1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * @brief Backend handles rasterization, pixel shading and output merger
28 ******************************************************************************/
30 #include <smmintrin.h>
34 #include "memory/tilingtraits.h"
35 #include "core/multisample.h"
39 typedef void(*PFN_CLEAR_TILES
)(DRAW_CONTEXT
*, SWR_RENDERTARGET_ATTACHMENT rt
, uint32_t, uint32_t, DWORD
[4], const SWR_RECT
& rect
);
40 static PFN_CLEAR_TILES sClearTilesTable
[NUM_SWR_FORMATS
];
43 //////////////////////////////////////////////////////////////////////////
44 /// @brief Process compute work.
45 /// @param pDC - pointer to draw context (dispatch).
46 /// @param workerId - The unique worker ID that is assigned to this thread.
47 /// @param threadGroupId - the linear index for the thread group within the dispatch.
48 void ProcessComputeBE(DRAW_CONTEXT
* pDC
, uint32_t workerId
, uint32_t threadGroupId
, void*& pSpillFillBuffer
)
50 SWR_CONTEXT
*pContext
= pDC
->pContext
;
52 AR_BEGIN(BEDispatch
, pDC
->drawId
);
54 const COMPUTE_DESC
* pTaskData
= (COMPUTE_DESC
*)pDC
->pDispatch
->GetTasksData();
55 SWR_ASSERT(pTaskData
!= nullptr);
57 // Ensure spill fill memory has been allocated.
58 size_t spillFillSize
= pDC
->pState
->state
.totalSpillFillSize
;
59 if (spillFillSize
&& pSpillFillBuffer
== nullptr)
61 pSpillFillBuffer
= pDC
->pArena
->AllocAlignedSync(spillFillSize
, KNOB_SIMD_BYTES
);
64 const API_STATE
& state
= GetApiState(pDC
);
66 SWR_CS_CONTEXT csContext
{ 0 };
67 csContext
.tileCounter
= threadGroupId
;
68 csContext
.dispatchDims
[0] = pTaskData
->threadGroupCountX
;
69 csContext
.dispatchDims
[1] = pTaskData
->threadGroupCountY
;
70 csContext
.dispatchDims
[2] = pTaskData
->threadGroupCountZ
;
71 csContext
.pTGSM
= pContext
->ppScratch
[workerId
];
72 csContext
.pSpillFillBuffer
= (uint8_t*)pSpillFillBuffer
;
74 state
.pfnCsFunc(GetPrivateState(pDC
), &csContext
);
76 UPDATE_STAT_BE(CsInvocations
, state
.totalThreadsInGroup
);
78 AR_END(BEDispatch
, 1);
81 //////////////////////////////////////////////////////////////////////////
82 /// @brief Process shutdown.
83 /// @param pDC - pointer to draw context (dispatch).
84 /// @param workerId - The unique worker ID that is assigned to this thread.
85 /// @param threadGroupId - the linear index for the thread group within the dispatch.
86 void ProcessShutdownBE(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t macroTile
, void *pUserData
)
91 void ProcessSyncBE(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t macroTile
, void *pUserData
)
94 MacroTileMgr::getTileIndices(macroTile
, x
, y
);
95 SWR_ASSERT(x
== 0 && y
== 0);
98 template<SWR_FORMAT format
>
99 void ClearRasterTile(uint8_t *pTileBuffer
, simdvector
&value
)
101 auto lambda
= [&](int32_t comp
)
103 FormatTraits
<format
>::storeSOA(comp
, pTileBuffer
, value
.v
[comp
]);
105 pTileBuffer
+= (KNOB_SIMD_WIDTH
* FormatTraits
<format
>::GetBPC(comp
) / 8);
108 const uint32_t numIter
= (KNOB_TILE_Y_DIM
/ SIMD_TILE_Y_DIM
) * (KNOB_TILE_X_DIM
/ SIMD_TILE_X_DIM
);
110 for (uint32_t i
= 0; i
< numIter
; ++i
)
112 UnrollerL
<0, FormatTraits
<format
>::numComps
, 1>::step(lambda
);
116 #if USE_8x2_TILE_BACKEND
117 template<SWR_FORMAT format
>
118 void ClearRasterTile(uint8_t *pTileBuffer
, simd16vector
&value
)
120 auto lambda
= [&](int32_t comp
)
122 FormatTraits
<format
>::storeSOA(comp
, pTileBuffer
, value
.v
[comp
]);
124 pTileBuffer
+= (KNOB_SIMD16_WIDTH
* FormatTraits
<format
>::GetBPC(comp
) / 8);
127 const uint32_t numIter
= (KNOB_TILE_Y_DIM
/ SIMD16_TILE_Y_DIM
) * (KNOB_TILE_X_DIM
/ SIMD16_TILE_X_DIM
);
129 for (uint32_t i
= 0; i
< numIter
; ++i
)
131 UnrollerL
<0, FormatTraits
<format
>::numComps
, 1>::step(lambda
);
136 template<SWR_FORMAT format
>
137 INLINE
void ClearMacroTile(DRAW_CONTEXT
*pDC
, SWR_RENDERTARGET_ATTACHMENT rt
, uint32_t macroTile
, uint32_t renderTargetArrayIndex
, DWORD clear
[4], const SWR_RECT
& rect
)
139 // convert clear color to hottile format
140 // clear color is in RGBA float/uint32
141 #if USE_8x2_TILE_BACKEND
143 for (uint32_t comp
= 0; comp
< FormatTraits
<format
>::numComps
; ++comp
)
146 vComp
= _simd16_load1_ps((const float*)&clear
[comp
]);
147 if (FormatTraits
<format
>::isNormalized(comp
))
149 vComp
= _simd16_mul_ps(vComp
, _simd16_set1_ps(FormatTraits
<format
>::fromFloat(comp
)));
150 vComp
= _simd16_castsi_ps(_simd16_cvtps_epi32(vComp
));
152 vComp
= FormatTraits
<format
>::pack(comp
, vComp
);
153 vClear
.v
[FormatTraits
<format
>::swizzle(comp
)] = vComp
;
158 for (uint32_t comp
= 0; comp
< FormatTraits
<format
>::numComps
; ++comp
)
161 vComp
= _simd_load1_ps((const float*)&clear
[comp
]);
162 if (FormatTraits
<format
>::isNormalized(comp
))
164 vComp
= _simd_mul_ps(vComp
, _simd_set1_ps(FormatTraits
<format
>::fromFloat(comp
)));
165 vComp
= _simd_castsi_ps(_simd_cvtps_epi32(vComp
));
167 vComp
= FormatTraits
<format
>::pack(comp
, vComp
);
168 vClear
.v
[FormatTraits
<format
>::swizzle(comp
)] = vComp
;
172 uint32_t tileX
, tileY
;
173 MacroTileMgr::getTileIndices(macroTile
, tileX
, tileY
);
175 // Init to full macrotile
178 KNOB_MACROTILE_X_DIM
* int32_t(tileX
),
179 KNOB_MACROTILE_Y_DIM
* int32_t(tileY
),
180 KNOB_MACROTILE_X_DIM
* int32_t(tileX
+ 1),
181 KNOB_MACROTILE_Y_DIM
* int32_t(tileY
+ 1),
184 // intersect with clear rect
187 // translate to local hottile origin
188 clearTile
.Translate(-int32_t(tileX
) * KNOB_MACROTILE_X_DIM
, -int32_t(tileY
) * KNOB_MACROTILE_Y_DIM
);
190 // Make maximums inclusive (needed for convert to raster tiles)
194 // convert to raster tiles
195 clearTile
.ymin
>>= (KNOB_TILE_Y_DIM_SHIFT
);
196 clearTile
.ymax
>>= (KNOB_TILE_Y_DIM_SHIFT
);
197 clearTile
.xmin
>>= (KNOB_TILE_X_DIM_SHIFT
);
198 clearTile
.xmax
>>= (KNOB_TILE_X_DIM_SHIFT
);
200 const int32_t numSamples
= GetNumSamples(pDC
->pState
->state
.rastState
.sampleCount
);
201 // compute steps between raster tile samples / raster tiles / macro tile rows
202 const uint32_t rasterTileSampleStep
= KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<format
>::bpp
/ 8;
203 const uint32_t rasterTileStep
= (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* (FormatTraits
<format
>::bpp
/ 8)) * numSamples
;
204 const uint32_t macroTileRowStep
= (KNOB_MACROTILE_X_DIM
/ KNOB_TILE_X_DIM
) * rasterTileStep
;
205 const uint32_t pitch
= (FormatTraits
<format
>::bpp
* KNOB_MACROTILE_X_DIM
/ 8);
207 HOTTILE
*pHotTile
= pDC
->pContext
->pHotTileMgr
->GetHotTile(pDC
->pContext
, pDC
, macroTile
, rt
, true, numSamples
, renderTargetArrayIndex
);
208 uint32_t rasterTileStartOffset
= (ComputeTileOffset2D
< TilingTraits
<SWR_TILE_SWRZ
, FormatTraits
<format
>::bpp
> >(pitch
, clearTile
.xmin
, clearTile
.ymin
)) * numSamples
;
209 uint8_t* pRasterTileRow
= pHotTile
->pBuffer
+ rasterTileStartOffset
; //(ComputeTileOffset2D< TilingTraits<SWR_TILE_SWRZ, FormatTraits<format>::bpp > >(pitch, x, y)) * numSamples;
211 // loop over all raster tiles in the current hot tile
212 for (int32_t y
= clearTile
.ymin
; y
<= clearTile
.ymax
; ++y
)
214 uint8_t* pRasterTile
= pRasterTileRow
;
215 for (int32_t x
= clearTile
.xmin
; x
<= clearTile
.xmax
; ++x
)
217 for( int32_t sampleNum
= 0; sampleNum
< numSamples
; sampleNum
++)
219 ClearRasterTile
<format
>(pRasterTile
, vClear
);
220 pRasterTile
+= rasterTileSampleStep
;
223 pRasterTileRow
+= macroTileRowStep
;
226 pHotTile
->state
= HOTTILE_DIRTY
;
230 void ProcessClearBE(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t macroTile
, void *pUserData
)
232 SWR_CONTEXT
*pContext
= pDC
->pContext
;
236 CLEAR_DESC
*pClear
= (CLEAR_DESC
*)pUserData
;
237 SWR_MULTISAMPLE_COUNT sampleCount
= pDC
->pState
->state
.rastState
.sampleCount
;
238 uint32_t numSamples
= GetNumSamples(sampleCount
);
240 SWR_ASSERT(pClear
->attachmentMask
!= 0); // shouldn't be here without a reason.
242 AR_BEGIN(BEClear
, pDC
->drawId
);
244 if (pClear
->attachmentMask
& SWR_ATTACHMENT_MASK_COLOR
)
246 unsigned long rt
= 0;
247 uint32_t mask
= pClear
->attachmentMask
& SWR_ATTACHMENT_MASK_COLOR
;
248 while (_BitScanForward(&rt
, mask
))
252 HOTTILE
*pHotTile
= pContext
->pHotTileMgr
->GetHotTile(pContext
, pDC
, macroTile
, (SWR_RENDERTARGET_ATTACHMENT
)rt
, true, numSamples
, pClear
->renderTargetArrayIndex
);
254 // All we want to do here is to mark the hot tile as being in a "needs clear" state.
255 pHotTile
->clearData
[0] = *(DWORD
*)&(pClear
->clearRTColor
[0]);
256 pHotTile
->clearData
[1] = *(DWORD
*)&(pClear
->clearRTColor
[1]);
257 pHotTile
->clearData
[2] = *(DWORD
*)&(pClear
->clearRTColor
[2]);
258 pHotTile
->clearData
[3] = *(DWORD
*)&(pClear
->clearRTColor
[3]);
259 pHotTile
->state
= HOTTILE_CLEAR
;
263 if (pClear
->attachmentMask
& SWR_ATTACHMENT_DEPTH_BIT
)
265 HOTTILE
*pHotTile
= pContext
->pHotTileMgr
->GetHotTile(pContext
, pDC
, macroTile
, SWR_ATTACHMENT_DEPTH
, true, numSamples
, pClear
->renderTargetArrayIndex
);
266 pHotTile
->clearData
[0] = *(DWORD
*)&pClear
->clearDepth
;
267 pHotTile
->state
= HOTTILE_CLEAR
;
270 if (pClear
->attachmentMask
& SWR_ATTACHMENT_STENCIL_BIT
)
272 HOTTILE
*pHotTile
= pContext
->pHotTileMgr
->GetHotTile(pContext
, pDC
, macroTile
, SWR_ATTACHMENT_STENCIL
, true, numSamples
, pClear
->renderTargetArrayIndex
);
274 pHotTile
->clearData
[0] = pClear
->clearStencil
;
275 pHotTile
->state
= HOTTILE_CLEAR
;
283 CLEAR_DESC
*pClear
= (CLEAR_DESC
*)pUserData
;
284 AR_BEGIN(BEClear
, pDC
->drawId
);
286 if (pClear
->attachmentMask
& SWR_ATTACHMENT_MASK_COLOR
)
289 clearData
[0] = *(DWORD
*)&(pClear
->clearRTColor
[0]);
290 clearData
[1] = *(DWORD
*)&(pClear
->clearRTColor
[1]);
291 clearData
[2] = *(DWORD
*)&(pClear
->clearRTColor
[2]);
292 clearData
[3] = *(DWORD
*)&(pClear
->clearRTColor
[3]);
294 PFN_CLEAR_TILES pfnClearTiles
= sClearTilesTable
[KNOB_COLOR_HOT_TILE_FORMAT
];
295 SWR_ASSERT(pfnClearTiles
!= nullptr);
297 unsigned long rt
= 0;
298 uint32_t mask
= pClear
->attachmentMask
& SWR_ATTACHMENT_MASK_COLOR
;
299 while (_BitScanForward(&rt
, mask
))
303 pfnClearTiles(pDC
, (SWR_RENDERTARGET_ATTACHMENT
)rt
, macroTile
, pClear
->renderTargetArrayIndex
, clearData
, pClear
->rect
);
307 if (pClear
->attachmentMask
& SWR_ATTACHMENT_DEPTH_BIT
)
310 clearData
[0] = *(DWORD
*)&pClear
->clearDepth
;
311 PFN_CLEAR_TILES pfnClearTiles
= sClearTilesTable
[KNOB_DEPTH_HOT_TILE_FORMAT
];
312 SWR_ASSERT(pfnClearTiles
!= nullptr);
314 pfnClearTiles(pDC
, SWR_ATTACHMENT_DEPTH
, macroTile
, pClear
->renderTargetArrayIndex
, clearData
, pClear
->rect
);
317 if (pClear
->attachmentMask
& SWR_ATTACHMENT_STENCIL_BIT
)
320 clearData
[0] = pClear
->clearStencil
;
321 PFN_CLEAR_TILES pfnClearTiles
= sClearTilesTable
[KNOB_STENCIL_HOT_TILE_FORMAT
];
323 pfnClearTiles(pDC
, SWR_ATTACHMENT_STENCIL
, macroTile
, pClear
->renderTargetArrayIndex
, clearData
, pClear
->rect
);
330 void ProcessStoreTileBE(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t macroTile
, STORE_TILES_DESC
* pDesc
,
331 SWR_RENDERTARGET_ATTACHMENT attachment
)
333 SWR_CONTEXT
*pContext
= pDC
->pContext
;
335 AR_BEGIN(BEStoreTiles
, pDC
->drawId
);
337 SWR_FORMAT srcFormat
;
340 case SWR_ATTACHMENT_COLOR0
:
341 case SWR_ATTACHMENT_COLOR1
:
342 case SWR_ATTACHMENT_COLOR2
:
343 case SWR_ATTACHMENT_COLOR3
:
344 case SWR_ATTACHMENT_COLOR4
:
345 case SWR_ATTACHMENT_COLOR5
:
346 case SWR_ATTACHMENT_COLOR6
:
347 case SWR_ATTACHMENT_COLOR7
: srcFormat
= KNOB_COLOR_HOT_TILE_FORMAT
; break;
348 case SWR_ATTACHMENT_DEPTH
: srcFormat
= KNOB_DEPTH_HOT_TILE_FORMAT
; break;
349 case SWR_ATTACHMENT_STENCIL
: srcFormat
= KNOB_STENCIL_HOT_TILE_FORMAT
; break;
350 default: SWR_INVALID("Unknown attachment: %d", attachment
); srcFormat
= KNOB_COLOR_HOT_TILE_FORMAT
; break;
354 MacroTileMgr::getTileIndices(macroTile
, x
, y
);
356 // Only need to store the hottile if it's been rendered to...
357 HOTTILE
*pHotTile
= pContext
->pHotTileMgr
->GetHotTileNoLoad(pContext
, pDC
, macroTile
, attachment
, false);
360 // clear if clear is pending (i.e., not rendered to), then mark as dirty for store.
361 if (pHotTile
->state
== HOTTILE_CLEAR
)
363 PFN_CLEAR_TILES pfnClearTiles
= sClearTilesTable
[srcFormat
];
364 SWR_ASSERT(pfnClearTiles
!= nullptr);
366 pfnClearTiles(pDC
, attachment
, macroTile
, pHotTile
->renderTargetArrayIndex
, pHotTile
->clearData
, pDesc
->rect
);
369 if (pHotTile
->state
== HOTTILE_DIRTY
|| pDesc
->postStoreTileState
== (SWR_TILE_STATE
)HOTTILE_DIRTY
)
371 int32_t destX
= KNOB_MACROTILE_X_DIM
* x
;
372 int32_t destY
= KNOB_MACROTILE_Y_DIM
* y
;
374 pContext
->pfnStoreTile(GetPrivateState(pDC
), srcFormat
,
375 attachment
, destX
, destY
, pHotTile
->renderTargetArrayIndex
, pHotTile
->pBuffer
);
379 if (pHotTile
->state
== HOTTILE_DIRTY
|| pHotTile
->state
== HOTTILE_RESOLVED
)
381 pHotTile
->state
= (HOTTILE_STATE
)pDesc
->postStoreTileState
;
384 AR_END(BEStoreTiles
, 1);
387 void ProcessStoreTilesBE(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t macroTile
, void *pData
)
389 STORE_TILES_DESC
*pDesc
= (STORE_TILES_DESC
*)pData
;
391 unsigned long rt
= 0;
392 uint32_t mask
= pDesc
->attachmentMask
;
393 while (_BitScanForward(&rt
, mask
))
396 ProcessStoreTileBE(pDC
, workerId
, macroTile
, pDesc
, (SWR_RENDERTARGET_ATTACHMENT
)rt
);
400 void ProcessDiscardInvalidateTilesBE(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t macroTile
, void *pData
)
402 DISCARD_INVALIDATE_TILES_DESC
*pDesc
= (DISCARD_INVALIDATE_TILES_DESC
*)pData
;
403 SWR_CONTEXT
*pContext
= pDC
->pContext
;
405 const int32_t numSamples
= GetNumSamples(pDC
->pState
->state
.rastState
.sampleCount
);
407 for (uint32_t i
= 0; i
< SWR_NUM_ATTACHMENTS
; ++i
)
409 if (pDesc
->attachmentMask
& (1 << i
))
411 HOTTILE
*pHotTile
= pContext
->pHotTileMgr
->GetHotTileNoLoad(
412 pContext
, pDC
, macroTile
, (SWR_RENDERTARGET_ATTACHMENT
)i
, pDesc
->createNewTiles
, numSamples
);
415 pHotTile
->state
= (HOTTILE_STATE
)pDesc
->newTileState
;
421 #if KNOB_SIMD_WIDTH == 8
422 const simdscalar vCenterOffsetsX
= __m256
{0.5, 1.5, 0.5, 1.5, 2.5, 3.5, 2.5, 3.5};
423 const simdscalar vCenterOffsetsY
= __m256
{0.5, 0.5, 1.5, 1.5, 0.5, 0.5, 1.5, 1.5};
424 const simdscalar vULOffsetsX
= __m256
{0.0, 1.0, 0.0, 1.0, 2.0, 3.0, 2.0, 3.0};
425 const simdscalar vULOffsetsY
= __m256
{0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0};
427 #error Unsupported vector width
430 simdmask
ComputeUserClipMask(uint8_t clipMask
, float* pUserClipBuffer
, simdscalar vI
, simdscalar vJ
)
432 simdscalar vClipMask
= _simd_setzero_ps();
433 uint32_t numClipDistance
= _mm_popcnt_u32(clipMask
);
435 for (uint32_t i
= 0; i
< numClipDistance
; ++i
)
437 // pull triangle clip distance values from clip buffer
438 simdscalar vA
= _simd_broadcast_ss(pUserClipBuffer
++);
439 simdscalar vB
= _simd_broadcast_ss(pUserClipBuffer
++);
440 simdscalar vC
= _simd_broadcast_ss(pUserClipBuffer
++);
443 simdscalar vInterp
= vplaneps(vA
, vB
, vC
, vI
, vJ
);
445 // clip if interpolated clip distance is < 0 || NAN
446 simdscalar vCull
= _simd_cmp_ps(_simd_setzero_ps(), vInterp
, _CMP_NLE_UQ
);
448 vClipMask
= _simd_or_ps(vClipMask
, vCull
);
451 return _simd_movemask_ps(vClipMask
);
455 void BackendSingleSample(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t x
, uint32_t y
, SWR_TRIANGLE_DESC
&work
, RenderOutputBuffers
&renderBuffers
)
457 SWR_CONTEXT
*pContext
= pDC
->pContext
;
459 AR_BEGIN(BESingleSampleBackend
, pDC
->drawId
);
460 AR_BEGIN(BESetup
, pDC
->drawId
);
462 const API_STATE
&state
= GetApiState(pDC
);
464 BarycentricCoeffs coeffs
;
465 SetupBarycentricCoeffs(&coeffs
, work
);
467 uint8_t *pColorBuffer
[SWR_NUM_RENDERTARGETS
], *pDepthBuffer
, *pStencilBuffer
;
468 SetupRenderBuffers(pColorBuffer
, &pDepthBuffer
, &pStencilBuffer
, state
.psState
.numRenderTargets
, renderBuffers
);
470 SWR_PS_CONTEXT psContext
;
471 const SWR_MULTISAMPLE_POS
& samplePos
= state
.rastState
.samplePositions
;
472 SetupPixelShaderContext
<T
>(&psContext
, samplePos
, work
);
476 psContext
.vY
.UL
= _simd_add_ps(vULOffsetsY
, _simd_set1_ps(static_cast<float>(y
)));
477 psContext
.vY
.center
= _simd_add_ps(vCenterOffsetsY
, _simd_set1_ps(static_cast<float>(y
)));
479 const simdscalar dy
= _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM
));
481 for (uint32_t yy
= y
; yy
< y
+ KNOB_TILE_Y_DIM
; yy
+= SIMD_TILE_Y_DIM
)
483 psContext
.vX
.UL
= _simd_add_ps(vULOffsetsX
, _simd_set1_ps(static_cast<float>(x
)));
484 psContext
.vX
.center
= _simd_add_ps(vCenterOffsetsX
, _simd_set1_ps(static_cast<float>(x
)));
486 const simdscalar dx
= _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM
));
488 for (uint32_t xx
= x
; xx
< x
+ KNOB_TILE_X_DIM
; xx
+= SIMD_TILE_X_DIM
)
490 #if USE_8x2_TILE_BACKEND
491 const bool useAlternateOffset
= ((xx
& SIMD_TILE_X_DIM
) != 0);
494 simdmask coverageMask
= work
.coverageMask
[0] & MASK
;
498 if (state
.depthHottileEnable
&& state
.depthBoundsState
.depthBoundsTestEnable
)
500 static_assert(KNOB_DEPTH_HOT_TILE_FORMAT
== R32_FLOAT
, "Unsupported depth hot tile format");
502 const simdscalar z
= _simd_load_ps(reinterpret_cast<const float *>(pDepthBuffer
));
504 const float minz
= state
.depthBoundsState
.depthBoundsTestMinValue
;
505 const float maxz
= state
.depthBoundsState
.depthBoundsTestMaxValue
;
507 coverageMask
&= CalcDepthBoundsAcceptMask(z
, minz
, maxz
);
510 if (T::InputCoverage
!= SWR_INPUT_COVERAGE_NONE
)
512 const uint64_t* pCoverageMask
= (T::InputCoverage
== SWR_INPUT_COVERAGE_INNER_CONSERVATIVE
) ? &work
.innerCoverageMask
: &work
.coverageMask
[0];
514 generateInputCoverage
<T
, T::InputCoverage
>(pCoverageMask
, psContext
.inputMask
, state
.blendState
.sampleMask
);
517 AR_BEGIN(BEBarycentric
, pDC
->drawId
);
519 CalcPixelBarycentrics(coeffs
, psContext
);
521 CalcCentroid
<T
, true>(&psContext
, samplePos
, coeffs
, work
.coverageMask
, state
.blendState
.sampleMask
);
523 // interpolate and quantize z
524 psContext
.vZ
= vplaneps(coeffs
.vZa
, coeffs
.vZb
, coeffs
.vZc
, psContext
.vI
.center
, psContext
.vJ
.center
);
525 psContext
.vZ
= state
.pfnQuantizeDepth(psContext
.vZ
);
527 AR_END(BEBarycentric
, 1);
529 // interpolate user clip distance if available
530 if (state
.rastState
.clipDistanceMask
)
532 coverageMask
&= ~ComputeUserClipMask(state
.rastState
.clipDistanceMask
, work
.pUserClipBuffer
, psContext
.vI
.center
, psContext
.vJ
.center
);
535 simdscalar vCoverageMask
= vMask(coverageMask
);
536 simdscalar depthPassMask
= vCoverageMask
;
537 simdscalar stencilPassMask
= vCoverageMask
;
542 AR_BEGIN(BEEarlyDepthTest
, pDC
->drawId
);
543 depthPassMask
= DepthStencilTest(&state
, work
.triFlags
.frontFacing
, work
.triFlags
.viewportIndex
,
544 psContext
.vZ
, pDepthBuffer
, vCoverageMask
, pStencilBuffer
, &stencilPassMask
);
545 AR_EVENT(EarlyDepthStencilInfoSingleSample(_simd_movemask_ps(depthPassMask
), _simd_movemask_ps(stencilPassMask
), _simd_movemask_ps(vCoverageMask
)));
546 AR_END(BEEarlyDepthTest
, 0);
548 // early-exit if no pixels passed depth or earlyZ is forced on
549 if (state
.psState
.forceEarlyZ
|| !_simd_movemask_ps(depthPassMask
))
551 DepthStencilWrite(&state
.vp
[work
.triFlags
.viewportIndex
], &state
.depthStencilState
, work
.triFlags
.frontFacing
, psContext
.vZ
,
552 pDepthBuffer
, depthPassMask
, vCoverageMask
, pStencilBuffer
, stencilPassMask
);
554 if (!_simd_movemask_ps(depthPassMask
))
561 psContext
.sampleIndex
= 0;
562 psContext
.activeMask
= _simd_castps_si(vCoverageMask
);
564 // execute pixel shader
565 AR_BEGIN(BEPixelShader
, pDC
->drawId
);
566 UPDATE_STAT_BE(PsInvocations
, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask
)));
567 state
.psState
.pfnPixelShader(GetPrivateState(pDC
), &psContext
);
568 AR_END(BEPixelShader
, 0);
570 vCoverageMask
= _simd_castsi_ps(psContext
.activeMask
);
575 AR_BEGIN(BELateDepthTest
, pDC
->drawId
);
576 depthPassMask
= DepthStencilTest(&state
, work
.triFlags
.frontFacing
, work
.triFlags
.viewportIndex
,
577 psContext
.vZ
, pDepthBuffer
, vCoverageMask
, pStencilBuffer
, &stencilPassMask
);
578 AR_EVENT(LateDepthStencilInfoSingleSample(_simd_movemask_ps(depthPassMask
), _simd_movemask_ps(stencilPassMask
), _simd_movemask_ps(vCoverageMask
)));
579 AR_END(BELateDepthTest
, 0);
581 if (!_simd_movemask_ps(depthPassMask
))
583 // need to call depth/stencil write for stencil write
584 DepthStencilWrite(&state
.vp
[work
.triFlags
.viewportIndex
], &state
.depthStencilState
, work
.triFlags
.frontFacing
, psContext
.vZ
,
585 pDepthBuffer
, depthPassMask
, vCoverageMask
, pStencilBuffer
, stencilPassMask
);
590 uint32_t statMask
= _simd_movemask_ps(depthPassMask
);
591 uint32_t statCount
= _mm_popcnt_u32(statMask
);
592 UPDATE_STAT_BE(DepthPassCount
, statCount
);
595 AR_BEGIN(BEOutputMerger
, pDC
->drawId
);
596 #if USE_8x2_TILE_BACKEND
597 OutputMerger8x2(psContext
, pColorBuffer
, 0, &state
.blendState
, state
.pfnBlendFunc
, vCoverageMask
, depthPassMask
, state
.psState
.numRenderTargets
, state
.colorHottileEnable
, useAlternateOffset
);
599 OutputMerger4x2(psContext
, pColorBuffer
, 0, &state
.blendState
, state
.pfnBlendFunc
, vCoverageMask
, depthPassMask
, state
.psState
.numRenderTargets
);
602 // do final depth write after all pixel kills
603 if (!state
.psState
.forceEarlyZ
)
605 DepthStencilWrite(&state
.vp
[work
.triFlags
.viewportIndex
], &state
.depthStencilState
, work
.triFlags
.frontFacing
, psContext
.vZ
,
606 pDepthBuffer
, depthPassMask
, vCoverageMask
, pStencilBuffer
, stencilPassMask
);
608 AR_END(BEOutputMerger
, 0);
612 AR_BEGIN(BEEndTile
, pDC
->drawId
);
614 work
.coverageMask
[0] >>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
615 if(T::InputCoverage
== SWR_INPUT_COVERAGE_INNER_CONSERVATIVE
)
617 work
.innerCoverageMask
>>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
620 #if USE_8x2_TILE_BACKEND
621 if (useAlternateOffset
)
623 for (uint32_t rt
= 0; rt
< state
.psState
.numRenderTargets
; ++rt
)
625 pColorBuffer
[rt
] += (2 * KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
) / 8;
629 for (uint32_t rt
= 0; rt
< state
.psState
.numRenderTargets
; ++rt
)
631 pColorBuffer
[rt
] += (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
) / 8;
634 pDepthBuffer
+= (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
) / 8;
635 pStencilBuffer
+= (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
) / 8;
637 AR_END(BEEndTile
, 0);
639 psContext
.vX
.UL
= _simd_add_ps(psContext
.vX
.UL
, dx
);
640 psContext
.vX
.center
= _simd_add_ps(psContext
.vX
.center
, dx
);
643 psContext
.vY
.UL
= _simd_add_ps(psContext
.vY
.UL
, dy
);
644 psContext
.vY
.center
= _simd_add_ps(psContext
.vY
.center
, dy
);
647 AR_END(BESingleSampleBackend
, 0);
651 void BackendSampleRate(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t x
, uint32_t y
, SWR_TRIANGLE_DESC
&work
, RenderOutputBuffers
&renderBuffers
)
653 SWR_CONTEXT
*pContext
= pDC
->pContext
;
655 AR_BEGIN(BESampleRateBackend
, pDC
->drawId
);
656 AR_BEGIN(BESetup
, pDC
->drawId
);
658 const API_STATE
&state
= GetApiState(pDC
);
660 BarycentricCoeffs coeffs
;
661 SetupBarycentricCoeffs(&coeffs
, work
);
663 uint8_t *pColorBuffer
[SWR_NUM_RENDERTARGETS
], *pDepthBuffer
, *pStencilBuffer
;
664 SetupRenderBuffers(pColorBuffer
, &pDepthBuffer
, &pStencilBuffer
, state
.psState
.numRenderTargets
, renderBuffers
);
666 SWR_PS_CONTEXT psContext
;
667 const SWR_MULTISAMPLE_POS
& samplePos
= state
.rastState
.samplePositions
;
668 SetupPixelShaderContext
<T
>(&psContext
, samplePos
, work
);
672 psContext
.vY
.UL
= _simd_add_ps(vULOffsetsY
, _simd_set1_ps(static_cast<float>(y
)));
673 psContext
.vY
.center
= _simd_add_ps(vCenterOffsetsY
, _simd_set1_ps(static_cast<float>(y
)));
675 const simdscalar dy
= _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM
));
677 for (uint32_t yy
= y
; yy
< y
+ KNOB_TILE_Y_DIM
; yy
+= SIMD_TILE_Y_DIM
)
679 psContext
.vX
.UL
= _simd_add_ps(vULOffsetsX
, _simd_set1_ps(static_cast<float>(x
)));
680 psContext
.vX
.center
= _simd_add_ps(vCenterOffsetsX
, _simd_set1_ps(static_cast<float>(x
)));
682 const simdscalar dx
= _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM
));
684 for (uint32_t xx
= x
; xx
< x
+ KNOB_TILE_X_DIM
; xx
+= SIMD_TILE_X_DIM
)
686 #if USE_8x2_TILE_BACKEND
687 const bool useAlternateOffset
= ((xx
& SIMD_TILE_X_DIM
) != 0);
690 if (T::InputCoverage
!= SWR_INPUT_COVERAGE_NONE
)
692 const uint64_t* pCoverageMask
= (T::InputCoverage
== SWR_INPUT_COVERAGE_INNER_CONSERVATIVE
) ? &work
.innerCoverageMask
: &work
.coverageMask
[0];
694 generateInputCoverage
<T
, T::InputCoverage
>(pCoverageMask
, psContext
.inputMask
, state
.blendState
.sampleMask
);
697 AR_BEGIN(BEBarycentric
, pDC
->drawId
);
699 CalcPixelBarycentrics(coeffs
, psContext
);
701 CalcCentroid
<T
, false>(&psContext
, samplePos
, coeffs
, work
.coverageMask
, state
.blendState
.sampleMask
);
703 AR_END(BEBarycentric
, 0);
705 for (uint32_t sample
= 0; sample
< T::MultisampleT::numSamples
; sample
++)
707 simdmask coverageMask
= work
.coverageMask
[sample
] & MASK
;
711 // offset depth/stencil buffers current sample
712 uint8_t *pDepthSample
= pDepthBuffer
+ RasterTileDepthOffset(sample
);
713 uint8_t *pStencilSample
= pStencilBuffer
+ RasterTileStencilOffset(sample
);
715 if (state
.depthHottileEnable
&& state
.depthBoundsState
.depthBoundsTestEnable
)
717 static_assert(KNOB_DEPTH_HOT_TILE_FORMAT
== R32_FLOAT
, "Unsupported depth hot tile format");
719 const simdscalar z
= _simd_load_ps(reinterpret_cast<const float *>(pDepthSample
));
721 const float minz
= state
.depthBoundsState
.depthBoundsTestMinValue
;
722 const float maxz
= state
.depthBoundsState
.depthBoundsTestMaxValue
;
724 coverageMask
&= CalcDepthBoundsAcceptMask(z
, minz
, maxz
);
727 AR_BEGIN(BEBarycentric
, pDC
->drawId
);
729 // calculate per sample positions
730 psContext
.vX
.sample
= _simd_add_ps(psContext
.vX
.UL
, samplePos
.vX(sample
));
731 psContext
.vY
.sample
= _simd_add_ps(psContext
.vY
.UL
, samplePos
.vY(sample
));
733 CalcSampleBarycentrics(coeffs
, psContext
);
735 // interpolate and quantize z
736 psContext
.vZ
= vplaneps(coeffs
.vZa
, coeffs
.vZb
, coeffs
.vZc
, psContext
.vI
.sample
, psContext
.vJ
.sample
);
737 psContext
.vZ
= state
.pfnQuantizeDepth(psContext
.vZ
);
739 AR_END(BEBarycentric
, 0);
741 // interpolate user clip distance if available
742 if (state
.rastState
.clipDistanceMask
)
744 coverageMask
&= ~ComputeUserClipMask(state
.rastState
.clipDistanceMask
, work
.pUserClipBuffer
, psContext
.vI
.sample
, psContext
.vJ
.sample
);
747 simdscalar vCoverageMask
= vMask(coverageMask
);
748 simdscalar depthPassMask
= vCoverageMask
;
749 simdscalar stencilPassMask
= vCoverageMask
;
754 AR_BEGIN(BEEarlyDepthTest
, pDC
->drawId
);
755 depthPassMask
= DepthStencilTest(&state
, work
.triFlags
.frontFacing
, work
.triFlags
.viewportIndex
,
756 psContext
.vZ
, pDepthSample
, vCoverageMask
, pStencilSample
, &stencilPassMask
);
757 AR_EVENT(EarlyDepthStencilInfoSampleRate(_simd_movemask_ps(depthPassMask
), _simd_movemask_ps(stencilPassMask
), _simd_movemask_ps(vCoverageMask
)));
758 AR_END(BEEarlyDepthTest
, 0);
760 // early-exit if no samples passed depth or earlyZ is forced on.
761 if (state
.psState
.forceEarlyZ
|| !_simd_movemask_ps(depthPassMask
))
763 DepthStencilWrite(&state
.vp
[work
.triFlags
.viewportIndex
], &state
.depthStencilState
, work
.triFlags
.frontFacing
, psContext
.vZ
,
764 pDepthSample
, depthPassMask
, vCoverageMask
, pStencilSample
, stencilPassMask
);
766 if (!_simd_movemask_ps(depthPassMask
))
768 work
.coverageMask
[sample
] >>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
774 psContext
.sampleIndex
= sample
;
775 psContext
.activeMask
= _simd_castps_si(vCoverageMask
);
777 // execute pixel shader
778 AR_BEGIN(BEPixelShader
, pDC
->drawId
);
779 UPDATE_STAT_BE(PsInvocations
, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask
)));
780 state
.psState
.pfnPixelShader(GetPrivateState(pDC
), &psContext
);
781 AR_END(BEPixelShader
, 0);
783 vCoverageMask
= _simd_castsi_ps(psContext
.activeMask
);
788 AR_BEGIN(BELateDepthTest
, pDC
->drawId
);
789 depthPassMask
= DepthStencilTest(&state
, work
.triFlags
.frontFacing
, work
.triFlags
.viewportIndex
,
790 psContext
.vZ
, pDepthSample
, vCoverageMask
, pStencilSample
, &stencilPassMask
);
791 AR_EVENT(LateDepthStencilInfoSampleRate(_simd_movemask_ps(depthPassMask
), _simd_movemask_ps(stencilPassMask
), _simd_movemask_ps(vCoverageMask
)));
792 AR_END(BELateDepthTest
, 0);
794 if (!_simd_movemask_ps(depthPassMask
))
796 // need to call depth/stencil write for stencil write
797 DepthStencilWrite(&state
.vp
[work
.triFlags
.viewportIndex
], &state
.depthStencilState
, work
.triFlags
.frontFacing
, psContext
.vZ
,
798 pDepthSample
, depthPassMask
, vCoverageMask
, pStencilSample
, stencilPassMask
);
800 work
.coverageMask
[sample
] >>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
805 uint32_t statMask
= _simd_movemask_ps(depthPassMask
);
806 uint32_t statCount
= _mm_popcnt_u32(statMask
);
807 UPDATE_STAT_BE(DepthPassCount
, statCount
);
810 AR_BEGIN(BEOutputMerger
, pDC
->drawId
);
811 #if USE_8x2_TILE_BACKEND
812 OutputMerger8x2(psContext
, pColorBuffer
, sample
, &state
.blendState
, state
.pfnBlendFunc
, vCoverageMask
, depthPassMask
, state
.psState
.numRenderTargets
, state
.colorHottileEnable
, useAlternateOffset
);
814 OutputMerger4x2(psContext
, pColorBuffer
, sample
, &state
.blendState
, state
.pfnBlendFunc
, vCoverageMask
, depthPassMask
, state
.psState
.numRenderTargets
);
817 // do final depth write after all pixel kills
818 if (!state
.psState
.forceEarlyZ
)
820 DepthStencilWrite(&state
.vp
[work
.triFlags
.viewportIndex
], &state
.depthStencilState
, work
.triFlags
.frontFacing
, psContext
.vZ
,
821 pDepthSample
, depthPassMask
, vCoverageMask
, pStencilSample
, stencilPassMask
);
823 AR_END(BEOutputMerger
, 0);
825 work
.coverageMask
[sample
] >>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
831 AR_BEGIN(BEEndTile
, pDC
->drawId
);
833 if(T::InputCoverage
== SWR_INPUT_COVERAGE_INNER_CONSERVATIVE
)
835 work
.innerCoverageMask
>>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
838 #if USE_8x2_TILE_BACKEND
839 if (useAlternateOffset
)
841 for (uint32_t rt
= 0; rt
< state
.psState
.numRenderTargets
; ++rt
)
843 pColorBuffer
[rt
] += (2 * KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
) / 8;
847 for (uint32_t rt
= 0; rt
< state
.psState
.numRenderTargets
; ++rt
)
849 pColorBuffer
[rt
] += (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
) / 8;
852 pDepthBuffer
+= (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
) / 8;
853 pStencilBuffer
+= (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
) / 8;
855 AR_END(BEEndTile
, 0);
857 psContext
.vX
.UL
= _simd_add_ps(psContext
.vX
.UL
, dx
);
858 psContext
.vX
.center
= _simd_add_ps(psContext
.vX
.center
, dx
);
861 psContext
.vY
.UL
= _simd_add_ps(psContext
.vY
.UL
, dy
);
862 psContext
.vY
.center
= _simd_add_ps(psContext
.vY
.center
, dy
);
865 AR_END(BESampleRateBackend
, 0);
867 // optimized backend flow with NULL PS
868 template<uint32_t sampleCountT
>
869 void BackendNullPS(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t x
, uint32_t y
, SWR_TRIANGLE_DESC
&work
, RenderOutputBuffers
&renderBuffers
)
871 SWR_CONTEXT
*pContext
= pDC
->pContext
;
873 AR_BEGIN(BENullBackend
, pDC
->drawId
);
874 ///@todo: handle center multisample pattern
875 typedef SwrBackendTraits
<sampleCountT
, false> T
;
876 AR_BEGIN(BESetup
, pDC
->drawId
);
878 const API_STATE
&state
= GetApiState(pDC
);
880 BarycentricCoeffs coeffs
;
881 SetupBarycentricCoeffs(&coeffs
, work
);
883 uint8_t *pDepthBuffer
, *pStencilBuffer
;
884 SetupRenderBuffers(NULL
, &pDepthBuffer
, &pStencilBuffer
, 0, renderBuffers
);
886 SWR_PS_CONTEXT psContext
;
887 // skip SetupPixelShaderContext(&psContext, ...); // not needed here
891 simdscalar vYSamplePosUL
= _simd_add_ps(vULOffsetsY
, _simd_set1_ps(static_cast<float>(y
)));
893 const simdscalar dy
= _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM
));
894 const SWR_MULTISAMPLE_POS
& samplePos
= state
.rastState
.samplePositions
;
895 for (uint32_t yy
= y
; yy
< y
+ KNOB_TILE_Y_DIM
; yy
+= SIMD_TILE_Y_DIM
)
897 simdscalar vXSamplePosUL
= _simd_add_ps(vULOffsetsX
, _simd_set1_ps(static_cast<float>(x
)));
899 const simdscalar dx
= _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM
));
901 for (uint32_t xx
= x
; xx
< x
+ KNOB_TILE_X_DIM
; xx
+= SIMD_TILE_X_DIM
)
903 // iterate over active samples
904 unsigned long sample
= 0;
905 uint32_t sampleMask
= state
.blendState
.sampleMask
;
906 while (_BitScanForward(&sample
, sampleMask
))
908 sampleMask
&= ~(1 << sample
);
910 simdmask coverageMask
= work
.coverageMask
[sample
] & MASK
;
914 // offset depth/stencil buffers current sample
915 uint8_t *pDepthSample
= pDepthBuffer
+ RasterTileDepthOffset(sample
);
916 uint8_t *pStencilSample
= pStencilBuffer
+ RasterTileStencilOffset(sample
);
918 if (state
.depthHottileEnable
&& state
.depthBoundsState
.depthBoundsTestEnable
)
920 static_assert(KNOB_DEPTH_HOT_TILE_FORMAT
== R32_FLOAT
, "Unsupported depth hot tile format");
922 const simdscalar z
= _simd_load_ps(reinterpret_cast<const float *>(pDepthSample
));
924 const float minz
= state
.depthBoundsState
.depthBoundsTestMinValue
;
925 const float maxz
= state
.depthBoundsState
.depthBoundsTestMaxValue
;
927 coverageMask
&= CalcDepthBoundsAcceptMask(z
, minz
, maxz
);
930 AR_BEGIN(BEBarycentric
, pDC
->drawId
);
932 // calculate per sample positions
933 psContext
.vX
.sample
= _simd_add_ps(vXSamplePosUL
, samplePos
.vX(sample
));
934 psContext
.vY
.sample
= _simd_add_ps(vYSamplePosUL
, samplePos
.vY(sample
));
936 CalcSampleBarycentrics(coeffs
, psContext
);
938 // interpolate and quantize z
939 psContext
.vZ
= vplaneps(coeffs
.vZa
, coeffs
.vZb
, coeffs
.vZc
, psContext
.vI
.sample
, psContext
.vJ
.sample
);
940 psContext
.vZ
= state
.pfnQuantizeDepth(psContext
.vZ
);
942 AR_END(BEBarycentric
, 0);
944 // interpolate user clip distance if available
945 if (state
.rastState
.clipDistanceMask
)
947 coverageMask
&= ~ComputeUserClipMask(state
.rastState
.clipDistanceMask
, work
.pUserClipBuffer
, psContext
.vI
.sample
, psContext
.vJ
.sample
);
950 simdscalar vCoverageMask
= vMask(coverageMask
);
951 simdscalar stencilPassMask
= vCoverageMask
;
953 AR_BEGIN(BEEarlyDepthTest
, pDC
->drawId
);
954 simdscalar depthPassMask
= DepthStencilTest(&state
, work
.triFlags
.frontFacing
, work
.triFlags
.viewportIndex
,
955 psContext
.vZ
, pDepthSample
, vCoverageMask
, pStencilSample
, &stencilPassMask
);
956 AR_EVENT(EarlyDepthStencilInfoNullPS(_simd_movemask_ps(depthPassMask
), _simd_movemask_ps(stencilPassMask
), _simd_movemask_ps(vCoverageMask
)));
957 DepthStencilWrite(&state
.vp
[work
.triFlags
.viewportIndex
], &state
.depthStencilState
, work
.triFlags
.frontFacing
, psContext
.vZ
,
958 pDepthSample
, depthPassMask
, vCoverageMask
, pStencilSample
, stencilPassMask
);
959 AR_END(BEEarlyDepthTest
, 0);
961 uint32_t statMask
= _simd_movemask_ps(depthPassMask
);
962 uint32_t statCount
= _mm_popcnt_u32(statMask
);
963 UPDATE_STAT_BE(DepthPassCount
, statCount
);
968 work
.coverageMask
[sample
] >>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
971 pDepthBuffer
+= (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
) / 8;
972 pStencilBuffer
+= (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
) / 8;
974 vXSamplePosUL
= _simd_add_ps(vXSamplePosUL
, dx
);
977 vYSamplePosUL
= _simd_add_ps(vYSamplePosUL
, dy
);
980 AR_END(BENullBackend
, 0);
983 void InitClearTilesTable()
985 memset(sClearTilesTable
, 0, sizeof(sClearTilesTable
));
987 sClearTilesTable
[R8G8B8A8_UNORM
] = ClearMacroTile
<R8G8B8A8_UNORM
>;
988 sClearTilesTable
[B8G8R8A8_UNORM
] = ClearMacroTile
<B8G8R8A8_UNORM
>;
989 sClearTilesTable
[R32_FLOAT
] = ClearMacroTile
<R32_FLOAT
>;
990 sClearTilesTable
[R32G32B32A32_FLOAT
] = ClearMacroTile
<R32G32B32A32_FLOAT
>;
991 sClearTilesTable
[R8_UINT
] = ClearMacroTile
<R8_UINT
>;
994 PFN_BACKEND_FUNC gBackendNullPs
[SWR_MULTISAMPLE_TYPE_COUNT
];
995 PFN_BACKEND_FUNC gBackendSingleSample
[SWR_INPUT_COVERAGE_COUNT
]
999 PFN_BACKEND_FUNC gBackendPixelRateTable
[SWR_MULTISAMPLE_TYPE_COUNT
]
1000 [2] // isCenterPattern
1001 [SWR_INPUT_COVERAGE_COUNT
]
1003 [2] // forcedSampleCount
1006 PFN_BACKEND_FUNC gBackendSampleRateTable
[SWR_MULTISAMPLE_TYPE_COUNT
]
1007 [SWR_INPUT_COVERAGE_COUNT
]
1012 // Recursive template used to auto-nest conditionals. Converts dynamic enum function
1013 // arguments to static template arguments.
1014 template <uint32_t... ArgsT
>
1017 // Last Arg Terminator
1018 static PFN_BACKEND_FUNC
GetFunc(SWR_BACKEND_FUNCS tArg
)
1022 case SWR_BACKEND_SINGLE_SAMPLE
: return BackendSingleSample
<SwrBackendTraits
<ArgsT
...>>; break;
1023 case SWR_BACKEND_MSAA_PIXEL_RATE
: return BackendPixelRate
<SwrBackendTraits
<ArgsT
...>>; break;
1024 case SWR_BACKEND_MSAA_SAMPLE_RATE
: return BackendSampleRate
<SwrBackendTraits
<ArgsT
...>>; break;
1026 SWR_ASSERT(0 && "Invalid backend func\n");
1032 // Recursively parse args
1033 template <typename
... TArgsT
>
1034 static PFN_BACKEND_FUNC
GetFunc(SWR_INPUT_COVERAGE tArg
, TArgsT
... remainingArgs
)
1038 case SWR_INPUT_COVERAGE_NONE
: return BEChooser
<ArgsT
..., SWR_INPUT_COVERAGE_NONE
>::GetFunc(remainingArgs
...); break;
1039 case SWR_INPUT_COVERAGE_NORMAL
: return BEChooser
<ArgsT
..., SWR_INPUT_COVERAGE_NORMAL
>::GetFunc(remainingArgs
...); break;
1040 case SWR_INPUT_COVERAGE_INNER_CONSERVATIVE
: return BEChooser
<ArgsT
..., SWR_INPUT_COVERAGE_INNER_CONSERVATIVE
>::GetFunc(remainingArgs
...); break;
1042 SWR_ASSERT(0 && "Invalid sample pattern\n");
1043 return BEChooser
<ArgsT
..., SWR_INPUT_COVERAGE_NONE
>::GetFunc(remainingArgs
...);
1048 // Recursively parse args
1049 template <typename
... TArgsT
>
1050 static PFN_BACKEND_FUNC
GetFunc(SWR_MULTISAMPLE_COUNT tArg
, TArgsT
... remainingArgs
)
1054 case SWR_MULTISAMPLE_1X
: return BEChooser
<ArgsT
..., SWR_MULTISAMPLE_1X
>::GetFunc(remainingArgs
...); break;
1055 case SWR_MULTISAMPLE_2X
: return BEChooser
<ArgsT
..., SWR_MULTISAMPLE_2X
>::GetFunc(remainingArgs
...); break;
1056 case SWR_MULTISAMPLE_4X
: return BEChooser
<ArgsT
..., SWR_MULTISAMPLE_4X
>::GetFunc(remainingArgs
...); break;
1057 case SWR_MULTISAMPLE_8X
: return BEChooser
<ArgsT
..., SWR_MULTISAMPLE_8X
>::GetFunc(remainingArgs
...); break;
1058 case SWR_MULTISAMPLE_16X
: return BEChooser
<ArgsT
..., SWR_MULTISAMPLE_16X
>::GetFunc(remainingArgs
...); break;
1060 SWR_ASSERT(0 && "Invalid sample count\n");
1061 return BEChooser
<ArgsT
..., SWR_MULTISAMPLE_1X
>::GetFunc(remainingArgs
...);
1066 // Recursively parse args
1067 template <typename
... TArgsT
>
1068 static PFN_BACKEND_FUNC
GetFunc(bool tArg
, TArgsT
... remainingArgs
)
1072 return BEChooser
<ArgsT
..., 1>::GetFunc(remainingArgs
...);
1075 return BEChooser
<ArgsT
..., 0>::GetFunc(remainingArgs
...);
1079 void InitBackendSingleFuncTable(PFN_BACKEND_FUNC (&table
)[SWR_INPUT_COVERAGE_COUNT
][2][2])
1081 for(uint32_t inputCoverage
= 0; inputCoverage
< SWR_INPUT_COVERAGE_COUNT
; inputCoverage
++)
1083 for(uint32_t isCentroid
= 0; isCentroid
< 2; isCentroid
++)
1085 for(uint32_t canEarlyZ
= 0; canEarlyZ
< 2; canEarlyZ
++)
1087 table
[inputCoverage
][isCentroid
][canEarlyZ
] =
1088 BEChooser
<>::GetFunc(SWR_MULTISAMPLE_1X
, false, (SWR_INPUT_COVERAGE
)inputCoverage
,
1089 (isCentroid
> 0), false, (canEarlyZ
> 0), SWR_BACKEND_SINGLE_SAMPLE
);
1095 void InitBackendSampleFuncTable(PFN_BACKEND_FUNC (&table
)[SWR_MULTISAMPLE_TYPE_COUNT
][SWR_INPUT_COVERAGE_COUNT
][2][2])
1097 for(uint32_t sampleCount
= SWR_MULTISAMPLE_1X
; sampleCount
< SWR_MULTISAMPLE_TYPE_COUNT
; sampleCount
++)
1099 for(uint32_t inputCoverage
= 0; inputCoverage
< SWR_INPUT_COVERAGE_COUNT
; inputCoverage
++)
1101 for(uint32_t centroid
= 0; centroid
< 2; centroid
++)
1103 for(uint32_t canEarlyZ
= 0; canEarlyZ
< 2; canEarlyZ
++)
1105 table
[sampleCount
][inputCoverage
][centroid
][canEarlyZ
] =
1106 BEChooser
<>::GetFunc((SWR_MULTISAMPLE_COUNT
)sampleCount
, false, (SWR_INPUT_COVERAGE
)inputCoverage
,
1107 (centroid
> 0), false, (canEarlyZ
> 0), (SWR_BACKEND_FUNCS
)SWR_BACKEND_MSAA_SAMPLE_RATE
);
1114 void InitBackendPixelRate0();
1115 void InitBackendFuncTables()
1117 InitBackendSingleFuncTable(gBackendSingleSample
);
1118 InitBackendPixelRate0();
1119 InitBackendSampleFuncTable(gBackendSampleRateTable
);
1121 gBackendNullPs
[SWR_MULTISAMPLE_1X
] = &BackendNullPS
< SWR_MULTISAMPLE_1X
> ;
1122 gBackendNullPs
[SWR_MULTISAMPLE_2X
] = &BackendNullPS
< SWR_MULTISAMPLE_2X
> ;
1123 gBackendNullPs
[SWR_MULTISAMPLE_4X
] = &BackendNullPS
< SWR_MULTISAMPLE_4X
> ;
1124 gBackendNullPs
[SWR_MULTISAMPLE_8X
] = &BackendNullPS
< SWR_MULTISAMPLE_8X
> ;
1125 gBackendNullPs
[SWR_MULTISAMPLE_16X
] = &BackendNullPS
< SWR_MULTISAMPLE_16X
> ;