1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * @brief Backend handles rasterization, pixel shading and output merger
28 ******************************************************************************/
30 #include <smmintrin.h>
34 #include "memory/tilingtraits.h"
35 #include "core/multisample.h"
39 typedef void(*PFN_CLEAR_TILES
)(DRAW_CONTEXT
*, SWR_RENDERTARGET_ATTACHMENT rt
, uint32_t, uint32_t, DWORD
[4], const SWR_RECT
& rect
);
40 static PFN_CLEAR_TILES sClearTilesTable
[NUM_SWR_FORMATS
];
43 //////////////////////////////////////////////////////////////////////////
44 /// @brief Process compute work.
45 /// @param pDC - pointer to draw context (dispatch).
46 /// @param workerId - The unique worker ID that is assigned to this thread.
47 /// @param threadGroupId - the linear index for the thread group within the dispatch.
48 void ProcessComputeBE(DRAW_CONTEXT
* pDC
, uint32_t workerId
, uint32_t threadGroupId
, void*& pSpillFillBuffer
, void*& pScratchSpace
)
50 SWR_CONTEXT
*pContext
= pDC
->pContext
;
52 AR_BEGIN(BEDispatch
, pDC
->drawId
);
54 const COMPUTE_DESC
* pTaskData
= (COMPUTE_DESC
*)pDC
->pDispatch
->GetTasksData();
55 SWR_ASSERT(pTaskData
!= nullptr);
57 // Ensure spill fill memory has been allocated.
58 size_t spillFillSize
= pDC
->pState
->state
.totalSpillFillSize
;
59 if (spillFillSize
&& pSpillFillBuffer
== nullptr)
61 pSpillFillBuffer
= pDC
->pArena
->AllocAlignedSync(spillFillSize
, KNOB_SIMD_BYTES
);
64 size_t scratchSpaceSize
= pDC
->pState
->state
.scratchSpaceSize
* pDC
->pState
->state
.scratchSpaceNumInstances
;
65 if (scratchSpaceSize
&& pScratchSpace
== nullptr)
67 pScratchSpace
= pDC
->pArena
->AllocAlignedSync(scratchSpaceSize
, KNOB_SIMD_BYTES
);
70 const API_STATE
& state
= GetApiState(pDC
);
72 SWR_CS_CONTEXT csContext
{ 0 };
73 csContext
.tileCounter
= threadGroupId
;
74 csContext
.dispatchDims
[0] = pTaskData
->threadGroupCountX
;
75 csContext
.dispatchDims
[1] = pTaskData
->threadGroupCountY
;
76 csContext
.dispatchDims
[2] = pTaskData
->threadGroupCountZ
;
77 csContext
.pTGSM
= pContext
->ppScratch
[workerId
];
78 csContext
.pSpillFillBuffer
= (uint8_t*)pSpillFillBuffer
;
79 csContext
.pScratchSpace
= (uint8_t*)pScratchSpace
;
80 csContext
.scratchSpacePerSimd
= pDC
->pState
->state
.scratchSpaceSize
;
82 state
.pfnCsFunc(GetPrivateState(pDC
), &csContext
);
84 UPDATE_STAT_BE(CsInvocations
, state
.totalThreadsInGroup
);
86 AR_END(BEDispatch
, 1);
89 //////////////////////////////////////////////////////////////////////////
90 /// @brief Process shutdown.
91 /// @param pDC - pointer to draw context (dispatch).
92 /// @param workerId - The unique worker ID that is assigned to this thread.
93 /// @param threadGroupId - the linear index for the thread group within the dispatch.
94 void ProcessShutdownBE(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t macroTile
, void *pUserData
)
99 void ProcessSyncBE(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t macroTile
, void *pUserData
)
102 MacroTileMgr::getTileIndices(macroTile
, x
, y
);
103 SWR_ASSERT(x
== 0 && y
== 0);
106 template<SWR_FORMAT format
>
107 void ClearRasterTile(uint8_t *pTileBuffer
, simdvector
&value
)
109 auto lambda
= [&](int32_t comp
)
111 FormatTraits
<format
>::storeSOA(comp
, pTileBuffer
, value
.v
[comp
]);
113 pTileBuffer
+= (KNOB_SIMD_WIDTH
* FormatTraits
<format
>::GetBPC(comp
) / 8);
116 const uint32_t numIter
= (KNOB_TILE_Y_DIM
/ SIMD_TILE_Y_DIM
) * (KNOB_TILE_X_DIM
/ SIMD_TILE_X_DIM
);
118 for (uint32_t i
= 0; i
< numIter
; ++i
)
120 UnrollerL
<0, FormatTraits
<format
>::numComps
, 1>::step(lambda
);
124 #if USE_8x2_TILE_BACKEND
125 template<SWR_FORMAT format
>
126 void ClearRasterTile(uint8_t *pTileBuffer
, simd16vector
&value
)
128 auto lambda
= [&](int32_t comp
)
130 FormatTraits
<format
>::storeSOA(comp
, pTileBuffer
, value
.v
[comp
]);
132 pTileBuffer
+= (KNOB_SIMD16_WIDTH
* FormatTraits
<format
>::GetBPC(comp
) / 8);
135 const uint32_t numIter
= (KNOB_TILE_Y_DIM
/ SIMD16_TILE_Y_DIM
) * (KNOB_TILE_X_DIM
/ SIMD16_TILE_X_DIM
);
137 for (uint32_t i
= 0; i
< numIter
; ++i
)
139 UnrollerL
<0, FormatTraits
<format
>::numComps
, 1>::step(lambda
);
144 template<SWR_FORMAT format
>
145 INLINE
void ClearMacroTile(DRAW_CONTEXT
*pDC
, SWR_RENDERTARGET_ATTACHMENT rt
, uint32_t macroTile
, uint32_t renderTargetArrayIndex
, DWORD clear
[4], const SWR_RECT
& rect
)
147 // convert clear color to hottile format
148 // clear color is in RGBA float/uint32
149 #if USE_8x2_TILE_BACKEND
151 for (uint32_t comp
= 0; comp
< FormatTraits
<format
>::numComps
; ++comp
)
154 vComp
= _simd16_load1_ps((const float*)&clear
[comp
]);
155 if (FormatTraits
<format
>::isNormalized(comp
))
157 vComp
= _simd16_mul_ps(vComp
, _simd16_set1_ps(FormatTraits
<format
>::fromFloat(comp
)));
158 vComp
= _simd16_castsi_ps(_simd16_cvtps_epi32(vComp
));
160 vComp
= FormatTraits
<format
>::pack(comp
, vComp
);
161 vClear
.v
[FormatTraits
<format
>::swizzle(comp
)] = vComp
;
166 for (uint32_t comp
= 0; comp
< FormatTraits
<format
>::numComps
; ++comp
)
169 vComp
= _simd_load1_ps((const float*)&clear
[comp
]);
170 if (FormatTraits
<format
>::isNormalized(comp
))
172 vComp
= _simd_mul_ps(vComp
, _simd_set1_ps(FormatTraits
<format
>::fromFloat(comp
)));
173 vComp
= _simd_castsi_ps(_simd_cvtps_epi32(vComp
));
175 vComp
= FormatTraits
<format
>::pack(comp
, vComp
);
176 vClear
.v
[FormatTraits
<format
>::swizzle(comp
)] = vComp
;
180 uint32_t tileX
, tileY
;
181 MacroTileMgr::getTileIndices(macroTile
, tileX
, tileY
);
183 // Init to full macrotile
186 KNOB_MACROTILE_X_DIM
* int32_t(tileX
),
187 KNOB_MACROTILE_Y_DIM
* int32_t(tileY
),
188 KNOB_MACROTILE_X_DIM
* int32_t(tileX
+ 1),
189 KNOB_MACROTILE_Y_DIM
* int32_t(tileY
+ 1),
192 // intersect with clear rect
195 // translate to local hottile origin
196 clearTile
.Translate(-int32_t(tileX
) * KNOB_MACROTILE_X_DIM
, -int32_t(tileY
) * KNOB_MACROTILE_Y_DIM
);
198 // Make maximums inclusive (needed for convert to raster tiles)
202 // convert to raster tiles
203 clearTile
.ymin
>>= (KNOB_TILE_Y_DIM_SHIFT
);
204 clearTile
.ymax
>>= (KNOB_TILE_Y_DIM_SHIFT
);
205 clearTile
.xmin
>>= (KNOB_TILE_X_DIM_SHIFT
);
206 clearTile
.xmax
>>= (KNOB_TILE_X_DIM_SHIFT
);
208 const int32_t numSamples
= GetNumSamples(pDC
->pState
->state
.rastState
.sampleCount
);
209 // compute steps between raster tile samples / raster tiles / macro tile rows
210 const uint32_t rasterTileSampleStep
= KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<format
>::bpp
/ 8;
211 const uint32_t rasterTileStep
= (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* (FormatTraits
<format
>::bpp
/ 8)) * numSamples
;
212 const uint32_t macroTileRowStep
= (KNOB_MACROTILE_X_DIM
/ KNOB_TILE_X_DIM
) * rasterTileStep
;
213 const uint32_t pitch
= (FormatTraits
<format
>::bpp
* KNOB_MACROTILE_X_DIM
/ 8);
215 HOTTILE
*pHotTile
= pDC
->pContext
->pHotTileMgr
->GetHotTile(pDC
->pContext
, pDC
, macroTile
, rt
, true, numSamples
, renderTargetArrayIndex
);
216 uint32_t rasterTileStartOffset
= (ComputeTileOffset2D
< TilingTraits
<SWR_TILE_SWRZ
, FormatTraits
<format
>::bpp
> >(pitch
, clearTile
.xmin
, clearTile
.ymin
)) * numSamples
;
217 uint8_t* pRasterTileRow
= pHotTile
->pBuffer
+ rasterTileStartOffset
; //(ComputeTileOffset2D< TilingTraits<SWR_TILE_SWRZ, FormatTraits<format>::bpp > >(pitch, x, y)) * numSamples;
219 // loop over all raster tiles in the current hot tile
220 for (int32_t y
= clearTile
.ymin
; y
<= clearTile
.ymax
; ++y
)
222 uint8_t* pRasterTile
= pRasterTileRow
;
223 for (int32_t x
= clearTile
.xmin
; x
<= clearTile
.xmax
; ++x
)
225 for( int32_t sampleNum
= 0; sampleNum
< numSamples
; sampleNum
++)
227 ClearRasterTile
<format
>(pRasterTile
, vClear
);
228 pRasterTile
+= rasterTileSampleStep
;
231 pRasterTileRow
+= macroTileRowStep
;
234 pHotTile
->state
= HOTTILE_DIRTY
;
238 void ProcessClearBE(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t macroTile
, void *pUserData
)
240 SWR_CONTEXT
*pContext
= pDC
->pContext
;
244 CLEAR_DESC
*pClear
= (CLEAR_DESC
*)pUserData
;
245 SWR_MULTISAMPLE_COUNT sampleCount
= pDC
->pState
->state
.rastState
.sampleCount
;
246 uint32_t numSamples
= GetNumSamples(sampleCount
);
248 SWR_ASSERT(pClear
->attachmentMask
!= 0); // shouldn't be here without a reason.
250 AR_BEGIN(BEClear
, pDC
->drawId
);
252 if (pClear
->attachmentMask
& SWR_ATTACHMENT_MASK_COLOR
)
254 unsigned long rt
= 0;
255 uint32_t mask
= pClear
->attachmentMask
& SWR_ATTACHMENT_MASK_COLOR
;
256 while (_BitScanForward(&rt
, mask
))
260 HOTTILE
*pHotTile
= pContext
->pHotTileMgr
->GetHotTile(pContext
, pDC
, macroTile
, (SWR_RENDERTARGET_ATTACHMENT
)rt
, true, numSamples
, pClear
->renderTargetArrayIndex
);
262 // All we want to do here is to mark the hot tile as being in a "needs clear" state.
263 pHotTile
->clearData
[0] = *(DWORD
*)&(pClear
->clearRTColor
[0]);
264 pHotTile
->clearData
[1] = *(DWORD
*)&(pClear
->clearRTColor
[1]);
265 pHotTile
->clearData
[2] = *(DWORD
*)&(pClear
->clearRTColor
[2]);
266 pHotTile
->clearData
[3] = *(DWORD
*)&(pClear
->clearRTColor
[3]);
267 pHotTile
->state
= HOTTILE_CLEAR
;
271 if (pClear
->attachmentMask
& SWR_ATTACHMENT_DEPTH_BIT
)
273 HOTTILE
*pHotTile
= pContext
->pHotTileMgr
->GetHotTile(pContext
, pDC
, macroTile
, SWR_ATTACHMENT_DEPTH
, true, numSamples
, pClear
->renderTargetArrayIndex
);
274 pHotTile
->clearData
[0] = *(DWORD
*)&pClear
->clearDepth
;
275 pHotTile
->state
= HOTTILE_CLEAR
;
278 if (pClear
->attachmentMask
& SWR_ATTACHMENT_STENCIL_BIT
)
280 HOTTILE
*pHotTile
= pContext
->pHotTileMgr
->GetHotTile(pContext
, pDC
, macroTile
, SWR_ATTACHMENT_STENCIL
, true, numSamples
, pClear
->renderTargetArrayIndex
);
282 pHotTile
->clearData
[0] = pClear
->clearStencil
;
283 pHotTile
->state
= HOTTILE_CLEAR
;
291 CLEAR_DESC
*pClear
= (CLEAR_DESC
*)pUserData
;
292 AR_BEGIN(BEClear
, pDC
->drawId
);
294 if (pClear
->attachmentMask
& SWR_ATTACHMENT_MASK_COLOR
)
297 clearData
[0] = *(DWORD
*)&(pClear
->clearRTColor
[0]);
298 clearData
[1] = *(DWORD
*)&(pClear
->clearRTColor
[1]);
299 clearData
[2] = *(DWORD
*)&(pClear
->clearRTColor
[2]);
300 clearData
[3] = *(DWORD
*)&(pClear
->clearRTColor
[3]);
302 PFN_CLEAR_TILES pfnClearTiles
= sClearTilesTable
[KNOB_COLOR_HOT_TILE_FORMAT
];
303 SWR_ASSERT(pfnClearTiles
!= nullptr);
305 unsigned long rt
= 0;
306 uint32_t mask
= pClear
->attachmentMask
& SWR_ATTACHMENT_MASK_COLOR
;
307 while (_BitScanForward(&rt
, mask
))
311 pfnClearTiles(pDC
, (SWR_RENDERTARGET_ATTACHMENT
)rt
, macroTile
, pClear
->renderTargetArrayIndex
, clearData
, pClear
->rect
);
315 if (pClear
->attachmentMask
& SWR_ATTACHMENT_DEPTH_BIT
)
318 clearData
[0] = *(DWORD
*)&pClear
->clearDepth
;
319 PFN_CLEAR_TILES pfnClearTiles
= sClearTilesTable
[KNOB_DEPTH_HOT_TILE_FORMAT
];
320 SWR_ASSERT(pfnClearTiles
!= nullptr);
322 pfnClearTiles(pDC
, SWR_ATTACHMENT_DEPTH
, macroTile
, pClear
->renderTargetArrayIndex
, clearData
, pClear
->rect
);
325 if (pClear
->attachmentMask
& SWR_ATTACHMENT_STENCIL_BIT
)
328 clearData
[0] = pClear
->clearStencil
;
329 PFN_CLEAR_TILES pfnClearTiles
= sClearTilesTable
[KNOB_STENCIL_HOT_TILE_FORMAT
];
331 pfnClearTiles(pDC
, SWR_ATTACHMENT_STENCIL
, macroTile
, pClear
->renderTargetArrayIndex
, clearData
, pClear
->rect
);
338 void ProcessStoreTileBE(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t macroTile
, STORE_TILES_DESC
* pDesc
,
339 SWR_RENDERTARGET_ATTACHMENT attachment
)
341 SWR_CONTEXT
*pContext
= pDC
->pContext
;
343 AR_BEGIN(BEStoreTiles
, pDC
->drawId
);
345 SWR_FORMAT srcFormat
;
348 case SWR_ATTACHMENT_COLOR0
:
349 case SWR_ATTACHMENT_COLOR1
:
350 case SWR_ATTACHMENT_COLOR2
:
351 case SWR_ATTACHMENT_COLOR3
:
352 case SWR_ATTACHMENT_COLOR4
:
353 case SWR_ATTACHMENT_COLOR5
:
354 case SWR_ATTACHMENT_COLOR6
:
355 case SWR_ATTACHMENT_COLOR7
: srcFormat
= KNOB_COLOR_HOT_TILE_FORMAT
; break;
356 case SWR_ATTACHMENT_DEPTH
: srcFormat
= KNOB_DEPTH_HOT_TILE_FORMAT
; break;
357 case SWR_ATTACHMENT_STENCIL
: srcFormat
= KNOB_STENCIL_HOT_TILE_FORMAT
; break;
358 default: SWR_INVALID("Unknown attachment: %d", attachment
); srcFormat
= KNOB_COLOR_HOT_TILE_FORMAT
; break;
362 MacroTileMgr::getTileIndices(macroTile
, x
, y
);
364 // Only need to store the hottile if it's been rendered to...
365 HOTTILE
*pHotTile
= pContext
->pHotTileMgr
->GetHotTileNoLoad(pContext
, pDC
, macroTile
, attachment
, false);
368 // clear if clear is pending (i.e., not rendered to), then mark as dirty for store.
369 if (pHotTile
->state
== HOTTILE_CLEAR
)
371 PFN_CLEAR_TILES pfnClearTiles
= sClearTilesTable
[srcFormat
];
372 SWR_ASSERT(pfnClearTiles
!= nullptr);
374 pfnClearTiles(pDC
, attachment
, macroTile
, pHotTile
->renderTargetArrayIndex
, pHotTile
->clearData
, pDesc
->rect
);
377 if (pHotTile
->state
== HOTTILE_DIRTY
|| pDesc
->postStoreTileState
== (SWR_TILE_STATE
)HOTTILE_DIRTY
)
379 int32_t destX
= KNOB_MACROTILE_X_DIM
* x
;
380 int32_t destY
= KNOB_MACROTILE_Y_DIM
* y
;
382 pContext
->pfnStoreTile(GetPrivateState(pDC
), srcFormat
,
383 attachment
, destX
, destY
, pHotTile
->renderTargetArrayIndex
, pHotTile
->pBuffer
);
387 if (pHotTile
->state
== HOTTILE_DIRTY
|| pHotTile
->state
== HOTTILE_RESOLVED
)
389 pHotTile
->state
= (HOTTILE_STATE
)pDesc
->postStoreTileState
;
392 AR_END(BEStoreTiles
, 1);
395 void ProcessStoreTilesBE(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t macroTile
, void *pData
)
397 STORE_TILES_DESC
*pDesc
= (STORE_TILES_DESC
*)pData
;
399 unsigned long rt
= 0;
400 uint32_t mask
= pDesc
->attachmentMask
;
401 while (_BitScanForward(&rt
, mask
))
404 ProcessStoreTileBE(pDC
, workerId
, macroTile
, pDesc
, (SWR_RENDERTARGET_ATTACHMENT
)rt
);
408 void ProcessDiscardInvalidateTilesBE(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t macroTile
, void *pData
)
410 DISCARD_INVALIDATE_TILES_DESC
*pDesc
= (DISCARD_INVALIDATE_TILES_DESC
*)pData
;
411 SWR_CONTEXT
*pContext
= pDC
->pContext
;
413 const int32_t numSamples
= GetNumSamples(pDC
->pState
->state
.rastState
.sampleCount
);
415 for (uint32_t i
= 0; i
< SWR_NUM_ATTACHMENTS
; ++i
)
417 if (pDesc
->attachmentMask
& (1 << i
))
419 HOTTILE
*pHotTile
= pContext
->pHotTileMgr
->GetHotTileNoLoad(
420 pContext
, pDC
, macroTile
, (SWR_RENDERTARGET_ATTACHMENT
)i
, pDesc
->createNewTiles
, numSamples
);
423 pHotTile
->state
= (HOTTILE_STATE
)pDesc
->newTileState
;
429 #if KNOB_SIMD_WIDTH == 8
430 const simdscalar vCenterOffsetsX
= __m256
{0.5, 1.5, 0.5, 1.5, 2.5, 3.5, 2.5, 3.5};
431 const simdscalar vCenterOffsetsY
= __m256
{0.5, 0.5, 1.5, 1.5, 0.5, 0.5, 1.5, 1.5};
432 const simdscalar vULOffsetsX
= __m256
{0.0, 1.0, 0.0, 1.0, 2.0, 3.0, 2.0, 3.0};
433 const simdscalar vULOffsetsY
= __m256
{0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0};
435 #error Unsupported vector width
438 simdmask
ComputeUserClipMask(uint8_t clipMask
, float* pUserClipBuffer
, simdscalar vI
, simdscalar vJ
)
440 simdscalar vClipMask
= _simd_setzero_ps();
441 uint32_t numClipDistance
= _mm_popcnt_u32(clipMask
);
443 for (uint32_t i
= 0; i
< numClipDistance
; ++i
)
445 // pull triangle clip distance values from clip buffer
446 simdscalar vA
= _simd_broadcast_ss(pUserClipBuffer
++);
447 simdscalar vB
= _simd_broadcast_ss(pUserClipBuffer
++);
448 simdscalar vC
= _simd_broadcast_ss(pUserClipBuffer
++);
451 simdscalar vInterp
= vplaneps(vA
, vB
, vC
, vI
, vJ
);
453 // clip if interpolated clip distance is < 0 || NAN
454 simdscalar vCull
= _simd_cmp_ps(_simd_setzero_ps(), vInterp
, _CMP_NLE_UQ
);
456 vClipMask
= _simd_or_ps(vClipMask
, vCull
);
459 return _simd_movemask_ps(vClipMask
);
463 void BackendSingleSample(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t x
, uint32_t y
, SWR_TRIANGLE_DESC
&work
, RenderOutputBuffers
&renderBuffers
)
465 SWR_CONTEXT
*pContext
= pDC
->pContext
;
467 AR_BEGIN(BESingleSampleBackend
, pDC
->drawId
);
468 AR_BEGIN(BESetup
, pDC
->drawId
);
470 const API_STATE
&state
= GetApiState(pDC
);
472 BarycentricCoeffs coeffs
;
473 SetupBarycentricCoeffs(&coeffs
, work
);
475 uint8_t *pColorBuffer
[SWR_NUM_RENDERTARGETS
], *pDepthBuffer
, *pStencilBuffer
;
476 SetupRenderBuffers(pColorBuffer
, &pDepthBuffer
, &pStencilBuffer
, state
.psState
.numRenderTargets
, renderBuffers
);
478 SWR_PS_CONTEXT psContext
;
479 const SWR_MULTISAMPLE_POS
& samplePos
= state
.rastState
.samplePositions
;
480 SetupPixelShaderContext
<T
>(&psContext
, samplePos
, work
);
484 psContext
.vY
.UL
= _simd_add_ps(vULOffsetsY
, _simd_set1_ps(static_cast<float>(y
)));
485 psContext
.vY
.center
= _simd_add_ps(vCenterOffsetsY
, _simd_set1_ps(static_cast<float>(y
)));
487 const simdscalar dy
= _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM
));
489 for (uint32_t yy
= y
; yy
< y
+ KNOB_TILE_Y_DIM
; yy
+= SIMD_TILE_Y_DIM
)
491 psContext
.vX
.UL
= _simd_add_ps(vULOffsetsX
, _simd_set1_ps(static_cast<float>(x
)));
492 psContext
.vX
.center
= _simd_add_ps(vCenterOffsetsX
, _simd_set1_ps(static_cast<float>(x
)));
494 const simdscalar dx
= _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM
));
496 for (uint32_t xx
= x
; xx
< x
+ KNOB_TILE_X_DIM
; xx
+= SIMD_TILE_X_DIM
)
498 #if USE_8x2_TILE_BACKEND
499 const bool useAlternateOffset
= ((xx
& SIMD_TILE_X_DIM
) != 0);
502 simdmask coverageMask
= work
.coverageMask
[0] & MASK
;
506 if (state
.depthHottileEnable
&& state
.depthBoundsState
.depthBoundsTestEnable
)
508 static_assert(KNOB_DEPTH_HOT_TILE_FORMAT
== R32_FLOAT
, "Unsupported depth hot tile format");
510 const simdscalar z
= _simd_load_ps(reinterpret_cast<const float *>(pDepthBuffer
));
512 const float minz
= state
.depthBoundsState
.depthBoundsTestMinValue
;
513 const float maxz
= state
.depthBoundsState
.depthBoundsTestMaxValue
;
515 coverageMask
&= CalcDepthBoundsAcceptMask(z
, minz
, maxz
);
518 if (T::InputCoverage
!= SWR_INPUT_COVERAGE_NONE
)
520 const uint64_t* pCoverageMask
= (T::InputCoverage
== SWR_INPUT_COVERAGE_INNER_CONSERVATIVE
) ? &work
.innerCoverageMask
: &work
.coverageMask
[0];
522 generateInputCoverage
<T
, T::InputCoverage
>(pCoverageMask
, psContext
.inputMask
, state
.blendState
.sampleMask
);
525 AR_BEGIN(BEBarycentric
, pDC
->drawId
);
527 CalcPixelBarycentrics(coeffs
, psContext
);
529 CalcCentroid
<T
, true>(&psContext
, samplePos
, coeffs
, work
.coverageMask
, state
.blendState
.sampleMask
);
531 // interpolate and quantize z
532 psContext
.vZ
= vplaneps(coeffs
.vZa
, coeffs
.vZb
, coeffs
.vZc
, psContext
.vI
.center
, psContext
.vJ
.center
);
533 psContext
.vZ
= state
.pfnQuantizeDepth(psContext
.vZ
);
535 AR_END(BEBarycentric
, 1);
537 // interpolate user clip distance if available
538 if (state
.rastState
.clipDistanceMask
)
540 coverageMask
&= ~ComputeUserClipMask(state
.rastState
.clipDistanceMask
, work
.pUserClipBuffer
, psContext
.vI
.center
, psContext
.vJ
.center
);
543 simdscalar vCoverageMask
= vMask(coverageMask
);
544 simdscalar depthPassMask
= vCoverageMask
;
545 simdscalar stencilPassMask
= vCoverageMask
;
550 AR_BEGIN(BEEarlyDepthTest
, pDC
->drawId
);
551 depthPassMask
= DepthStencilTest(&state
, work
.triFlags
.frontFacing
, work
.triFlags
.viewportIndex
,
552 psContext
.vZ
, pDepthBuffer
, vCoverageMask
, pStencilBuffer
, &stencilPassMask
);
553 AR_EVENT(EarlyDepthStencilInfoSingleSample(_simd_movemask_ps(depthPassMask
), _simd_movemask_ps(stencilPassMask
), _simd_movemask_ps(vCoverageMask
)));
554 AR_END(BEEarlyDepthTest
, 0);
556 // early-exit if no pixels passed depth or earlyZ is forced on
557 if (state
.psState
.forceEarlyZ
|| !_simd_movemask_ps(depthPassMask
))
559 DepthStencilWrite(&state
.vp
[work
.triFlags
.viewportIndex
], &state
.depthStencilState
, work
.triFlags
.frontFacing
, psContext
.vZ
,
560 pDepthBuffer
, depthPassMask
, vCoverageMask
, pStencilBuffer
, stencilPassMask
);
562 if (!_simd_movemask_ps(depthPassMask
))
569 psContext
.sampleIndex
= 0;
570 psContext
.activeMask
= _simd_castps_si(vCoverageMask
);
572 // execute pixel shader
573 AR_BEGIN(BEPixelShader
, pDC
->drawId
);
574 UPDATE_STAT_BE(PsInvocations
, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask
)));
575 state
.psState
.pfnPixelShader(GetPrivateState(pDC
), &psContext
);
576 AR_END(BEPixelShader
, 0);
578 vCoverageMask
= _simd_castsi_ps(psContext
.activeMask
);
583 AR_BEGIN(BELateDepthTest
, pDC
->drawId
);
584 depthPassMask
= DepthStencilTest(&state
, work
.triFlags
.frontFacing
, work
.triFlags
.viewportIndex
,
585 psContext
.vZ
, pDepthBuffer
, vCoverageMask
, pStencilBuffer
, &stencilPassMask
);
586 AR_EVENT(LateDepthStencilInfoSingleSample(_simd_movemask_ps(depthPassMask
), _simd_movemask_ps(stencilPassMask
), _simd_movemask_ps(vCoverageMask
)));
587 AR_END(BELateDepthTest
, 0);
589 if (!_simd_movemask_ps(depthPassMask
))
591 // need to call depth/stencil write for stencil write
592 DepthStencilWrite(&state
.vp
[work
.triFlags
.viewportIndex
], &state
.depthStencilState
, work
.triFlags
.frontFacing
, psContext
.vZ
,
593 pDepthBuffer
, depthPassMask
, vCoverageMask
, pStencilBuffer
, stencilPassMask
);
598 uint32_t statMask
= _simd_movemask_ps(depthPassMask
);
599 uint32_t statCount
= _mm_popcnt_u32(statMask
);
600 UPDATE_STAT_BE(DepthPassCount
, statCount
);
603 AR_BEGIN(BEOutputMerger
, pDC
->drawId
);
604 #if USE_8x2_TILE_BACKEND
605 OutputMerger8x2(psContext
, pColorBuffer
, 0, &state
.blendState
, state
.pfnBlendFunc
, vCoverageMask
, depthPassMask
, state
.psState
.numRenderTargets
, state
.colorHottileEnable
, useAlternateOffset
);
607 OutputMerger4x2(psContext
, pColorBuffer
, 0, &state
.blendState
, state
.pfnBlendFunc
, vCoverageMask
, depthPassMask
, state
.psState
.numRenderTargets
);
610 // do final depth write after all pixel kills
611 if (!state
.psState
.forceEarlyZ
)
613 DepthStencilWrite(&state
.vp
[work
.triFlags
.viewportIndex
], &state
.depthStencilState
, work
.triFlags
.frontFacing
, psContext
.vZ
,
614 pDepthBuffer
, depthPassMask
, vCoverageMask
, pStencilBuffer
, stencilPassMask
);
616 AR_END(BEOutputMerger
, 0);
620 AR_BEGIN(BEEndTile
, pDC
->drawId
);
622 work
.coverageMask
[0] >>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
623 if(T::InputCoverage
== SWR_INPUT_COVERAGE_INNER_CONSERVATIVE
)
625 work
.innerCoverageMask
>>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
628 #if USE_8x2_TILE_BACKEND
629 if (useAlternateOffset
)
631 for (uint32_t rt
= 0; rt
< state
.psState
.numRenderTargets
; ++rt
)
633 pColorBuffer
[rt
] += (2 * KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
) / 8;
637 for (uint32_t rt
= 0; rt
< state
.psState
.numRenderTargets
; ++rt
)
639 pColorBuffer
[rt
] += (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
) / 8;
642 pDepthBuffer
+= (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
) / 8;
643 pStencilBuffer
+= (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
) / 8;
645 AR_END(BEEndTile
, 0);
647 psContext
.vX
.UL
= _simd_add_ps(psContext
.vX
.UL
, dx
);
648 psContext
.vX
.center
= _simd_add_ps(psContext
.vX
.center
, dx
);
651 psContext
.vY
.UL
= _simd_add_ps(psContext
.vY
.UL
, dy
);
652 psContext
.vY
.center
= _simd_add_ps(psContext
.vY
.center
, dy
);
655 AR_END(BESingleSampleBackend
, 0);
659 void BackendSampleRate(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t x
, uint32_t y
, SWR_TRIANGLE_DESC
&work
, RenderOutputBuffers
&renderBuffers
)
661 SWR_CONTEXT
*pContext
= pDC
->pContext
;
663 AR_BEGIN(BESampleRateBackend
, pDC
->drawId
);
664 AR_BEGIN(BESetup
, pDC
->drawId
);
666 const API_STATE
&state
= GetApiState(pDC
);
668 BarycentricCoeffs coeffs
;
669 SetupBarycentricCoeffs(&coeffs
, work
);
671 uint8_t *pColorBuffer
[SWR_NUM_RENDERTARGETS
], *pDepthBuffer
, *pStencilBuffer
;
672 SetupRenderBuffers(pColorBuffer
, &pDepthBuffer
, &pStencilBuffer
, state
.psState
.numRenderTargets
, renderBuffers
);
674 SWR_PS_CONTEXT psContext
;
675 const SWR_MULTISAMPLE_POS
& samplePos
= state
.rastState
.samplePositions
;
676 SetupPixelShaderContext
<T
>(&psContext
, samplePos
, work
);
680 psContext
.vY
.UL
= _simd_add_ps(vULOffsetsY
, _simd_set1_ps(static_cast<float>(y
)));
681 psContext
.vY
.center
= _simd_add_ps(vCenterOffsetsY
, _simd_set1_ps(static_cast<float>(y
)));
683 const simdscalar dy
= _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM
));
685 for (uint32_t yy
= y
; yy
< y
+ KNOB_TILE_Y_DIM
; yy
+= SIMD_TILE_Y_DIM
)
687 psContext
.vX
.UL
= _simd_add_ps(vULOffsetsX
, _simd_set1_ps(static_cast<float>(x
)));
688 psContext
.vX
.center
= _simd_add_ps(vCenterOffsetsX
, _simd_set1_ps(static_cast<float>(x
)));
690 const simdscalar dx
= _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM
));
692 for (uint32_t xx
= x
; xx
< x
+ KNOB_TILE_X_DIM
; xx
+= SIMD_TILE_X_DIM
)
694 #if USE_8x2_TILE_BACKEND
695 const bool useAlternateOffset
= ((xx
& SIMD_TILE_X_DIM
) != 0);
698 if (T::InputCoverage
!= SWR_INPUT_COVERAGE_NONE
)
700 const uint64_t* pCoverageMask
= (T::InputCoverage
== SWR_INPUT_COVERAGE_INNER_CONSERVATIVE
) ? &work
.innerCoverageMask
: &work
.coverageMask
[0];
702 generateInputCoverage
<T
, T::InputCoverage
>(pCoverageMask
, psContext
.inputMask
, state
.blendState
.sampleMask
);
705 AR_BEGIN(BEBarycentric
, pDC
->drawId
);
707 CalcPixelBarycentrics(coeffs
, psContext
);
709 CalcCentroid
<T
, false>(&psContext
, samplePos
, coeffs
, work
.coverageMask
, state
.blendState
.sampleMask
);
711 AR_END(BEBarycentric
, 0);
713 for (uint32_t sample
= 0; sample
< T::MultisampleT::numSamples
; sample
++)
715 simdmask coverageMask
= work
.coverageMask
[sample
] & MASK
;
719 // offset depth/stencil buffers current sample
720 uint8_t *pDepthSample
= pDepthBuffer
+ RasterTileDepthOffset(sample
);
721 uint8_t *pStencilSample
= pStencilBuffer
+ RasterTileStencilOffset(sample
);
723 if (state
.depthHottileEnable
&& state
.depthBoundsState
.depthBoundsTestEnable
)
725 static_assert(KNOB_DEPTH_HOT_TILE_FORMAT
== R32_FLOAT
, "Unsupported depth hot tile format");
727 const simdscalar z
= _simd_load_ps(reinterpret_cast<const float *>(pDepthSample
));
729 const float minz
= state
.depthBoundsState
.depthBoundsTestMinValue
;
730 const float maxz
= state
.depthBoundsState
.depthBoundsTestMaxValue
;
732 coverageMask
&= CalcDepthBoundsAcceptMask(z
, minz
, maxz
);
735 AR_BEGIN(BEBarycentric
, pDC
->drawId
);
737 // calculate per sample positions
738 psContext
.vX
.sample
= _simd_add_ps(psContext
.vX
.UL
, samplePos
.vX(sample
));
739 psContext
.vY
.sample
= _simd_add_ps(psContext
.vY
.UL
, samplePos
.vY(sample
));
741 CalcSampleBarycentrics(coeffs
, psContext
);
743 // interpolate and quantize z
744 psContext
.vZ
= vplaneps(coeffs
.vZa
, coeffs
.vZb
, coeffs
.vZc
, psContext
.vI
.sample
, psContext
.vJ
.sample
);
745 psContext
.vZ
= state
.pfnQuantizeDepth(psContext
.vZ
);
747 AR_END(BEBarycentric
, 0);
749 // interpolate user clip distance if available
750 if (state
.rastState
.clipDistanceMask
)
752 coverageMask
&= ~ComputeUserClipMask(state
.rastState
.clipDistanceMask
, work
.pUserClipBuffer
, psContext
.vI
.sample
, psContext
.vJ
.sample
);
755 simdscalar vCoverageMask
= vMask(coverageMask
);
756 simdscalar depthPassMask
= vCoverageMask
;
757 simdscalar stencilPassMask
= vCoverageMask
;
762 AR_BEGIN(BEEarlyDepthTest
, pDC
->drawId
);
763 depthPassMask
= DepthStencilTest(&state
, work
.triFlags
.frontFacing
, work
.triFlags
.viewportIndex
,
764 psContext
.vZ
, pDepthSample
, vCoverageMask
, pStencilSample
, &stencilPassMask
);
765 AR_EVENT(EarlyDepthStencilInfoSampleRate(_simd_movemask_ps(depthPassMask
), _simd_movemask_ps(stencilPassMask
), _simd_movemask_ps(vCoverageMask
)));
766 AR_END(BEEarlyDepthTest
, 0);
768 // early-exit if no samples passed depth or earlyZ is forced on.
769 if (state
.psState
.forceEarlyZ
|| !_simd_movemask_ps(depthPassMask
))
771 DepthStencilWrite(&state
.vp
[work
.triFlags
.viewportIndex
], &state
.depthStencilState
, work
.triFlags
.frontFacing
, psContext
.vZ
,
772 pDepthSample
, depthPassMask
, vCoverageMask
, pStencilSample
, stencilPassMask
);
774 if (!_simd_movemask_ps(depthPassMask
))
776 work
.coverageMask
[sample
] >>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
782 psContext
.sampleIndex
= sample
;
783 psContext
.activeMask
= _simd_castps_si(vCoverageMask
);
785 // execute pixel shader
786 AR_BEGIN(BEPixelShader
, pDC
->drawId
);
787 UPDATE_STAT_BE(PsInvocations
, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask
)));
788 state
.psState
.pfnPixelShader(GetPrivateState(pDC
), &psContext
);
789 AR_END(BEPixelShader
, 0);
791 vCoverageMask
= _simd_castsi_ps(psContext
.activeMask
);
796 AR_BEGIN(BELateDepthTest
, pDC
->drawId
);
797 depthPassMask
= DepthStencilTest(&state
, work
.triFlags
.frontFacing
, work
.triFlags
.viewportIndex
,
798 psContext
.vZ
, pDepthSample
, vCoverageMask
, pStencilSample
, &stencilPassMask
);
799 AR_EVENT(LateDepthStencilInfoSampleRate(_simd_movemask_ps(depthPassMask
), _simd_movemask_ps(stencilPassMask
), _simd_movemask_ps(vCoverageMask
)));
800 AR_END(BELateDepthTest
, 0);
802 if (!_simd_movemask_ps(depthPassMask
))
804 // need to call depth/stencil write for stencil write
805 DepthStencilWrite(&state
.vp
[work
.triFlags
.viewportIndex
], &state
.depthStencilState
, work
.triFlags
.frontFacing
, psContext
.vZ
,
806 pDepthSample
, depthPassMask
, vCoverageMask
, pStencilSample
, stencilPassMask
);
808 work
.coverageMask
[sample
] >>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
813 uint32_t statMask
= _simd_movemask_ps(depthPassMask
);
814 uint32_t statCount
= _mm_popcnt_u32(statMask
);
815 UPDATE_STAT_BE(DepthPassCount
, statCount
);
818 AR_BEGIN(BEOutputMerger
, pDC
->drawId
);
819 #if USE_8x2_TILE_BACKEND
820 OutputMerger8x2(psContext
, pColorBuffer
, sample
, &state
.blendState
, state
.pfnBlendFunc
, vCoverageMask
, depthPassMask
, state
.psState
.numRenderTargets
, state
.colorHottileEnable
, useAlternateOffset
);
822 OutputMerger4x2(psContext
, pColorBuffer
, sample
, &state
.blendState
, state
.pfnBlendFunc
, vCoverageMask
, depthPassMask
, state
.psState
.numRenderTargets
);
825 // do final depth write after all pixel kills
826 if (!state
.psState
.forceEarlyZ
)
828 DepthStencilWrite(&state
.vp
[work
.triFlags
.viewportIndex
], &state
.depthStencilState
, work
.triFlags
.frontFacing
, psContext
.vZ
,
829 pDepthSample
, depthPassMask
, vCoverageMask
, pStencilSample
, stencilPassMask
);
831 AR_END(BEOutputMerger
, 0);
833 work
.coverageMask
[sample
] >>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
839 AR_BEGIN(BEEndTile
, pDC
->drawId
);
841 if(T::InputCoverage
== SWR_INPUT_COVERAGE_INNER_CONSERVATIVE
)
843 work
.innerCoverageMask
>>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
846 #if USE_8x2_TILE_BACKEND
847 if (useAlternateOffset
)
849 for (uint32_t rt
= 0; rt
< state
.psState
.numRenderTargets
; ++rt
)
851 pColorBuffer
[rt
] += (2 * KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
) / 8;
855 for (uint32_t rt
= 0; rt
< state
.psState
.numRenderTargets
; ++rt
)
857 pColorBuffer
[rt
] += (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
) / 8;
860 pDepthBuffer
+= (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
) / 8;
861 pStencilBuffer
+= (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
) / 8;
863 AR_END(BEEndTile
, 0);
865 psContext
.vX
.UL
= _simd_add_ps(psContext
.vX
.UL
, dx
);
866 psContext
.vX
.center
= _simd_add_ps(psContext
.vX
.center
, dx
);
869 psContext
.vY
.UL
= _simd_add_ps(psContext
.vY
.UL
, dy
);
870 psContext
.vY
.center
= _simd_add_ps(psContext
.vY
.center
, dy
);
873 AR_END(BESampleRateBackend
, 0);
875 // optimized backend flow with NULL PS
876 template<uint32_t sampleCountT
>
877 void BackendNullPS(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t x
, uint32_t y
, SWR_TRIANGLE_DESC
&work
, RenderOutputBuffers
&renderBuffers
)
879 SWR_CONTEXT
*pContext
= pDC
->pContext
;
881 AR_BEGIN(BENullBackend
, pDC
->drawId
);
882 ///@todo: handle center multisample pattern
883 AR_BEGIN(BESetup
, pDC
->drawId
);
885 const API_STATE
&state
= GetApiState(pDC
);
887 BarycentricCoeffs coeffs
;
888 SetupBarycentricCoeffs(&coeffs
, work
);
890 uint8_t *pDepthBuffer
, *pStencilBuffer
;
891 SetupRenderBuffers(NULL
, &pDepthBuffer
, &pStencilBuffer
, 0, renderBuffers
);
893 SWR_PS_CONTEXT psContext
;
894 // skip SetupPixelShaderContext(&psContext, ...); // not needed here
898 simdscalar vYSamplePosUL
= _simd_add_ps(vULOffsetsY
, _simd_set1_ps(static_cast<float>(y
)));
900 const simdscalar dy
= _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM
));
901 const SWR_MULTISAMPLE_POS
& samplePos
= state
.rastState
.samplePositions
;
902 for (uint32_t yy
= y
; yy
< y
+ KNOB_TILE_Y_DIM
; yy
+= SIMD_TILE_Y_DIM
)
904 simdscalar vXSamplePosUL
= _simd_add_ps(vULOffsetsX
, _simd_set1_ps(static_cast<float>(x
)));
906 const simdscalar dx
= _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM
));
908 for (uint32_t xx
= x
; xx
< x
+ KNOB_TILE_X_DIM
; xx
+= SIMD_TILE_X_DIM
)
910 // iterate over active samples
911 unsigned long sample
= 0;
912 uint32_t sampleMask
= state
.blendState
.sampleMask
;
913 while (_BitScanForward(&sample
, sampleMask
))
915 sampleMask
&= ~(1 << sample
);
917 simdmask coverageMask
= work
.coverageMask
[sample
] & MASK
;
921 // offset depth/stencil buffers current sample
922 uint8_t *pDepthSample
= pDepthBuffer
+ RasterTileDepthOffset(sample
);
923 uint8_t *pStencilSample
= pStencilBuffer
+ RasterTileStencilOffset(sample
);
925 if (state
.depthHottileEnable
&& state
.depthBoundsState
.depthBoundsTestEnable
)
927 static_assert(KNOB_DEPTH_HOT_TILE_FORMAT
== R32_FLOAT
, "Unsupported depth hot tile format");
929 const simdscalar z
= _simd_load_ps(reinterpret_cast<const float *>(pDepthSample
));
931 const float minz
= state
.depthBoundsState
.depthBoundsTestMinValue
;
932 const float maxz
= state
.depthBoundsState
.depthBoundsTestMaxValue
;
934 coverageMask
&= CalcDepthBoundsAcceptMask(z
, minz
, maxz
);
937 AR_BEGIN(BEBarycentric
, pDC
->drawId
);
939 // calculate per sample positions
940 psContext
.vX
.sample
= _simd_add_ps(vXSamplePosUL
, samplePos
.vX(sample
));
941 psContext
.vY
.sample
= _simd_add_ps(vYSamplePosUL
, samplePos
.vY(sample
));
943 CalcSampleBarycentrics(coeffs
, psContext
);
945 // interpolate and quantize z
946 psContext
.vZ
= vplaneps(coeffs
.vZa
, coeffs
.vZb
, coeffs
.vZc
, psContext
.vI
.sample
, psContext
.vJ
.sample
);
947 psContext
.vZ
= state
.pfnQuantizeDepth(psContext
.vZ
);
949 AR_END(BEBarycentric
, 0);
951 // interpolate user clip distance if available
952 if (state
.rastState
.clipDistanceMask
)
954 coverageMask
&= ~ComputeUserClipMask(state
.rastState
.clipDistanceMask
, work
.pUserClipBuffer
, psContext
.vI
.sample
, psContext
.vJ
.sample
);
957 simdscalar vCoverageMask
= vMask(coverageMask
);
958 simdscalar stencilPassMask
= vCoverageMask
;
960 AR_BEGIN(BEEarlyDepthTest
, pDC
->drawId
);
961 simdscalar depthPassMask
= DepthStencilTest(&state
, work
.triFlags
.frontFacing
, work
.triFlags
.viewportIndex
,
962 psContext
.vZ
, pDepthSample
, vCoverageMask
, pStencilSample
, &stencilPassMask
);
963 AR_EVENT(EarlyDepthStencilInfoNullPS(_simd_movemask_ps(depthPassMask
), _simd_movemask_ps(stencilPassMask
), _simd_movemask_ps(vCoverageMask
)));
964 DepthStencilWrite(&state
.vp
[work
.triFlags
.viewportIndex
], &state
.depthStencilState
, work
.triFlags
.frontFacing
, psContext
.vZ
,
965 pDepthSample
, depthPassMask
, vCoverageMask
, pStencilSample
, stencilPassMask
);
966 AR_END(BEEarlyDepthTest
, 0);
968 uint32_t statMask
= _simd_movemask_ps(depthPassMask
);
969 uint32_t statCount
= _mm_popcnt_u32(statMask
);
970 UPDATE_STAT_BE(DepthPassCount
, statCount
);
975 work
.coverageMask
[sample
] >>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
978 pDepthBuffer
+= (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
) / 8;
979 pStencilBuffer
+= (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
) / 8;
981 vXSamplePosUL
= _simd_add_ps(vXSamplePosUL
, dx
);
984 vYSamplePosUL
= _simd_add_ps(vYSamplePosUL
, dy
);
987 AR_END(BENullBackend
, 0);
990 void InitClearTilesTable()
992 memset(sClearTilesTable
, 0, sizeof(sClearTilesTable
));
994 sClearTilesTable
[R8G8B8A8_UNORM
] = ClearMacroTile
<R8G8B8A8_UNORM
>;
995 sClearTilesTable
[B8G8R8A8_UNORM
] = ClearMacroTile
<B8G8R8A8_UNORM
>;
996 sClearTilesTable
[R32_FLOAT
] = ClearMacroTile
<R32_FLOAT
>;
997 sClearTilesTable
[R32G32B32A32_FLOAT
] = ClearMacroTile
<R32G32B32A32_FLOAT
>;
998 sClearTilesTable
[R8_UINT
] = ClearMacroTile
<R8_UINT
>;
1001 PFN_BACKEND_FUNC gBackendNullPs
[SWR_MULTISAMPLE_TYPE_COUNT
];
1002 PFN_BACKEND_FUNC gBackendSingleSample
[SWR_INPUT_COVERAGE_COUNT
]
1006 PFN_BACKEND_FUNC gBackendPixelRateTable
[SWR_MULTISAMPLE_TYPE_COUNT
]
1007 [2] // isCenterPattern
1008 [SWR_INPUT_COVERAGE_COUNT
]
1010 [2] // forcedSampleCount
1013 PFN_BACKEND_FUNC gBackendSampleRateTable
[SWR_MULTISAMPLE_TYPE_COUNT
]
1014 [SWR_INPUT_COVERAGE_COUNT
]
1019 // Recursive template used to auto-nest conditionals. Converts dynamic enum function
1020 // arguments to static template arguments.
1021 template <uint32_t... ArgsT
>
1024 // Last Arg Terminator
1025 static PFN_BACKEND_FUNC
GetFunc(SWR_BACKEND_FUNCS tArg
)
1029 case SWR_BACKEND_SINGLE_SAMPLE
: return BackendSingleSample
<SwrBackendTraits
<ArgsT
...>>; break;
1030 case SWR_BACKEND_MSAA_PIXEL_RATE
: return BackendPixelRate
<SwrBackendTraits
<ArgsT
...>>; break;
1031 case SWR_BACKEND_MSAA_SAMPLE_RATE
: return BackendSampleRate
<SwrBackendTraits
<ArgsT
...>>; break;
1033 SWR_ASSERT(0 && "Invalid backend func\n");
1039 // Recursively parse args
1040 template <typename
... TArgsT
>
1041 static PFN_BACKEND_FUNC
GetFunc(SWR_INPUT_COVERAGE tArg
, TArgsT
... remainingArgs
)
1045 case SWR_INPUT_COVERAGE_NONE
: return BEChooser
<ArgsT
..., SWR_INPUT_COVERAGE_NONE
>::GetFunc(remainingArgs
...); break;
1046 case SWR_INPUT_COVERAGE_NORMAL
: return BEChooser
<ArgsT
..., SWR_INPUT_COVERAGE_NORMAL
>::GetFunc(remainingArgs
...); break;
1047 case SWR_INPUT_COVERAGE_INNER_CONSERVATIVE
: return BEChooser
<ArgsT
..., SWR_INPUT_COVERAGE_INNER_CONSERVATIVE
>::GetFunc(remainingArgs
...); break;
1049 SWR_ASSERT(0 && "Invalid sample pattern\n");
1050 return BEChooser
<ArgsT
..., SWR_INPUT_COVERAGE_NONE
>::GetFunc(remainingArgs
...);
1055 // Recursively parse args
1056 template <typename
... TArgsT
>
1057 static PFN_BACKEND_FUNC
GetFunc(SWR_MULTISAMPLE_COUNT tArg
, TArgsT
... remainingArgs
)
1061 case SWR_MULTISAMPLE_1X
: return BEChooser
<ArgsT
..., SWR_MULTISAMPLE_1X
>::GetFunc(remainingArgs
...); break;
1062 case SWR_MULTISAMPLE_2X
: return BEChooser
<ArgsT
..., SWR_MULTISAMPLE_2X
>::GetFunc(remainingArgs
...); break;
1063 case SWR_MULTISAMPLE_4X
: return BEChooser
<ArgsT
..., SWR_MULTISAMPLE_4X
>::GetFunc(remainingArgs
...); break;
1064 case SWR_MULTISAMPLE_8X
: return BEChooser
<ArgsT
..., SWR_MULTISAMPLE_8X
>::GetFunc(remainingArgs
...); break;
1065 case SWR_MULTISAMPLE_16X
: return BEChooser
<ArgsT
..., SWR_MULTISAMPLE_16X
>::GetFunc(remainingArgs
...); break;
1067 SWR_ASSERT(0 && "Invalid sample count\n");
1068 return BEChooser
<ArgsT
..., SWR_MULTISAMPLE_1X
>::GetFunc(remainingArgs
...);
1073 // Recursively parse args
1074 template <typename
... TArgsT
>
1075 static PFN_BACKEND_FUNC
GetFunc(bool tArg
, TArgsT
... remainingArgs
)
1079 return BEChooser
<ArgsT
..., 1>::GetFunc(remainingArgs
...);
1082 return BEChooser
<ArgsT
..., 0>::GetFunc(remainingArgs
...);
1086 void InitBackendSingleFuncTable(PFN_BACKEND_FUNC (&table
)[SWR_INPUT_COVERAGE_COUNT
][2][2])
1088 for(uint32_t inputCoverage
= 0; inputCoverage
< SWR_INPUT_COVERAGE_COUNT
; inputCoverage
++)
1090 for(uint32_t isCentroid
= 0; isCentroid
< 2; isCentroid
++)
1092 for(uint32_t canEarlyZ
= 0; canEarlyZ
< 2; canEarlyZ
++)
1094 table
[inputCoverage
][isCentroid
][canEarlyZ
] =
1095 BEChooser
<>::GetFunc(SWR_MULTISAMPLE_1X
, false, (SWR_INPUT_COVERAGE
)inputCoverage
,
1096 (isCentroid
> 0), false, (canEarlyZ
> 0), SWR_BACKEND_SINGLE_SAMPLE
);
1102 void InitBackendSampleFuncTable(PFN_BACKEND_FUNC (&table
)[SWR_MULTISAMPLE_TYPE_COUNT
][SWR_INPUT_COVERAGE_COUNT
][2][2])
1104 for(uint32_t sampleCount
= SWR_MULTISAMPLE_1X
; sampleCount
< SWR_MULTISAMPLE_TYPE_COUNT
; sampleCount
++)
1106 for(uint32_t inputCoverage
= 0; inputCoverage
< SWR_INPUT_COVERAGE_COUNT
; inputCoverage
++)
1108 for(uint32_t centroid
= 0; centroid
< 2; centroid
++)
1110 for(uint32_t canEarlyZ
= 0; canEarlyZ
< 2; canEarlyZ
++)
1112 table
[sampleCount
][inputCoverage
][centroid
][canEarlyZ
] =
1113 BEChooser
<>::GetFunc((SWR_MULTISAMPLE_COUNT
)sampleCount
, false, (SWR_INPUT_COVERAGE
)inputCoverage
,
1114 (centroid
> 0), false, (canEarlyZ
> 0), (SWR_BACKEND_FUNCS
)SWR_BACKEND_MSAA_SAMPLE_RATE
);
1121 void InitBackendPixelRate0();
1122 void InitBackendFuncTables()
1124 InitBackendSingleFuncTable(gBackendSingleSample
);
1125 InitBackendPixelRate0();
1126 InitBackendSampleFuncTable(gBackendSampleRateTable
);
1128 gBackendNullPs
[SWR_MULTISAMPLE_1X
] = &BackendNullPS
< SWR_MULTISAMPLE_1X
> ;
1129 gBackendNullPs
[SWR_MULTISAMPLE_2X
] = &BackendNullPS
< SWR_MULTISAMPLE_2X
> ;
1130 gBackendNullPs
[SWR_MULTISAMPLE_4X
] = &BackendNullPS
< SWR_MULTISAMPLE_4X
> ;
1131 gBackendNullPs
[SWR_MULTISAMPLE_8X
] = &BackendNullPS
< SWR_MULTISAMPLE_8X
> ;
1132 gBackendNullPs
[SWR_MULTISAMPLE_16X
] = &BackendNullPS
< SWR_MULTISAMPLE_16X
> ;