1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * @brief Backend handles rasterization, pixel shading and output merger
28 ******************************************************************************/
30 #include <smmintrin.h>
33 #include "depthstencil.h"
35 #include "memory/tilingtraits.h"
36 #include "core/multisample.h"
40 typedef void(*PFN_CLEAR_TILES
)(DRAW_CONTEXT
*, SWR_RENDERTARGET_ATTACHMENT rt
, uint32_t, DWORD
[4], const SWR_RECT
& rect
);
41 static PFN_CLEAR_TILES sClearTilesTable
[NUM_SWR_FORMATS
];
43 //////////////////////////////////////////////////////////////////////////
44 /// @brief Process compute work.
45 /// @param pDC - pointer to draw context (dispatch).
46 /// @param workerId - The unique worker ID that is assigned to this thread.
47 /// @param threadGroupId - the linear index for the thread group within the dispatch.
48 void ProcessComputeBE(DRAW_CONTEXT
* pDC
, uint32_t workerId
, uint32_t threadGroupId
, void*& pSpillFillBuffer
)
50 SWR_CONTEXT
*pContext
= pDC
->pContext
;
52 AR_BEGIN(BEDispatch
, pDC
->drawId
);
54 const COMPUTE_DESC
* pTaskData
= (COMPUTE_DESC
*)pDC
->pDispatch
->GetTasksData();
55 SWR_ASSERT(pTaskData
!= nullptr);
57 // Ensure spill fill memory has been allocated.
58 size_t spillFillSize
= pDC
->pState
->state
.totalSpillFillSize
;
59 if (spillFillSize
&& pSpillFillBuffer
== nullptr)
61 pSpillFillBuffer
= pDC
->pArena
->AllocAlignedSync(spillFillSize
, KNOB_SIMD_BYTES
);
64 const API_STATE
& state
= GetApiState(pDC
);
66 SWR_CS_CONTEXT csContext
{ 0 };
67 csContext
.tileCounter
= threadGroupId
;
68 csContext
.dispatchDims
[0] = pTaskData
->threadGroupCountX
;
69 csContext
.dispatchDims
[1] = pTaskData
->threadGroupCountY
;
70 csContext
.dispatchDims
[2] = pTaskData
->threadGroupCountZ
;
71 csContext
.pTGSM
= pContext
->ppScratch
[workerId
];
72 csContext
.pSpillFillBuffer
= (uint8_t*)pSpillFillBuffer
;
74 state
.pfnCsFunc(GetPrivateState(pDC
), &csContext
);
76 UPDATE_STAT(CsInvocations
, state
.totalThreadsInGroup
);
78 AR_END(BEDispatch
, 1);
81 //////////////////////////////////////////////////////////////////////////
82 /// @brief Process shutdown.
83 /// @param pDC - pointer to draw context (dispatch).
84 /// @param workerId - The unique worker ID that is assigned to this thread.
85 /// @param threadGroupId - the linear index for the thread group within the dispatch.
86 void ProcessShutdownBE(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t macroTile
, void *pUserData
)
91 void ProcessSyncBE(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t macroTile
, void *pUserData
)
94 MacroTileMgr::getTileIndices(macroTile
, x
, y
);
95 SWR_ASSERT(x
== 0 && y
== 0);
98 template<SWR_FORMAT format
>
99 void ClearRasterTile(uint8_t *pTileBuffer
, simdvector
&value
)
101 auto lambda
= [&](int32_t comp
)
103 FormatTraits
<format
>::storeSOA(comp
, pTileBuffer
, value
.v
[comp
]);
105 pTileBuffer
+= (KNOB_SIMD_WIDTH
* FormatTraits
<format
>::GetBPC(comp
) / 8);
108 const uint32_t numIter
= (KNOB_TILE_Y_DIM
/ SIMD_TILE_Y_DIM
) * (KNOB_TILE_X_DIM
/ SIMD_TILE_X_DIM
);
110 for (uint32_t i
= 0; i
< numIter
; ++i
)
112 UnrollerL
<0, FormatTraits
<format
>::numComps
, 1>::step(lambda
);
116 #if USE_8x2_TILE_BACKEND
117 template<SWR_FORMAT format
>
118 void ClearRasterTile(uint8_t *pTileBuffer
, simd16vector
&value
)
120 auto lambda
= [&](int32_t comp
)
122 FormatTraits
<format
>::storeSOA(comp
, pTileBuffer
, value
.v
[comp
]);
124 pTileBuffer
+= (KNOB_SIMD16_WIDTH
* FormatTraits
<format
>::GetBPC(comp
) / 8);
127 const uint32_t numIter
= (KNOB_TILE_Y_DIM
/ SIMD16_TILE_Y_DIM
) * (KNOB_TILE_X_DIM
/ SIMD16_TILE_X_DIM
);
129 for (uint32_t i
= 0; i
< numIter
; ++i
)
131 UnrollerL
<0, FormatTraits
<format
>::numComps
, 1>::step(lambda
);
136 template<SWR_FORMAT format
>
137 INLINE
void ClearMacroTile(DRAW_CONTEXT
*pDC
, SWR_RENDERTARGET_ATTACHMENT rt
, uint32_t macroTile
, DWORD clear
[4], const SWR_RECT
& rect
)
139 // convert clear color to hottile format
140 // clear color is in RGBA float/uint32
141 #if USE_8x2_TILE_BACKEND
143 for (uint32_t comp
= 0; comp
< FormatTraits
<format
>::numComps
; ++comp
)
146 vComp
= _simd16_load1_ps((const float*)&clear
[comp
]);
147 if (FormatTraits
<format
>::isNormalized(comp
))
149 vComp
= _simd16_mul_ps(vComp
, _simd16_set1_ps(FormatTraits
<format
>::fromFloat(comp
)));
150 vComp
= _simd16_castsi_ps(_simd16_cvtps_epi32(vComp
));
152 vComp
= FormatTraits
<format
>::pack(comp
, vComp
);
153 vClear
.v
[FormatTraits
<format
>::swizzle(comp
)] = vComp
;
158 for (uint32_t comp
= 0; comp
< FormatTraits
<format
>::numComps
; ++comp
)
161 vComp
= _simd_load1_ps((const float*)&clear
[comp
]);
162 if (FormatTraits
<format
>::isNormalized(comp
))
164 vComp
= _simd_mul_ps(vComp
, _simd_set1_ps(FormatTraits
<format
>::fromFloat(comp
)));
165 vComp
= _simd_castsi_ps(_simd_cvtps_epi32(vComp
));
167 vComp
= FormatTraits
<format
>::pack(comp
, vComp
);
168 vClear
.v
[FormatTraits
<format
>::swizzle(comp
)] = vComp
;
172 uint32_t tileX
, tileY
;
173 MacroTileMgr::getTileIndices(macroTile
, tileX
, tileY
);
175 // Init to full macrotile
178 KNOB_MACROTILE_X_DIM
* int32_t(tileX
),
179 KNOB_MACROTILE_Y_DIM
* int32_t(tileY
),
180 KNOB_MACROTILE_X_DIM
* int32_t(tileX
+ 1),
181 KNOB_MACROTILE_Y_DIM
* int32_t(tileY
+ 1),
184 // intersect with clear rect
187 // translate to local hottile origin
188 clearTile
.Translate(-int32_t(tileX
) * KNOB_MACROTILE_X_DIM
, -int32_t(tileY
) * KNOB_MACROTILE_Y_DIM
);
190 // Make maximums inclusive (needed for convert to raster tiles)
194 // convert to raster tiles
195 clearTile
.ymin
>>= (KNOB_TILE_Y_DIM_SHIFT
);
196 clearTile
.ymax
>>= (KNOB_TILE_Y_DIM_SHIFT
);
197 clearTile
.xmin
>>= (KNOB_TILE_X_DIM_SHIFT
);
198 clearTile
.xmax
>>= (KNOB_TILE_X_DIM_SHIFT
);
200 const int32_t numSamples
= GetNumSamples(pDC
->pState
->state
.rastState
.sampleCount
);
201 // compute steps between raster tile samples / raster tiles / macro tile rows
202 const uint32_t rasterTileSampleStep
= KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<format
>::bpp
/ 8;
203 const uint32_t rasterTileStep
= (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* (FormatTraits
<format
>::bpp
/ 8)) * numSamples
;
204 const uint32_t macroTileRowStep
= (KNOB_MACROTILE_X_DIM
/ KNOB_TILE_X_DIM
) * rasterTileStep
;
205 const uint32_t pitch
= (FormatTraits
<format
>::bpp
* KNOB_MACROTILE_X_DIM
/ 8);
207 HOTTILE
*pHotTile
= pDC
->pContext
->pHotTileMgr
->GetHotTile(pDC
->pContext
, pDC
, macroTile
, rt
, true, numSamples
);
208 uint32_t rasterTileStartOffset
= (ComputeTileOffset2D
< TilingTraits
<SWR_TILE_SWRZ
, FormatTraits
<format
>::bpp
> >(pitch
, clearTile
.xmin
, clearTile
.ymin
)) * numSamples
;
209 uint8_t* pRasterTileRow
= pHotTile
->pBuffer
+ rasterTileStartOffset
; //(ComputeTileOffset2D< TilingTraits<SWR_TILE_SWRZ, FormatTraits<format>::bpp > >(pitch, x, y)) * numSamples;
211 // loop over all raster tiles in the current hot tile
212 for (int32_t y
= clearTile
.ymin
; y
<= clearTile
.ymax
; ++y
)
214 uint8_t* pRasterTile
= pRasterTileRow
;
215 for (int32_t x
= clearTile
.xmin
; x
<= clearTile
.xmax
; ++x
)
217 for( int32_t sampleNum
= 0; sampleNum
< numSamples
; sampleNum
++)
219 ClearRasterTile
<format
>(pRasterTile
, vClear
);
220 pRasterTile
+= rasterTileSampleStep
;
223 pRasterTileRow
+= macroTileRowStep
;
226 pHotTile
->state
= HOTTILE_DIRTY
;
230 void ProcessClearBE(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t macroTile
, void *pUserData
)
232 SWR_CONTEXT
*pContext
= pDC
->pContext
;
236 CLEAR_DESC
*pClear
= (CLEAR_DESC
*)pUserData
;
237 SWR_MULTISAMPLE_COUNT sampleCount
= pDC
->pState
->state
.rastState
.sampleCount
;
238 uint32_t numSamples
= GetNumSamples(sampleCount
);
240 SWR_ASSERT(pClear
->flags
.bits
!= 0); // shouldn't be here without a reason.
242 AR_BEGIN(BEClear
, pDC
->drawId
);
244 if (pClear
->flags
.mask
& SWR_CLEAR_COLOR
)
246 HOTTILE
*pHotTile
= pContext
->pHotTileMgr
->GetHotTile(pContext
, pDC
, macroTile
, SWR_ATTACHMENT_COLOR0
, true, numSamples
);
247 // All we want to do here is to mark the hot tile as being in a "needs clear" state.
248 pHotTile
->clearData
[0] = *(DWORD
*)&(pClear
->clearRTColor
[0]);
249 pHotTile
->clearData
[1] = *(DWORD
*)&(pClear
->clearRTColor
[1]);
250 pHotTile
->clearData
[2] = *(DWORD
*)&(pClear
->clearRTColor
[2]);
251 pHotTile
->clearData
[3] = *(DWORD
*)&(pClear
->clearRTColor
[3]);
252 pHotTile
->state
= HOTTILE_CLEAR
;
255 if (pClear
->flags
.mask
& SWR_CLEAR_DEPTH
)
257 HOTTILE
*pHotTile
= pContext
->pHotTileMgr
->GetHotTile(pContext
, pDC
, macroTile
, SWR_ATTACHMENT_DEPTH
, true, numSamples
);
258 pHotTile
->clearData
[0] = *(DWORD
*)&pClear
->clearDepth
;
259 pHotTile
->state
= HOTTILE_CLEAR
;
262 if (pClear
->flags
.mask
& SWR_CLEAR_STENCIL
)
264 HOTTILE
*pHotTile
= pContext
->pHotTileMgr
->GetHotTile(pContext
, pDC
, macroTile
, SWR_ATTACHMENT_STENCIL
, true, numSamples
);
266 pHotTile
->clearData
[0] = *(DWORD
*)&pClear
->clearStencil
;
267 pHotTile
->state
= HOTTILE_CLEAR
;
275 CLEAR_DESC
*pClear
= (CLEAR_DESC
*)pUserData
;
276 AR_BEGIN(BEClear
, pDC
->drawId
);
278 if (pClear
->flags
.mask
& SWR_CLEAR_COLOR
)
280 /// @todo clear data should come in as RGBA32_FLOAT
283 clearFloat
[0] = ((uint8_t*)(&pClear
->clearRTColor
))[0] / 255.0f
;
284 clearFloat
[1] = ((uint8_t*)(&pClear
->clearRTColor
))[1] / 255.0f
;
285 clearFloat
[2] = ((uint8_t*)(&pClear
->clearRTColor
))[2] / 255.0f
;
286 clearFloat
[3] = ((uint8_t*)(&pClear
->clearRTColor
))[3] / 255.0f
;
287 clearData
[0] = *(DWORD
*)&clearFloat
[0];
288 clearData
[1] = *(DWORD
*)&clearFloat
[1];
289 clearData
[2] = *(DWORD
*)&clearFloat
[2];
290 clearData
[3] = *(DWORD
*)&clearFloat
[3];
292 PFN_CLEAR_TILES pfnClearTiles
= sClearTilesTable
[KNOB_COLOR_HOT_TILE_FORMAT
];
293 SWR_ASSERT(pfnClearTiles
!= nullptr);
295 pfnClearTiles(pDC
, SWR_ATTACHMENT_COLOR0
, macroTile
, clearData
, pClear
->rect
);
298 if (pClear
->flags
.mask
& SWR_CLEAR_DEPTH
)
301 clearData
[0] = *(DWORD
*)&pClear
->clearDepth
;
302 PFN_CLEAR_TILES pfnClearTiles
= sClearTilesTable
[KNOB_DEPTH_HOT_TILE_FORMAT
];
303 SWR_ASSERT(pfnClearTiles
!= nullptr);
305 pfnClearTiles(pDC
, SWR_ATTACHMENT_DEPTH
, macroTile
, clearData
, pClear
->rect
);
308 if (pClear
->flags
.mask
& SWR_CLEAR_STENCIL
)
310 uint32_t value
= pClear
->clearStencil
;
312 clearData
[0] = *(DWORD
*)&value
;
313 PFN_CLEAR_TILES pfnClearTiles
= sClearTilesTable
[KNOB_STENCIL_HOT_TILE_FORMAT
];
315 pfnClearTiles(pDC
, SWR_ATTACHMENT_STENCIL
, macroTile
, clearData
, pClear
->rect
);
322 void ProcessStoreTileBE(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t macroTile
, STORE_TILES_DESC
* pDesc
,
323 SWR_RENDERTARGET_ATTACHMENT attachment
)
325 SWR_CONTEXT
*pContext
= pDC
->pContext
;
327 AR_BEGIN(BEStoreTiles
, pDC
->drawId
);
329 SWR_FORMAT srcFormat
;
332 case SWR_ATTACHMENT_COLOR0
:
333 case SWR_ATTACHMENT_COLOR1
:
334 case SWR_ATTACHMENT_COLOR2
:
335 case SWR_ATTACHMENT_COLOR3
:
336 case SWR_ATTACHMENT_COLOR4
:
337 case SWR_ATTACHMENT_COLOR5
:
338 case SWR_ATTACHMENT_COLOR6
:
339 case SWR_ATTACHMENT_COLOR7
: srcFormat
= KNOB_COLOR_HOT_TILE_FORMAT
; break;
340 case SWR_ATTACHMENT_DEPTH
: srcFormat
= KNOB_DEPTH_HOT_TILE_FORMAT
; break;
341 case SWR_ATTACHMENT_STENCIL
: srcFormat
= KNOB_STENCIL_HOT_TILE_FORMAT
; break;
342 default: SWR_ASSERT(false, "Unknown attachment: %d", attachment
); srcFormat
= KNOB_COLOR_HOT_TILE_FORMAT
; break;
346 MacroTileMgr::getTileIndices(macroTile
, x
, y
);
348 // Only need to store the hottile if it's been rendered to...
349 HOTTILE
*pHotTile
= pContext
->pHotTileMgr
->GetHotTile(pContext
, pDC
, macroTile
, attachment
, false);
352 // clear if clear is pending (i.e., not rendered to), then mark as dirty for store.
353 if (pHotTile
->state
== HOTTILE_CLEAR
)
355 PFN_CLEAR_TILES pfnClearTiles
= sClearTilesTable
[srcFormat
];
356 SWR_ASSERT(pfnClearTiles
!= nullptr);
358 pfnClearTiles(pDC
, attachment
, macroTile
, pHotTile
->clearData
, pDesc
->rect
);
361 if (pHotTile
->state
== HOTTILE_DIRTY
|| pDesc
->postStoreTileState
== (SWR_TILE_STATE
)HOTTILE_DIRTY
)
363 int32_t destX
= KNOB_MACROTILE_X_DIM
* x
;
364 int32_t destY
= KNOB_MACROTILE_Y_DIM
* y
;
366 pContext
->pfnStoreTile(GetPrivateState(pDC
), srcFormat
,
367 attachment
, destX
, destY
, pHotTile
->renderTargetArrayIndex
, pHotTile
->pBuffer
);
371 if (pHotTile
->state
== HOTTILE_DIRTY
|| pHotTile
->state
== HOTTILE_RESOLVED
)
373 pHotTile
->state
= (HOTTILE_STATE
)pDesc
->postStoreTileState
;
376 AR_END(BEStoreTiles
, 1);
379 void ProcessStoreTilesBE(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t macroTile
, void *pData
)
381 STORE_TILES_DESC
*pDesc
= (STORE_TILES_DESC
*)pData
;
383 unsigned long rt
= 0;
384 uint32_t mask
= pDesc
->attachmentMask
;
385 while (_BitScanForward(&rt
, mask
))
388 ProcessStoreTileBE(pDC
, workerId
, macroTile
, pDesc
, (SWR_RENDERTARGET_ATTACHMENT
)rt
);
392 void ProcessDiscardInvalidateTilesBE(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t macroTile
, void *pData
)
394 DISCARD_INVALIDATE_TILES_DESC
*pDesc
= (DISCARD_INVALIDATE_TILES_DESC
*)pData
;
395 SWR_CONTEXT
*pContext
= pDC
->pContext
;
397 const int32_t numSamples
= GetNumSamples(pDC
->pState
->state
.rastState
.sampleCount
);
399 for (uint32_t i
= 0; i
< SWR_NUM_ATTACHMENTS
; ++i
)
401 if (pDesc
->attachmentMask
& (1 << i
))
403 HOTTILE
*pHotTile
= pContext
->pHotTileMgr
->GetHotTileNoLoad(
404 pContext
, pDC
, macroTile
, (SWR_RENDERTARGET_ATTACHMENT
)i
, pDesc
->createNewTiles
, numSamples
);
407 pHotTile
->state
= (HOTTILE_STATE
)pDesc
->newTileState
;
413 #if KNOB_SIMD_WIDTH == 8
414 const __m256 vCenterOffsetsX
= {0.5, 1.5, 0.5, 1.5, 2.5, 3.5, 2.5, 3.5};
415 const __m256 vCenterOffsetsY
= {0.5, 0.5, 1.5, 1.5, 0.5, 0.5, 1.5, 1.5};
416 const __m256 vULOffsetsX
= {0.0, 1.0, 0.0, 1.0, 2.0, 3.0, 2.0, 3.0};
417 const __m256 vULOffsetsY
= {0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0};
419 #error Unsupported vector width
422 simdmask
ComputeUserClipMask(uint8_t clipMask
, float* pUserClipBuffer
, simdscalar vI
, simdscalar vJ
)
424 simdscalar vClipMask
= _simd_setzero_ps();
425 uint32_t numClipDistance
= _mm_popcnt_u32(clipMask
);
427 for (uint32_t i
= 0; i
< numClipDistance
; ++i
)
429 // pull triangle clip distance values from clip buffer
430 simdscalar vA
= _simd_broadcast_ss(pUserClipBuffer
++);
431 simdscalar vB
= _simd_broadcast_ss(pUserClipBuffer
++);
432 simdscalar vC
= _simd_broadcast_ss(pUserClipBuffer
++);
435 simdscalar vInterp
= vplaneps(vA
, vB
, vC
, vI
, vJ
);
437 // clip if interpolated clip distance is < 0 || NAN
438 simdscalar vCull
= _simd_cmp_ps(_simd_setzero_ps(), vInterp
, _CMP_NLE_UQ
);
440 vClipMask
= _simd_or_ps(vClipMask
, vCull
);
443 return _simd_movemask_ps(vClipMask
);
447 void BackendSingleSample(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t x
, uint32_t y
, SWR_TRIANGLE_DESC
&work
, RenderOutputBuffers
&renderBuffers
)
449 SWR_CONTEXT
*pContext
= pDC
->pContext
;
451 AR_BEGIN(BESingleSampleBackend
, pDC
->drawId
);
452 AR_BEGIN(BESetup
, pDC
->drawId
);
454 const API_STATE
&state
= GetApiState(pDC
);
456 BarycentricCoeffs coeffs
;
457 SetupBarycentricCoeffs(&coeffs
, work
);
459 uint8_t *pColorBuffer
[SWR_NUM_RENDERTARGETS
], *pDepthBuffer
, *pStencilBuffer
;
460 SetupRenderBuffers(pColorBuffer
, &pDepthBuffer
, &pStencilBuffer
, state
.psState
.numRenderTargets
, renderBuffers
);
462 SWR_PS_CONTEXT psContext
;
463 SetupPixelShaderContext
<T
>(&psContext
, work
);
467 psContext
.vY
.UL
= _simd_add_ps(vULOffsetsY
, _simd_set1_ps(static_cast<float>(y
)));
468 psContext
.vY
.center
= _simd_add_ps(vCenterOffsetsY
, _simd_set1_ps(static_cast<float>(y
)));
470 const simdscalar dy
= _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM
));
472 for (uint32_t yy
= y
; yy
< y
+ KNOB_TILE_Y_DIM
; yy
+= SIMD_TILE_Y_DIM
)
474 psContext
.vX
.UL
= _simd_add_ps(vULOffsetsX
, _simd_set1_ps(static_cast<float>(x
)));
475 psContext
.vX
.center
= _simd_add_ps(vCenterOffsetsX
, _simd_set1_ps(static_cast<float>(x
)));
477 const simdscalar dx
= _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM
));
479 for (uint32_t xx
= x
; xx
< x
+ KNOB_TILE_X_DIM
; xx
+= SIMD_TILE_X_DIM
)
481 #if USE_8x2_TILE_BACKEND
482 const bool useAlternateOffset
= ((xx
& SIMD_TILE_X_DIM
) != 0);
485 simdmask coverageMask
= work
.coverageMask
[0] & MASK
;
489 if (state
.depthHottileEnable
&& state
.depthBoundsState
.depthBoundsTestEnable
)
491 static_assert(KNOB_DEPTH_HOT_TILE_FORMAT
== R32_FLOAT
, "Unsupported depth hot tile format");
493 const simdscalar z
= _simd_load_ps(reinterpret_cast<const float *>(pDepthBuffer
));
495 const float minz
= state
.depthBoundsState
.depthBoundsTestMinValue
;
496 const float maxz
= state
.depthBoundsState
.depthBoundsTestMaxValue
;
498 coverageMask
&= CalcDepthBoundsAcceptMask(z
, minz
, maxz
);
501 if (T::InputCoverage
!= SWR_INPUT_COVERAGE_NONE
)
503 const uint64_t* pCoverageMask
= (T::InputCoverage
== SWR_INPUT_COVERAGE_INNER_CONSERVATIVE
) ? &work
.innerCoverageMask
: &work
.coverageMask
[0];
505 generateInputCoverage
<T
, T::InputCoverage
>(pCoverageMask
, psContext
.inputMask
, state
.blendState
.sampleMask
);
508 AR_BEGIN(BEBarycentric
, pDC
->drawId
);
510 CalcPixelBarycentrics(coeffs
, psContext
);
512 CalcCentroid
<T
, true>(&psContext
, coeffs
, work
.coverageMask
, state
.blendState
.sampleMask
);
514 // interpolate and quantize z
515 psContext
.vZ
= vplaneps(coeffs
.vZa
, coeffs
.vZb
, coeffs
.vZc
, psContext
.vI
.center
, psContext
.vJ
.center
);
516 psContext
.vZ
= state
.pfnQuantizeDepth(psContext
.vZ
);
518 AR_END(BEBarycentric
, 1);
520 // interpolate user clip distance if available
521 if (state
.rastState
.clipDistanceMask
)
523 coverageMask
&= ~ComputeUserClipMask(state
.rastState
.clipDistanceMask
, work
.pUserClipBuffer
, psContext
.vI
.center
, psContext
.vJ
.center
);
526 simdscalar vCoverageMask
= vMask(coverageMask
);
527 simdscalar depthPassMask
= vCoverageMask
;
528 simdscalar stencilPassMask
= vCoverageMask
;
533 AR_BEGIN(BEEarlyDepthTest
, pDC
->drawId
);
534 depthPassMask
= DepthStencilTest(&state
, work
.triFlags
.frontFacing
, work
.triFlags
.viewportIndex
,
535 psContext
.vZ
, pDepthBuffer
, vCoverageMask
, pStencilBuffer
, &stencilPassMask
);
536 AR_END(BEEarlyDepthTest
, 0);
538 // early-exit if no pixels passed depth or earlyZ is forced on
539 if (state
.psState
.forceEarlyZ
|| !_simd_movemask_ps(depthPassMask
))
541 DepthStencilWrite(&state
.vp
[work
.triFlags
.viewportIndex
], &state
.depthStencilState
, work
.triFlags
.frontFacing
, psContext
.vZ
,
542 pDepthBuffer
, depthPassMask
, vCoverageMask
, pStencilBuffer
, stencilPassMask
);
544 if (!_simd_movemask_ps(depthPassMask
))
551 psContext
.sampleIndex
= 0;
552 psContext
.activeMask
= _simd_castps_si(vCoverageMask
);
554 // execute pixel shader
555 AR_BEGIN(BEPixelShader
, pDC
->drawId
);
556 UPDATE_STAT(PsInvocations
, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask
)));
557 state
.psState
.pfnPixelShader(GetPrivateState(pDC
), &psContext
);
558 AR_END(BEPixelShader
, 0);
560 vCoverageMask
= _simd_castsi_ps(psContext
.activeMask
);
565 AR_BEGIN(BELateDepthTest
, pDC
->drawId
);
566 depthPassMask
= DepthStencilTest(&state
, work
.triFlags
.frontFacing
, work
.triFlags
.viewportIndex
,
567 psContext
.vZ
, pDepthBuffer
, vCoverageMask
, pStencilBuffer
, &stencilPassMask
);
568 AR_END(BELateDepthTest
, 0);
570 if (!_simd_movemask_ps(depthPassMask
))
572 // need to call depth/stencil write for stencil write
573 DepthStencilWrite(&state
.vp
[work
.triFlags
.viewportIndex
], &state
.depthStencilState
, work
.triFlags
.frontFacing
, psContext
.vZ
,
574 pDepthBuffer
, depthPassMask
, vCoverageMask
, pStencilBuffer
, stencilPassMask
);
579 uint32_t statMask
= _simd_movemask_ps(depthPassMask
);
580 uint32_t statCount
= _mm_popcnt_u32(statMask
);
581 UPDATE_STAT(DepthPassCount
, statCount
);
584 AR_BEGIN(BEOutputMerger
, pDC
->drawId
);
585 #if USE_8x2_TILE_BACKEND
586 OutputMerger(psContext
, pColorBuffer
, 0, &state
.blendState
, state
.pfnBlendFunc
, vCoverageMask
, depthPassMask
, state
.psState
.numRenderTargets
, useAlternateOffset
);
588 OutputMerger(psContext
, pColorBuffer
, 0, &state
.blendState
, state
.pfnBlendFunc
, vCoverageMask
, depthPassMask
, state
.psState
.numRenderTargets
);
591 // do final depth write after all pixel kills
592 if (!state
.psState
.forceEarlyZ
)
594 DepthStencilWrite(&state
.vp
[work
.triFlags
.viewportIndex
], &state
.depthStencilState
, work
.triFlags
.frontFacing
, psContext
.vZ
,
595 pDepthBuffer
, depthPassMask
, vCoverageMask
, pStencilBuffer
, stencilPassMask
);
597 AR_END(BEOutputMerger
, 0);
601 AR_BEGIN(BEEndTile
, pDC
->drawId
);
603 work
.coverageMask
[0] >>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
604 if(T::InputCoverage
== SWR_INPUT_COVERAGE_INNER_CONSERVATIVE
)
606 work
.innerCoverageMask
>>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
609 #if USE_8x2_TILE_BACKEND
610 if (useAlternateOffset
)
612 for (uint32_t rt
= 0; rt
< state
.psState
.numRenderTargets
; ++rt
)
614 pColorBuffer
[rt
] += (2 * KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
) / 8;
618 for (uint32_t rt
= 0; rt
< state
.psState
.numRenderTargets
; ++rt
)
620 pColorBuffer
[rt
] += (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
) / 8;
623 pDepthBuffer
+= (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
) / 8;
624 pStencilBuffer
+= (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
) / 8;
626 AR_END(BEEndTile
, 0);
628 psContext
.vX
.UL
= _simd_add_ps(psContext
.vX
.UL
, dx
);
629 psContext
.vX
.center
= _simd_add_ps(psContext
.vX
.center
, dx
);
632 psContext
.vY
.UL
= _simd_add_ps(psContext
.vY
.UL
, dy
);
633 psContext
.vY
.center
= _simd_add_ps(psContext
.vY
.center
, dy
);
636 AR_END(BESingleSampleBackend
, 0);
640 void BackendSampleRate(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t x
, uint32_t y
, SWR_TRIANGLE_DESC
&work
, RenderOutputBuffers
&renderBuffers
)
642 SWR_CONTEXT
*pContext
= pDC
->pContext
;
644 AR_BEGIN(BESampleRateBackend
, pDC
->drawId
);
645 AR_BEGIN(BESetup
, pDC
->drawId
);
647 const API_STATE
&state
= GetApiState(pDC
);
649 BarycentricCoeffs coeffs
;
650 SetupBarycentricCoeffs(&coeffs
, work
);
652 uint8_t *pColorBuffer
[SWR_NUM_RENDERTARGETS
], *pDepthBuffer
, *pStencilBuffer
;
653 SetupRenderBuffers(pColorBuffer
, &pDepthBuffer
, &pStencilBuffer
, state
.psState
.numRenderTargets
, renderBuffers
);
655 SWR_PS_CONTEXT psContext
;
656 SetupPixelShaderContext
<T
>(&psContext
, work
);
660 psContext
.vY
.UL
= _simd_add_ps(vULOffsetsY
, _simd_set1_ps(static_cast<float>(y
)));
661 psContext
.vY
.center
= _simd_add_ps(vCenterOffsetsY
, _simd_set1_ps(static_cast<float>(y
)));
663 const simdscalar dy
= _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM
));
665 for (uint32_t yy
= y
; yy
< y
+ KNOB_TILE_Y_DIM
; yy
+= SIMD_TILE_Y_DIM
)
667 psContext
.vX
.UL
= _simd_add_ps(vULOffsetsX
, _simd_set1_ps(static_cast<float>(x
)));
668 psContext
.vX
.center
= _simd_add_ps(vCenterOffsetsX
, _simd_set1_ps(static_cast<float>(x
)));
670 const simdscalar dx
= _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM
));
672 for (uint32_t xx
= x
; xx
< x
+ KNOB_TILE_X_DIM
; xx
+= SIMD_TILE_X_DIM
)
674 #if USE_8x2_TILE_BACKEND
675 const bool useAlternateOffset
= ((xx
& SIMD_TILE_X_DIM
) != 0);
678 if (T::InputCoverage
!= SWR_INPUT_COVERAGE_NONE
)
680 const uint64_t* pCoverageMask
= (T::InputCoverage
== SWR_INPUT_COVERAGE_INNER_CONSERVATIVE
) ? &work
.innerCoverageMask
: &work
.coverageMask
[0];
682 generateInputCoverage
<T
, T::InputCoverage
>(pCoverageMask
, psContext
.inputMask
, state
.blendState
.sampleMask
);
685 AR_BEGIN(BEBarycentric
, pDC
->drawId
);
687 CalcPixelBarycentrics(coeffs
, psContext
);
689 CalcCentroid
<T
, false>(&psContext
, coeffs
, work
.coverageMask
, state
.blendState
.sampleMask
);
691 AR_END(BEBarycentric
, 0);
693 for (uint32_t sample
= 0; sample
< T::MultisampleT::numSamples
; sample
++)
695 simdmask coverageMask
= work
.coverageMask
[sample
] & MASK
;
699 // offset depth/stencil buffers current sample
700 uint8_t *pDepthSample
= pDepthBuffer
+ RasterTileDepthOffset(sample
);
701 uint8_t *pStencilSample
= pStencilBuffer
+ RasterTileStencilOffset(sample
);
703 if (state
.depthHottileEnable
&& state
.depthBoundsState
.depthBoundsTestEnable
)
705 static_assert(KNOB_DEPTH_HOT_TILE_FORMAT
== R32_FLOAT
, "Unsupported depth hot tile format");
707 const simdscalar z
= _simd_load_ps(reinterpret_cast<const float *>(pDepthSample
));
709 const float minz
= state
.depthBoundsState
.depthBoundsTestMinValue
;
710 const float maxz
= state
.depthBoundsState
.depthBoundsTestMaxValue
;
712 coverageMask
&= CalcDepthBoundsAcceptMask(z
, minz
, maxz
);
715 AR_BEGIN(BEBarycentric
, pDC
->drawId
);
717 // calculate per sample positions
718 psContext
.vX
.sample
= _simd_add_ps(psContext
.vX
.UL
, T::MultisampleT::vX(sample
));
719 psContext
.vY
.sample
= _simd_add_ps(psContext
.vY
.UL
, T::MultisampleT::vY(sample
));
721 CalcSampleBarycentrics(coeffs
, psContext
);
723 // interpolate and quantize z
724 psContext
.vZ
= vplaneps(coeffs
.vZa
, coeffs
.vZb
, coeffs
.vZc
, psContext
.vI
.sample
, psContext
.vJ
.sample
);
725 psContext
.vZ
= state
.pfnQuantizeDepth(psContext
.vZ
);
727 AR_END(BEBarycentric
, 0);
729 // interpolate user clip distance if available
730 if (state
.rastState
.clipDistanceMask
)
732 coverageMask
&= ~ComputeUserClipMask(state
.rastState
.clipDistanceMask
, work
.pUserClipBuffer
, psContext
.vI
.sample
, psContext
.vJ
.sample
);
735 simdscalar vCoverageMask
= vMask(coverageMask
);
736 simdscalar depthPassMask
= vCoverageMask
;
737 simdscalar stencilPassMask
= vCoverageMask
;
742 AR_BEGIN(BEEarlyDepthTest
, pDC
->drawId
);
743 depthPassMask
= DepthStencilTest(&state
, work
.triFlags
.frontFacing
, work
.triFlags
.viewportIndex
,
744 psContext
.vZ
, pDepthSample
, vCoverageMask
, pStencilSample
, &stencilPassMask
);
745 AR_END(BEEarlyDepthTest
, 0);
747 // early-exit if no samples passed depth or earlyZ is forced on.
748 if (state
.psState
.forceEarlyZ
|| !_simd_movemask_ps(depthPassMask
))
750 DepthStencilWrite(&state
.vp
[work
.triFlags
.viewportIndex
], &state
.depthStencilState
, work
.triFlags
.frontFacing
, psContext
.vZ
,
751 pDepthSample
, depthPassMask
, vCoverageMask
, pStencilSample
, stencilPassMask
);
753 if (!_simd_movemask_ps(depthPassMask
))
755 work
.coverageMask
[sample
] >>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
761 psContext
.sampleIndex
= sample
;
762 psContext
.activeMask
= _simd_castps_si(vCoverageMask
);
764 // execute pixel shader
765 AR_BEGIN(BEPixelShader
, pDC
->drawId
);
766 UPDATE_STAT(PsInvocations
, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask
)));
767 state
.psState
.pfnPixelShader(GetPrivateState(pDC
), &psContext
);
768 AR_END(BEPixelShader
, 0);
770 vCoverageMask
= _simd_castsi_ps(psContext
.activeMask
);
775 AR_BEGIN(BELateDepthTest
, pDC
->drawId
);
776 depthPassMask
= DepthStencilTest(&state
, work
.triFlags
.frontFacing
, work
.triFlags
.viewportIndex
,
777 psContext
.vZ
, pDepthSample
, vCoverageMask
, pStencilSample
, &stencilPassMask
);
778 AR_END(BELateDepthTest
, 0);
780 if (!_simd_movemask_ps(depthPassMask
))
782 // need to call depth/stencil write for stencil write
783 DepthStencilWrite(&state
.vp
[work
.triFlags
.viewportIndex
], &state
.depthStencilState
, work
.triFlags
.frontFacing
, psContext
.vZ
,
784 pDepthSample
, depthPassMask
, vCoverageMask
, pStencilSample
, stencilPassMask
);
786 work
.coverageMask
[sample
] >>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
791 uint32_t statMask
= _simd_movemask_ps(depthPassMask
);
792 uint32_t statCount
= _mm_popcnt_u32(statMask
);
793 UPDATE_STAT(DepthPassCount
, statCount
);
796 AR_BEGIN(BEOutputMerger
, pDC
->drawId
);
797 #if USE_8x2_TILE_BACKEND
798 OutputMerger(psContext
, pColorBuffer
, sample
, &state
.blendState
, state
.pfnBlendFunc
, vCoverageMask
, depthPassMask
, state
.psState
.numRenderTargets
, useAlternateOffset
);
800 OutputMerger(psContext
, pColorBuffer
, sample
, &state
.blendState
, state
.pfnBlendFunc
, vCoverageMask
, depthPassMask
, state
.psState
.numRenderTargets
);
803 // do final depth write after all pixel kills
804 if (!state
.psState
.forceEarlyZ
)
806 DepthStencilWrite(&state
.vp
[work
.triFlags
.viewportIndex
], &state
.depthStencilState
, work
.triFlags
.frontFacing
, psContext
.vZ
,
807 pDepthSample
, depthPassMask
, vCoverageMask
, pStencilSample
, stencilPassMask
);
809 AR_END(BEOutputMerger
, 0);
811 work
.coverageMask
[sample
] >>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
817 AR_BEGIN(BEEndTile
, pDC
->drawId
);
819 if(T::InputCoverage
== SWR_INPUT_COVERAGE_INNER_CONSERVATIVE
)
821 work
.innerCoverageMask
>>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
824 #if USE_8x2_TILE_BACKEND
825 if (useAlternateOffset
)
827 for (uint32_t rt
= 0; rt
< state
.psState
.numRenderTargets
; ++rt
)
829 pColorBuffer
[rt
] += (2 * KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
) / 8;
833 for (uint32_t rt
= 0; rt
< state
.psState
.numRenderTargets
; ++rt
)
835 pColorBuffer
[rt
] += (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
) / 8;
838 pDepthBuffer
+= (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
) / 8;
839 pStencilBuffer
+= (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
) / 8;
841 AR_END(BEEndTile
, 0);
843 psContext
.vX
.UL
= _simd_add_ps(psContext
.vX
.UL
, dx
);
844 psContext
.vX
.center
= _simd_add_ps(psContext
.vX
.center
, dx
);
847 psContext
.vY
.UL
= _simd_add_ps(psContext
.vY
.UL
, dy
);
848 psContext
.vY
.center
= _simd_add_ps(psContext
.vY
.center
, dy
);
851 AR_END(BESampleRateBackend
, 0);
855 void BackendPixelRate(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t x
, uint32_t y
, SWR_TRIANGLE_DESC
&work
, RenderOutputBuffers
&renderBuffers
)
857 SWR_CONTEXT
*pContext
= pDC
->pContext
;
859 AR_BEGIN(BEPixelRateBackend
, pDC
->drawId
);
860 AR_BEGIN(BESetup
, pDC
->drawId
);
862 const API_STATE
&state
= GetApiState(pDC
);
864 BarycentricCoeffs coeffs
;
865 SetupBarycentricCoeffs(&coeffs
, work
);
867 uint8_t *pColorBuffer
[SWR_NUM_RENDERTARGETS
], *pDepthBuffer
, *pStencilBuffer
;
868 SetupRenderBuffers(pColorBuffer
, &pDepthBuffer
, &pStencilBuffer
, state
.psState
.numRenderTargets
, renderBuffers
);
870 SWR_PS_CONTEXT psContext
;
871 SetupPixelShaderContext
<T
>(&psContext
, work
);
875 PixelRateZTestLoop
<T
> PixelRateZTest(pDC
, workerId
, work
, coeffs
, state
, pDepthBuffer
, pStencilBuffer
, state
.rastState
.clipDistanceMask
);
877 psContext
.vY
.UL
= _simd_add_ps(vULOffsetsY
, _simd_set1_ps(static_cast<float>(y
)));
878 psContext
.vY
.center
= _simd_add_ps(vCenterOffsetsY
, _simd_set1_ps(static_cast<float>(y
)));
880 const simdscalar dy
= _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM
));
882 for(uint32_t yy
= y
; yy
< y
+ KNOB_TILE_Y_DIM
; yy
+= SIMD_TILE_Y_DIM
)
884 psContext
.vX
.UL
= _simd_add_ps(vULOffsetsX
, _simd_set1_ps(static_cast<float>(x
)));
885 psContext
.vX
.center
= _simd_add_ps(vCenterOffsetsX
, _simd_set1_ps(static_cast<float>(x
)));
887 const simdscalar dx
= _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM
));
889 for(uint32_t xx
= x
; xx
< x
+ KNOB_TILE_X_DIM
; xx
+= SIMD_TILE_X_DIM
)
891 #if USE_8x2_TILE_BACKEND
892 const bool useAlternateOffset
= ((xx
& SIMD_TILE_X_DIM
) != 0);
895 simdscalar activeLanes
;
896 if(!(work
.anyCoveredSamples
& MASK
)) {goto Endtile
;};
897 activeLanes
= vMask(work
.anyCoveredSamples
& MASK
);
899 if (T::InputCoverage
!= SWR_INPUT_COVERAGE_NONE
)
901 const uint64_t* pCoverageMask
= (T::InputCoverage
== SWR_INPUT_COVERAGE_INNER_CONSERVATIVE
) ? &work
.innerCoverageMask
: &work
.coverageMask
[0];
903 generateInputCoverage
<T
, T::InputCoverage
>(pCoverageMask
, psContext
.inputMask
, state
.blendState
.sampleMask
);
906 AR_BEGIN(BEBarycentric
, pDC
->drawId
);
908 CalcPixelBarycentrics(coeffs
, psContext
);
910 CalcCentroid
<T
, false>(&psContext
, coeffs
, work
.coverageMask
, state
.blendState
.sampleMask
);
912 AR_END(BEBarycentric
, 0);
914 if(T::bForcedSampleCount
)
916 // candidate pixels (that passed coverage) will cause shader invocation if any bits in the samplemask are set
917 const simdscalar vSampleMask
= _simd_castsi_ps(_simd_cmpgt_epi32(_simd_set1_epi32(state
.blendState
.sampleMask
), _simd_setzero_si()));
918 activeLanes
= _simd_and_ps(activeLanes
, vSampleMask
);
922 if(T::bCanEarlyZ
&& !T::bForcedSampleCount
)
924 uint32_t depthPassCount
= PixelRateZTest(activeLanes
, psContext
, BEEarlyDepthTest
);
925 UPDATE_STAT(DepthPassCount
, depthPassCount
);
928 // if we have no covered samples that passed depth at this point, go to next tile
929 if(!_simd_movemask_ps(activeLanes
)) { goto Endtile
; };
931 if(state
.psState
.usesSourceDepth
)
933 AR_BEGIN(BEBarycentric
, pDC
->drawId
);
934 // interpolate and quantize z
935 psContext
.vZ
= vplaneps(coeffs
.vZa
, coeffs
.vZb
, coeffs
.vZc
, psContext
.vI
.center
, psContext
.vJ
.center
);
936 psContext
.vZ
= state
.pfnQuantizeDepth(psContext
.vZ
);
937 AR_END(BEBarycentric
, 0);
940 // pixels that are currently active
941 psContext
.activeMask
= _simd_castps_si(activeLanes
);
942 psContext
.oMask
= T::MultisampleT::FullSampleMask();
944 // execute pixel shader
945 AR_BEGIN(BEPixelShader
, pDC
->drawId
);
946 state
.psState
.pfnPixelShader(GetPrivateState(pDC
), &psContext
);
947 UPDATE_STAT(PsInvocations
, _mm_popcnt_u32(_simd_movemask_ps(activeLanes
)));
948 AR_END(BEPixelShader
, 0);
950 // update active lanes to remove any discarded or oMask'd pixels
951 activeLanes
= _simd_castsi_ps(_simd_and_si(psContext
.activeMask
, _simd_cmpgt_epi32(psContext
.oMask
, _simd_setzero_si())));
952 if(!_simd_movemask_ps(activeLanes
)) { goto Endtile
; };
955 if(!T::bCanEarlyZ
&& !T::bForcedSampleCount
)
957 uint32_t depthPassCount
= PixelRateZTest(activeLanes
, psContext
, BELateDepthTest
);
958 UPDATE_STAT(DepthPassCount
, depthPassCount
);
961 // if we have no covered samples that passed depth at this point, skip OM and go to next tile
962 if(!_simd_movemask_ps(activeLanes
)) { goto Endtile
; };
965 // loop over all samples, broadcasting the results of the PS to all passing pixels
966 for(uint32_t sample
= 0; sample
< GetNumOMSamples
<T
>(state
.blendState
.sampleCount
); sample
++)
968 AR_BEGIN(BEOutputMerger
, pDC
->drawId
);
969 // center pattern does a single coverage/depth/stencil test, standard pattern tests all samples
970 uint32_t coverageSampleNum
= (T::bIsStandardPattern
) ? sample
: 0;
971 simdscalar coverageMask
, depthMask
;
972 if(T::bForcedSampleCount
)
974 coverageMask
= depthMask
= activeLanes
;
978 coverageMask
= PixelRateZTest
.vCoverageMask
[coverageSampleNum
];
979 depthMask
= PixelRateZTest
.depthPassMask
[coverageSampleNum
];
980 if(!_simd_movemask_ps(depthMask
))
982 // stencil should already have been written in early/lateZ tests
983 AR_END(BEOutputMerger
, 0);
988 // broadcast the results of the PS to all passing pixels
989 #if USE_8x2_TILE_BACKEND
990 OutputMerger(psContext
, pColorBuffer
, sample
, &state
.blendState
, state
.pfnBlendFunc
, coverageMask
, depthMask
, state
.psState
.numRenderTargets
, useAlternateOffset
);
992 OutputMerger(psContext
, pColorBuffer
, sample
, &state
.blendState
, state
.pfnBlendFunc
, coverageMask
, depthMask
, state
.psState
.numRenderTargets
);
995 if(!state
.psState
.forceEarlyZ
&& !T::bForcedSampleCount
)
997 uint8_t *pDepthSample
= pDepthBuffer
+ RasterTileDepthOffset(sample
);
998 uint8_t * pStencilSample
= pStencilBuffer
+ RasterTileStencilOffset(sample
);
1000 DepthStencilWrite(&state
.vp
[work
.triFlags
.viewportIndex
], &state
.depthStencilState
, work
.triFlags
.frontFacing
, PixelRateZTest
.vZ
[coverageSampleNum
],
1001 pDepthSample
, depthMask
, coverageMask
, pStencilSample
, PixelRateZTest
.stencilPassMask
[coverageSampleNum
]);
1003 AR_END(BEOutputMerger
, 0);
1006 AR_BEGIN(BEEndTile
, pDC
->drawId
);
1008 for(uint32_t sample
= 0; sample
< T::MultisampleT::numCoverageSamples
; sample
++)
1010 work
.coverageMask
[sample
] >>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
1013 if(T::InputCoverage
== SWR_INPUT_COVERAGE_INNER_CONSERVATIVE
)
1015 work
.innerCoverageMask
>>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
1017 work
.anyCoveredSamples
>>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
1019 #if USE_8x2_TILE_BACKEND
1020 if (useAlternateOffset
)
1022 for (uint32_t rt
= 0; rt
< state
.psState
.numRenderTargets
; ++rt
)
1024 pColorBuffer
[rt
] += (2 * KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
) / 8;
1028 for(uint32_t rt
= 0; rt
< state
.psState
.numRenderTargets
; ++rt
)
1030 pColorBuffer
[rt
] += (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
) / 8;
1032 pDepthBuffer
+= (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
) / 8;
1033 pStencilBuffer
+= (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
) / 8;
1036 AR_END(BEEndTile
, 0);
1038 psContext
.vX
.UL
= _simd_add_ps(psContext
.vX
.UL
, dx
);
1039 psContext
.vX
.center
= _simd_add_ps(psContext
.vX
.center
, dx
);
1042 psContext
.vY
.UL
= _simd_add_ps(psContext
.vY
.UL
, dy
);
1043 psContext
.vY
.center
= _simd_add_ps(psContext
.vY
.center
, dy
);
1046 AR_END(BEPixelRateBackend
, 0);
1048 // optimized backend flow with NULL PS
1049 template<uint32_t sampleCountT
>
1050 void BackendNullPS(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t x
, uint32_t y
, SWR_TRIANGLE_DESC
&work
, RenderOutputBuffers
&renderBuffers
)
1052 SWR_CONTEXT
*pContext
= pDC
->pContext
;
1054 AR_BEGIN(BENullBackend
, pDC
->drawId
);
1055 ///@todo: handle center multisample pattern
1056 typedef SwrBackendTraits
<sampleCountT
, SWR_MSAA_STANDARD_PATTERN
> T
;
1057 AR_BEGIN(BESetup
, pDC
->drawId
);
1059 const API_STATE
&state
= GetApiState(pDC
);
1061 BarycentricCoeffs coeffs
;
1062 SetupBarycentricCoeffs(&coeffs
, work
);
1064 uint8_t *pDepthBuffer
, *pStencilBuffer
;
1065 SetupRenderBuffers(NULL
, &pDepthBuffer
, &pStencilBuffer
, 0, renderBuffers
);
1067 SWR_PS_CONTEXT psContext
;
1068 // skip SetupPixelShaderContext(&psContext, ...); // not needed here
1072 simdscalar vYSamplePosUL
= _simd_add_ps(vULOffsetsY
, _simd_set1_ps(static_cast<float>(y
)));
1074 const simdscalar dy
= _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM
));
1076 for (uint32_t yy
= y
; yy
< y
+ KNOB_TILE_Y_DIM
; yy
+= SIMD_TILE_Y_DIM
)
1078 simdscalar vXSamplePosUL
= _simd_add_ps(vULOffsetsX
, _simd_set1_ps(static_cast<float>(x
)));
1080 const simdscalar dx
= _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM
));
1082 for (uint32_t xx
= x
; xx
< x
+ KNOB_TILE_X_DIM
; xx
+= SIMD_TILE_X_DIM
)
1084 // iterate over active samples
1085 unsigned long sample
= 0;
1086 uint32_t sampleMask
= state
.blendState
.sampleMask
;
1087 while (_BitScanForward(&sample
, sampleMask
))
1089 sampleMask
&= ~(1 << sample
);
1091 simdmask coverageMask
= work
.coverageMask
[sample
] & MASK
;
1095 // offset depth/stencil buffers current sample
1096 uint8_t *pDepthSample
= pDepthBuffer
+ RasterTileDepthOffset(sample
);
1097 uint8_t *pStencilSample
= pStencilBuffer
+ RasterTileStencilOffset(sample
);
1099 if (state
.depthHottileEnable
&& state
.depthBoundsState
.depthBoundsTestEnable
)
1101 static_assert(KNOB_DEPTH_HOT_TILE_FORMAT
== R32_FLOAT
, "Unsupported depth hot tile format");
1103 const simdscalar z
= _simd_load_ps(reinterpret_cast<const float *>(pDepthSample
));
1105 const float minz
= state
.depthBoundsState
.depthBoundsTestMinValue
;
1106 const float maxz
= state
.depthBoundsState
.depthBoundsTestMaxValue
;
1108 coverageMask
&= CalcDepthBoundsAcceptMask(z
, minz
, maxz
);
1111 AR_BEGIN(BEBarycentric
, pDC
->drawId
);
1113 // calculate per sample positions
1114 psContext
.vX
.sample
= _simd_add_ps(vXSamplePosUL
, T::MultisampleT::vX(sample
));
1115 psContext
.vY
.sample
= _simd_add_ps(vYSamplePosUL
, T::MultisampleT::vY(sample
));
1117 CalcSampleBarycentrics(coeffs
, psContext
);
1119 // interpolate and quantize z
1120 psContext
.vZ
= vplaneps(coeffs
.vZa
, coeffs
.vZb
, coeffs
.vZc
, psContext
.vI
.sample
, psContext
.vJ
.sample
);
1121 psContext
.vZ
= state
.pfnQuantizeDepth(psContext
.vZ
);
1123 AR_END(BEBarycentric
, 0);
1125 // interpolate user clip distance if available
1126 if (state
.rastState
.clipDistanceMask
)
1128 coverageMask
&= ~ComputeUserClipMask(state
.rastState
.clipDistanceMask
, work
.pUserClipBuffer
, psContext
.vI
.sample
, psContext
.vJ
.sample
);
1131 simdscalar vCoverageMask
= vMask(coverageMask
);
1132 simdscalar stencilPassMask
= vCoverageMask
;
1134 AR_BEGIN(BEEarlyDepthTest
, pDC
->drawId
);
1135 simdscalar depthPassMask
= DepthStencilTest(&state
, work
.triFlags
.frontFacing
, work
.triFlags
.viewportIndex
,
1136 psContext
.vZ
, pDepthSample
, vCoverageMask
, pStencilSample
, &stencilPassMask
);
1137 DepthStencilWrite(&state
.vp
[work
.triFlags
.viewportIndex
], &state
.depthStencilState
, work
.triFlags
.frontFacing
, psContext
.vZ
,
1138 pDepthSample
, depthPassMask
, vCoverageMask
, pStencilSample
, stencilPassMask
);
1139 AR_END(BEEarlyDepthTest
, 0);
1141 uint32_t statMask
= _simd_movemask_ps(depthPassMask
);
1142 uint32_t statCount
= _mm_popcnt_u32(statMask
);
1143 UPDATE_STAT(DepthPassCount
, statCount
);
1148 work
.coverageMask
[sample
] >>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
1151 pDepthBuffer
+= (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
) / 8;
1152 pStencilBuffer
+= (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
) / 8;
1154 vXSamplePosUL
= _simd_add_ps(vXSamplePosUL
, dx
);
1157 vYSamplePosUL
= _simd_add_ps(vYSamplePosUL
, dy
);
1160 AR_END(BENullBackend
, 0);
1163 void InitClearTilesTable()
1165 memset(sClearTilesTable
, 0, sizeof(sClearTilesTable
));
1167 sClearTilesTable
[R8G8B8A8_UNORM
] = ClearMacroTile
<R8G8B8A8_UNORM
>;
1168 sClearTilesTable
[B8G8R8A8_UNORM
] = ClearMacroTile
<B8G8R8A8_UNORM
>;
1169 sClearTilesTable
[R32_FLOAT
] = ClearMacroTile
<R32_FLOAT
>;
1170 sClearTilesTable
[R32G32B32A32_FLOAT
] = ClearMacroTile
<R32G32B32A32_FLOAT
>;
1171 sClearTilesTable
[R8_UINT
] = ClearMacroTile
<R8_UINT
>;
1174 PFN_BACKEND_FUNC gBackendNullPs
[SWR_MULTISAMPLE_TYPE_COUNT
];
1175 PFN_BACKEND_FUNC gBackendSingleSample
[SWR_INPUT_COVERAGE_COUNT
]
1179 PFN_BACKEND_FUNC gBackendPixelRateTable
[SWR_MULTISAMPLE_TYPE_COUNT
]
1180 [SWR_MSAA_SAMPLE_PATTERN_COUNT
]
1181 [SWR_INPUT_COVERAGE_COUNT
]
1183 [2] // forcedSampleCount
1186 PFN_BACKEND_FUNC gBackendSampleRateTable
[SWR_MULTISAMPLE_TYPE_COUNT
]
1187 [SWR_INPUT_COVERAGE_COUNT
]
1192 // Recursive template used to auto-nest conditionals. Converts dynamic enum function
1193 // arguments to static template arguments.
1194 template <uint32_t... ArgsT
>
1197 // Last Arg Terminator
1198 static PFN_BACKEND_FUNC
GetFunc(SWR_BACKEND_FUNCS tArg
)
1202 case SWR_BACKEND_SINGLE_SAMPLE
: return BackendSingleSample
<SwrBackendTraits
<ArgsT
...>>; break;
1203 case SWR_BACKEND_MSAA_PIXEL_RATE
: return BackendPixelRate
<SwrBackendTraits
<ArgsT
...>>; break;
1204 case SWR_BACKEND_MSAA_SAMPLE_RATE
: return BackendSampleRate
<SwrBackendTraits
<ArgsT
...>>; break;
1206 SWR_ASSERT(0 && "Invalid backend func\n");
1212 // Recursively parse args
1213 template <typename
... TArgsT
>
1214 static PFN_BACKEND_FUNC
GetFunc(SWR_MSAA_SAMPLE_PATTERN tArg
, TArgsT
... remainingArgs
)
1218 case SWR_MSAA_CENTER_PATTERN
: return BEChooser
<ArgsT
..., SWR_MSAA_CENTER_PATTERN
>::GetFunc(remainingArgs
...); break;
1219 case SWR_MSAA_STANDARD_PATTERN
: return BEChooser
<ArgsT
..., SWR_MSAA_STANDARD_PATTERN
>::GetFunc(remainingArgs
...); break;
1221 SWR_ASSERT(0 && "Invalid sample pattern\n");
1222 return BEChooser
<ArgsT
..., SWR_MSAA_STANDARD_PATTERN
>::GetFunc(remainingArgs
...);
1227 // Recursively parse args
1228 template <typename
... TArgsT
>
1229 static PFN_BACKEND_FUNC
GetFunc(SWR_INPUT_COVERAGE tArg
, TArgsT
... remainingArgs
)
1233 case SWR_INPUT_COVERAGE_NONE
: return BEChooser
<ArgsT
..., SWR_INPUT_COVERAGE_NONE
>::GetFunc(remainingArgs
...); break;
1234 case SWR_INPUT_COVERAGE_NORMAL
: return BEChooser
<ArgsT
..., SWR_INPUT_COVERAGE_NORMAL
>::GetFunc(remainingArgs
...); break;
1235 case SWR_INPUT_COVERAGE_INNER_CONSERVATIVE
: return BEChooser
<ArgsT
..., SWR_INPUT_COVERAGE_INNER_CONSERVATIVE
>::GetFunc(remainingArgs
...); break;
1237 SWR_ASSERT(0 && "Invalid sample pattern\n");
1238 return BEChooser
<ArgsT
..., SWR_INPUT_COVERAGE_NONE
>::GetFunc(remainingArgs
...);
1243 // Recursively parse args
1244 template <typename
... TArgsT
>
1245 static PFN_BACKEND_FUNC
GetFunc(SWR_MULTISAMPLE_COUNT tArg
, TArgsT
... remainingArgs
)
1249 case SWR_MULTISAMPLE_1X
: return BEChooser
<ArgsT
..., SWR_MULTISAMPLE_1X
>::GetFunc(remainingArgs
...); break;
1250 case SWR_MULTISAMPLE_2X
: return BEChooser
<ArgsT
..., SWR_MULTISAMPLE_2X
>::GetFunc(remainingArgs
...); break;
1251 case SWR_MULTISAMPLE_4X
: return BEChooser
<ArgsT
..., SWR_MULTISAMPLE_4X
>::GetFunc(remainingArgs
...); break;
1252 case SWR_MULTISAMPLE_8X
: return BEChooser
<ArgsT
..., SWR_MULTISAMPLE_8X
>::GetFunc(remainingArgs
...); break;
1253 case SWR_MULTISAMPLE_16X
: return BEChooser
<ArgsT
..., SWR_MULTISAMPLE_16X
>::GetFunc(remainingArgs
...); break;
1255 SWR_ASSERT(0 && "Invalid sample count\n");
1256 return BEChooser
<ArgsT
..., SWR_MULTISAMPLE_1X
>::GetFunc(remainingArgs
...);
1261 // Recursively parse args
1262 template <typename
... TArgsT
>
1263 static PFN_BACKEND_FUNC
GetFunc(bool tArg
, TArgsT
... remainingArgs
)
1267 return BEChooser
<ArgsT
..., 1>::GetFunc(remainingArgs
...);
1270 return BEChooser
<ArgsT
..., 0>::GetFunc(remainingArgs
...);
1274 void InitBackendSingleFuncTable(PFN_BACKEND_FUNC (&table
)[SWR_INPUT_COVERAGE_COUNT
][2][2])
1276 for(uint32_t inputCoverage
= 0; inputCoverage
< SWR_INPUT_COVERAGE_COUNT
; inputCoverage
++)
1278 for(uint32_t isCentroid
= 0; isCentroid
< 2; isCentroid
++)
1280 for(uint32_t canEarlyZ
= 0; canEarlyZ
< 2; canEarlyZ
++)
1282 table
[inputCoverage
][isCentroid
][canEarlyZ
] =
1283 BEChooser
<>::GetFunc(SWR_MULTISAMPLE_1X
, SWR_MSAA_STANDARD_PATTERN
, (SWR_INPUT_COVERAGE
)inputCoverage
,
1284 (isCentroid
> 0), false, (canEarlyZ
> 0), SWR_BACKEND_SINGLE_SAMPLE
);
1290 void InitBackendPixelFuncTable(PFN_BACKEND_FUNC (&table
)[SWR_MULTISAMPLE_TYPE_COUNT
][SWR_MSAA_SAMPLE_PATTERN_COUNT
][SWR_INPUT_COVERAGE_COUNT
][2][2][2])
1292 for(uint32_t sampleCount
= SWR_MULTISAMPLE_1X
; sampleCount
< SWR_MULTISAMPLE_TYPE_COUNT
; sampleCount
++)
1294 for(uint32_t samplePattern
= SWR_MSAA_CENTER_PATTERN
; samplePattern
< SWR_MSAA_SAMPLE_PATTERN_COUNT
; samplePattern
++)
1296 for(uint32_t inputCoverage
= 0; inputCoverage
< SWR_INPUT_COVERAGE_COUNT
; inputCoverage
++)
1298 for(uint32_t isCentroid
= 0; isCentroid
< 2; isCentroid
++)
1300 for(uint32_t forcedSampleCount
= 0; forcedSampleCount
< 2; forcedSampleCount
++)
1302 for(uint32_t canEarlyZ
= 0; canEarlyZ
< 2; canEarlyZ
++)
1304 table
[sampleCount
][samplePattern
][inputCoverage
][isCentroid
][forcedSampleCount
][canEarlyZ
] =
1305 BEChooser
<>::GetFunc((SWR_MULTISAMPLE_COUNT
)sampleCount
, (SWR_MSAA_SAMPLE_PATTERN
)samplePattern
, (SWR_INPUT_COVERAGE
)inputCoverage
,
1306 (isCentroid
> 0), (forcedSampleCount
> 0), (canEarlyZ
> 0), SWR_BACKEND_MSAA_PIXEL_RATE
);
1315 void InitBackendSampleFuncTable(PFN_BACKEND_FUNC (&table
)[SWR_MULTISAMPLE_TYPE_COUNT
][SWR_INPUT_COVERAGE_COUNT
][2][2])
1317 for(uint32_t sampleCount
= SWR_MULTISAMPLE_1X
; sampleCount
< SWR_MULTISAMPLE_TYPE_COUNT
; sampleCount
++)
1319 for(uint32_t inputCoverage
= 0; inputCoverage
< SWR_INPUT_COVERAGE_COUNT
; inputCoverage
++)
1321 for(uint32_t centroid
= 0; centroid
< 2; centroid
++)
1323 for(uint32_t canEarlyZ
= 0; canEarlyZ
< 2; canEarlyZ
++)
1325 table
[sampleCount
][inputCoverage
][centroid
][canEarlyZ
] =
1326 BEChooser
<>::GetFunc((SWR_MULTISAMPLE_COUNT
)sampleCount
, SWR_MSAA_STANDARD_PATTERN
, (SWR_INPUT_COVERAGE
)inputCoverage
,
1327 (centroid
> 0), false, (canEarlyZ
> 0), (SWR_BACKEND_FUNCS
)SWR_BACKEND_MSAA_SAMPLE_RATE
);
1334 void InitBackendFuncTables()
1336 InitBackendSingleFuncTable(gBackendSingleSample
);
1337 InitBackendPixelFuncTable(gBackendPixelRateTable
);
1338 InitBackendSampleFuncTable(gBackendSampleRateTable
);
1340 gBackendNullPs
[SWR_MULTISAMPLE_1X
] = &BackendNullPS
< SWR_MULTISAMPLE_1X
> ;
1341 gBackendNullPs
[SWR_MULTISAMPLE_2X
] = &BackendNullPS
< SWR_MULTISAMPLE_2X
> ;
1342 gBackendNullPs
[SWR_MULTISAMPLE_4X
] = &BackendNullPS
< SWR_MULTISAMPLE_4X
> ;
1343 gBackendNullPs
[SWR_MULTISAMPLE_8X
] = &BackendNullPS
< SWR_MULTISAMPLE_8X
> ;
1344 gBackendNullPs
[SWR_MULTISAMPLE_16X
] = &BackendNullPS
< SWR_MULTISAMPLE_16X
> ;