1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * @brief Backend handles rasterization, pixel shading and output merger
28 ******************************************************************************/
30 #include <smmintrin.h>
33 #include "depthstencil.h"
35 #include "memory/tilingtraits.h"
36 #include "core/multisample.h"
40 typedef void(*PFN_CLEAR_TILES
)(DRAW_CONTEXT
*, SWR_RENDERTARGET_ATTACHMENT rt
, uint32_t, DWORD
[4], const SWR_RECT
& rect
);
41 static PFN_CLEAR_TILES sClearTilesTable
[NUM_SWR_FORMATS
];
43 //////////////////////////////////////////////////////////////////////////
44 /// @brief Process compute work.
45 /// @param pDC - pointer to draw context (dispatch).
46 /// @param workerId - The unique worker ID that is assigned to this thread.
47 /// @param threadGroupId - the linear index for the thread group within the dispatch.
48 void ProcessComputeBE(DRAW_CONTEXT
* pDC
, uint32_t workerId
, uint32_t threadGroupId
, void*& pSpillFillBuffer
)
50 SWR_CONTEXT
*pContext
= pDC
->pContext
;
52 AR_BEGIN(BEDispatch
, pDC
->drawId
);
54 const COMPUTE_DESC
* pTaskData
= (COMPUTE_DESC
*)pDC
->pDispatch
->GetTasksData();
55 SWR_ASSERT(pTaskData
!= nullptr);
57 // Ensure spill fill memory has been allocated.
58 size_t spillFillSize
= pDC
->pState
->state
.totalSpillFillSize
;
59 if (spillFillSize
&& pSpillFillBuffer
== nullptr)
61 pSpillFillBuffer
= pDC
->pArena
->AllocAlignedSync(spillFillSize
, KNOB_SIMD_BYTES
);
64 const API_STATE
& state
= GetApiState(pDC
);
66 SWR_CS_CONTEXT csContext
{ 0 };
67 csContext
.tileCounter
= threadGroupId
;
68 csContext
.dispatchDims
[0] = pTaskData
->threadGroupCountX
;
69 csContext
.dispatchDims
[1] = pTaskData
->threadGroupCountY
;
70 csContext
.dispatchDims
[2] = pTaskData
->threadGroupCountZ
;
71 csContext
.pTGSM
= pContext
->ppScratch
[workerId
];
72 csContext
.pSpillFillBuffer
= (uint8_t*)pSpillFillBuffer
;
74 state
.pfnCsFunc(GetPrivateState(pDC
), &csContext
);
76 UPDATE_STAT(CsInvocations
, state
.totalThreadsInGroup
);
78 AR_END(BEDispatch
, 1);
81 //////////////////////////////////////////////////////////////////////////
82 /// @brief Process shutdown.
83 /// @param pDC - pointer to draw context (dispatch).
84 /// @param workerId - The unique worker ID that is assigned to this thread.
85 /// @param threadGroupId - the linear index for the thread group within the dispatch.
86 void ProcessShutdownBE(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t macroTile
, void *pUserData
)
91 void ProcessSyncBE(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t macroTile
, void *pUserData
)
94 MacroTileMgr::getTileIndices(macroTile
, x
, y
);
95 SWR_ASSERT(x
== 0 && y
== 0);
98 template<SWR_FORMAT format
>
99 void ClearRasterTile(uint8_t *pTileBuffer
, simdvector
&value
)
101 auto lambda
= [&](int32_t comp
)
103 FormatTraits
<format
>::storeSOA(comp
, pTileBuffer
, value
.v
[comp
]);
104 pTileBuffer
+= (KNOB_SIMD_WIDTH
* FormatTraits
<format
>::GetBPC(comp
) / 8);
107 const uint32_t numIter
= (KNOB_TILE_Y_DIM
/ SIMD_TILE_Y_DIM
) * (KNOB_TILE_X_DIM
/ SIMD_TILE_X_DIM
);
108 for (uint32_t i
= 0; i
< numIter
; ++i
)
110 UnrollerL
<0, FormatTraits
<format
>::numComps
, 1>::step(lambda
);
114 template<SWR_FORMAT format
>
115 INLINE
void ClearMacroTile(DRAW_CONTEXT
*pDC
, SWR_RENDERTARGET_ATTACHMENT rt
, uint32_t macroTile
, DWORD clear
[4], const SWR_RECT
& rect
)
117 // convert clear color to hottile format
118 // clear color is in RGBA float/uint32
120 for (uint32_t comp
= 0; comp
< FormatTraits
<format
>::numComps
; ++comp
)
123 vComp
= _simd_load1_ps((const float*)&clear
[comp
]);
124 if (FormatTraits
<format
>::isNormalized(comp
))
126 vComp
= _simd_mul_ps(vComp
, _simd_set1_ps(FormatTraits
<format
>::fromFloat(comp
)));
127 vComp
= _simd_castsi_ps(_simd_cvtps_epi32(vComp
));
129 vComp
= FormatTraits
<format
>::pack(comp
, vComp
);
130 vClear
.v
[FormatTraits
<format
>::swizzle(comp
)] = vComp
;
133 uint32_t tileX
, tileY
;
134 MacroTileMgr::getTileIndices(macroTile
, tileX
, tileY
);
136 // Init to full macrotile
139 KNOB_MACROTILE_X_DIM
* int32_t(tileX
),
140 KNOB_MACROTILE_Y_DIM
* int32_t(tileY
),
141 KNOB_MACROTILE_X_DIM
* int32_t(tileX
+ 1),
142 KNOB_MACROTILE_Y_DIM
* int32_t(tileY
+ 1),
145 // intersect with clear rect
148 // translate to local hottile origin
149 clearTile
.Translate(-int32_t(tileX
) * KNOB_MACROTILE_X_DIM
, -int32_t(tileY
) * KNOB_MACROTILE_Y_DIM
);
151 // Make maximums inclusive (needed for convert to raster tiles)
155 // convert to raster tiles
156 clearTile
.ymin
>>= (KNOB_TILE_Y_DIM_SHIFT
);
157 clearTile
.ymax
>>= (KNOB_TILE_Y_DIM_SHIFT
);
158 clearTile
.xmin
>>= (KNOB_TILE_X_DIM_SHIFT
);
159 clearTile
.xmax
>>= (KNOB_TILE_X_DIM_SHIFT
);
161 const int32_t numSamples
= GetNumSamples(pDC
->pState
->state
.rastState
.sampleCount
);
162 // compute steps between raster tile samples / raster tiles / macro tile rows
163 const uint32_t rasterTileSampleStep
= KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<format
>::bpp
/ 8;
164 const uint32_t rasterTileStep
= (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* (FormatTraits
<format
>::bpp
/ 8)) * numSamples
;
165 const uint32_t macroTileRowStep
= (KNOB_MACROTILE_X_DIM
/ KNOB_TILE_X_DIM
) * rasterTileStep
;
166 const uint32_t pitch
= (FormatTraits
<format
>::bpp
* KNOB_MACROTILE_X_DIM
/ 8);
168 HOTTILE
*pHotTile
= pDC
->pContext
->pHotTileMgr
->GetHotTile(pDC
->pContext
, pDC
, macroTile
, rt
, true, numSamples
);
169 uint32_t rasterTileStartOffset
= (ComputeTileOffset2D
< TilingTraits
<SWR_TILE_SWRZ
, FormatTraits
<format
>::bpp
> >(pitch
, clearTile
.xmin
, clearTile
.ymin
)) * numSamples
;
170 uint8_t* pRasterTileRow
= pHotTile
->pBuffer
+ rasterTileStartOffset
; //(ComputeTileOffset2D< TilingTraits<SWR_TILE_SWRZ, FormatTraits<format>::bpp > >(pitch, x, y)) * numSamples;
172 // loop over all raster tiles in the current hot tile
173 for (int32_t y
= clearTile
.ymin
; y
<= clearTile
.ymax
; ++y
)
175 uint8_t* pRasterTile
= pRasterTileRow
;
176 for (int32_t x
= clearTile
.xmin
; x
<= clearTile
.xmax
; ++x
)
178 for( int32_t sampleNum
= 0; sampleNum
< numSamples
; sampleNum
++)
180 ClearRasterTile
<format
>(pRasterTile
, vClear
);
181 pRasterTile
+= rasterTileSampleStep
;
184 pRasterTileRow
+= macroTileRowStep
;
187 pHotTile
->state
= HOTTILE_DIRTY
;
191 void ProcessClearBE(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t macroTile
, void *pUserData
)
193 SWR_CONTEXT
*pContext
= pDC
->pContext
;
197 CLEAR_DESC
*pClear
= (CLEAR_DESC
*)pUserData
;
198 SWR_MULTISAMPLE_COUNT sampleCount
= pDC
->pState
->state
.rastState
.sampleCount
;
199 uint32_t numSamples
= GetNumSamples(sampleCount
);
201 SWR_ASSERT(pClear
->flags
.bits
!= 0); // shouldn't be here without a reason.
203 AR_BEGIN(BEClear
, pDC
->drawId
);
205 if (pClear
->flags
.mask
& SWR_CLEAR_COLOR
)
207 HOTTILE
*pHotTile
= pContext
->pHotTileMgr
->GetHotTile(pContext
, pDC
, macroTile
, SWR_ATTACHMENT_COLOR0
, true, numSamples
);
208 // All we want to do here is to mark the hot tile as being in a "needs clear" state.
209 pHotTile
->clearData
[0] = *(DWORD
*)&(pClear
->clearRTColor
[0]);
210 pHotTile
->clearData
[1] = *(DWORD
*)&(pClear
->clearRTColor
[1]);
211 pHotTile
->clearData
[2] = *(DWORD
*)&(pClear
->clearRTColor
[2]);
212 pHotTile
->clearData
[3] = *(DWORD
*)&(pClear
->clearRTColor
[3]);
213 pHotTile
->state
= HOTTILE_CLEAR
;
216 if (pClear
->flags
.mask
& SWR_CLEAR_DEPTH
)
218 HOTTILE
*pHotTile
= pContext
->pHotTileMgr
->GetHotTile(pContext
, pDC
, macroTile
, SWR_ATTACHMENT_DEPTH
, true, numSamples
);
219 pHotTile
->clearData
[0] = *(DWORD
*)&pClear
->clearDepth
;
220 pHotTile
->state
= HOTTILE_CLEAR
;
223 if (pClear
->flags
.mask
& SWR_CLEAR_STENCIL
)
225 HOTTILE
*pHotTile
= pContext
->pHotTileMgr
->GetHotTile(pContext
, pDC
, macroTile
, SWR_ATTACHMENT_STENCIL
, true, numSamples
);
227 pHotTile
->clearData
[0] = *(DWORD
*)&pClear
->clearStencil
;
228 pHotTile
->state
= HOTTILE_CLEAR
;
236 CLEAR_DESC
*pClear
= (CLEAR_DESC
*)pUserData
;
237 AR_BEGIN(BEClear
, pDC
->drawId
);
239 if (pClear
->flags
.mask
& SWR_CLEAR_COLOR
)
241 /// @todo clear data should come in as RGBA32_FLOAT
244 clearFloat
[0] = ((uint8_t*)(&pClear
->clearRTColor
))[0] / 255.0f
;
245 clearFloat
[1] = ((uint8_t*)(&pClear
->clearRTColor
))[1] / 255.0f
;
246 clearFloat
[2] = ((uint8_t*)(&pClear
->clearRTColor
))[2] / 255.0f
;
247 clearFloat
[3] = ((uint8_t*)(&pClear
->clearRTColor
))[3] / 255.0f
;
248 clearData
[0] = *(DWORD
*)&clearFloat
[0];
249 clearData
[1] = *(DWORD
*)&clearFloat
[1];
250 clearData
[2] = *(DWORD
*)&clearFloat
[2];
251 clearData
[3] = *(DWORD
*)&clearFloat
[3];
253 PFN_CLEAR_TILES pfnClearTiles
= sClearTilesTable
[KNOB_COLOR_HOT_TILE_FORMAT
];
254 SWR_ASSERT(pfnClearTiles
!= nullptr);
256 pfnClearTiles(pDC
, SWR_ATTACHMENT_COLOR0
, macroTile
, clearData
, pClear
->rect
);
259 if (pClear
->flags
.mask
& SWR_CLEAR_DEPTH
)
262 clearData
[0] = *(DWORD
*)&pClear
->clearDepth
;
263 PFN_CLEAR_TILES pfnClearTiles
= sClearTilesTable
[KNOB_DEPTH_HOT_TILE_FORMAT
];
264 SWR_ASSERT(pfnClearTiles
!= nullptr);
266 pfnClearTiles(pDC
, SWR_ATTACHMENT_DEPTH
, macroTile
, clearData
, pClear
->rect
);
269 if (pClear
->flags
.mask
& SWR_CLEAR_STENCIL
)
271 uint32_t value
= pClear
->clearStencil
;
273 clearData
[0] = *(DWORD
*)&value
;
274 PFN_CLEAR_TILES pfnClearTiles
= sClearTilesTable
[KNOB_STENCIL_HOT_TILE_FORMAT
];
276 pfnClearTiles(pDC
, SWR_ATTACHMENT_STENCIL
, macroTile
, clearData
, pClear
->rect
);
284 void ProcessStoreTileBE(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t macroTile
, void *pData
)
286 STORE_TILES_DESC
*pDesc
= (STORE_TILES_DESC
*)pData
;
287 SWR_CONTEXT
*pContext
= pDC
->pContext
;
289 AR_BEGIN(BEStoreTiles
, pDC
->drawId
);
291 #ifdef KNOB_ENABLE_RDTSC
292 uint32_t numTiles
= 0;
294 SWR_FORMAT srcFormat
;
295 switch (pDesc
->attachment
)
297 case SWR_ATTACHMENT_COLOR0
:
298 case SWR_ATTACHMENT_COLOR1
:
299 case SWR_ATTACHMENT_COLOR2
:
300 case SWR_ATTACHMENT_COLOR3
:
301 case SWR_ATTACHMENT_COLOR4
:
302 case SWR_ATTACHMENT_COLOR5
:
303 case SWR_ATTACHMENT_COLOR6
:
304 case SWR_ATTACHMENT_COLOR7
: srcFormat
= KNOB_COLOR_HOT_TILE_FORMAT
; break;
305 case SWR_ATTACHMENT_DEPTH
: srcFormat
= KNOB_DEPTH_HOT_TILE_FORMAT
; break;
306 case SWR_ATTACHMENT_STENCIL
: srcFormat
= KNOB_STENCIL_HOT_TILE_FORMAT
; break;
307 default: SWR_ASSERT(false, "Unknown attachment: %d", pDesc
->attachment
); srcFormat
= KNOB_COLOR_HOT_TILE_FORMAT
; break;
311 MacroTileMgr::getTileIndices(macroTile
, x
, y
);
313 // Only need to store the hottile if it's been rendered to...
314 HOTTILE
*pHotTile
= pContext
->pHotTileMgr
->GetHotTile(pContext
, pDC
, macroTile
, pDesc
->attachment
, false);
317 // clear if clear is pending (i.e., not rendered to), then mark as dirty for store.
318 if (pHotTile
->state
== HOTTILE_CLEAR
)
320 PFN_CLEAR_TILES pfnClearTiles
= sClearTilesTable
[srcFormat
];
321 SWR_ASSERT(pfnClearTiles
!= nullptr);
323 pfnClearTiles(pDC
, pDesc
->attachment
, macroTile
, pHotTile
->clearData
, pDesc
->rect
);
326 if (pHotTile
->state
== HOTTILE_DIRTY
|| pDesc
->postStoreTileState
== (SWR_TILE_STATE
)HOTTILE_DIRTY
)
328 int32_t destX
= KNOB_MACROTILE_X_DIM
* x
;
329 int32_t destY
= KNOB_MACROTILE_Y_DIM
* y
;
331 pContext
->pfnStoreTile(GetPrivateState(pDC
), srcFormat
,
332 pDesc
->attachment
, destX
, destY
, pHotTile
->renderTargetArrayIndex
, pHotTile
->pBuffer
);
336 if (pHotTile
->state
== HOTTILE_DIRTY
|| pHotTile
->state
== HOTTILE_RESOLVED
)
338 pHotTile
->state
= (HOTTILE_STATE
)pDesc
->postStoreTileState
;
341 AR_END(BEStoreTiles
, numTiles
);
345 void ProcessDiscardInvalidateTilesBE(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t macroTile
, void *pData
)
347 DISCARD_INVALIDATE_TILES_DESC
*pDesc
= (DISCARD_INVALIDATE_TILES_DESC
*)pData
;
348 SWR_CONTEXT
*pContext
= pDC
->pContext
;
350 const int32_t numSamples
= GetNumSamples(pDC
->pState
->state
.rastState
.sampleCount
);
352 for (uint32_t i
= 0; i
< SWR_NUM_ATTACHMENTS
; ++i
)
354 if (pDesc
->attachmentMask
& (1 << i
))
356 HOTTILE
*pHotTile
= pContext
->pHotTileMgr
->GetHotTileNoLoad(
357 pContext
, pDC
, macroTile
, (SWR_RENDERTARGET_ATTACHMENT
)i
, pDesc
->createNewTiles
, numSamples
);
360 pHotTile
->state
= (HOTTILE_STATE
)pDesc
->newTileState
;
366 #if KNOB_SIMD_WIDTH == 8
367 const __m256 vCenterOffsetsX
= {0.5, 1.5, 0.5, 1.5, 2.5, 3.5, 2.5, 3.5};
368 const __m256 vCenterOffsetsY
= {0.5, 0.5, 1.5, 1.5, 0.5, 0.5, 1.5, 1.5};
369 const __m256 vULOffsetsX
= {0.0, 1.0, 0.0, 1.0, 2.0, 3.0, 2.0, 3.0};
370 const __m256 vULOffsetsY
= {0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0};
372 #error Unsupported vector width
375 simdmask
ComputeUserClipMask(uint8_t clipMask
, float* pUserClipBuffer
, simdscalar vI
, simdscalar vJ
)
377 simdscalar vClipMask
= _simd_setzero_ps();
378 uint32_t numClipDistance
= _mm_popcnt_u32(clipMask
);
380 for (uint32_t i
= 0; i
< numClipDistance
; ++i
)
382 // pull triangle clip distance values from clip buffer
383 simdscalar vA
= _simd_broadcast_ss(pUserClipBuffer
++);
384 simdscalar vB
= _simd_broadcast_ss(pUserClipBuffer
++);
385 simdscalar vC
= _simd_broadcast_ss(pUserClipBuffer
++);
388 simdscalar vInterp
= vplaneps(vA
, vB
, vC
, vI
, vJ
);
390 // clip if interpolated clip distance is < 0 || NAN
391 simdscalar vCull
= _simd_cmp_ps(_simd_setzero_ps(), vInterp
, _CMP_NLE_UQ
);
393 vClipMask
= _simd_or_ps(vClipMask
, vCull
);
396 return _simd_movemask_ps(vClipMask
);
400 void BackendSingleSample(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t x
, uint32_t y
, SWR_TRIANGLE_DESC
&work
, RenderOutputBuffers
&renderBuffers
)
402 SWR_CONTEXT
*pContext
= pDC
->pContext
;
404 AR_BEGIN(BESingleSampleBackend
, pDC
->drawId
);
405 AR_BEGIN(BESetup
, pDC
->drawId
);
407 const API_STATE
& state
= GetApiState(pDC
);
408 const SWR_RASTSTATE
& rastState
= state
.rastState
;
409 const SWR_PS_STATE
*pPSState
= &state
.psState
;
410 const SWR_BLEND_STATE
*pBlendState
= &state
.blendState
;
411 uint64_t coverageMask
= work
.coverageMask
[0];
414 BarycentricCoeffs coeffs
;
415 coeffs
.vIa
= _simd_broadcast_ss(&work
.I
[0]);
416 coeffs
.vIb
= _simd_broadcast_ss(&work
.I
[1]);
417 coeffs
.vIc
= _simd_broadcast_ss(&work
.I
[2]);
419 coeffs
.vJa
= _simd_broadcast_ss(&work
.J
[0]);
420 coeffs
.vJb
= _simd_broadcast_ss(&work
.J
[1]);
421 coeffs
.vJc
= _simd_broadcast_ss(&work
.J
[2]);
423 coeffs
.vZa
= _simd_broadcast_ss(&work
.Z
[0]);
424 coeffs
.vZb
= _simd_broadcast_ss(&work
.Z
[1]);
425 coeffs
.vZc
= _simd_broadcast_ss(&work
.Z
[2]);
427 coeffs
.vRecipDet
= _simd_broadcast_ss(&work
.recipDet
);
429 coeffs
.vAOneOverW
= _simd_broadcast_ss(&work
.OneOverW
[0]);
430 coeffs
.vBOneOverW
= _simd_broadcast_ss(&work
.OneOverW
[1]);
431 coeffs
.vCOneOverW
= _simd_broadcast_ss(&work
.OneOverW
[2]);
433 uint8_t *pColorBase
[SWR_NUM_RENDERTARGETS
];
434 uint32_t NumRT
= state
.psState
.numRenderTargets
;
435 for(uint32_t rt
= 0; rt
< NumRT
; ++rt
)
437 pColorBase
[rt
] = renderBuffers
.pColor
[rt
];
439 uint8_t *pDepthBase
= renderBuffers
.pDepth
, *pStencilBase
= renderBuffers
.pStencil
;
442 SWR_PS_CONTEXT psContext
;
443 psContext
.pAttribs
= work
.pAttribs
;
444 psContext
.pPerspAttribs
= work
.pPerspAttribs
;
445 psContext
.frontFace
= work
.triFlags
.frontFacing
;
446 psContext
.primID
= work
.triFlags
.primID
;
448 // save Ia/Ib/Ic and Ja/Jb/Jc if we need to reevaluate i/j/k in the shader because of pull attribs
449 psContext
.I
= work
.I
;
450 psContext
.J
= work
.J
;
451 psContext
.recipDet
= work
.recipDet
;
452 psContext
.pRecipW
= work
.pRecipW
;
453 psContext
.pSamplePosX
= (const float*)&T::MultisampleT::samplePosX
;
454 psContext
.pSamplePosY
= (const float*)&T::MultisampleT::samplePosY
;
455 psContext
.rasterizerSampleCount
= T::MultisampleT::numSamples
;
457 for(uint32_t yy
= y
; yy
< y
+ KNOB_TILE_Y_DIM
; yy
+= SIMD_TILE_Y_DIM
)
460 psContext
.vY
.UL
= _simd_add_ps(vULOffsetsY
, _simd_set1_ps((float)yy
));
462 psContext
.vY
.center
= _simd_add_ps(vCenterOffsetsY
, _simd_set1_ps((float)yy
));
464 for(uint32_t xx
= x
; xx
< x
+ KNOB_TILE_X_DIM
; xx
+= SIMD_TILE_X_DIM
)
466 if(coverageMask
& MASK
)
468 psContext
.vX
.UL
= _simd_add_ps(vULOffsetsX
, _simd_set1_ps((float)xx
));
470 psContext
.vX
.center
= _simd_add_ps(vCenterOffsetsX
, _simd_set1_ps((float)xx
));
472 if(T::InputCoverage
!= SWR_INPUT_COVERAGE_NONE
)
474 const uint64_t* pCoverageMask
= (T::InputCoverage
== SWR_INPUT_COVERAGE_INNER_CONSERVATIVE
) ? &work
.innerCoverageMask
:
475 &work
.coverageMask
[0];
476 generateInputCoverage
<T
, T::InputCoverage
>(pCoverageMask
, psContext
.inputMask
, pBlendState
->sampleMask
);
479 AR_BEGIN(BEBarycentric
, pDC
->drawId
);
480 CalcPixelBarycentrics(coeffs
, psContext
);
482 // for 1x case, centroid is pixel center
483 psContext
.vX
.centroid
= psContext
.vX
.center
;
484 psContext
.vY
.centroid
= psContext
.vY
.center
;
485 psContext
.vI
.centroid
= psContext
.vI
.center
;
486 psContext
.vJ
.centroid
= psContext
.vJ
.center
;
487 psContext
.vOneOverW
.centroid
= psContext
.vOneOverW
.center
;
489 // interpolate and quantize z
490 psContext
.vZ
= vplaneps(coeffs
.vZa
, coeffs
.vZb
, coeffs
.vZc
, psContext
.vI
.center
, psContext
.vJ
.center
);
491 psContext
.vZ
= state
.pfnQuantizeDepth(psContext
.vZ
);
492 AR_END(BEBarycentric
, 1);
494 simdmask clipCoverageMask
= coverageMask
& MASK
;
495 // interpolate user clip distance if available
496 if(rastState
.clipDistanceMask
)
498 clipCoverageMask
&= ~ComputeUserClipMask(rastState
.clipDistanceMask
, work
.pUserClipBuffer
,
499 psContext
.vI
.center
, psContext
.vJ
.center
);
502 simdscalar vCoverageMask
= vMask(clipCoverageMask
);
503 simdscalar depthPassMask
= vCoverageMask
;
504 simdscalar stencilPassMask
= vCoverageMask
;
509 AR_BEGIN(BEEarlyDepthTest
, pDC
->drawId
);
510 depthPassMask
= DepthStencilTest(&state
, work
.triFlags
.frontFacing
, work
.triFlags
.viewportIndex
,
511 psContext
.vZ
, pDepthBase
, vCoverageMask
, pStencilBase
, &stencilPassMask
);
512 AR_END(BEEarlyDepthTest
, 0);
514 // early-exit if no pixels passed depth or earlyZ is forced on
515 if(pPSState
->forceEarlyZ
|| !_simd_movemask_ps(depthPassMask
))
517 DepthStencilWrite(&state
.vp
[work
.triFlags
.viewportIndex
], &state
.depthStencilState
, work
.triFlags
.frontFacing
, psContext
.vZ
,
518 pDepthBase
, depthPassMask
, vCoverageMask
, pStencilBase
, stencilPassMask
);
520 if (!_simd_movemask_ps(depthPassMask
))
527 psContext
.sampleIndex
= 0;
528 psContext
.activeMask
= _simd_castps_si(vCoverageMask
);
530 // execute pixel shader
531 AR_BEGIN(BEPixelShader
, pDC
->drawId
);
532 UPDATE_STAT(PsInvocations
, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask
)));
533 state
.psState
.pfnPixelShader(GetPrivateState(pDC
), &psContext
);
534 AR_END(BEPixelShader
, 0);
536 vCoverageMask
= _simd_castsi_ps(psContext
.activeMask
);
541 AR_BEGIN(BELateDepthTest
, pDC
->drawId
);
542 depthPassMask
= DepthStencilTest(&state
, work
.triFlags
.frontFacing
, work
.triFlags
.viewportIndex
,
543 psContext
.vZ
, pDepthBase
, vCoverageMask
, pStencilBase
, &stencilPassMask
);
544 AR_END(BELateDepthTest
, 0);
546 if(!_simd_movemask_ps(depthPassMask
))
548 // need to call depth/stencil write for stencil write
549 DepthStencilWrite(&state
.vp
[work
.triFlags
.viewportIndex
], &state
.depthStencilState
, work
.triFlags
.frontFacing
, psContext
.vZ
,
550 pDepthBase
, depthPassMask
, vCoverageMask
, pStencilBase
, stencilPassMask
);
555 uint32_t statMask
= _simd_movemask_ps(depthPassMask
);
556 uint32_t statCount
= _mm_popcnt_u32(statMask
);
557 UPDATE_STAT(DepthPassCount
, statCount
);
560 AR_BEGIN(BEOutputMerger
, pDC
->drawId
);
561 OutputMerger(psContext
, pColorBase
, 0, pBlendState
, state
.pfnBlendFunc
, vCoverageMask
, depthPassMask
, pPSState
->numRenderTargets
);
563 // do final depth write after all pixel kills
564 if (!pPSState
->forceEarlyZ
)
566 DepthStencilWrite(&state
.vp
[work
.triFlags
.viewportIndex
], &state
.depthStencilState
, work
.triFlags
.frontFacing
, psContext
.vZ
,
567 pDepthBase
, depthPassMask
, vCoverageMask
, pStencilBase
, stencilPassMask
);
569 AR_END(BEOutputMerger
, 0);
573 AR_BEGIN(BEEndTile
, pDC
->drawId
);
574 coverageMask
>>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
575 if(T::InputCoverage
== SWR_INPUT_COVERAGE_INNER_CONSERVATIVE
)
577 work
.innerCoverageMask
>>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
579 pDepthBase
+= (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
) / 8;
580 pStencilBase
+= (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
) / 8;
582 for(uint32_t rt
= 0; rt
< NumRT
; ++rt
)
584 pColorBase
[rt
] += (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
) / 8;
586 AR_END(BEEndTile
, 0);
589 AR_END(BESingleSampleBackend
, 0);
593 void BackendSampleRate(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t x
, uint32_t y
, SWR_TRIANGLE_DESC
&work
, RenderOutputBuffers
&renderBuffers
)
595 SWR_CONTEXT
*pContext
= pDC
->pContext
;
597 AR_BEGIN(BESampleRateBackend
, pDC
->drawId
);
598 AR_BEGIN(BESetup
, pDC
->drawId
);
600 const API_STATE
& state
= GetApiState(pDC
);
601 const SWR_RASTSTATE
& rastState
= state
.rastState
;
602 const SWR_PS_STATE
*pPSState
= &state
.psState
;
603 const SWR_BLEND_STATE
*pBlendState
= &state
.blendState
;
606 BarycentricCoeffs coeffs
;
607 coeffs
.vIa
= _simd_broadcast_ss(&work
.I
[0]);
608 coeffs
.vIb
= _simd_broadcast_ss(&work
.I
[1]);
609 coeffs
.vIc
= _simd_broadcast_ss(&work
.I
[2]);
611 coeffs
.vJa
= _simd_broadcast_ss(&work
.J
[0]);
612 coeffs
.vJb
= _simd_broadcast_ss(&work
.J
[1]);
613 coeffs
.vJc
= _simd_broadcast_ss(&work
.J
[2]);
615 coeffs
.vZa
= _simd_broadcast_ss(&work
.Z
[0]);
616 coeffs
.vZb
= _simd_broadcast_ss(&work
.Z
[1]);
617 coeffs
.vZc
= _simd_broadcast_ss(&work
.Z
[2]);
619 coeffs
.vRecipDet
= _simd_broadcast_ss(&work
.recipDet
);
621 coeffs
.vAOneOverW
= _simd_broadcast_ss(&work
.OneOverW
[0]);
622 coeffs
.vBOneOverW
= _simd_broadcast_ss(&work
.OneOverW
[1]);
623 coeffs
.vCOneOverW
= _simd_broadcast_ss(&work
.OneOverW
[2]);
625 uint8_t *pColorBase
[SWR_NUM_RENDERTARGETS
];
626 uint32_t NumRT
= state
.psState
.numRenderTargets
;
627 for(uint32_t rt
= 0; rt
< NumRT
; ++rt
)
629 pColorBase
[rt
] = renderBuffers
.pColor
[rt
];
631 uint8_t *pDepthBase
= renderBuffers
.pDepth
, *pStencilBase
= renderBuffers
.pStencil
;
634 SWR_PS_CONTEXT psContext
;
635 psContext
.pAttribs
= work
.pAttribs
;
636 psContext
.pPerspAttribs
= work
.pPerspAttribs
;
637 psContext
.pRecipW
= work
.pRecipW
;
638 psContext
.frontFace
= work
.triFlags
.frontFacing
;
639 psContext
.primID
= work
.triFlags
.primID
;
641 // save Ia/Ib/Ic and Ja/Jb/Jc if we need to reevaluate i/j/k in the shader because of pull attribs
642 psContext
.I
= work
.I
;
643 psContext
.J
= work
.J
;
644 psContext
.recipDet
= work
.recipDet
;
645 psContext
.pSamplePosX
= (const float*)&T::MultisampleT::samplePosX
;
646 psContext
.pSamplePosY
= (const float*)&T::MultisampleT::samplePosY
;
647 psContext
.rasterizerSampleCount
= T::MultisampleT::numSamples
;
649 for (uint32_t yy
= y
; yy
< y
+ KNOB_TILE_Y_DIM
; yy
+= SIMD_TILE_Y_DIM
)
652 psContext
.vY
.UL
= _simd_add_ps(vULOffsetsY
, _simd_set1_ps((float)yy
));
654 psContext
.vY
.center
= _simd_add_ps(vCenterOffsetsY
, _simd_set1_ps((float)yy
));
656 for (uint32_t xx
= x
; xx
< x
+ KNOB_TILE_X_DIM
; xx
+= SIMD_TILE_X_DIM
)
658 psContext
.vX
.UL
= _simd_add_ps(vULOffsetsX
, _simd_set1_ps((float)xx
));
660 psContext
.vX
.center
= _simd_add_ps(vCenterOffsetsX
, _simd_set1_ps((float)xx
));
662 AR_BEGIN(BEBarycentric
, pDC
->drawId
);
663 CalcPixelBarycentrics(coeffs
, psContext
);
664 AR_END(BEBarycentric
, 0);
666 if(T::InputCoverage
!= SWR_INPUT_COVERAGE_NONE
)
668 const uint64_t* pCoverageMask
= (T::InputCoverage
== SWR_INPUT_COVERAGE_INNER_CONSERVATIVE
) ? &work
.innerCoverageMask
:
669 &work
.coverageMask
[0];
670 generateInputCoverage
<T
, T::InputCoverage
>(pCoverageMask
, psContext
.inputMask
, pBlendState
->sampleMask
);
675 ///@ todo: don't need to genererate input coverage 2x if input coverage and centroid
676 AR_BEGIN(BEBarycentric
, pDC
->drawId
);
677 if(T::bIsStandardPattern
)
679 CalcCentroidPos
<T
>(psContext
, &work
.coverageMask
[0], pBlendState
->sampleMask
, psContext
.vX
.UL
, psContext
.vY
.UL
);
683 psContext
.vX
.centroid
= _simd_add_ps(psContext
.vX
.UL
, _simd_set1_ps(0.5f
));
684 psContext
.vY
.centroid
= _simd_add_ps(psContext
.vY
.UL
, _simd_set1_ps(0.5f
));
686 CalcCentroidBarycentrics(coeffs
, psContext
, psContext
.vX
.UL
, psContext
.vY
.UL
);
687 AR_END(BEBarycentric
, 0);
691 psContext
.vX
.centroid
= psContext
.vX
.sample
;
692 psContext
.vY
.centroid
= psContext
.vY
.sample
;
695 for(uint32_t sample
= 0; sample
< T::MultisampleT::numSamples
; sample
++)
697 simdmask coverageMask
= work
.coverageMask
[sample
] & MASK
;
700 AR_BEGIN(BEBarycentric
, pDC
->drawId
);
701 // calculate per sample positions
702 psContext
.vX
.sample
= _simd_add_ps(psContext
.vX
.UL
, T::MultisampleT::vX(sample
));
703 psContext
.vY
.sample
= _simd_add_ps(psContext
.vY
.UL
, T::MultisampleT::vY(sample
));
705 CalcSampleBarycentrics(coeffs
, psContext
);
707 // interpolate and quantize z
708 psContext
.vZ
= vplaneps(coeffs
.vZa
, coeffs
.vZb
, coeffs
.vZc
, psContext
.vI
.sample
, psContext
.vJ
.sample
);
709 psContext
.vZ
= state
.pfnQuantizeDepth(psContext
.vZ
);
710 AR_END(BEBarycentric
, 0);
712 // interpolate user clip distance if available
713 if (rastState
.clipDistanceMask
)
715 coverageMask
&= ~ComputeUserClipMask(rastState
.clipDistanceMask
, work
.pUserClipBuffer
,
716 psContext
.vI
.sample
, psContext
.vJ
.sample
);
719 simdscalar vCoverageMask
= vMask(coverageMask
);
720 simdscalar depthPassMask
= vCoverageMask
;
721 simdscalar stencilPassMask
= vCoverageMask
;
723 // offset depth/stencil buffers current sample
724 uint8_t *pDepthSample
= pDepthBase
+ RasterTileDepthOffset(sample
);
725 uint8_t *pStencilSample
= pStencilBase
+ RasterTileStencilOffset(sample
);
730 AR_BEGIN(BEEarlyDepthTest
, pDC
->drawId
);
731 depthPassMask
= DepthStencilTest(&state
, work
.triFlags
.frontFacing
, work
.triFlags
.viewportIndex
,
732 psContext
.vZ
, pDepthSample
, vCoverageMask
, pStencilSample
, &stencilPassMask
);
733 AR_END(BEEarlyDepthTest
, 0);
735 // early-exit if no samples passed depth or earlyZ is forced on.
736 if (pPSState
->forceEarlyZ
|| !_simd_movemask_ps(depthPassMask
))
738 DepthStencilWrite(&state
.vp
[work
.triFlags
.viewportIndex
], &state
.depthStencilState
, work
.triFlags
.frontFacing
, psContext
.vZ
,
739 pDepthSample
, depthPassMask
, vCoverageMask
, pStencilSample
, stencilPassMask
);
741 if (!_simd_movemask_ps(depthPassMask
))
743 work
.coverageMask
[sample
] >>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
749 psContext
.sampleIndex
= sample
;
750 psContext
.activeMask
= _simd_castps_si(vCoverageMask
);
752 // execute pixel shader
753 AR_BEGIN(BEPixelShader
, pDC
->drawId
);
754 UPDATE_STAT(PsInvocations
, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask
)));
755 state
.psState
.pfnPixelShader(GetPrivateState(pDC
), &psContext
);
756 AR_END(BEPixelShader
, 0);
758 vCoverageMask
= _simd_castsi_ps(psContext
.activeMask
);
763 AR_BEGIN(BELateDepthTest
, pDC
->drawId
);
764 depthPassMask
= DepthStencilTest(&state
, work
.triFlags
.frontFacing
, work
.triFlags
.viewportIndex
,
765 psContext
.vZ
, pDepthSample
, vCoverageMask
, pStencilSample
, &stencilPassMask
);
766 AR_END(BELateDepthTest
, 0);
768 if (!_simd_movemask_ps(depthPassMask
))
770 // need to call depth/stencil write for stencil write
771 DepthStencilWrite(&state
.vp
[work
.triFlags
.viewportIndex
], &state
.depthStencilState
, work
.triFlags
.frontFacing
, psContext
.vZ
,
772 pDepthSample
, depthPassMask
, vCoverageMask
, pStencilSample
, stencilPassMask
);
774 work
.coverageMask
[sample
] >>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
779 uint32_t statMask
= _simd_movemask_ps(depthPassMask
);
780 uint32_t statCount
= _mm_popcnt_u32(statMask
);
781 UPDATE_STAT(DepthPassCount
, statCount
);
784 AR_BEGIN(BEOutputMerger
, pDC
->drawId
);
785 OutputMerger(psContext
, pColorBase
, sample
, pBlendState
, state
.pfnBlendFunc
, vCoverageMask
, depthPassMask
, pPSState
->numRenderTargets
);
787 // do final depth write after all pixel kills
788 if (!pPSState
->forceEarlyZ
)
790 DepthStencilWrite(&state
.vp
[work
.triFlags
.viewportIndex
], &state
.depthStencilState
, work
.triFlags
.frontFacing
, psContext
.vZ
,
791 pDepthSample
, depthPassMask
, vCoverageMask
, pStencilSample
, stencilPassMask
);
793 AR_END(BEOutputMerger
, 0);
795 work
.coverageMask
[sample
] >>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
797 AR_BEGIN(BEEndTile
, pDC
->drawId
);
798 if(T::InputCoverage
== SWR_INPUT_COVERAGE_INNER_CONSERVATIVE
)
800 work
.innerCoverageMask
>>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
802 pDepthBase
+= (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
) / 8;
803 pStencilBase
+= (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
) / 8;
805 for (uint32_t rt
= 0; rt
< NumRT
; ++rt
)
807 pColorBase
[rt
] += (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
) / 8;
809 AR_END(BEEndTile
, 0);
812 AR_END(BESampleRateBackend
, 0);
816 void BackendPixelRate(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t x
, uint32_t y
, SWR_TRIANGLE_DESC
&work
, RenderOutputBuffers
&renderBuffers
)
818 SWR_CONTEXT
*pContext
= pDC
->pContext
;
820 AR_BEGIN(BEPixelRateBackend
, pDC
->drawId
);
821 AR_BEGIN(BESetup
, pDC
->drawId
);
823 const API_STATE
& state
= GetApiState(pDC
);
824 const SWR_RASTSTATE
& rastState
= state
.rastState
;
825 const SWR_PS_STATE
*pPSState
= &state
.psState
;
826 const SWR_BLEND_STATE
*pBlendState
= &state
.blendState
;
829 BarycentricCoeffs coeffs
;
830 coeffs
.vIa
= _simd_broadcast_ss(&work
.I
[0]);
831 coeffs
.vIb
= _simd_broadcast_ss(&work
.I
[1]);
832 coeffs
.vIc
= _simd_broadcast_ss(&work
.I
[2]);
834 coeffs
.vJa
= _simd_broadcast_ss(&work
.J
[0]);
835 coeffs
.vJb
= _simd_broadcast_ss(&work
.J
[1]);
836 coeffs
.vJc
= _simd_broadcast_ss(&work
.J
[2]);
838 coeffs
.vZa
= _simd_broadcast_ss(&work
.Z
[0]);
839 coeffs
.vZb
= _simd_broadcast_ss(&work
.Z
[1]);
840 coeffs
.vZc
= _simd_broadcast_ss(&work
.Z
[2]);
842 coeffs
.vRecipDet
= _simd_broadcast_ss(&work
.recipDet
);
844 coeffs
.vAOneOverW
= _simd_broadcast_ss(&work
.OneOverW
[0]);
845 coeffs
.vBOneOverW
= _simd_broadcast_ss(&work
.OneOverW
[1]);
846 coeffs
.vCOneOverW
= _simd_broadcast_ss(&work
.OneOverW
[2]);
848 uint8_t *pColorBase
[SWR_NUM_RENDERTARGETS
];
849 uint32_t NumRT
= state
.psState
.numRenderTargets
;
850 for(uint32_t rt
= 0; rt
< NumRT
; ++rt
)
852 pColorBase
[rt
] = renderBuffers
.pColor
[rt
];
854 uint8_t *pDepthBase
= renderBuffers
.pDepth
, *pStencilBase
= renderBuffers
.pStencil
;
857 SWR_PS_CONTEXT psContext
;
858 psContext
.pAttribs
= work
.pAttribs
;
859 psContext
.pPerspAttribs
= work
.pPerspAttribs
;
860 psContext
.frontFace
= work
.triFlags
.frontFacing
;
861 psContext
.primID
= work
.triFlags
.primID
;
862 psContext
.pRecipW
= work
.pRecipW
;
863 // save Ia/Ib/Ic and Ja/Jb/Jc if we need to reevaluate i/j/k in the shader because of pull attribs
864 psContext
.I
= work
.I
;
865 psContext
.J
= work
.J
;
866 psContext
.recipDet
= work
.recipDet
;
867 psContext
.pSamplePosX
= (const float*)&T::MultisampleT::samplePosX
;
868 psContext
.pSamplePosY
= (const float*)&T::MultisampleT::samplePosY
;
869 psContext
.rasterizerSampleCount
= T::MultisampleT::numSamples
;
871 psContext
.sampleIndex
= 0;
873 PixelRateZTestLoop
<T
> PixelRateZTest(pDC
, workerId
, work
, coeffs
, state
, pDepthBase
, pStencilBase
, rastState
.clipDistanceMask
);
875 for(uint32_t yy
= y
; yy
< y
+ KNOB_TILE_Y_DIM
; yy
+= SIMD_TILE_Y_DIM
)
877 psContext
.vY
.UL
= _simd_add_ps(vULOffsetsY
, _simd_set1_ps((float)yy
));
878 psContext
.vY
.center
= _simd_add_ps(vCenterOffsetsY
, _simd_set1_ps((float)yy
));
879 for(uint32_t xx
= x
; xx
< x
+ KNOB_TILE_X_DIM
; xx
+= SIMD_TILE_X_DIM
)
881 simdscalar activeLanes
;
882 if(!(work
.anyCoveredSamples
& MASK
)) {goto Endtile
;};
883 activeLanes
= vMask(work
.anyCoveredSamples
& MASK
);
885 psContext
.vX
.UL
= _simd_add_ps(vULOffsetsX
, _simd_set1_ps((float)xx
));
886 // set pixel center positions
887 psContext
.vX
.center
= _simd_add_ps(vCenterOffsetsX
, _simd_set1_ps((float)xx
));
889 AR_BEGIN(BEBarycentric
, pDC
->drawId
);
890 CalcPixelBarycentrics(coeffs
, psContext
);
891 AR_END(BEBarycentric
, 0);
893 if (T::InputCoverage
!= SWR_INPUT_COVERAGE_NONE
)
895 const uint64_t* pCoverageMask
= (T::InputCoverage
== SWR_INPUT_COVERAGE_INNER_CONSERVATIVE
) ? &work
.innerCoverageMask
:
896 &work
.coverageMask
[0];
897 generateInputCoverage
<T
, T::InputCoverage
>(pCoverageMask
, psContext
.inputMask
, pBlendState
->sampleMask
);
902 ///@ todo: don't need to genererate input coverage 2x if input coverage and centroid
903 AR_BEGIN(BEBarycentric
, pDC
->drawId
);
904 if(T::bIsStandardPattern
)
906 CalcCentroidPos
<T
>(psContext
, &work
.coverageMask
[0], pBlendState
->sampleMask
, psContext
.vX
.UL
, psContext
.vY
.UL
);
910 psContext
.vX
.centroid
= _simd_add_ps(psContext
.vX
.UL
, _simd_set1_ps(0.5f
));
911 psContext
.vY
.centroid
= _simd_add_ps(psContext
.vY
.UL
, _simd_set1_ps(0.5f
));
914 CalcCentroidBarycentrics(coeffs
, psContext
, psContext
.vX
.UL
, psContext
.vY
.UL
);
915 AR_END(BEBarycentric
, 0);
919 psContext
.vX
.centroid
= _simd_add_ps(psContext
.vX
.UL
, _simd_set1_ps(0.5f
));
920 psContext
.vY
.centroid
= _simd_add_ps(psContext
.vY
.UL
, _simd_set1_ps(0.5f
));
923 if(T::bForcedSampleCount
)
925 // candidate pixels (that passed coverage) will cause shader invocation if any bits in the samplemask are set
926 const simdscalar vSampleMask
= _simd_castsi_ps(_simd_cmpgt_epi32(_simd_set1_epi32(pBlendState
->sampleMask
), _simd_setzero_si()));
927 activeLanes
= _simd_and_ps(activeLanes
, vSampleMask
);
931 if(T::bCanEarlyZ
&& !T::bForcedSampleCount
)
933 uint32_t depthPassCount
= PixelRateZTest(activeLanes
, psContext
, BEEarlyDepthTest
);
934 UPDATE_STAT(DepthPassCount
, depthPassCount
);
937 // if we have no covered samples that passed depth at this point, go to next tile
938 if(!_simd_movemask_ps(activeLanes
)) { goto Endtile
; };
940 if(pPSState
->usesSourceDepth
)
942 AR_BEGIN(BEBarycentric
, pDC
->drawId
);
943 // interpolate and quantize z
944 psContext
.vZ
= vplaneps(coeffs
.vZa
, coeffs
.vZb
, coeffs
.vZc
, psContext
.vI
.center
, psContext
.vJ
.center
);
945 psContext
.vZ
= state
.pfnQuantizeDepth(psContext
.vZ
);
946 AR_END(BEBarycentric
, 0);
949 // pixels that are currently active
950 psContext
.activeMask
= _simd_castps_si(activeLanes
);
951 psContext
.oMask
= T::MultisampleT::FullSampleMask();
953 // execute pixel shader
954 AR_BEGIN(BEPixelShader
, pDC
->drawId
);
955 state
.psState
.pfnPixelShader(GetPrivateState(pDC
), &psContext
);
956 UPDATE_STAT(PsInvocations
, _mm_popcnt_u32(_simd_movemask_ps(activeLanes
)));
957 AR_END(BEPixelShader
, 0);
959 // update active lanes to remove any discarded or oMask'd pixels
960 activeLanes
= _simd_castsi_ps(_simd_and_si(psContext
.activeMask
, _simd_cmpgt_epi32(psContext
.oMask
, _simd_setzero_si())));
961 if(!_simd_movemask_ps(activeLanes
)) { goto Endtile
; };
964 if(!T::bCanEarlyZ
&& !T::bForcedSampleCount
)
966 uint32_t depthPassCount
= PixelRateZTest(activeLanes
, psContext
, BELateDepthTest
);
967 UPDATE_STAT(DepthPassCount
, depthPassCount
);
970 // if we have no covered samples that passed depth at this point, skip OM and go to next tile
971 if(!_simd_movemask_ps(activeLanes
)) { goto Endtile
; };
974 // loop over all samples, broadcasting the results of the PS to all passing pixels
975 for(uint32_t sample
= 0; sample
< GetNumOMSamples
<T
>(pBlendState
->sampleCount
); sample
++)
977 AR_BEGIN(BEOutputMerger
, pDC
->drawId
);
978 // center pattern does a single coverage/depth/stencil test, standard pattern tests all samples
979 uint32_t coverageSampleNum
= (T::bIsStandardPattern
) ? sample
: 0;
980 simdscalar coverageMask
, depthMask
;
981 if(T::bForcedSampleCount
)
983 coverageMask
= depthMask
= activeLanes
;
987 coverageMask
= PixelRateZTest
.vCoverageMask
[coverageSampleNum
];
988 depthMask
= PixelRateZTest
.depthPassMask
[coverageSampleNum
];
989 if(!_simd_movemask_ps(depthMask
))
991 // stencil should already have been written in early/lateZ tests
992 AR_END(BEOutputMerger
, 0);
997 // broadcast the results of the PS to all passing pixels
998 OutputMerger(psContext
, pColorBase
, sample
, pBlendState
, state
.pfnBlendFunc
, coverageMask
, depthMask
, pPSState
->numRenderTargets
);
1000 if(!pPSState
->forceEarlyZ
&& !T::bForcedSampleCount
)
1002 uint8_t *pDepthSample
= pDepthBase
+ RasterTileDepthOffset(sample
);
1003 uint8_t * pStencilSample
= pStencilBase
+ RasterTileStencilOffset(sample
);
1005 DepthStencilWrite(&state
.vp
[work
.triFlags
.viewportIndex
], &state
.depthStencilState
, work
.triFlags
.frontFacing
, PixelRateZTest
.vZ
[coverageSampleNum
],
1006 pDepthSample
, depthMask
, coverageMask
, pStencilSample
, PixelRateZTest
.stencilPassMask
[coverageSampleNum
]);
1008 AR_END(BEOutputMerger
, 0);
1011 AR_BEGIN(BEEndTile
, pDC
->drawId
);
1012 for(uint32_t sample
= 0; sample
< T::MultisampleT::numCoverageSamples
; sample
++)
1014 work
.coverageMask
[sample
] >>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
1017 if(T::InputCoverage
== SWR_INPUT_COVERAGE_INNER_CONSERVATIVE
)
1019 work
.innerCoverageMask
>>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
1021 work
.anyCoveredSamples
>>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
1022 pDepthBase
+= (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
) / 8;
1023 pStencilBase
+= (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
) / 8;
1025 for(uint32_t rt
= 0; rt
< NumRT
; ++rt
)
1027 pColorBase
[rt
] += (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
) / 8;
1029 AR_END(BEEndTile
, 0);
1032 AR_END(BEPixelRateBackend
, 0);
1034 // optimized backend flow with NULL PS
1035 template<uint32_t sampleCountT
>
1036 void BackendNullPS(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t x
, uint32_t y
, SWR_TRIANGLE_DESC
&work
, RenderOutputBuffers
&renderBuffers
)
1038 SWR_CONTEXT
*pContext
= pDC
->pContext
;
1040 AR_BEGIN(BENullBackend
, pDC
->drawId
);
1041 ///@todo: handle center multisample pattern
1042 typedef SwrBackendTraits
<sampleCountT
, SWR_MSAA_STANDARD_PATTERN
> T
;
1043 AR_BEGIN(BESetup
, pDC
->drawId
);
1045 const API_STATE
& state
= GetApiState(pDC
);
1046 const SWR_RASTSTATE
& rastState
= pDC
->pState
->state
.rastState
;
1048 // broadcast scalars
1049 BarycentricCoeffs coeffs
;
1050 coeffs
.vIa
= _simd_broadcast_ss(&work
.I
[0]);
1051 coeffs
.vIb
= _simd_broadcast_ss(&work
.I
[1]);
1052 coeffs
.vIc
= _simd_broadcast_ss(&work
.I
[2]);
1054 coeffs
.vJa
= _simd_broadcast_ss(&work
.J
[0]);
1055 coeffs
.vJb
= _simd_broadcast_ss(&work
.J
[1]);
1056 coeffs
.vJc
= _simd_broadcast_ss(&work
.J
[2]);
1058 coeffs
.vZa
= _simd_broadcast_ss(&work
.Z
[0]);
1059 coeffs
.vZb
= _simd_broadcast_ss(&work
.Z
[1]);
1060 coeffs
.vZc
= _simd_broadcast_ss(&work
.Z
[2]);
1062 coeffs
.vRecipDet
= _simd_broadcast_ss(&work
.recipDet
);
1064 uint8_t *pDepthBase
= renderBuffers
.pDepth
, *pStencilBase
= renderBuffers
.pStencil
;
1068 SWR_PS_CONTEXT psContext
;
1069 for (uint32_t yy
= y
; yy
< y
+ KNOB_TILE_Y_DIM
; yy
+= SIMD_TILE_Y_DIM
)
1072 simdscalar vYSamplePosUL
= _simd_add_ps(vULOffsetsY
, _simd_set1_ps((float)yy
));
1074 for (uint32_t xx
= x
; xx
< x
+ KNOB_TILE_X_DIM
; xx
+= SIMD_TILE_X_DIM
)
1077 simdscalar vXSamplePosUL
= _simd_add_ps(vULOffsetsX
, _simd_set1_ps((float)xx
));
1079 // iterate over active samples
1080 unsigned long sample
= 0;
1081 uint32_t sampleMask
= state
.blendState
.sampleMask
;
1082 while (_BitScanForward(&sample
, sampleMask
))
1084 sampleMask
&= ~(1 << sample
);
1085 simdmask coverageMask
= work
.coverageMask
[sample
] & MASK
;
1088 AR_BEGIN(BEBarycentric
, pDC
->drawId
);
1089 // calculate per sample positions
1090 psContext
.vX
.sample
= _simd_add_ps(vXSamplePosUL
, T::MultisampleT::vX(sample
));
1091 psContext
.vY
.sample
= _simd_add_ps(vYSamplePosUL
, T::MultisampleT::vY(sample
));
1093 CalcSampleBarycentrics(coeffs
, psContext
);
1095 // interpolate and quantize z
1096 psContext
.vZ
= vplaneps(coeffs
.vZa
, coeffs
.vZb
, coeffs
.vZc
, psContext
.vI
.sample
, psContext
.vJ
.sample
);
1097 psContext
.vZ
= state
.pfnQuantizeDepth(psContext
.vZ
);
1099 AR_END(BEBarycentric
, 0);
1101 // interpolate user clip distance if available
1102 if (rastState
.clipDistanceMask
)
1104 coverageMask
&= ~ComputeUserClipMask(rastState
.clipDistanceMask
, work
.pUserClipBuffer
,
1105 psContext
.vI
.sample
, psContext
.vJ
.sample
);
1108 simdscalar vCoverageMask
= vMask(coverageMask
);
1109 simdscalar stencilPassMask
= vCoverageMask
;
1111 // offset depth/stencil buffers current sample
1112 uint8_t *pDepthSample
= pDepthBase
+ RasterTileDepthOffset(sample
);
1113 uint8_t *pStencilSample
= pStencilBase
+ RasterTileStencilOffset(sample
);
1115 AR_BEGIN(BEEarlyDepthTest
, pDC
->drawId
);
1116 simdscalar depthPassMask
= DepthStencilTest(&state
, work
.triFlags
.frontFacing
, work
.triFlags
.viewportIndex
,
1117 psContext
.vZ
, pDepthSample
, vCoverageMask
, pStencilSample
, &stencilPassMask
);
1118 DepthStencilWrite(&state
.vp
[work
.triFlags
.viewportIndex
], &state
.depthStencilState
, work
.triFlags
.frontFacing
, psContext
.vZ
,
1119 pDepthSample
, depthPassMask
, vCoverageMask
, pStencilSample
, stencilPassMask
);
1120 AR_END(BEEarlyDepthTest
, 0);
1122 uint32_t statMask
= _simd_movemask_ps(depthPassMask
);
1123 uint32_t statCount
= _mm_popcnt_u32(statMask
);
1124 UPDATE_STAT(DepthPassCount
, statCount
);
1126 work
.coverageMask
[sample
] >>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
1128 pDepthBase
+= (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
) / 8;
1129 pStencilBase
+= (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
) / 8;
1132 AR_END(BENullBackend
, 0);
1135 void InitClearTilesTable()
1137 memset(sClearTilesTable
, 0, sizeof(sClearTilesTable
));
1139 sClearTilesTable
[R8G8B8A8_UNORM
] = ClearMacroTile
<R8G8B8A8_UNORM
>;
1140 sClearTilesTable
[B8G8R8A8_UNORM
] = ClearMacroTile
<B8G8R8A8_UNORM
>;
1141 sClearTilesTable
[R32_FLOAT
] = ClearMacroTile
<R32_FLOAT
>;
1142 sClearTilesTable
[R32G32B32A32_FLOAT
] = ClearMacroTile
<R32G32B32A32_FLOAT
>;
1143 sClearTilesTable
[R8_UINT
] = ClearMacroTile
<R8_UINT
>;
1146 PFN_BACKEND_FUNC gBackendNullPs
[SWR_MULTISAMPLE_TYPE_COUNT
];
1147 PFN_BACKEND_FUNC gBackendSingleSample
[SWR_INPUT_COVERAGE_COUNT
]
1151 PFN_BACKEND_FUNC gBackendPixelRateTable
[SWR_MULTISAMPLE_TYPE_COUNT
]
1152 [SWR_MSAA_SAMPLE_PATTERN_COUNT
]
1153 [SWR_INPUT_COVERAGE_COUNT
]
1155 [2] // forcedSampleCount
1158 PFN_BACKEND_FUNC gBackendSampleRateTable
[SWR_MULTISAMPLE_TYPE_COUNT
]
1159 [SWR_INPUT_COVERAGE_COUNT
]
1164 // Recursive template used to auto-nest conditionals. Converts dynamic enum function
1165 // arguments to static template arguments.
1166 template <uint32_t... ArgsT
>
1169 // Last Arg Terminator
1170 static PFN_BACKEND_FUNC
GetFunc(SWR_BACKEND_FUNCS tArg
)
1174 case SWR_BACKEND_SINGLE_SAMPLE
: return BackendSingleSample
<SwrBackendTraits
<ArgsT
...>>; break;
1175 case SWR_BACKEND_MSAA_PIXEL_RATE
: return BackendPixelRate
<SwrBackendTraits
<ArgsT
...>>; break;
1176 case SWR_BACKEND_MSAA_SAMPLE_RATE
: return BackendSampleRate
<SwrBackendTraits
<ArgsT
...>>; break;
1178 SWR_ASSERT(0 && "Invalid backend func\n");
1184 // Recursively parse args
1185 template <typename
... TArgsT
>
1186 static PFN_BACKEND_FUNC
GetFunc(SWR_MSAA_SAMPLE_PATTERN tArg
, TArgsT
... remainingArgs
)
1190 case SWR_MSAA_CENTER_PATTERN
: return BEChooser
<ArgsT
..., SWR_MSAA_CENTER_PATTERN
>::GetFunc(remainingArgs
...); break;
1191 case SWR_MSAA_STANDARD_PATTERN
: return BEChooser
<ArgsT
..., SWR_MSAA_STANDARD_PATTERN
>::GetFunc(remainingArgs
...); break;
1193 SWR_ASSERT(0 && "Invalid sample pattern\n");
1194 return BEChooser
<ArgsT
..., SWR_MSAA_STANDARD_PATTERN
>::GetFunc(remainingArgs
...);
1199 // Recursively parse args
1200 template <typename
... TArgsT
>
1201 static PFN_BACKEND_FUNC
GetFunc(SWR_INPUT_COVERAGE tArg
, TArgsT
... remainingArgs
)
1205 case SWR_INPUT_COVERAGE_NONE
: return BEChooser
<ArgsT
..., SWR_INPUT_COVERAGE_NONE
>::GetFunc(remainingArgs
...); break;
1206 case SWR_INPUT_COVERAGE_NORMAL
: return BEChooser
<ArgsT
..., SWR_INPUT_COVERAGE_NORMAL
>::GetFunc(remainingArgs
...); break;
1207 case SWR_INPUT_COVERAGE_INNER_CONSERVATIVE
: return BEChooser
<ArgsT
..., SWR_INPUT_COVERAGE_INNER_CONSERVATIVE
>::GetFunc(remainingArgs
...); break;
1209 SWR_ASSERT(0 && "Invalid sample pattern\n");
1210 return BEChooser
<ArgsT
..., SWR_INPUT_COVERAGE_NONE
>::GetFunc(remainingArgs
...);
1215 // Recursively parse args
1216 template <typename
... TArgsT
>
1217 static PFN_BACKEND_FUNC
GetFunc(SWR_MULTISAMPLE_COUNT tArg
, TArgsT
... remainingArgs
)
1221 case SWR_MULTISAMPLE_1X
: return BEChooser
<ArgsT
..., SWR_MULTISAMPLE_1X
>::GetFunc(remainingArgs
...); break;
1222 case SWR_MULTISAMPLE_2X
: return BEChooser
<ArgsT
..., SWR_MULTISAMPLE_2X
>::GetFunc(remainingArgs
...); break;
1223 case SWR_MULTISAMPLE_4X
: return BEChooser
<ArgsT
..., SWR_MULTISAMPLE_4X
>::GetFunc(remainingArgs
...); break;
1224 case SWR_MULTISAMPLE_8X
: return BEChooser
<ArgsT
..., SWR_MULTISAMPLE_8X
>::GetFunc(remainingArgs
...); break;
1225 case SWR_MULTISAMPLE_16X
: return BEChooser
<ArgsT
..., SWR_MULTISAMPLE_16X
>::GetFunc(remainingArgs
...); break;
1227 SWR_ASSERT(0 && "Invalid sample count\n");
1228 return BEChooser
<ArgsT
..., SWR_MULTISAMPLE_1X
>::GetFunc(remainingArgs
...);
1233 // Recursively parse args
1234 template <typename
... TArgsT
>
1235 static PFN_BACKEND_FUNC
GetFunc(bool tArg
, TArgsT
... remainingArgs
)
1239 return BEChooser
<ArgsT
..., 1>::GetFunc(remainingArgs
...);
1242 return BEChooser
<ArgsT
..., 0>::GetFunc(remainingArgs
...);
1246 void InitBackendSingleFuncTable(PFN_BACKEND_FUNC (&table
)[SWR_INPUT_COVERAGE_COUNT
][2][2])
1248 for(uint32_t inputCoverage
= 0; inputCoverage
< SWR_INPUT_COVERAGE_COUNT
; inputCoverage
++)
1250 for(uint32_t isCentroid
= 0; isCentroid
< 2; isCentroid
++)
1252 for(uint32_t canEarlyZ
= 0; canEarlyZ
< 2; canEarlyZ
++)
1254 table
[inputCoverage
][isCentroid
][canEarlyZ
] =
1255 BEChooser
<>::GetFunc(SWR_MULTISAMPLE_1X
, SWR_MSAA_STANDARD_PATTERN
, (SWR_INPUT_COVERAGE
)inputCoverage
,
1256 (isCentroid
> 0), false, (canEarlyZ
> 0), SWR_BACKEND_SINGLE_SAMPLE
);
1262 void InitBackendPixelFuncTable(PFN_BACKEND_FUNC (&table
)[SWR_MULTISAMPLE_TYPE_COUNT
][SWR_MSAA_SAMPLE_PATTERN_COUNT
][SWR_INPUT_COVERAGE_COUNT
][2][2][2])
1264 for(uint32_t sampleCount
= SWR_MULTISAMPLE_1X
; sampleCount
< SWR_MULTISAMPLE_TYPE_COUNT
; sampleCount
++)
1266 for(uint32_t samplePattern
= SWR_MSAA_CENTER_PATTERN
; samplePattern
< SWR_MSAA_SAMPLE_PATTERN_COUNT
; samplePattern
++)
1268 for(uint32_t inputCoverage
= 0; inputCoverage
< SWR_INPUT_COVERAGE_COUNT
; inputCoverage
++)
1270 for(uint32_t isCentroid
= 0; isCentroid
< 2; isCentroid
++)
1272 for(uint32_t forcedSampleCount
= 0; forcedSampleCount
< 2; forcedSampleCount
++)
1274 for(uint32_t canEarlyZ
= 0; canEarlyZ
< 2; canEarlyZ
++)
1276 table
[sampleCount
][samplePattern
][inputCoverage
][isCentroid
][forcedSampleCount
][canEarlyZ
] =
1277 BEChooser
<>::GetFunc((SWR_MULTISAMPLE_COUNT
)sampleCount
, (SWR_MSAA_SAMPLE_PATTERN
)samplePattern
, (SWR_INPUT_COVERAGE
)inputCoverage
,
1278 (isCentroid
> 0), (forcedSampleCount
> 0), (canEarlyZ
> 0), SWR_BACKEND_MSAA_PIXEL_RATE
);
1287 void InitBackendSampleFuncTable(PFN_BACKEND_FUNC (&table
)[SWR_MULTISAMPLE_TYPE_COUNT
][SWR_INPUT_COVERAGE_COUNT
][2][2])
1289 for(uint32_t sampleCount
= SWR_MULTISAMPLE_1X
; sampleCount
< SWR_MULTISAMPLE_TYPE_COUNT
; sampleCount
++)
1291 for(uint32_t inputCoverage
= 0; inputCoverage
< SWR_INPUT_COVERAGE_COUNT
; inputCoverage
++)
1293 for(uint32_t centroid
= 0; centroid
< 2; centroid
++)
1295 for(uint32_t canEarlyZ
= 0; canEarlyZ
< 2; canEarlyZ
++)
1297 table
[sampleCount
][inputCoverage
][centroid
][canEarlyZ
] =
1298 BEChooser
<>::GetFunc((SWR_MULTISAMPLE_COUNT
)sampleCount
, SWR_MSAA_STANDARD_PATTERN
, (SWR_INPUT_COVERAGE
)inputCoverage
,
1299 (centroid
> 0), false, (canEarlyZ
> 0), (SWR_BACKEND_FUNCS
)SWR_BACKEND_MSAA_SAMPLE_RATE
);
1306 void InitBackendFuncTables()
1308 InitBackendSingleFuncTable(gBackendSingleSample
);
1309 InitBackendPixelFuncTable(gBackendPixelRateTable
);
1310 InitBackendSampleFuncTable(gBackendSampleRateTable
);
1312 gBackendNullPs
[SWR_MULTISAMPLE_1X
] = &BackendNullPS
< SWR_MULTISAMPLE_1X
> ;
1313 gBackendNullPs
[SWR_MULTISAMPLE_2X
] = &BackendNullPS
< SWR_MULTISAMPLE_2X
> ;
1314 gBackendNullPs
[SWR_MULTISAMPLE_4X
] = &BackendNullPS
< SWR_MULTISAMPLE_4X
> ;
1315 gBackendNullPs
[SWR_MULTISAMPLE_8X
] = &BackendNullPS
< SWR_MULTISAMPLE_8X
> ;
1316 gBackendNullPs
[SWR_MULTISAMPLE_16X
] = &BackendNullPS
< SWR_MULTISAMPLE_16X
> ;