1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * @brief Backend handles rasterization, pixel shading and output merger
28 ******************************************************************************/
30 #include <smmintrin.h>
33 #include "depthstencil.h"
35 #include "memory/tilingtraits.h"
36 #include "core/multisample.h"
40 typedef void(*PFN_CLEAR_TILES
)(DRAW_CONTEXT
*, SWR_RENDERTARGET_ATTACHMENT rt
, uint32_t, DWORD
[4], const SWR_RECT
& rect
);
41 static PFN_CLEAR_TILES sClearTilesTable
[NUM_SWR_FORMATS
];
43 //////////////////////////////////////////////////////////////////////////
44 /// @brief Process compute work.
45 /// @param pDC - pointer to draw context (dispatch).
46 /// @param workerId - The unique worker ID that is assigned to this thread.
47 /// @param threadGroupId - the linear index for the thread group within the dispatch.
48 void ProcessComputeBE(DRAW_CONTEXT
* pDC
, uint32_t workerId
, uint32_t threadGroupId
, void*& pSpillFillBuffer
)
50 RDTSC_START(BEDispatch
);
52 SWR_CONTEXT
*pContext
= pDC
->pContext
;
54 const COMPUTE_DESC
* pTaskData
= (COMPUTE_DESC
*)pDC
->pDispatch
->GetTasksData();
55 SWR_ASSERT(pTaskData
!= nullptr);
57 // Ensure spill fill memory has been allocated.
58 size_t spillFillSize
= pDC
->pState
->state
.totalSpillFillSize
;
59 if (spillFillSize
&& pSpillFillBuffer
== nullptr)
61 pSpillFillBuffer
= pDC
->pArena
->AllocAlignedSync(spillFillSize
, KNOB_SIMD_BYTES
);
64 const API_STATE
& state
= GetApiState(pDC
);
66 SWR_CS_CONTEXT csContext
{ 0 };
67 csContext
.tileCounter
= threadGroupId
;
68 csContext
.dispatchDims
[0] = pTaskData
->threadGroupCountX
;
69 csContext
.dispatchDims
[1] = pTaskData
->threadGroupCountY
;
70 csContext
.dispatchDims
[2] = pTaskData
->threadGroupCountZ
;
71 csContext
.pTGSM
= pContext
->pScratch
[workerId
];
72 csContext
.pSpillFillBuffer
= (uint8_t*)pSpillFillBuffer
;
74 state
.pfnCsFunc(GetPrivateState(pDC
), &csContext
);
76 UPDATE_STAT(CsInvocations
, state
.totalThreadsInGroup
);
78 RDTSC_STOP(BEDispatch
, 1, 0);
81 void ProcessSyncBE(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t macroTile
, void *pUserData
)
84 MacroTileMgr::getTileIndices(macroTile
, x
, y
);
85 SWR_ASSERT(x
== 0 && y
== 0);
88 template<SWR_FORMAT format
>
89 void ClearRasterTile(uint8_t *pTileBuffer
, simdvector
&value
)
91 auto lambda
= [&](int32_t comp
)
93 FormatTraits
<format
>::storeSOA(comp
, pTileBuffer
, value
.v
[comp
]);
94 pTileBuffer
+= (KNOB_SIMD_WIDTH
* FormatTraits
<format
>::GetBPC(comp
) / 8);
97 const uint32_t numIter
= (KNOB_TILE_Y_DIM
/ SIMD_TILE_Y_DIM
) * (KNOB_TILE_X_DIM
/ SIMD_TILE_X_DIM
);
98 for (uint32_t i
= 0; i
< numIter
; ++i
)
100 UnrollerL
<0, FormatTraits
<format
>::numComps
, 1>::step(lambda
);
104 template<SWR_FORMAT format
>
105 INLINE
void ClearMacroTile(DRAW_CONTEXT
*pDC
, SWR_RENDERTARGET_ATTACHMENT rt
, uint32_t macroTile
, DWORD clear
[4], const SWR_RECT
& rect
)
107 // convert clear color to hottile format
108 // clear color is in RGBA float/uint32
110 for (uint32_t comp
= 0; comp
< FormatTraits
<format
>::numComps
; ++comp
)
113 vComp
= _simd_load1_ps((const float*)&clear
[comp
]);
114 if (FormatTraits
<format
>::isNormalized(comp
))
116 vComp
= _simd_mul_ps(vComp
, _simd_set1_ps(FormatTraits
<format
>::fromFloat(comp
)));
117 vComp
= _simd_castsi_ps(_simd_cvtps_epi32(vComp
));
119 vComp
= FormatTraits
<format
>::pack(comp
, vComp
);
120 vClear
.v
[FormatTraits
<format
>::swizzle(comp
)] = vComp
;
123 uint32_t tileX
, tileY
;
124 MacroTileMgr::getTileIndices(macroTile
, tileX
, tileY
);
126 // Init to full macrotile
129 KNOB_MACROTILE_X_DIM
* int32_t(tileX
),
130 KNOB_MACROTILE_Y_DIM
* int32_t(tileY
),
131 KNOB_MACROTILE_X_DIM
* int32_t(tileX
+ 1),
132 KNOB_MACROTILE_Y_DIM
* int32_t(tileY
+ 1),
135 // intersect with clear rect
138 // translate to local hottile origin
139 clearTile
.Translate(-int32_t(tileX
) * KNOB_MACROTILE_X_DIM
, -int32_t(tileY
) * KNOB_MACROTILE_Y_DIM
);
141 // Make maximums inclusive (needed for convert to raster tiles)
145 // convert to raster tiles
146 clearTile
.ymin
>>= (KNOB_TILE_Y_DIM_SHIFT
);
147 clearTile
.ymax
>>= (KNOB_TILE_Y_DIM_SHIFT
);
148 clearTile
.xmin
>>= (KNOB_TILE_X_DIM_SHIFT
);
149 clearTile
.xmax
>>= (KNOB_TILE_X_DIM_SHIFT
);
151 const int32_t numSamples
= GetNumSamples(pDC
->pState
->state
.rastState
.sampleCount
);
152 // compute steps between raster tile samples / raster tiles / macro tile rows
153 const uint32_t rasterTileSampleStep
= KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<format
>::bpp
/ 8;
154 const uint32_t rasterTileStep
= (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* (FormatTraits
<format
>::bpp
/ 8)) * numSamples
;
155 const uint32_t macroTileRowStep
= (KNOB_MACROTILE_X_DIM
/ KNOB_TILE_X_DIM
) * rasterTileStep
;
156 const uint32_t pitch
= (FormatTraits
<format
>::bpp
* KNOB_MACROTILE_X_DIM
/ 8);
158 HOTTILE
*pHotTile
= pDC
->pContext
->pHotTileMgr
->GetHotTile(pDC
->pContext
, pDC
, macroTile
, rt
, true, numSamples
);
159 uint32_t rasterTileStartOffset
= (ComputeTileOffset2D
< TilingTraits
<SWR_TILE_SWRZ
, FormatTraits
<format
>::bpp
> >(pitch
, clearTile
.xmin
, clearTile
.ymin
)) * numSamples
;
160 uint8_t* pRasterTileRow
= pHotTile
->pBuffer
+ rasterTileStartOffset
; //(ComputeTileOffset2D< TilingTraits<SWR_TILE_SWRZ, FormatTraits<format>::bpp > >(pitch, x, y)) * numSamples;
162 // loop over all raster tiles in the current hot tile
163 for (int32_t y
= clearTile
.ymin
; y
<= clearTile
.ymax
; ++y
)
165 uint8_t* pRasterTile
= pRasterTileRow
;
166 for (int32_t x
= clearTile
.xmin
; x
<= clearTile
.xmax
; ++x
)
168 for( int32_t sampleNum
= 0; sampleNum
< numSamples
; sampleNum
++)
170 ClearRasterTile
<format
>(pRasterTile
, vClear
);
171 pRasterTile
+= rasterTileSampleStep
;
174 pRasterTileRow
+= macroTileRowStep
;
177 pHotTile
->state
= HOTTILE_DIRTY
;
181 void ProcessClearBE(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t macroTile
, void *pUserData
)
185 CLEAR_DESC
*pClear
= (CLEAR_DESC
*)pUserData
;
186 SWR_CONTEXT
*pContext
= pDC
->pContext
;
187 SWR_MULTISAMPLE_COUNT sampleCount
= pDC
->pState
->state
.rastState
.sampleCount
;
188 uint32_t numSamples
= GetNumSamples(sampleCount
);
190 SWR_ASSERT(pClear
->flags
.bits
!= 0); // shouldn't be here without a reason.
192 RDTSC_START(BEClear
);
194 if (pClear
->flags
.mask
& SWR_CLEAR_COLOR
)
196 HOTTILE
*pHotTile
= pContext
->pHotTileMgr
->GetHotTile(pContext
, pDC
, macroTile
, SWR_ATTACHMENT_COLOR0
, true, numSamples
);
197 // All we want to do here is to mark the hot tile as being in a "needs clear" state.
198 pHotTile
->clearData
[0] = *(DWORD
*)&(pClear
->clearRTColor
[0]);
199 pHotTile
->clearData
[1] = *(DWORD
*)&(pClear
->clearRTColor
[1]);
200 pHotTile
->clearData
[2] = *(DWORD
*)&(pClear
->clearRTColor
[2]);
201 pHotTile
->clearData
[3] = *(DWORD
*)&(pClear
->clearRTColor
[3]);
202 pHotTile
->state
= HOTTILE_CLEAR
;
205 if (pClear
->flags
.mask
& SWR_CLEAR_DEPTH
)
207 HOTTILE
*pHotTile
= pContext
->pHotTileMgr
->GetHotTile(pContext
, pDC
, macroTile
, SWR_ATTACHMENT_DEPTH
, true, numSamples
);
208 pHotTile
->clearData
[0] = *(DWORD
*)&pClear
->clearDepth
;
209 pHotTile
->state
= HOTTILE_CLEAR
;
212 if (pClear
->flags
.mask
& SWR_CLEAR_STENCIL
)
214 HOTTILE
*pHotTile
= pContext
->pHotTileMgr
->GetHotTile(pContext
, pDC
, macroTile
, SWR_ATTACHMENT_STENCIL
, true, numSamples
);
216 pHotTile
->clearData
[0] = *(DWORD
*)&pClear
->clearStencil
;
217 pHotTile
->state
= HOTTILE_CLEAR
;
220 RDTSC_STOP(BEClear
, 0, 0);
225 CLEAR_DESC
*pClear
= (CLEAR_DESC
*)pUserData
;
226 RDTSC_START(BEClear
);
228 if (pClear
->flags
.mask
& SWR_CLEAR_COLOR
)
230 /// @todo clear data should come in as RGBA32_FLOAT
233 clearFloat
[0] = ((uint8_t*)(&pClear
->clearRTColor
))[0] / 255.0f
;
234 clearFloat
[1] = ((uint8_t*)(&pClear
->clearRTColor
))[1] / 255.0f
;
235 clearFloat
[2] = ((uint8_t*)(&pClear
->clearRTColor
))[2] / 255.0f
;
236 clearFloat
[3] = ((uint8_t*)(&pClear
->clearRTColor
))[3] / 255.0f
;
237 clearData
[0] = *(DWORD
*)&clearFloat
[0];
238 clearData
[1] = *(DWORD
*)&clearFloat
[1];
239 clearData
[2] = *(DWORD
*)&clearFloat
[2];
240 clearData
[3] = *(DWORD
*)&clearFloat
[3];
242 PFN_CLEAR_TILES pfnClearTiles
= sClearTilesTable
[KNOB_COLOR_HOT_TILE_FORMAT
];
243 SWR_ASSERT(pfnClearTiles
!= nullptr);
245 pfnClearTiles(pDC
, SWR_ATTACHMENT_COLOR0
, macroTile
, clearData
, pClear
->rect
);
248 if (pClear
->flags
.mask
& SWR_CLEAR_DEPTH
)
251 clearData
[0] = *(DWORD
*)&pClear
->clearDepth
;
252 PFN_CLEAR_TILES pfnClearTiles
= sClearTilesTable
[KNOB_DEPTH_HOT_TILE_FORMAT
];
253 SWR_ASSERT(pfnClearTiles
!= nullptr);
255 pfnClearTiles(pDC
, SWR_ATTACHMENT_DEPTH
, macroTile
, clearData
, pClear
->rect
);
258 if (pClear
->flags
.mask
& SWR_CLEAR_STENCIL
)
260 uint32_t value
= pClear
->clearStencil
;
262 clearData
[0] = *(DWORD
*)&value
;
263 PFN_CLEAR_TILES pfnClearTiles
= sClearTilesTable
[KNOB_STENCIL_HOT_TILE_FORMAT
];
265 pfnClearTiles(pDC
, SWR_ATTACHMENT_STENCIL
, macroTile
, clearData
, pClear
->rect
);
268 RDTSC_STOP(BEClear
, 0, 0);
273 void ProcessStoreTileBE(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t macroTile
, void *pData
)
275 RDTSC_START(BEStoreTiles
);
276 STORE_TILES_DESC
*pDesc
= (STORE_TILES_DESC
*)pData
;
277 SWR_CONTEXT
*pContext
= pDC
->pContext
;
279 #ifdef KNOB_ENABLE_RDTSC
280 uint32_t numTiles
= 0;
282 SWR_FORMAT srcFormat
;
283 switch (pDesc
->attachment
)
285 case SWR_ATTACHMENT_COLOR0
:
286 case SWR_ATTACHMENT_COLOR1
:
287 case SWR_ATTACHMENT_COLOR2
:
288 case SWR_ATTACHMENT_COLOR3
:
289 case SWR_ATTACHMENT_COLOR4
:
290 case SWR_ATTACHMENT_COLOR5
:
291 case SWR_ATTACHMENT_COLOR6
:
292 case SWR_ATTACHMENT_COLOR7
: srcFormat
= KNOB_COLOR_HOT_TILE_FORMAT
; break;
293 case SWR_ATTACHMENT_DEPTH
: srcFormat
= KNOB_DEPTH_HOT_TILE_FORMAT
; break;
294 case SWR_ATTACHMENT_STENCIL
: srcFormat
= KNOB_STENCIL_HOT_TILE_FORMAT
; break;
295 default: SWR_ASSERT(false, "Unknown attachment: %d", pDesc
->attachment
); srcFormat
= KNOB_COLOR_HOT_TILE_FORMAT
; break;
299 MacroTileMgr::getTileIndices(macroTile
, x
, y
);
301 // Only need to store the hottile if it's been rendered to...
302 HOTTILE
*pHotTile
= pContext
->pHotTileMgr
->GetHotTile(pContext
, pDC
, macroTile
, pDesc
->attachment
, false);
305 // clear if clear is pending (i.e., not rendered to), then mark as dirty for store.
306 if (pHotTile
->state
== HOTTILE_CLEAR
)
308 PFN_CLEAR_TILES pfnClearTiles
= sClearTilesTable
[srcFormat
];
309 SWR_ASSERT(pfnClearTiles
!= nullptr);
311 pfnClearTiles(pDC
, pDesc
->attachment
, macroTile
, pHotTile
->clearData
, pDesc
->rect
);
314 if (pHotTile
->state
== HOTTILE_DIRTY
|| pDesc
->postStoreTileState
== (SWR_TILE_STATE
)HOTTILE_DIRTY
)
316 int32_t destX
= KNOB_MACROTILE_X_DIM
* x
;
317 int32_t destY
= KNOB_MACROTILE_Y_DIM
* y
;
319 pContext
->pfnStoreTile(GetPrivateState(pDC
), srcFormat
,
320 pDesc
->attachment
, destX
, destY
, pHotTile
->renderTargetArrayIndex
, pHotTile
->pBuffer
);
324 if (pHotTile
->state
== HOTTILE_DIRTY
|| pHotTile
->state
== HOTTILE_RESOLVED
)
326 pHotTile
->state
= (HOTTILE_STATE
)pDesc
->postStoreTileState
;
329 RDTSC_STOP(BEStoreTiles
, numTiles
, pDC
->drawId
);
333 void ProcessDiscardInvalidateTilesBE(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t macroTile
, void *pData
)
335 DISCARD_INVALIDATE_TILES_DESC
*pDesc
= (DISCARD_INVALIDATE_TILES_DESC
*)pData
;
336 SWR_CONTEXT
*pContext
= pDC
->pContext
;
338 const int32_t numSamples
= GetNumSamples(pDC
->pState
->state
.rastState
.sampleCount
);
340 for (uint32_t i
= 0; i
< SWR_NUM_ATTACHMENTS
; ++i
)
342 if (pDesc
->attachmentMask
& (1 << i
))
344 HOTTILE
*pHotTile
= pContext
->pHotTileMgr
->GetHotTileNoLoad(
345 pContext
, pDC
, macroTile
, (SWR_RENDERTARGET_ATTACHMENT
)i
, pDesc
->createNewTiles
, numSamples
);
348 pHotTile
->state
= (HOTTILE_STATE
)pDesc
->newTileState
;
354 #if KNOB_SIMD_WIDTH == 8
355 const __m256 vCenterOffsetsX
= {0.5, 1.5, 0.5, 1.5, 2.5, 3.5, 2.5, 3.5};
356 const __m256 vCenterOffsetsY
= {0.5, 0.5, 1.5, 1.5, 0.5, 0.5, 1.5, 1.5};
357 const __m256 vULOffsetsX
= {0.0, 1.0, 0.0, 1.0, 2.0, 3.0, 2.0, 3.0};
358 const __m256 vULOffsetsY
= {0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0};
360 #error Unsupported vector width
363 simdmask
ComputeUserClipMask(uint8_t clipMask
, float* pUserClipBuffer
, simdscalar vI
, simdscalar vJ
)
365 simdscalar vClipMask
= _simd_setzero_ps();
366 uint32_t numClipDistance
= _mm_popcnt_u32(clipMask
);
368 for (uint32_t i
= 0; i
< numClipDistance
; ++i
)
370 // pull triangle clip distance values from clip buffer
371 simdscalar vA
= _simd_broadcast_ss(pUserClipBuffer
++);
372 simdscalar vB
= _simd_broadcast_ss(pUserClipBuffer
++);
373 simdscalar vC
= _simd_broadcast_ss(pUserClipBuffer
++);
376 simdscalar vInterp
= vplaneps(vA
, vB
, vC
, vI
, vJ
);
378 // clip if interpolated clip distance is < 0 || NAN
379 simdscalar vCull
= _simd_cmp_ps(_simd_setzero_ps(), vInterp
, _CMP_NLE_UQ
);
381 vClipMask
= _simd_or_ps(vClipMask
, vCull
);
384 return _simd_movemask_ps(vClipMask
);
388 void BackendSingleSample(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t x
, uint32_t y
, SWR_TRIANGLE_DESC
&work
, RenderOutputBuffers
&renderBuffers
)
390 RDTSC_START(BESingleSampleBackend
);
391 RDTSC_START(BESetup
);
393 const API_STATE
& state
= GetApiState(pDC
);
394 const SWR_RASTSTATE
& rastState
= state
.rastState
;
395 const SWR_PS_STATE
*pPSState
= &state
.psState
;
396 const SWR_BLEND_STATE
*pBlendState
= &state
.blendState
;
397 uint64_t coverageMask
= work
.coverageMask
[0];
400 BarycentricCoeffs coeffs
;
401 coeffs
.vIa
= _simd_broadcast_ss(&work
.I
[0]);
402 coeffs
.vIb
= _simd_broadcast_ss(&work
.I
[1]);
403 coeffs
.vIc
= _simd_broadcast_ss(&work
.I
[2]);
405 coeffs
.vJa
= _simd_broadcast_ss(&work
.J
[0]);
406 coeffs
.vJb
= _simd_broadcast_ss(&work
.J
[1]);
407 coeffs
.vJc
= _simd_broadcast_ss(&work
.J
[2]);
409 coeffs
.vZa
= _simd_broadcast_ss(&work
.Z
[0]);
410 coeffs
.vZb
= _simd_broadcast_ss(&work
.Z
[1]);
411 coeffs
.vZc
= _simd_broadcast_ss(&work
.Z
[2]);
413 coeffs
.vRecipDet
= _simd_broadcast_ss(&work
.recipDet
);
415 coeffs
.vAOneOverW
= _simd_broadcast_ss(&work
.OneOverW
[0]);
416 coeffs
.vBOneOverW
= _simd_broadcast_ss(&work
.OneOverW
[1]);
417 coeffs
.vCOneOverW
= _simd_broadcast_ss(&work
.OneOverW
[2]);
419 uint8_t *pColorBase
[SWR_NUM_RENDERTARGETS
];
420 uint32_t NumRT
= state
.psState
.numRenderTargets
;
421 for(uint32_t rt
= 0; rt
< NumRT
; ++rt
)
423 pColorBase
[rt
] = renderBuffers
.pColor
[rt
];
425 uint8_t *pDepthBase
= renderBuffers
.pDepth
, *pStencilBase
= renderBuffers
.pStencil
;
426 RDTSC_STOP(BESetup
, 0, 0);
428 SWR_PS_CONTEXT psContext
;
429 psContext
.pAttribs
= work
.pAttribs
;
430 psContext
.pPerspAttribs
= work
.pPerspAttribs
;
431 psContext
.frontFace
= work
.triFlags
.frontFacing
;
432 psContext
.primID
= work
.triFlags
.primID
;
434 // save Ia/Ib/Ic and Ja/Jb/Jc if we need to reevaluate i/j/k in the shader because of pull attribs
435 psContext
.I
= work
.I
;
436 psContext
.J
= work
.J
;
437 psContext
.recipDet
= work
.recipDet
;
438 psContext
.pRecipW
= work
.pRecipW
;
439 psContext
.pSamplePosX
= (const float*)&T::MultisampleT::samplePosX
;
440 psContext
.pSamplePosY
= (const float*)&T::MultisampleT::samplePosY
;
441 psContext
.rasterizerSampleCount
= T::MultisampleT::numSamples
;
443 for(uint32_t yy
= y
; yy
< y
+ KNOB_TILE_Y_DIM
; yy
+= SIMD_TILE_Y_DIM
)
446 psContext
.vY
.UL
= _simd_add_ps(vULOffsetsY
, _simd_set1_ps((float)yy
));
448 psContext
.vY
.center
= _simd_add_ps(vCenterOffsetsY
, _simd_set1_ps((float)yy
));
450 for(uint32_t xx
= x
; xx
< x
+ KNOB_TILE_X_DIM
; xx
+= SIMD_TILE_X_DIM
)
452 if(coverageMask
& MASK
)
454 psContext
.vX
.UL
= _simd_add_ps(vULOffsetsX
, _simd_set1_ps((float)xx
));
456 psContext
.vX
.center
= _simd_add_ps(vCenterOffsetsX
, _simd_set1_ps((float)xx
));
458 if(T::InputCoverage
!= SWR_INPUT_COVERAGE_NONE
)
460 const uint64_t* pCoverageMask
= (T::InputCoverage
== SWR_INPUT_COVERAGE_INNER_CONSERVATIVE
) ? &work
.innerCoverageMask
:
461 &work
.coverageMask
[0];
462 generateInputCoverage
<T
, T::InputCoverage
>(pCoverageMask
, psContext
.inputMask
, pBlendState
->sampleMask
);
465 RDTSC_START(BEBarycentric
);
466 CalcPixelBarycentrics(coeffs
, psContext
);
468 // for 1x case, centroid is pixel center
469 psContext
.vX
.centroid
= psContext
.vX
.center
;
470 psContext
.vY
.centroid
= psContext
.vY
.center
;
471 psContext
.vI
.centroid
= psContext
.vI
.center
;
472 psContext
.vJ
.centroid
= psContext
.vJ
.center
;
473 psContext
.vOneOverW
.centroid
= psContext
.vOneOverW
.center
;
475 // interpolate and quantize z
476 psContext
.vZ
= vplaneps(coeffs
.vZa
, coeffs
.vZb
, coeffs
.vZc
, psContext
.vI
.center
, psContext
.vJ
.center
);
477 psContext
.vZ
= state
.pfnQuantizeDepth(psContext
.vZ
);
478 RDTSC_STOP(BEBarycentric
, 0, 0);
480 simdmask clipCoverageMask
= coverageMask
& MASK
;
481 // interpolate user clip distance if available
482 if(rastState
.clipDistanceMask
)
484 clipCoverageMask
&= ~ComputeUserClipMask(rastState
.clipDistanceMask
, work
.pUserClipBuffer
,
485 psContext
.vI
.center
, psContext
.vJ
.center
);
488 simdscalar vCoverageMask
= vMask(clipCoverageMask
);
489 simdscalar depthPassMask
= vCoverageMask
;
490 simdscalar stencilPassMask
= vCoverageMask
;
495 RDTSC_START(BEEarlyDepthTest
);
496 depthPassMask
= DepthStencilTest(&state
, work
.triFlags
.frontFacing
, work
.triFlags
.viewportIndex
,
497 psContext
.vZ
, pDepthBase
, vCoverageMask
, pStencilBase
, &stencilPassMask
);
498 RDTSC_STOP(BEEarlyDepthTest
, 0, 0);
500 // early-exit if no pixels passed depth or earlyZ is forced on
501 if(pPSState
->forceEarlyZ
|| !_simd_movemask_ps(depthPassMask
))
503 DepthStencilWrite(&state
.vp
[work
.triFlags
.viewportIndex
], &state
.depthStencilState
, work
.triFlags
.frontFacing
, psContext
.vZ
,
504 pDepthBase
, depthPassMask
, vCoverageMask
, pStencilBase
, stencilPassMask
);
506 if (!_simd_movemask_ps(depthPassMask
))
513 psContext
.sampleIndex
= 0;
514 psContext
.activeMask
= _simd_castps_si(vCoverageMask
);
516 // execute pixel shader
517 RDTSC_START(BEPixelShader
);
518 UPDATE_STAT(PsInvocations
, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask
)));
519 state
.psState
.pfnPixelShader(GetPrivateState(pDC
), &psContext
);
520 RDTSC_STOP(BEPixelShader
, 0, 0);
522 vCoverageMask
= _simd_castsi_ps(psContext
.activeMask
);
527 RDTSC_START(BELateDepthTest
);
528 depthPassMask
= DepthStencilTest(&state
, work
.triFlags
.frontFacing
, work
.triFlags
.viewportIndex
,
529 psContext
.vZ
, pDepthBase
, vCoverageMask
, pStencilBase
, &stencilPassMask
);
530 RDTSC_STOP(BELateDepthTest
, 0, 0);
532 if(!_simd_movemask_ps(depthPassMask
))
534 // need to call depth/stencil write for stencil write
535 DepthStencilWrite(&state
.vp
[work
.triFlags
.viewportIndex
], &state
.depthStencilState
, work
.triFlags
.frontFacing
, psContext
.vZ
,
536 pDepthBase
, depthPassMask
, vCoverageMask
, pStencilBase
, stencilPassMask
);
541 uint32_t statMask
= _simd_movemask_ps(depthPassMask
);
542 uint32_t statCount
= _mm_popcnt_u32(statMask
);
543 UPDATE_STAT(DepthPassCount
, statCount
);
546 RDTSC_START(BEOutputMerger
);
547 OutputMerger(psContext
, pColorBase
, 0, pBlendState
, state
.pfnBlendFunc
, vCoverageMask
, depthPassMask
, pPSState
->numRenderTargets
);
549 // do final depth write after all pixel kills
550 if (!pPSState
->forceEarlyZ
)
552 DepthStencilWrite(&state
.vp
[work
.triFlags
.viewportIndex
], &state
.depthStencilState
, work
.triFlags
.frontFacing
, psContext
.vZ
,
553 pDepthBase
, depthPassMask
, vCoverageMask
, pStencilBase
, stencilPassMask
);
555 RDTSC_STOP(BEOutputMerger
, 0, 0);
559 RDTSC_START(BEEndTile
);
560 coverageMask
>>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
561 if(T::InputCoverage
== SWR_INPUT_COVERAGE_INNER_CONSERVATIVE
)
563 work
.innerCoverageMask
>>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
565 pDepthBase
+= (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
) / 8;
566 pStencilBase
+= (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
) / 8;
568 for(uint32_t rt
= 0; rt
< NumRT
; ++rt
)
570 pColorBase
[rt
] += (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
) / 8;
572 RDTSC_STOP(BEEndTile
, 0, 0);
575 RDTSC_STOP(BESingleSampleBackend
, 0, 0);
579 void BackendSampleRate(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t x
, uint32_t y
, SWR_TRIANGLE_DESC
&work
, RenderOutputBuffers
&renderBuffers
)
581 RDTSC_START(BESampleRateBackend
);
582 RDTSC_START(BESetup
);
584 const API_STATE
& state
= GetApiState(pDC
);
585 const SWR_RASTSTATE
& rastState
= state
.rastState
;
586 const SWR_PS_STATE
*pPSState
= &state
.psState
;
587 const SWR_BLEND_STATE
*pBlendState
= &state
.blendState
;
590 BarycentricCoeffs coeffs
;
591 coeffs
.vIa
= _simd_broadcast_ss(&work
.I
[0]);
592 coeffs
.vIb
= _simd_broadcast_ss(&work
.I
[1]);
593 coeffs
.vIc
= _simd_broadcast_ss(&work
.I
[2]);
595 coeffs
.vJa
= _simd_broadcast_ss(&work
.J
[0]);
596 coeffs
.vJb
= _simd_broadcast_ss(&work
.J
[1]);
597 coeffs
.vJc
= _simd_broadcast_ss(&work
.J
[2]);
599 coeffs
.vZa
= _simd_broadcast_ss(&work
.Z
[0]);
600 coeffs
.vZb
= _simd_broadcast_ss(&work
.Z
[1]);
601 coeffs
.vZc
= _simd_broadcast_ss(&work
.Z
[2]);
603 coeffs
.vRecipDet
= _simd_broadcast_ss(&work
.recipDet
);
605 coeffs
.vAOneOverW
= _simd_broadcast_ss(&work
.OneOverW
[0]);
606 coeffs
.vBOneOverW
= _simd_broadcast_ss(&work
.OneOverW
[1]);
607 coeffs
.vCOneOverW
= _simd_broadcast_ss(&work
.OneOverW
[2]);
609 uint8_t *pColorBase
[SWR_NUM_RENDERTARGETS
];
610 uint32_t NumRT
= state
.psState
.numRenderTargets
;
611 for(uint32_t rt
= 0; rt
< NumRT
; ++rt
)
613 pColorBase
[rt
] = renderBuffers
.pColor
[rt
];
615 uint8_t *pDepthBase
= renderBuffers
.pDepth
, *pStencilBase
= renderBuffers
.pStencil
;
616 RDTSC_STOP(BESetup
, 0, 0);
618 SWR_PS_CONTEXT psContext
;
619 psContext
.pAttribs
= work
.pAttribs
;
620 psContext
.pPerspAttribs
= work
.pPerspAttribs
;
621 psContext
.pRecipW
= work
.pRecipW
;
622 psContext
.frontFace
= work
.triFlags
.frontFacing
;
623 psContext
.primID
= work
.triFlags
.primID
;
625 // save Ia/Ib/Ic and Ja/Jb/Jc if we need to reevaluate i/j/k in the shader because of pull attribs
626 psContext
.I
= work
.I
;
627 psContext
.J
= work
.J
;
628 psContext
.recipDet
= work
.recipDet
;
629 psContext
.pSamplePosX
= (const float*)&T::MultisampleT::samplePosX
;
630 psContext
.pSamplePosY
= (const float*)&T::MultisampleT::samplePosY
;
631 psContext
.rasterizerSampleCount
= T::MultisampleT::numSamples
;
633 for (uint32_t yy
= y
; yy
< y
+ KNOB_TILE_Y_DIM
; yy
+= SIMD_TILE_Y_DIM
)
636 psContext
.vY
.UL
= _simd_add_ps(vULOffsetsY
, _simd_set1_ps((float)yy
));
638 psContext
.vY
.center
= _simd_add_ps(vCenterOffsetsY
, _simd_set1_ps((float)yy
));
640 for (uint32_t xx
= x
; xx
< x
+ KNOB_TILE_X_DIM
; xx
+= SIMD_TILE_X_DIM
)
642 psContext
.vX
.UL
= _simd_add_ps(vULOffsetsX
, _simd_set1_ps((float)xx
));
644 psContext
.vX
.center
= _simd_add_ps(vCenterOffsetsX
, _simd_set1_ps((float)xx
));
646 RDTSC_START(BEBarycentric
);
647 CalcPixelBarycentrics(coeffs
, psContext
);
648 RDTSC_STOP(BEBarycentric
, 0, 0);
650 if(T::InputCoverage
!= SWR_INPUT_COVERAGE_NONE
)
652 const uint64_t* pCoverageMask
= (T::InputCoverage
== SWR_INPUT_COVERAGE_INNER_CONSERVATIVE
) ? &work
.innerCoverageMask
:
653 &work
.coverageMask
[0];
654 generateInputCoverage
<T
, T::InputCoverage
>(pCoverageMask
, psContext
.inputMask
, pBlendState
->sampleMask
);
659 ///@ todo: don't need to genererate input coverage 2x if input coverage and centroid
660 RDTSC_START(BEBarycentric
);
661 if(T::bIsStandardPattern
)
663 CalcCentroidPos
<T
>(psContext
, &work
.coverageMask
[0], pBlendState
->sampleMask
, psContext
.vX
.UL
, psContext
.vY
.UL
);
667 psContext
.vX
.centroid
= _simd_add_ps(psContext
.vX
.UL
, _simd_set1_ps(0.5f
));
668 psContext
.vY
.centroid
= _simd_add_ps(psContext
.vY
.UL
, _simd_set1_ps(0.5f
));
670 CalcCentroidBarycentrics(coeffs
, psContext
, psContext
.vX
.UL
, psContext
.vY
.UL
);
671 RDTSC_STOP(BEBarycentric
, 0, 0);
675 psContext
.vX
.centroid
= psContext
.vX
.sample
;
676 psContext
.vY
.centroid
= psContext
.vY
.sample
;
679 for(uint32_t sample
= 0; sample
< T::MultisampleT::numSamples
; sample
++)
681 simdmask coverageMask
= work
.coverageMask
[sample
] & MASK
;
684 RDTSC_START(BEBarycentric
);
685 // calculate per sample positions
686 psContext
.vX
.sample
= _simd_add_ps(psContext
.vX
.UL
, T::MultisampleT::vX(sample
));
687 psContext
.vY
.sample
= _simd_add_ps(psContext
.vY
.UL
, T::MultisampleT::vY(sample
));
689 CalcSampleBarycentrics(coeffs
, psContext
);
691 // interpolate and quantize z
692 psContext
.vZ
= vplaneps(coeffs
.vZa
, coeffs
.vZb
, coeffs
.vZc
, psContext
.vI
.sample
, psContext
.vJ
.sample
);
693 psContext
.vZ
= state
.pfnQuantizeDepth(psContext
.vZ
);
694 RDTSC_STOP(BEBarycentric
, 0, 0);
696 // interpolate user clip distance if available
697 if (rastState
.clipDistanceMask
)
699 coverageMask
&= ~ComputeUserClipMask(rastState
.clipDistanceMask
, work
.pUserClipBuffer
,
700 psContext
.vI
.sample
, psContext
.vJ
.sample
);
703 simdscalar vCoverageMask
= vMask(coverageMask
);
704 simdscalar depthPassMask
= vCoverageMask
;
705 simdscalar stencilPassMask
= vCoverageMask
;
707 // offset depth/stencil buffers current sample
708 uint8_t *pDepthSample
= pDepthBase
+ RasterTileDepthOffset(sample
);
709 uint8_t *pStencilSample
= pStencilBase
+ RasterTileStencilOffset(sample
);
714 RDTSC_START(BEEarlyDepthTest
);
715 depthPassMask
= DepthStencilTest(&state
, work
.triFlags
.frontFacing
, work
.triFlags
.viewportIndex
,
716 psContext
.vZ
, pDepthSample
, vCoverageMask
, pStencilSample
, &stencilPassMask
);
717 RDTSC_STOP(BEEarlyDepthTest
, 0, 0);
719 // early-exit if no samples passed depth or earlyZ is forced on.
720 if (pPSState
->forceEarlyZ
|| !_simd_movemask_ps(depthPassMask
))
722 DepthStencilWrite(&state
.vp
[work
.triFlags
.viewportIndex
], &state
.depthStencilState
, work
.triFlags
.frontFacing
, psContext
.vZ
,
723 pDepthSample
, depthPassMask
, vCoverageMask
, pStencilSample
, stencilPassMask
);
725 if (!_simd_movemask_ps(depthPassMask
))
727 work
.coverageMask
[sample
] >>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
733 psContext
.sampleIndex
= sample
;
734 psContext
.activeMask
= _simd_castps_si(vCoverageMask
);
736 // execute pixel shader
737 RDTSC_START(BEPixelShader
);
738 UPDATE_STAT(PsInvocations
, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask
)));
739 state
.psState
.pfnPixelShader(GetPrivateState(pDC
), &psContext
);
740 RDTSC_STOP(BEPixelShader
, 0, 0);
742 vCoverageMask
= _simd_castsi_ps(psContext
.activeMask
);
747 RDTSC_START(BELateDepthTest
);
748 depthPassMask
= DepthStencilTest(&state
, work
.triFlags
.frontFacing
, work
.triFlags
.viewportIndex
,
749 psContext
.vZ
, pDepthSample
, vCoverageMask
, pStencilSample
, &stencilPassMask
);
750 RDTSC_STOP(BELateDepthTest
, 0, 0);
752 if (!_simd_movemask_ps(depthPassMask
))
754 // need to call depth/stencil write for stencil write
755 DepthStencilWrite(&state
.vp
[work
.triFlags
.viewportIndex
], &state
.depthStencilState
, work
.triFlags
.frontFacing
, psContext
.vZ
,
756 pDepthSample
, depthPassMask
, vCoverageMask
, pStencilSample
, stencilPassMask
);
758 work
.coverageMask
[sample
] >>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
763 uint32_t statMask
= _simd_movemask_ps(depthPassMask
);
764 uint32_t statCount
= _mm_popcnt_u32(statMask
);
765 UPDATE_STAT(DepthPassCount
, statCount
);
768 RDTSC_START(BEOutputMerger
);
769 OutputMerger(psContext
, pColorBase
, sample
, pBlendState
, state
.pfnBlendFunc
, vCoverageMask
, depthPassMask
, pPSState
->numRenderTargets
);
771 // do final depth write after all pixel kills
772 if (!pPSState
->forceEarlyZ
)
774 DepthStencilWrite(&state
.vp
[work
.triFlags
.viewportIndex
], &state
.depthStencilState
, work
.triFlags
.frontFacing
, psContext
.vZ
,
775 pDepthSample
, depthPassMask
, vCoverageMask
, pStencilSample
, stencilPassMask
);
777 RDTSC_STOP(BEOutputMerger
, 0, 0);
779 work
.coverageMask
[sample
] >>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
781 RDTSC_START(BEEndTile
);
782 if(T::InputCoverage
== SWR_INPUT_COVERAGE_INNER_CONSERVATIVE
)
784 work
.innerCoverageMask
>>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
786 pDepthBase
+= (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
) / 8;
787 pStencilBase
+= (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
) / 8;
789 for (uint32_t rt
= 0; rt
< NumRT
; ++rt
)
791 pColorBase
[rt
] += (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
) / 8;
793 RDTSC_STOP(BEEndTile
, 0, 0);
796 RDTSC_STOP(BESampleRateBackend
, 0, 0);
800 void BackendPixelRate(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t x
, uint32_t y
, SWR_TRIANGLE_DESC
&work
, RenderOutputBuffers
&renderBuffers
)
802 RDTSC_START(BEPixelRateBackend
);
803 RDTSC_START(BESetup
);
805 const API_STATE
& state
= GetApiState(pDC
);
806 const SWR_RASTSTATE
& rastState
= state
.rastState
;
807 const SWR_PS_STATE
*pPSState
= &state
.psState
;
808 const SWR_BLEND_STATE
*pBlendState
= &state
.blendState
;
811 BarycentricCoeffs coeffs
;
812 coeffs
.vIa
= _simd_broadcast_ss(&work
.I
[0]);
813 coeffs
.vIb
= _simd_broadcast_ss(&work
.I
[1]);
814 coeffs
.vIc
= _simd_broadcast_ss(&work
.I
[2]);
816 coeffs
.vJa
= _simd_broadcast_ss(&work
.J
[0]);
817 coeffs
.vJb
= _simd_broadcast_ss(&work
.J
[1]);
818 coeffs
.vJc
= _simd_broadcast_ss(&work
.J
[2]);
820 coeffs
.vZa
= _simd_broadcast_ss(&work
.Z
[0]);
821 coeffs
.vZb
= _simd_broadcast_ss(&work
.Z
[1]);
822 coeffs
.vZc
= _simd_broadcast_ss(&work
.Z
[2]);
824 coeffs
.vRecipDet
= _simd_broadcast_ss(&work
.recipDet
);
826 coeffs
.vAOneOverW
= _simd_broadcast_ss(&work
.OneOverW
[0]);
827 coeffs
.vBOneOverW
= _simd_broadcast_ss(&work
.OneOverW
[1]);
828 coeffs
.vCOneOverW
= _simd_broadcast_ss(&work
.OneOverW
[2]);
830 uint8_t *pColorBase
[SWR_NUM_RENDERTARGETS
];
831 uint32_t NumRT
= state
.psState
.numRenderTargets
;
832 for(uint32_t rt
= 0; rt
< NumRT
; ++rt
)
834 pColorBase
[rt
] = renderBuffers
.pColor
[rt
];
836 uint8_t *pDepthBase
= renderBuffers
.pDepth
, *pStencilBase
= renderBuffers
.pStencil
;
837 RDTSC_STOP(BESetup
, 0, 0);
839 SWR_PS_CONTEXT psContext
;
840 psContext
.pAttribs
= work
.pAttribs
;
841 psContext
.pPerspAttribs
= work
.pPerspAttribs
;
842 psContext
.frontFace
= work
.triFlags
.frontFacing
;
843 psContext
.primID
= work
.triFlags
.primID
;
844 psContext
.pRecipW
= work
.pRecipW
;
845 // save Ia/Ib/Ic and Ja/Jb/Jc if we need to reevaluate i/j/k in the shader because of pull attribs
846 psContext
.I
= work
.I
;
847 psContext
.J
= work
.J
;
848 psContext
.recipDet
= work
.recipDet
;
849 psContext
.pSamplePosX
= (const float*)&T::MultisampleT::samplePosX
;
850 psContext
.pSamplePosY
= (const float*)&T::MultisampleT::samplePosY
;
851 psContext
.rasterizerSampleCount
= T::MultisampleT::numSamples
;
853 psContext
.sampleIndex
= 0;
855 PixelRateZTestLoop
<T
> PixelRateZTest(pDC
, work
, coeffs
, state
, pDepthBase
, pStencilBase
, rastState
.clipDistanceMask
);
857 for(uint32_t yy
= y
; yy
< y
+ KNOB_TILE_Y_DIM
; yy
+= SIMD_TILE_Y_DIM
)
859 psContext
.vY
.UL
= _simd_add_ps(vULOffsetsY
, _simd_set1_ps((float)yy
));
860 psContext
.vY
.center
= _simd_add_ps(vCenterOffsetsY
, _simd_set1_ps((float)yy
));
861 for(uint32_t xx
= x
; xx
< x
+ KNOB_TILE_X_DIM
; xx
+= SIMD_TILE_X_DIM
)
863 simdscalar activeLanes
;
864 if(!(work
.anyCoveredSamples
& MASK
)) {goto Endtile
;};
865 activeLanes
= vMask(work
.anyCoveredSamples
& MASK
);
867 psContext
.vX
.UL
= _simd_add_ps(vULOffsetsX
, _simd_set1_ps((float)xx
));
868 // set pixel center positions
869 psContext
.vX
.center
= _simd_add_ps(vCenterOffsetsX
, _simd_set1_ps((float)xx
));
871 RDTSC_START(BEBarycentric
);
872 CalcPixelBarycentrics(coeffs
, psContext
);
873 RDTSC_STOP(BEBarycentric
, 0, 0);
875 if (T::InputCoverage
!= SWR_INPUT_COVERAGE_NONE
)
877 const uint64_t* pCoverageMask
= (T::InputCoverage
== SWR_INPUT_COVERAGE_INNER_CONSERVATIVE
) ? &work
.innerCoverageMask
:
878 &work
.coverageMask
[0];
879 generateInputCoverage
<T
, T::InputCoverage
>(pCoverageMask
, psContext
.inputMask
, pBlendState
->sampleMask
);
884 ///@ todo: don't need to genererate input coverage 2x if input coverage and centroid
885 RDTSC_START(BEBarycentric
);
886 if(T::bIsStandardPattern
)
888 CalcCentroidPos
<T
>(psContext
, &work
.coverageMask
[0], pBlendState
->sampleMask
, psContext
.vX
.UL
, psContext
.vY
.UL
);
892 psContext
.vX
.centroid
= _simd_add_ps(psContext
.vX
.UL
, _simd_set1_ps(0.5f
));
893 psContext
.vY
.centroid
= _simd_add_ps(psContext
.vY
.UL
, _simd_set1_ps(0.5f
));
896 CalcCentroidBarycentrics(coeffs
, psContext
, psContext
.vX
.UL
, psContext
.vY
.UL
);
897 RDTSC_STOP(BEBarycentric
, 0, 0);
901 psContext
.vX
.centroid
= _simd_add_ps(psContext
.vX
.UL
, _simd_set1_ps(0.5f
));
902 psContext
.vY
.centroid
= _simd_add_ps(psContext
.vY
.UL
, _simd_set1_ps(0.5f
));
905 if(T::bForcedSampleCount
)
907 // candidate pixels (that passed coverage) will cause shader invocation if any bits in the samplemask are set
908 const simdscalar vSampleMask
= _simd_castsi_ps(_simd_cmpgt_epi32(_simd_set1_epi32(pBlendState
->sampleMask
), _simd_setzero_si()));
909 activeLanes
= _simd_and_ps(activeLanes
, vSampleMask
);
913 if(T::bCanEarlyZ
&& !T::bForcedSampleCount
)
915 uint32_t depthPassCount
= PixelRateZTest(activeLanes
, psContext
, BEEarlyDepthTest
);
916 UPDATE_STAT(DepthPassCount
, depthPassCount
);
919 // if we have no covered samples that passed depth at this point, go to next tile
920 if(!_simd_movemask_ps(activeLanes
)) { goto Endtile
; };
922 if(pPSState
->usesSourceDepth
)
924 RDTSC_START(BEBarycentric
);
925 // interpolate and quantize z
926 psContext
.vZ
= vplaneps(coeffs
.vZa
, coeffs
.vZb
, coeffs
.vZc
, psContext
.vI
.center
, psContext
.vJ
.center
);
927 psContext
.vZ
= state
.pfnQuantizeDepth(psContext
.vZ
);
928 RDTSC_STOP(BEBarycentric
, 0, 0);
931 // pixels that are currently active
932 psContext
.activeMask
= _simd_castps_si(activeLanes
);
933 psContext
.oMask
= T::MultisampleT::FullSampleMask();
935 // execute pixel shader
936 RDTSC_START(BEPixelShader
);
937 state
.psState
.pfnPixelShader(GetPrivateState(pDC
), &psContext
);
938 UPDATE_STAT(PsInvocations
, _mm_popcnt_u32(_simd_movemask_ps(activeLanes
)));
939 RDTSC_STOP(BEPixelShader
, 0, 0);
941 // update active lanes to remove any discarded or oMask'd pixels
942 activeLanes
= _simd_castsi_ps(_simd_and_si(psContext
.activeMask
, _simd_cmpgt_epi32(psContext
.oMask
, _simd_setzero_si())));
943 if(!_simd_movemask_ps(activeLanes
)) { goto Endtile
; };
946 if(!T::bCanEarlyZ
&& !T::bForcedSampleCount
)
948 uint32_t depthPassCount
= PixelRateZTest(activeLanes
, psContext
, BELateDepthTest
);
949 UPDATE_STAT(DepthPassCount
, depthPassCount
);
952 // if we have no covered samples that passed depth at this point, skip OM and go to next tile
953 if(!_simd_movemask_ps(activeLanes
)) { goto Endtile
; };
956 // loop over all samples, broadcasting the results of the PS to all passing pixels
957 for(uint32_t sample
= 0; sample
< GetNumOMSamples
<T
>(pBlendState
->sampleCount
); sample
++)
959 RDTSC_START(BEOutputMerger
);
960 // center pattern does a single coverage/depth/stencil test, standard pattern tests all samples
961 uint32_t coverageSampleNum
= (T::bIsStandardPattern
) ? sample
: 0;
962 simdscalar coverageMask
, depthMask
;
963 if(T::bForcedSampleCount
)
965 coverageMask
= depthMask
= activeLanes
;
969 coverageMask
= PixelRateZTest
.vCoverageMask
[coverageSampleNum
];
970 depthMask
= PixelRateZTest
.depthPassMask
[coverageSampleNum
];
971 if(!_simd_movemask_ps(depthMask
))
973 // stencil should already have been written in early/lateZ tests
974 RDTSC_STOP(BEOutputMerger
, 0, 0);
979 // broadcast the results of the PS to all passing pixels
980 OutputMerger(psContext
, pColorBase
, sample
, pBlendState
, state
.pfnBlendFunc
, coverageMask
, depthMask
, pPSState
->numRenderTargets
);
982 if(!pPSState
->forceEarlyZ
&& !T::bForcedSampleCount
)
984 uint8_t *pDepthSample
= pDepthBase
+ RasterTileDepthOffset(sample
);
985 uint8_t * pStencilSample
= pStencilBase
+ RasterTileStencilOffset(sample
);
987 DepthStencilWrite(&state
.vp
[work
.triFlags
.viewportIndex
], &state
.depthStencilState
, work
.triFlags
.frontFacing
, PixelRateZTest
.vZ
[coverageSampleNum
],
988 pDepthSample
, depthMask
, coverageMask
, pStencilSample
, PixelRateZTest
.stencilPassMask
[coverageSampleNum
]);
990 RDTSC_STOP(BEOutputMerger
, 0, 0);
993 RDTSC_START(BEEndTile
);
994 for(uint32_t sample
= 0; sample
< T::MultisampleT::numCoverageSamples
; sample
++)
996 work
.coverageMask
[sample
] >>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
999 if(T::InputCoverage
== SWR_INPUT_COVERAGE_INNER_CONSERVATIVE
)
1001 work
.innerCoverageMask
>>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
1003 work
.anyCoveredSamples
>>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
1004 pDepthBase
+= (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
) / 8;
1005 pStencilBase
+= (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
) / 8;
1007 for(uint32_t rt
= 0; rt
< NumRT
; ++rt
)
1009 pColorBase
[rt
] += (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
) / 8;
1011 RDTSC_STOP(BEEndTile
, 0, 0);
1014 RDTSC_STOP(BEPixelRateBackend
, 0, 0);
1016 // optimized backend flow with NULL PS
1017 template<uint32_t sampleCountT
>
1018 void BackendNullPS(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t x
, uint32_t y
, SWR_TRIANGLE_DESC
&work
, RenderOutputBuffers
&renderBuffers
)
1020 RDTSC_START(BENullBackend
);
1021 ///@todo: handle center multisample pattern
1022 typedef SwrBackendTraits
<sampleCountT
, SWR_MSAA_STANDARD_PATTERN
> T
;
1023 RDTSC_START(BESetup
);
1025 const API_STATE
& state
= GetApiState(pDC
);
1026 const SWR_RASTSTATE
& rastState
= pDC
->pState
->state
.rastState
;
1028 // broadcast scalars
1029 BarycentricCoeffs coeffs
;
1030 coeffs
.vIa
= _simd_broadcast_ss(&work
.I
[0]);
1031 coeffs
.vIb
= _simd_broadcast_ss(&work
.I
[1]);
1032 coeffs
.vIc
= _simd_broadcast_ss(&work
.I
[2]);
1034 coeffs
.vJa
= _simd_broadcast_ss(&work
.J
[0]);
1035 coeffs
.vJb
= _simd_broadcast_ss(&work
.J
[1]);
1036 coeffs
.vJc
= _simd_broadcast_ss(&work
.J
[2]);
1038 coeffs
.vZa
= _simd_broadcast_ss(&work
.Z
[0]);
1039 coeffs
.vZb
= _simd_broadcast_ss(&work
.Z
[1]);
1040 coeffs
.vZc
= _simd_broadcast_ss(&work
.Z
[2]);
1042 coeffs
.vRecipDet
= _simd_broadcast_ss(&work
.recipDet
);
1044 uint8_t *pDepthBase
= renderBuffers
.pDepth
, *pStencilBase
= renderBuffers
.pStencil
;
1046 RDTSC_STOP(BESetup
, 0, 0);
1048 SWR_PS_CONTEXT psContext
;
1049 for (uint32_t yy
= y
; yy
< y
+ KNOB_TILE_Y_DIM
; yy
+= SIMD_TILE_Y_DIM
)
1052 simdscalar vYSamplePosUL
= _simd_add_ps(vULOffsetsY
, _simd_set1_ps((float)yy
));
1054 for (uint32_t xx
= x
; xx
< x
+ KNOB_TILE_X_DIM
; xx
+= SIMD_TILE_X_DIM
)
1057 simdscalar vXSamplePosUL
= _simd_add_ps(vULOffsetsX
, _simd_set1_ps((float)xx
));
1059 // iterate over active samples
1060 unsigned long sample
= 0;
1061 uint32_t sampleMask
= state
.blendState
.sampleMask
;
1062 while (_BitScanForward(&sample
, sampleMask
))
1064 sampleMask
&= ~(1 << sample
);
1065 simdmask coverageMask
= work
.coverageMask
[sample
] & MASK
;
1068 RDTSC_START(BEBarycentric
);
1069 // calculate per sample positions
1070 psContext
.vX
.sample
= _simd_add_ps(vXSamplePosUL
, T::MultisampleT::vX(sample
));
1071 psContext
.vY
.sample
= _simd_add_ps(vYSamplePosUL
, T::MultisampleT::vY(sample
));
1073 CalcSampleBarycentrics(coeffs
, psContext
);
1075 // interpolate and quantize z
1076 psContext
.vZ
= vplaneps(coeffs
.vZa
, coeffs
.vZb
, coeffs
.vZc
, psContext
.vI
.sample
, psContext
.vJ
.sample
);
1077 psContext
.vZ
= state
.pfnQuantizeDepth(psContext
.vZ
);
1079 RDTSC_STOP(BEBarycentric
, 0, 0);
1081 // interpolate user clip distance if available
1082 if (rastState
.clipDistanceMask
)
1084 coverageMask
&= ~ComputeUserClipMask(rastState
.clipDistanceMask
, work
.pUserClipBuffer
,
1085 psContext
.vI
.sample
, psContext
.vJ
.sample
);
1088 simdscalar vCoverageMask
= vMask(coverageMask
);
1089 simdscalar stencilPassMask
= vCoverageMask
;
1091 // offset depth/stencil buffers current sample
1092 uint8_t *pDepthSample
= pDepthBase
+ RasterTileDepthOffset(sample
);
1093 uint8_t *pStencilSample
= pStencilBase
+ RasterTileStencilOffset(sample
);
1095 RDTSC_START(BEEarlyDepthTest
);
1096 simdscalar depthPassMask
= DepthStencilTest(&state
, work
.triFlags
.frontFacing
, work
.triFlags
.viewportIndex
,
1097 psContext
.vZ
, pDepthSample
, vCoverageMask
, pStencilSample
, &stencilPassMask
);
1098 DepthStencilWrite(&state
.vp
[work
.triFlags
.viewportIndex
], &state
.depthStencilState
, work
.triFlags
.frontFacing
, psContext
.vZ
,
1099 pDepthSample
, depthPassMask
, vCoverageMask
, pStencilSample
, stencilPassMask
);
1100 RDTSC_STOP(BEEarlyDepthTest
, 0, 0);
1102 uint32_t statMask
= _simd_movemask_ps(depthPassMask
);
1103 uint32_t statCount
= _mm_popcnt_u32(statMask
);
1104 UPDATE_STAT(DepthPassCount
, statCount
);
1106 work
.coverageMask
[sample
] >>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
1108 pDepthBase
+= (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
) / 8;
1109 pStencilBase
+= (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
) / 8;
1112 RDTSC_STOP(BENullBackend
, 0, 0);
1115 void InitClearTilesTable()
1117 memset(sClearTilesTable
, 0, sizeof(sClearTilesTable
));
1119 sClearTilesTable
[R8G8B8A8_UNORM
] = ClearMacroTile
<R8G8B8A8_UNORM
>;
1120 sClearTilesTable
[B8G8R8A8_UNORM
] = ClearMacroTile
<B8G8R8A8_UNORM
>;
1121 sClearTilesTable
[R32_FLOAT
] = ClearMacroTile
<R32_FLOAT
>;
1122 sClearTilesTable
[R32G32B32A32_FLOAT
] = ClearMacroTile
<R32G32B32A32_FLOAT
>;
1123 sClearTilesTable
[R8_UINT
] = ClearMacroTile
<R8_UINT
>;
1126 PFN_BACKEND_FUNC gBackendNullPs
[SWR_MULTISAMPLE_TYPE_COUNT
];
1127 PFN_BACKEND_FUNC gBackendSingleSample
[SWR_INPUT_COVERAGE_COUNT
]
1131 PFN_BACKEND_FUNC gBackendPixelRateTable
[SWR_MULTISAMPLE_TYPE_COUNT
]
1132 [SWR_MSAA_SAMPLE_PATTERN_COUNT
]
1133 [SWR_INPUT_COVERAGE_COUNT
]
1135 [2] // forcedSampleCount
1138 PFN_BACKEND_FUNC gBackendSampleRateTable
[SWR_MULTISAMPLE_TYPE_COUNT
]
1139 [SWR_INPUT_COVERAGE_COUNT
]
1144 // Recursive template used to auto-nest conditionals. Converts dynamic enum function
1145 // arguments to static template arguments.
1146 template <uint32_t... ArgsT
>
1149 // Last Arg Terminator
1150 static PFN_BACKEND_FUNC
GetFunc(SWR_BACKEND_FUNCS tArg
)
1154 case SWR_BACKEND_SINGLE_SAMPLE
: return BackendSingleSample
<SwrBackendTraits
<ArgsT
...>>; break;
1155 case SWR_BACKEND_MSAA_PIXEL_RATE
: return BackendPixelRate
<SwrBackendTraits
<ArgsT
...>>; break;
1156 case SWR_BACKEND_MSAA_SAMPLE_RATE
: return BackendSampleRate
<SwrBackendTraits
<ArgsT
...>>; break;
1158 SWR_ASSERT(0 && "Invalid backend func\n");
1164 // Recursively parse args
1165 template <typename
... TArgsT
>
1166 static PFN_BACKEND_FUNC
GetFunc(SWR_MSAA_SAMPLE_PATTERN tArg
, TArgsT
... remainingArgs
)
1170 case SWR_MSAA_CENTER_PATTERN
: return BEChooser
<ArgsT
..., SWR_MSAA_CENTER_PATTERN
>::GetFunc(remainingArgs
...); break;
1171 case SWR_MSAA_STANDARD_PATTERN
: return BEChooser
<ArgsT
..., SWR_MSAA_STANDARD_PATTERN
>::GetFunc(remainingArgs
...); break;
1173 SWR_ASSERT(0 && "Invalid sample pattern\n");
1174 return BEChooser
<ArgsT
..., SWR_MSAA_STANDARD_PATTERN
>::GetFunc(remainingArgs
...);
1179 // Recursively parse args
1180 template <typename
... TArgsT
>
1181 static PFN_BACKEND_FUNC
GetFunc(SWR_INPUT_COVERAGE tArg
, TArgsT
... remainingArgs
)
1185 case SWR_INPUT_COVERAGE_NONE
: return BEChooser
<ArgsT
..., SWR_INPUT_COVERAGE_NONE
>::GetFunc(remainingArgs
...); break;
1186 case SWR_INPUT_COVERAGE_NORMAL
: return BEChooser
<ArgsT
..., SWR_INPUT_COVERAGE_NORMAL
>::GetFunc(remainingArgs
...); break;
1187 case SWR_INPUT_COVERAGE_INNER_CONSERVATIVE
: return BEChooser
<ArgsT
..., SWR_INPUT_COVERAGE_INNER_CONSERVATIVE
>::GetFunc(remainingArgs
...); break;
1189 SWR_ASSERT(0 && "Invalid sample pattern\n");
1190 return BEChooser
<ArgsT
..., SWR_INPUT_COVERAGE_NONE
>::GetFunc(remainingArgs
...);
1195 // Recursively parse args
1196 template <typename
... TArgsT
>
1197 static PFN_BACKEND_FUNC
GetFunc(SWR_MULTISAMPLE_COUNT tArg
, TArgsT
... remainingArgs
)
1201 case SWR_MULTISAMPLE_1X
: return BEChooser
<ArgsT
..., SWR_MULTISAMPLE_1X
>::GetFunc(remainingArgs
...); break;
1202 case SWR_MULTISAMPLE_2X
: return BEChooser
<ArgsT
..., SWR_MULTISAMPLE_2X
>::GetFunc(remainingArgs
...); break;
1203 case SWR_MULTISAMPLE_4X
: return BEChooser
<ArgsT
..., SWR_MULTISAMPLE_4X
>::GetFunc(remainingArgs
...); break;
1204 case SWR_MULTISAMPLE_8X
: return BEChooser
<ArgsT
..., SWR_MULTISAMPLE_8X
>::GetFunc(remainingArgs
...); break;
1205 case SWR_MULTISAMPLE_16X
: return BEChooser
<ArgsT
..., SWR_MULTISAMPLE_16X
>::GetFunc(remainingArgs
...); break;
1207 SWR_ASSERT(0 && "Invalid sample count\n");
1208 return BEChooser
<ArgsT
..., SWR_MULTISAMPLE_1X
>::GetFunc(remainingArgs
...);
1213 // Recursively parse args
1214 template <typename
... TArgsT
>
1215 static PFN_BACKEND_FUNC
GetFunc(bool tArg
, TArgsT
... remainingArgs
)
1219 return BEChooser
<ArgsT
..., 1>::GetFunc(remainingArgs
...);
1222 return BEChooser
<ArgsT
..., 0>::GetFunc(remainingArgs
...);
1226 void InitBackendSingleFuncTable(PFN_BACKEND_FUNC (&table
)[SWR_INPUT_COVERAGE_COUNT
][2][2])
1228 for(uint32_t inputCoverage
= 0; inputCoverage
< SWR_INPUT_COVERAGE_COUNT
; inputCoverage
++)
1230 for(uint32_t isCentroid
= 0; isCentroid
< 2; isCentroid
++)
1232 for(uint32_t canEarlyZ
= 0; canEarlyZ
< 2; canEarlyZ
++)
1234 table
[inputCoverage
][isCentroid
][canEarlyZ
] =
1235 BEChooser
<>::GetFunc(SWR_MULTISAMPLE_1X
, SWR_MSAA_STANDARD_PATTERN
, (SWR_INPUT_COVERAGE
)inputCoverage
,
1236 (isCentroid
> 0), false, (canEarlyZ
> 0), SWR_BACKEND_SINGLE_SAMPLE
);
1242 void InitBackendPixelFuncTable(PFN_BACKEND_FUNC (&table
)[SWR_MULTISAMPLE_TYPE_COUNT
][SWR_MSAA_SAMPLE_PATTERN_COUNT
][SWR_INPUT_COVERAGE_COUNT
][2][2][2])
1244 for(uint32_t sampleCount
= SWR_MULTISAMPLE_1X
; sampleCount
< SWR_MULTISAMPLE_TYPE_COUNT
; sampleCount
++)
1246 for(uint32_t samplePattern
= SWR_MSAA_CENTER_PATTERN
; samplePattern
< SWR_MSAA_SAMPLE_PATTERN_COUNT
; samplePattern
++)
1248 for(uint32_t inputCoverage
= 0; inputCoverage
< SWR_INPUT_COVERAGE_COUNT
; inputCoverage
++)
1250 for(uint32_t isCentroid
= 0; isCentroid
< 2; isCentroid
++)
1252 for(uint32_t forcedSampleCount
= 0; forcedSampleCount
< 2; forcedSampleCount
++)
1254 for(uint32_t canEarlyZ
= 0; canEarlyZ
< 2; canEarlyZ
++)
1256 table
[sampleCount
][samplePattern
][inputCoverage
][isCentroid
][forcedSampleCount
][canEarlyZ
] =
1257 BEChooser
<>::GetFunc((SWR_MULTISAMPLE_COUNT
)sampleCount
, (SWR_MSAA_SAMPLE_PATTERN
)samplePattern
, (SWR_INPUT_COVERAGE
)inputCoverage
,
1258 (isCentroid
> 0), (forcedSampleCount
> 0), (canEarlyZ
> 0), SWR_BACKEND_MSAA_PIXEL_RATE
);
1267 void InitBackendSampleFuncTable(PFN_BACKEND_FUNC (&table
)[SWR_MULTISAMPLE_TYPE_COUNT
][SWR_INPUT_COVERAGE_COUNT
][2][2])
1269 for(uint32_t sampleCount
= SWR_MULTISAMPLE_1X
; sampleCount
< SWR_MULTISAMPLE_TYPE_COUNT
; sampleCount
++)
1271 for(uint32_t inputCoverage
= 0; inputCoverage
< SWR_INPUT_COVERAGE_COUNT
; inputCoverage
++)
1273 for(uint32_t centroid
= 0; centroid
< 2; centroid
++)
1275 for(uint32_t canEarlyZ
= 0; canEarlyZ
< 2; canEarlyZ
++)
1277 table
[sampleCount
][inputCoverage
][centroid
][canEarlyZ
] =
1278 BEChooser
<>::GetFunc((SWR_MULTISAMPLE_COUNT
)sampleCount
, SWR_MSAA_STANDARD_PATTERN
, (SWR_INPUT_COVERAGE
)inputCoverage
,
1279 (centroid
> 0), false, (canEarlyZ
> 0), (SWR_BACKEND_FUNCS
)SWR_BACKEND_MSAA_SAMPLE_RATE
);
1286 void InitBackendFuncTables()
1288 InitBackendSingleFuncTable(gBackendSingleSample
);
1289 InitBackendPixelFuncTable(gBackendPixelRateTable
);
1290 InitBackendSampleFuncTable(gBackendSampleRateTable
);
1292 gBackendNullPs
[SWR_MULTISAMPLE_1X
] = &BackendNullPS
< SWR_MULTISAMPLE_1X
> ;
1293 gBackendNullPs
[SWR_MULTISAMPLE_2X
] = &BackendNullPS
< SWR_MULTISAMPLE_2X
> ;
1294 gBackendNullPs
[SWR_MULTISAMPLE_4X
] = &BackendNullPS
< SWR_MULTISAMPLE_4X
> ;
1295 gBackendNullPs
[SWR_MULTISAMPLE_8X
] = &BackendNullPS
< SWR_MULTISAMPLE_8X
> ;
1296 gBackendNullPs
[SWR_MULTISAMPLE_16X
] = &BackendNullPS
< SWR_MULTISAMPLE_16X
> ;