8c9449baa00665aee62082bf37ff03df2df5c011
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / backend.cpp
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file backend.cpp
24 *
25 * @brief Backend handles rasterization, pixel shading and output merger
26 * operations.
27 *
28 ******************************************************************************/
29
30 #include <smmintrin.h>
31
32 #include "backend.h"
33 #include "tilemgr.h"
34 #include "memory/tilingtraits.h"
35 #include "core/multisample.h"
36
37 #include <algorithm>
38
39 typedef void(*PFN_CLEAR_TILES)(DRAW_CONTEXT*, SWR_RENDERTARGET_ATTACHMENT rt, uint32_t, uint32_t, DWORD[4], const SWR_RECT& rect);
40 static PFN_CLEAR_TILES sClearTilesTable[NUM_SWR_FORMATS];
41
42
43 //////////////////////////////////////////////////////////////////////////
44 /// @brief Process compute work.
45 /// @param pDC - pointer to draw context (dispatch).
46 /// @param workerId - The unique worker ID that is assigned to this thread.
47 /// @param threadGroupId - the linear index for the thread group within the dispatch.
48 void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId, void*& pSpillFillBuffer, void*& pScratchSpace)
49 {
50 SWR_CONTEXT *pContext = pDC->pContext;
51
52 AR_BEGIN(BEDispatch, pDC->drawId);
53
54 const COMPUTE_DESC* pTaskData = (COMPUTE_DESC*)pDC->pDispatch->GetTasksData();
55 SWR_ASSERT(pTaskData != nullptr);
56
57 // Ensure spill fill memory has been allocated.
58 size_t spillFillSize = pDC->pState->state.totalSpillFillSize;
59 if (spillFillSize && pSpillFillBuffer == nullptr)
60 {
61 pSpillFillBuffer = pDC->pArena->AllocAlignedSync(spillFillSize, KNOB_SIMD_BYTES);
62 }
63
64 size_t scratchSpaceSize = pDC->pState->state.scratchSpaceSize * pDC->pState->state.scratchSpaceNumInstances;
65 if (scratchSpaceSize && pScratchSpace == nullptr)
66 {
67 pScratchSpace = pDC->pArena->AllocAlignedSync(scratchSpaceSize, KNOB_SIMD_BYTES);
68 }
69
70 const API_STATE& state = GetApiState(pDC);
71
72 SWR_CS_CONTEXT csContext{ 0 };
73 csContext.tileCounter = threadGroupId;
74 csContext.dispatchDims[0] = pTaskData->threadGroupCountX;
75 csContext.dispatchDims[1] = pTaskData->threadGroupCountY;
76 csContext.dispatchDims[2] = pTaskData->threadGroupCountZ;
77 csContext.pTGSM = pContext->ppScratch[workerId];
78 csContext.pSpillFillBuffer = (uint8_t*)pSpillFillBuffer;
79 csContext.pScratchSpace = (uint8_t*)pScratchSpace;
80 csContext.scratchSpacePerSimd = pDC->pState->state.scratchSpaceSize;
81
82 state.pfnCsFunc(GetPrivateState(pDC), &csContext);
83
84 UPDATE_STAT_BE(CsInvocations, state.totalThreadsInGroup);
85
86 AR_END(BEDispatch, 1);
87 }
88
89 //////////////////////////////////////////////////////////////////////////
90 /// @brief Process shutdown.
91 /// @param pDC - pointer to draw context (dispatch).
92 /// @param workerId - The unique worker ID that is assigned to this thread.
93 /// @param threadGroupId - the linear index for the thread group within the dispatch.
94 void ProcessShutdownBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData)
95 {
96 // Dummy function
97 }
98
99 void ProcessSyncBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData)
100 {
101 uint32_t x, y;
102 MacroTileMgr::getTileIndices(macroTile, x, y);
103 SWR_ASSERT(x == 0 && y == 0);
104 }
105
106 template<SWR_FORMAT format>
107 void ClearRasterTile(uint8_t *pTileBuffer, simdvector &value)
108 {
109 auto lambda = [&](int32_t comp)
110 {
111 FormatTraits<format>::storeSOA(comp, pTileBuffer, value.v[comp]);
112
113 pTileBuffer += (KNOB_SIMD_WIDTH * FormatTraits<format>::GetBPC(comp) / 8);
114 };
115
116 const uint32_t numIter = (KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD_TILE_X_DIM);
117
118 for (uint32_t i = 0; i < numIter; ++i)
119 {
120 UnrollerL<0, FormatTraits<format>::numComps, 1>::step(lambda);
121 }
122 }
123
124 #if USE_8x2_TILE_BACKEND
125 template<SWR_FORMAT format>
126 void ClearRasterTile(uint8_t *pTileBuffer, simd16vector &value)
127 {
128 auto lambda = [&](int32_t comp)
129 {
130 FormatTraits<format>::storeSOA(comp, pTileBuffer, value.v[comp]);
131
132 pTileBuffer += (KNOB_SIMD16_WIDTH * FormatTraits<format>::GetBPC(comp) / 8);
133 };
134
135 const uint32_t numIter = (KNOB_TILE_Y_DIM / SIMD16_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD16_TILE_X_DIM);
136
137 for (uint32_t i = 0; i < numIter; ++i)
138 {
139 UnrollerL<0, FormatTraits<format>::numComps, 1>::step(lambda);
140 }
141 }
142
143 #endif
144 template<SWR_FORMAT format>
145 INLINE void ClearMacroTile(DRAW_CONTEXT *pDC, SWR_RENDERTARGET_ATTACHMENT rt, uint32_t macroTile, uint32_t renderTargetArrayIndex, DWORD clear[4], const SWR_RECT& rect)
146 {
147 // convert clear color to hottile format
148 // clear color is in RGBA float/uint32
149 #if USE_8x2_TILE_BACKEND
150 simd16vector vClear;
151 for (uint32_t comp = 0; comp < FormatTraits<format>::numComps; ++comp)
152 {
153 simd16scalar vComp;
154 vComp = _simd16_load1_ps((const float*)&clear[comp]);
155 if (FormatTraits<format>::isNormalized(comp))
156 {
157 vComp = _simd16_mul_ps(vComp, _simd16_set1_ps(FormatTraits<format>::fromFloat(comp)));
158 vComp = _simd16_castsi_ps(_simd16_cvtps_epi32(vComp));
159 }
160 vComp = FormatTraits<format>::pack(comp, vComp);
161 vClear.v[FormatTraits<format>::swizzle(comp)] = vComp;
162 }
163
164 #else
165 simdvector vClear;
166 for (uint32_t comp = 0; comp < FormatTraits<format>::numComps; ++comp)
167 {
168 simdscalar vComp;
169 vComp = _simd_load1_ps((const float*)&clear[comp]);
170 if (FormatTraits<format>::isNormalized(comp))
171 {
172 vComp = _simd_mul_ps(vComp, _simd_set1_ps(FormatTraits<format>::fromFloat(comp)));
173 vComp = _simd_castsi_ps(_simd_cvtps_epi32(vComp));
174 }
175 vComp = FormatTraits<format>::pack(comp, vComp);
176 vClear.v[FormatTraits<format>::swizzle(comp)] = vComp;
177 }
178
179 #endif
180 uint32_t tileX, tileY;
181 MacroTileMgr::getTileIndices(macroTile, tileX, tileY);
182
183 // Init to full macrotile
184 SWR_RECT clearTile =
185 {
186 KNOB_MACROTILE_X_DIM * int32_t(tileX),
187 KNOB_MACROTILE_Y_DIM * int32_t(tileY),
188 KNOB_MACROTILE_X_DIM * int32_t(tileX + 1),
189 KNOB_MACROTILE_Y_DIM * int32_t(tileY + 1),
190 };
191
192 // intersect with clear rect
193 clearTile &= rect;
194
195 // translate to local hottile origin
196 clearTile.Translate(-int32_t(tileX) * KNOB_MACROTILE_X_DIM, -int32_t(tileY) * KNOB_MACROTILE_Y_DIM);
197
198 // Make maximums inclusive (needed for convert to raster tiles)
199 clearTile.xmax -= 1;
200 clearTile.ymax -= 1;
201
202 // convert to raster tiles
203 clearTile.ymin >>= (KNOB_TILE_Y_DIM_SHIFT);
204 clearTile.ymax >>= (KNOB_TILE_Y_DIM_SHIFT);
205 clearTile.xmin >>= (KNOB_TILE_X_DIM_SHIFT);
206 clearTile.xmax >>= (KNOB_TILE_X_DIM_SHIFT);
207
208 const int32_t numSamples = GetNumSamples(pDC->pState->state.rastState.sampleCount);
209 // compute steps between raster tile samples / raster tiles / macro tile rows
210 const uint32_t rasterTileSampleStep = KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<format>::bpp / 8;
211 const uint32_t rasterTileStep = (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<format>::bpp / 8)) * numSamples;
212 const uint32_t macroTileRowStep = (KNOB_MACROTILE_X_DIM / KNOB_TILE_X_DIM) * rasterTileStep;
213 const uint32_t pitch = (FormatTraits<format>::bpp * KNOB_MACROTILE_X_DIM / 8);
214
215 HOTTILE *pHotTile = pDC->pContext->pHotTileMgr->GetHotTile(pDC->pContext, pDC, macroTile, rt, true, numSamples, renderTargetArrayIndex);
216 uint32_t rasterTileStartOffset = (ComputeTileOffset2D< TilingTraits<SWR_TILE_SWRZ, FormatTraits<format>::bpp > >(pitch, clearTile.xmin, clearTile.ymin)) * numSamples;
217 uint8_t* pRasterTileRow = pHotTile->pBuffer + rasterTileStartOffset; //(ComputeTileOffset2D< TilingTraits<SWR_TILE_SWRZ, FormatTraits<format>::bpp > >(pitch, x, y)) * numSamples;
218
219 // loop over all raster tiles in the current hot tile
220 for (int32_t y = clearTile.ymin; y <= clearTile.ymax; ++y)
221 {
222 uint8_t* pRasterTile = pRasterTileRow;
223 for (int32_t x = clearTile.xmin; x <= clearTile.xmax; ++x)
224 {
225 for( int32_t sampleNum = 0; sampleNum < numSamples; sampleNum++)
226 {
227 ClearRasterTile<format>(pRasterTile, vClear);
228 pRasterTile += rasterTileSampleStep;
229 }
230 }
231 pRasterTileRow += macroTileRowStep;
232 }
233
234 pHotTile->state = HOTTILE_DIRTY;
235 }
236
237
238 void ProcessClearBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData)
239 {
240 SWR_CONTEXT *pContext = pDC->pContext;
241
242 if (KNOB_FAST_CLEAR)
243 {
244 CLEAR_DESC *pClear = (CLEAR_DESC*)pUserData;
245 SWR_MULTISAMPLE_COUNT sampleCount = pDC->pState->state.rastState.sampleCount;
246 uint32_t numSamples = GetNumSamples(sampleCount);
247
248 SWR_ASSERT(pClear->attachmentMask != 0); // shouldn't be here without a reason.
249
250 AR_BEGIN(BEClear, pDC->drawId);
251
252 if (pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR)
253 {
254 unsigned long rt = 0;
255 uint32_t mask = pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR;
256 while (_BitScanForward(&rt, mask))
257 {
258 mask &= ~(1 << rt);
259
260 HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, (SWR_RENDERTARGET_ATTACHMENT)rt, true, numSamples, pClear->renderTargetArrayIndex);
261
262 // All we want to do here is to mark the hot tile as being in a "needs clear" state.
263 pHotTile->clearData[0] = *(DWORD*)&(pClear->clearRTColor[0]);
264 pHotTile->clearData[1] = *(DWORD*)&(pClear->clearRTColor[1]);
265 pHotTile->clearData[2] = *(DWORD*)&(pClear->clearRTColor[2]);
266 pHotTile->clearData[3] = *(DWORD*)&(pClear->clearRTColor[3]);
267 pHotTile->state = HOTTILE_CLEAR;
268 }
269 }
270
271 if (pClear->attachmentMask & SWR_ATTACHMENT_DEPTH_BIT)
272 {
273 HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, SWR_ATTACHMENT_DEPTH, true, numSamples, pClear->renderTargetArrayIndex);
274 pHotTile->clearData[0] = *(DWORD*)&pClear->clearDepth;
275 pHotTile->state = HOTTILE_CLEAR;
276 }
277
278 if (pClear->attachmentMask & SWR_ATTACHMENT_STENCIL_BIT)
279 {
280 HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, SWR_ATTACHMENT_STENCIL, true, numSamples, pClear->renderTargetArrayIndex);
281
282 pHotTile->clearData[0] = pClear->clearStencil;
283 pHotTile->state = HOTTILE_CLEAR;
284 }
285
286 AR_END(BEClear, 1);
287 }
288 else
289 {
290 // Legacy clear
291 CLEAR_DESC *pClear = (CLEAR_DESC*)pUserData;
292 AR_BEGIN(BEClear, pDC->drawId);
293
294 if (pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR)
295 {
296 DWORD clearData[4];
297 clearData[0] = *(DWORD*)&(pClear->clearRTColor[0]);
298 clearData[1] = *(DWORD*)&(pClear->clearRTColor[1]);
299 clearData[2] = *(DWORD*)&(pClear->clearRTColor[2]);
300 clearData[3] = *(DWORD*)&(pClear->clearRTColor[3]);
301
302 PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[KNOB_COLOR_HOT_TILE_FORMAT];
303 SWR_ASSERT(pfnClearTiles != nullptr);
304
305 unsigned long rt = 0;
306 uint32_t mask = pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR;
307 while (_BitScanForward(&rt, mask))
308 {
309 mask &= ~(1 << rt);
310
311 pfnClearTiles(pDC, (SWR_RENDERTARGET_ATTACHMENT)rt, macroTile, pClear->renderTargetArrayIndex, clearData, pClear->rect);
312 }
313 }
314
315 if (pClear->attachmentMask & SWR_ATTACHMENT_DEPTH_BIT)
316 {
317 DWORD clearData[4];
318 clearData[0] = *(DWORD*)&pClear->clearDepth;
319 PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[KNOB_DEPTH_HOT_TILE_FORMAT];
320 SWR_ASSERT(pfnClearTiles != nullptr);
321
322 pfnClearTiles(pDC, SWR_ATTACHMENT_DEPTH, macroTile, pClear->renderTargetArrayIndex, clearData, pClear->rect);
323 }
324
325 if (pClear->attachmentMask & SWR_ATTACHMENT_STENCIL_BIT)
326 {
327 DWORD clearData[4];
328 clearData[0] = pClear->clearStencil;
329 PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[KNOB_STENCIL_HOT_TILE_FORMAT];
330
331 pfnClearTiles(pDC, SWR_ATTACHMENT_STENCIL, macroTile, pClear->renderTargetArrayIndex, clearData, pClear->rect);
332 }
333
334 AR_END(BEClear, 1);
335 }
336 }
337
338 void ProcessStoreTileBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, STORE_TILES_DESC* pDesc,
339 SWR_RENDERTARGET_ATTACHMENT attachment)
340 {
341 SWR_CONTEXT *pContext = pDC->pContext;
342
343 AR_BEGIN(BEStoreTiles, pDC->drawId);
344
345 SWR_FORMAT srcFormat;
346 switch (attachment)
347 {
348 case SWR_ATTACHMENT_COLOR0:
349 case SWR_ATTACHMENT_COLOR1:
350 case SWR_ATTACHMENT_COLOR2:
351 case SWR_ATTACHMENT_COLOR3:
352 case SWR_ATTACHMENT_COLOR4:
353 case SWR_ATTACHMENT_COLOR5:
354 case SWR_ATTACHMENT_COLOR6:
355 case SWR_ATTACHMENT_COLOR7: srcFormat = KNOB_COLOR_HOT_TILE_FORMAT; break;
356 case SWR_ATTACHMENT_DEPTH: srcFormat = KNOB_DEPTH_HOT_TILE_FORMAT; break;
357 case SWR_ATTACHMENT_STENCIL: srcFormat = KNOB_STENCIL_HOT_TILE_FORMAT; break;
358 default: SWR_INVALID("Unknown attachment: %d", attachment); srcFormat = KNOB_COLOR_HOT_TILE_FORMAT; break;
359 }
360
361 uint32_t x, y;
362 MacroTileMgr::getTileIndices(macroTile, x, y);
363
364 // Only need to store the hottile if it's been rendered to...
365 HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTileNoLoad(pContext, pDC, macroTile, attachment, false);
366 if (pHotTile)
367 {
368 // clear if clear is pending (i.e., not rendered to), then mark as dirty for store.
369 if (pHotTile->state == HOTTILE_CLEAR)
370 {
371 PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[srcFormat];
372 SWR_ASSERT(pfnClearTiles != nullptr);
373
374 pfnClearTiles(pDC, attachment, macroTile, pHotTile->renderTargetArrayIndex, pHotTile->clearData, pDesc->rect);
375 }
376
377 if (pHotTile->state == HOTTILE_DIRTY || pDesc->postStoreTileState == (SWR_TILE_STATE)HOTTILE_DIRTY)
378 {
379 int32_t destX = KNOB_MACROTILE_X_DIM * x;
380 int32_t destY = KNOB_MACROTILE_Y_DIM * y;
381
382 pContext->pfnStoreTile(GetPrivateState(pDC), srcFormat,
383 attachment, destX, destY, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer);
384 }
385
386
387 if (pHotTile->state == HOTTILE_DIRTY || pHotTile->state == HOTTILE_RESOLVED)
388 {
389 if (!(pDesc->postStoreTileState == (SWR_TILE_STATE)HOTTILE_DIRTY && pHotTile->state == HOTTILE_RESOLVED))
390 {
391 pHotTile->state = (HOTTILE_STATE)pDesc->postStoreTileState;
392 }
393 }
394 }
395 AR_END(BEStoreTiles, 1);
396 }
397
398 void ProcessStoreTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData)
399 {
400 STORE_TILES_DESC *pDesc = (STORE_TILES_DESC*)pData;
401
402 unsigned long rt = 0;
403 uint32_t mask = pDesc->attachmentMask;
404 while (_BitScanForward(&rt, mask))
405 {
406 mask &= ~(1 << rt);
407 ProcessStoreTileBE(pDC, workerId, macroTile, pDesc, (SWR_RENDERTARGET_ATTACHMENT)rt);
408 }
409 }
410
411 void ProcessDiscardInvalidateTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData)
412 {
413 DISCARD_INVALIDATE_TILES_DESC *pDesc = (DISCARD_INVALIDATE_TILES_DESC *)pData;
414 SWR_CONTEXT *pContext = pDC->pContext;
415
416 const int32_t numSamples = GetNumSamples(pDC->pState->state.rastState.sampleCount);
417
418 for (uint32_t i = 0; i < SWR_NUM_ATTACHMENTS; ++i)
419 {
420 if (pDesc->attachmentMask & (1 << i))
421 {
422 HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTileNoLoad(
423 pContext, pDC, macroTile, (SWR_RENDERTARGET_ATTACHMENT)i, pDesc->createNewTiles, numSamples);
424 if (pHotTile)
425 {
426 pHotTile->state = (HOTTILE_STATE)pDesc->newTileState;
427 }
428 }
429 }
430 }
431
432 #if KNOB_SIMD_WIDTH == 8
433 const simdscalar vCenterOffsetsX = __m256{0.5, 1.5, 0.5, 1.5, 2.5, 3.5, 2.5, 3.5};
434 const simdscalar vCenterOffsetsY = __m256{0.5, 0.5, 1.5, 1.5, 0.5, 0.5, 1.5, 1.5};
435 const simdscalar vULOffsetsX = __m256{0.0, 1.0, 0.0, 1.0, 2.0, 3.0, 2.0, 3.0};
436 const simdscalar vULOffsetsY = __m256{0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0};
437 #else
438 #error Unsupported vector width
439 #endif
440
441 simdmask ComputeUserClipMask(uint8_t clipMask, float* pUserClipBuffer, simdscalar vI, simdscalar vJ)
442 {
443 simdscalar vClipMask = _simd_setzero_ps();
444 uint32_t numClipDistance = _mm_popcnt_u32(clipMask);
445
446 for (uint32_t i = 0; i < numClipDistance; ++i)
447 {
448 // pull triangle clip distance values from clip buffer
449 simdscalar vA = _simd_broadcast_ss(pUserClipBuffer++);
450 simdscalar vB = _simd_broadcast_ss(pUserClipBuffer++);
451 simdscalar vC = _simd_broadcast_ss(pUserClipBuffer++);
452
453 // interpolate
454 simdscalar vInterp = vplaneps(vA, vB, vC, vI, vJ);
455
456 // clip if interpolated clip distance is < 0 || NAN
457 simdscalar vCull = _simd_cmp_ps(_simd_setzero_ps(), vInterp, _CMP_NLE_UQ);
458
459 vClipMask = _simd_or_ps(vClipMask, vCull);
460 }
461
462 return _simd_movemask_ps(vClipMask);
463 }
464
465 template<typename T>
466 void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
467 {
468 SWR_CONTEXT *pContext = pDC->pContext;
469
470 AR_BEGIN(BESingleSampleBackend, pDC->drawId);
471 AR_BEGIN(BESetup, pDC->drawId);
472
473 const API_STATE &state = GetApiState(pDC);
474
475 BarycentricCoeffs coeffs;
476 SetupBarycentricCoeffs(&coeffs, work);
477
478 uint8_t *pColorBuffer[SWR_NUM_RENDERTARGETS], *pDepthBuffer, *pStencilBuffer;
479 SetupRenderBuffers(pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.psState.numRenderTargets, renderBuffers);
480
481 SWR_PS_CONTEXT psContext;
482 const SWR_MULTISAMPLE_POS& samplePos = state.rastState.samplePositions;
483 SetupPixelShaderContext<T>(&psContext, samplePos, work);
484
485 AR_END(BESetup, 1);
486
487 psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast<float>(y)));
488 psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast<float>(y)));
489
490 const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM));
491
492 for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
493 {
494 psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps(static_cast<float>(x)));
495 psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps(static_cast<float>(x)));
496
497 const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM));
498
499 for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
500 {
501 #if USE_8x2_TILE_BACKEND
502 const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0);
503
504 #endif
505 simdmask coverageMask = work.coverageMask[0] & MASK;
506
507 if (coverageMask)
508 {
509 if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable)
510 {
511 static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format");
512
513 const simdscalar z = _simd_load_ps(reinterpret_cast<const float *>(pDepthBuffer));
514
515 const float minz = state.depthBoundsState.depthBoundsTestMinValue;
516 const float maxz = state.depthBoundsState.depthBoundsTestMaxValue;
517
518 coverageMask &= CalcDepthBoundsAcceptMask(z, minz, maxz);
519 }
520
521 if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE)
522 {
523 const uint64_t* pCoverageMask = (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) ? &work.innerCoverageMask : &work.coverageMask[0];
524
525 generateInputCoverage<T, T::InputCoverage>(pCoverageMask, psContext.inputMask, state.blendState.sampleMask);
526 }
527
528 AR_BEGIN(BEBarycentric, pDC->drawId);
529
530 CalcPixelBarycentrics(coeffs, psContext);
531
532 CalcCentroid<T, true>(&psContext, samplePos, coeffs, work.coverageMask, state.blendState.sampleMask);
533
534 // interpolate and quantize z
535 psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center);
536 psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
537
538 AR_END(BEBarycentric, 1);
539
540 // interpolate user clip distance if available
541 if (state.rastState.clipDistanceMask)
542 {
543 coverageMask &= ~ComputeUserClipMask(state.rastState.clipDistanceMask, work.pUserClipBuffer, psContext.vI.center, psContext.vJ.center);
544 }
545
546 simdscalar vCoverageMask = vMask(coverageMask);
547 simdscalar depthPassMask = vCoverageMask;
548 simdscalar stencilPassMask = vCoverageMask;
549
550 // Early-Z?
551 if (T::bCanEarlyZ)
552 {
553 AR_BEGIN(BEEarlyDepthTest, pDC->drawId);
554 depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
555 psContext.vZ, pDepthBuffer, vCoverageMask, pStencilBuffer, &stencilPassMask);
556 AR_EVENT(EarlyDepthStencilInfoSingleSample(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(stencilPassMask), _simd_movemask_ps(vCoverageMask)));
557 AR_END(BEEarlyDepthTest, 0);
558
559 // early-exit if no pixels passed depth or earlyZ is forced on
560 if (state.psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask))
561 {
562 DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
563 pDepthBuffer, depthPassMask, vCoverageMask, pStencilBuffer, stencilPassMask);
564
565 if (!_simd_movemask_ps(depthPassMask))
566 {
567 goto Endtile;
568 }
569 }
570 }
571
572 psContext.sampleIndex = 0;
573 psContext.activeMask = _simd_castps_si(vCoverageMask);
574
575 // execute pixel shader
576 AR_BEGIN(BEPixelShader, pDC->drawId);
577 UPDATE_STAT_BE(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask)));
578 state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
579 AR_END(BEPixelShader, 0);
580
581 vCoverageMask = _simd_castsi_ps(psContext.activeMask);
582
583 // late-Z
584 if (!T::bCanEarlyZ)
585 {
586 AR_BEGIN(BELateDepthTest, pDC->drawId);
587 depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
588 psContext.vZ, pDepthBuffer, vCoverageMask, pStencilBuffer, &stencilPassMask);
589 AR_EVENT(LateDepthStencilInfoSingleSample(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(stencilPassMask), _simd_movemask_ps(vCoverageMask)));
590 AR_END(BELateDepthTest, 0);
591
592 if (!_simd_movemask_ps(depthPassMask))
593 {
594 // need to call depth/stencil write for stencil write
595 DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
596 pDepthBuffer, depthPassMask, vCoverageMask, pStencilBuffer, stencilPassMask);
597 goto Endtile;
598 }
599 } else {
600 // for early z, consolidate discards from shader
601 // into depthPassMask
602 depthPassMask = _simd_and_ps(depthPassMask, vCoverageMask);
603 }
604
605 uint32_t statMask = _simd_movemask_ps(depthPassMask);
606 uint32_t statCount = _mm_popcnt_u32(statMask);
607 UPDATE_STAT_BE(DepthPassCount, statCount);
608
609 // output merger
610 AR_BEGIN(BEOutputMerger, pDC->drawId);
611 #if USE_8x2_TILE_BACKEND
612 OutputMerger8x2(psContext, pColorBuffer, 0, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.numRenderTargets, state.colorHottileEnable, useAlternateOffset);
613 #else
614 OutputMerger4x2(psContext, pColorBuffer, 0, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.numRenderTargets);
615 #endif
616
617 // do final depth write after all pixel kills
618 if (!state.psState.forceEarlyZ)
619 {
620 DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
621 pDepthBuffer, depthPassMask, vCoverageMask, pStencilBuffer, stencilPassMask);
622 }
623 AR_END(BEOutputMerger, 0);
624 }
625
626 Endtile:
627 AR_BEGIN(BEEndTile, pDC->drawId);
628
629 work.coverageMask[0] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
630 if(T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
631 {
632 work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
633 }
634
635 #if USE_8x2_TILE_BACKEND
636 if (useAlternateOffset)
637 {
638 for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt)
639 {
640 pColorBuffer[rt] += (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
641 }
642 }
643 #else
644 for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt)
645 {
646 pColorBuffer[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
647 }
648 #endif
649 pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
650 pStencilBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
651
652 AR_END(BEEndTile, 0);
653
654 psContext.vX.UL = _simd_add_ps(psContext.vX.UL, dx);
655 psContext.vX.center = _simd_add_ps(psContext.vX.center, dx);
656 }
657
658 psContext.vY.UL = _simd_add_ps(psContext.vY.UL, dy);
659 psContext.vY.center = _simd_add_ps(psContext.vY.center, dy);
660 }
661
662 AR_END(BESingleSampleBackend, 0);
663 }
664
665 template<typename T>
666 void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
667 {
668 SWR_CONTEXT *pContext = pDC->pContext;
669
670 AR_BEGIN(BESampleRateBackend, pDC->drawId);
671 AR_BEGIN(BESetup, pDC->drawId);
672
673 const API_STATE &state = GetApiState(pDC);
674
675 BarycentricCoeffs coeffs;
676 SetupBarycentricCoeffs(&coeffs, work);
677
678 uint8_t *pColorBuffer[SWR_NUM_RENDERTARGETS], *pDepthBuffer, *pStencilBuffer;
679 SetupRenderBuffers(pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.psState.numRenderTargets, renderBuffers);
680
681 SWR_PS_CONTEXT psContext;
682 const SWR_MULTISAMPLE_POS& samplePos = state.rastState.samplePositions;
683 SetupPixelShaderContext<T>(&psContext, samplePos, work);
684
685 AR_END(BESetup, 0);
686
687 psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast<float>(y)));
688 psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast<float>(y)));
689
690 const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM));
691
692 for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
693 {
694 psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps(static_cast<float>(x)));
695 psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps(static_cast<float>(x)));
696
697 const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM));
698
699 for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
700 {
701 #if USE_8x2_TILE_BACKEND
702 const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0);
703
704 #endif
705 if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE)
706 {
707 const uint64_t* pCoverageMask = (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) ? &work.innerCoverageMask : &work.coverageMask[0];
708
709 generateInputCoverage<T, T::InputCoverage>(pCoverageMask, psContext.inputMask, state.blendState.sampleMask);
710 }
711
712 AR_BEGIN(BEBarycentric, pDC->drawId);
713
714 CalcPixelBarycentrics(coeffs, psContext);
715
716 CalcCentroid<T, false>(&psContext, samplePos, coeffs, work.coverageMask, state.blendState.sampleMask);
717
718 AR_END(BEBarycentric, 0);
719
720 for (uint32_t sample = 0; sample < T::MultisampleT::numSamples; sample++)
721 {
722 simdmask coverageMask = work.coverageMask[sample] & MASK;
723
724 if (coverageMask)
725 {
726 // offset depth/stencil buffers current sample
727 uint8_t *pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample);
728 uint8_t *pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample);
729
730 if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable)
731 {
732 static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format");
733
734 const simdscalar z = _simd_load_ps(reinterpret_cast<const float *>(pDepthSample));
735
736 const float minz = state.depthBoundsState.depthBoundsTestMinValue;
737 const float maxz = state.depthBoundsState.depthBoundsTestMaxValue;
738
739 coverageMask &= CalcDepthBoundsAcceptMask(z, minz, maxz);
740 }
741
742 AR_BEGIN(BEBarycentric, pDC->drawId);
743
744 // calculate per sample positions
745 psContext.vX.sample = _simd_add_ps(psContext.vX.UL, samplePos.vX(sample));
746 psContext.vY.sample = _simd_add_ps(psContext.vY.UL, samplePos.vY(sample));
747
748 CalcSampleBarycentrics(coeffs, psContext);
749
750 // interpolate and quantize z
751 psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample);
752 psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
753
754 AR_END(BEBarycentric, 0);
755
756 // interpolate user clip distance if available
757 if (state.rastState.clipDistanceMask)
758 {
759 coverageMask &= ~ComputeUserClipMask(state.rastState.clipDistanceMask, work.pUserClipBuffer, psContext.vI.sample, psContext.vJ.sample);
760 }
761
762 simdscalar vCoverageMask = vMask(coverageMask);
763 simdscalar depthPassMask = vCoverageMask;
764 simdscalar stencilPassMask = vCoverageMask;
765
766 // Early-Z?
767 if (T::bCanEarlyZ)
768 {
769 AR_BEGIN(BEEarlyDepthTest, pDC->drawId);
770 depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
771 psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
772 AR_EVENT(EarlyDepthStencilInfoSampleRate(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(stencilPassMask), _simd_movemask_ps(vCoverageMask)));
773 AR_END(BEEarlyDepthTest, 0);
774
775 // early-exit if no samples passed depth or earlyZ is forced on.
776 if (state.psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask))
777 {
778 DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
779 pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
780
781 if (!_simd_movemask_ps(depthPassMask))
782 {
783 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
784 continue;
785 }
786 }
787 }
788
789 psContext.sampleIndex = sample;
790 psContext.activeMask = _simd_castps_si(vCoverageMask);
791
792 // execute pixel shader
793 AR_BEGIN(BEPixelShader, pDC->drawId);
794 UPDATE_STAT_BE(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask)));
795 state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
796 AR_END(BEPixelShader, 0);
797
798 vCoverageMask = _simd_castsi_ps(psContext.activeMask);
799
800 // late-Z
801 if (!T::bCanEarlyZ)
802 {
803 AR_BEGIN(BELateDepthTest, pDC->drawId);
804 depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
805 psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
806 AR_EVENT(LateDepthStencilInfoSampleRate(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(stencilPassMask), _simd_movemask_ps(vCoverageMask)));
807 AR_END(BELateDepthTest, 0);
808
809 if (!_simd_movemask_ps(depthPassMask))
810 {
811 // need to call depth/stencil write for stencil write
812 DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
813 pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
814
815 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
816 continue;
817 }
818 }
819
820 uint32_t statMask = _simd_movemask_ps(depthPassMask);
821 uint32_t statCount = _mm_popcnt_u32(statMask);
822 UPDATE_STAT_BE(DepthPassCount, statCount);
823
824 // output merger
825 AR_BEGIN(BEOutputMerger, pDC->drawId);
826 #if USE_8x2_TILE_BACKEND
827 OutputMerger8x2(psContext, pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.numRenderTargets, state.colorHottileEnable, useAlternateOffset);
828 #else
829 OutputMerger4x2(psContext, pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.numRenderTargets);
830 #endif
831
832 // do final depth write after all pixel kills
833 if (!state.psState.forceEarlyZ)
834 {
835 DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
836 pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
837 }
838 AR_END(BEOutputMerger, 0);
839 }
840 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
841 }
842
843 Endtile:
844 ATTR_UNUSED;
845
846 AR_BEGIN(BEEndTile, pDC->drawId);
847
848 if(T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
849 {
850 work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
851 }
852
853 #if USE_8x2_TILE_BACKEND
854 if (useAlternateOffset)
855 {
856 for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt)
857 {
858 pColorBuffer[rt] += (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
859 }
860 }
861 #else
862 for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt)
863 {
864 pColorBuffer[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
865 }
866 #endif
867 pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
868 pStencilBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
869
870 AR_END(BEEndTile, 0);
871
872 psContext.vX.UL = _simd_add_ps(psContext.vX.UL, dx);
873 psContext.vX.center = _simd_add_ps(psContext.vX.center, dx);
874 }
875
876 psContext.vY.UL = _simd_add_ps(psContext.vY.UL, dy);
877 psContext.vY.center = _simd_add_ps(psContext.vY.center, dy);
878 }
879
880 AR_END(BESampleRateBackend, 0);
881 }
882 // optimized backend flow with NULL PS
883 template<uint32_t sampleCountT>
884 void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
885 {
886 SWR_CONTEXT *pContext = pDC->pContext;
887
888 AR_BEGIN(BENullBackend, pDC->drawId);
889 ///@todo: handle center multisample pattern
890 AR_BEGIN(BESetup, pDC->drawId);
891
892 const API_STATE &state = GetApiState(pDC);
893
894 BarycentricCoeffs coeffs;
895 SetupBarycentricCoeffs(&coeffs, work);
896
897 uint8_t *pDepthBuffer, *pStencilBuffer;
898 SetupRenderBuffers(NULL, &pDepthBuffer, &pStencilBuffer, 0, renderBuffers);
899
900 SWR_PS_CONTEXT psContext;
901 // skip SetupPixelShaderContext(&psContext, ...); // not needed here
902
903 AR_END(BESetup, 0);
904
905 simdscalar vYSamplePosUL = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast<float>(y)));
906
907 const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM));
908 const SWR_MULTISAMPLE_POS& samplePos = state.rastState.samplePositions;
909 for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
910 {
911 simdscalar vXSamplePosUL = _simd_add_ps(vULOffsetsX, _simd_set1_ps(static_cast<float>(x)));
912
913 const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM));
914
915 for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
916 {
917 // iterate over active samples
918 unsigned long sample = 0;
919 uint32_t sampleMask = state.blendState.sampleMask;
920 while (_BitScanForward(&sample, sampleMask))
921 {
922 sampleMask &= ~(1 << sample);
923
924 simdmask coverageMask = work.coverageMask[sample] & MASK;
925
926 if (coverageMask)
927 {
928 // offset depth/stencil buffers current sample
929 uint8_t *pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample);
930 uint8_t *pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample);
931
932 if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable)
933 {
934 static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format");
935
936 const simdscalar z = _simd_load_ps(reinterpret_cast<const float *>(pDepthSample));
937
938 const float minz = state.depthBoundsState.depthBoundsTestMinValue;
939 const float maxz = state.depthBoundsState.depthBoundsTestMaxValue;
940
941 coverageMask &= CalcDepthBoundsAcceptMask(z, minz, maxz);
942 }
943
944 AR_BEGIN(BEBarycentric, pDC->drawId);
945
946 // calculate per sample positions
947 psContext.vX.sample = _simd_add_ps(vXSamplePosUL, samplePos.vX(sample));
948 psContext.vY.sample = _simd_add_ps(vYSamplePosUL, samplePos.vY(sample));
949
950 CalcSampleBarycentrics(coeffs, psContext);
951
952 // interpolate and quantize z
953 psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample);
954 psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
955
956 AR_END(BEBarycentric, 0);
957
958 // interpolate user clip distance if available
959 if (state.rastState.clipDistanceMask)
960 {
961 coverageMask &= ~ComputeUserClipMask(state.rastState.clipDistanceMask, work.pUserClipBuffer, psContext.vI.sample, psContext.vJ.sample);
962 }
963
964 simdscalar vCoverageMask = vMask(coverageMask);
965 simdscalar stencilPassMask = vCoverageMask;
966
967 AR_BEGIN(BEEarlyDepthTest, pDC->drawId);
968 simdscalar depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
969 psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
970 AR_EVENT(EarlyDepthStencilInfoNullPS(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(stencilPassMask), _simd_movemask_ps(vCoverageMask)));
971 DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
972 pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
973 AR_END(BEEarlyDepthTest, 0);
974
975 uint32_t statMask = _simd_movemask_ps(depthPassMask);
976 uint32_t statCount = _mm_popcnt_u32(statMask);
977 UPDATE_STAT_BE(DepthPassCount, statCount);
978 }
979
980 Endtile:
981 ATTR_UNUSED;
982 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
983 }
984
985 pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
986 pStencilBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
987
988 vXSamplePosUL = _simd_add_ps(vXSamplePosUL, dx);
989 }
990
991 vYSamplePosUL = _simd_add_ps(vYSamplePosUL, dy);
992 }
993
994 AR_END(BENullBackend, 0);
995 }
996
997 void InitClearTilesTable()
998 {
999 memset(sClearTilesTable, 0, sizeof(sClearTilesTable));
1000
1001 sClearTilesTable[R8G8B8A8_UNORM] = ClearMacroTile<R8G8B8A8_UNORM>;
1002 sClearTilesTable[B8G8R8A8_UNORM] = ClearMacroTile<B8G8R8A8_UNORM>;
1003 sClearTilesTable[R32_FLOAT] = ClearMacroTile<R32_FLOAT>;
1004 sClearTilesTable[R32G32B32A32_FLOAT] = ClearMacroTile<R32G32B32A32_FLOAT>;
1005 sClearTilesTable[R8_UINT] = ClearMacroTile<R8_UINT>;
1006 }
1007
1008 PFN_BACKEND_FUNC gBackendNullPs[SWR_MULTISAMPLE_TYPE_COUNT];
1009 PFN_BACKEND_FUNC gBackendSingleSample[SWR_INPUT_COVERAGE_COUNT]
1010 [2] // centroid
1011 [2] // canEarlyZ
1012 = {};
1013 PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_COUNT]
1014 [2] // isCenterPattern
1015 [SWR_INPUT_COVERAGE_COUNT]
1016 [2] // centroid
1017 [2] // forcedSampleCount
1018 [2] // canEarlyZ
1019 = {};
1020 PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_COUNT]
1021 [SWR_INPUT_COVERAGE_COUNT]
1022 [2] // centroid
1023 [2] // canEarlyZ
1024 = {};
1025
1026 // Recursive template used to auto-nest conditionals. Converts dynamic enum function
1027 // arguments to static template arguments.
1028 template <uint32_t... ArgsT>
1029 struct BEChooser
1030 {
1031 // Last Arg Terminator
1032 static PFN_BACKEND_FUNC GetFunc(SWR_BACKEND_FUNCS tArg)
1033 {
1034 switch(tArg)
1035 {
1036 case SWR_BACKEND_SINGLE_SAMPLE: return BackendSingleSample<SwrBackendTraits<ArgsT...>>; break;
1037 case SWR_BACKEND_MSAA_PIXEL_RATE: return BackendPixelRate<SwrBackendTraits<ArgsT...>>; break;
1038 case SWR_BACKEND_MSAA_SAMPLE_RATE: return BackendSampleRate<SwrBackendTraits<ArgsT...>>; break;
1039 default:
1040 SWR_ASSERT(0 && "Invalid backend func\n");
1041 return nullptr;
1042 break;
1043 }
1044 }
1045
1046 // Recursively parse args
1047 template <typename... TArgsT>
1048 static PFN_BACKEND_FUNC GetFunc(SWR_INPUT_COVERAGE tArg, TArgsT... remainingArgs)
1049 {
1050 switch(tArg)
1051 {
1052 case SWR_INPUT_COVERAGE_NONE: return BEChooser<ArgsT..., SWR_INPUT_COVERAGE_NONE>::GetFunc(remainingArgs...); break;
1053 case SWR_INPUT_COVERAGE_NORMAL: return BEChooser<ArgsT..., SWR_INPUT_COVERAGE_NORMAL>::GetFunc(remainingArgs...); break;
1054 case SWR_INPUT_COVERAGE_INNER_CONSERVATIVE: return BEChooser<ArgsT..., SWR_INPUT_COVERAGE_INNER_CONSERVATIVE>::GetFunc(remainingArgs...); break;
1055 default:
1056 SWR_ASSERT(0 && "Invalid sample pattern\n");
1057 return BEChooser<ArgsT..., SWR_INPUT_COVERAGE_NONE>::GetFunc(remainingArgs...);
1058 break;
1059 }
1060 }
1061
1062 // Recursively parse args
1063 template <typename... TArgsT>
1064 static PFN_BACKEND_FUNC GetFunc(SWR_MULTISAMPLE_COUNT tArg, TArgsT... remainingArgs)
1065 {
1066 switch(tArg)
1067 {
1068 case SWR_MULTISAMPLE_1X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...); break;
1069 case SWR_MULTISAMPLE_2X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_2X>::GetFunc(remainingArgs...); break;
1070 case SWR_MULTISAMPLE_4X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_4X>::GetFunc(remainingArgs...); break;
1071 case SWR_MULTISAMPLE_8X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_8X>::GetFunc(remainingArgs...); break;
1072 case SWR_MULTISAMPLE_16X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_16X>::GetFunc(remainingArgs...); break;
1073 default:
1074 SWR_ASSERT(0 && "Invalid sample count\n");
1075 return BEChooser<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...);
1076 break;
1077 }
1078 }
1079
1080 // Recursively parse args
1081 template <typename... TArgsT>
1082 static PFN_BACKEND_FUNC GetFunc(bool tArg, TArgsT... remainingArgs)
1083 {
1084 if(tArg == true)
1085 {
1086 return BEChooser<ArgsT..., 1>::GetFunc(remainingArgs...);
1087 }
1088
1089 return BEChooser<ArgsT..., 0>::GetFunc(remainingArgs...);
1090 }
1091 };
1092
1093 void InitBackendSingleFuncTable(PFN_BACKEND_FUNC (&table)[SWR_INPUT_COVERAGE_COUNT][2][2])
1094 {
1095 for(uint32_t inputCoverage = 0; inputCoverage < SWR_INPUT_COVERAGE_COUNT; inputCoverage++)
1096 {
1097 for(uint32_t isCentroid = 0; isCentroid < 2; isCentroid++)
1098 {
1099 for(uint32_t canEarlyZ = 0; canEarlyZ < 2; canEarlyZ++)
1100 {
1101 table[inputCoverage][isCentroid][canEarlyZ] =
1102 BEChooser<>::GetFunc(SWR_MULTISAMPLE_1X, false, (SWR_INPUT_COVERAGE)inputCoverage,
1103 (isCentroid > 0), false, (canEarlyZ > 0), SWR_BACKEND_SINGLE_SAMPLE);
1104 }
1105 }
1106 }
1107 }
1108
1109 void InitBackendSampleFuncTable(PFN_BACKEND_FUNC (&table)[SWR_MULTISAMPLE_TYPE_COUNT][SWR_INPUT_COVERAGE_COUNT][2][2])
1110 {
1111 for(uint32_t sampleCount = SWR_MULTISAMPLE_1X; sampleCount < SWR_MULTISAMPLE_TYPE_COUNT; sampleCount++)
1112 {
1113 for(uint32_t inputCoverage = 0; inputCoverage < SWR_INPUT_COVERAGE_COUNT; inputCoverage++)
1114 {
1115 for(uint32_t centroid = 0; centroid < 2; centroid++)
1116 {
1117 for(uint32_t canEarlyZ = 0; canEarlyZ < 2; canEarlyZ++)
1118 {
1119 table[sampleCount][inputCoverage][centroid][canEarlyZ] =
1120 BEChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, false, (SWR_INPUT_COVERAGE)inputCoverage,
1121 (centroid > 0), false, (canEarlyZ > 0), (SWR_BACKEND_FUNCS)SWR_BACKEND_MSAA_SAMPLE_RATE);
1122 }
1123 }
1124 }
1125 }
1126 }
1127
1128 void InitBackendPixelRate0();
1129 void InitBackendFuncTables()
1130 {
1131 InitBackendSingleFuncTable(gBackendSingleSample);
1132 InitBackendPixelRate0();
1133 InitBackendSampleFuncTable(gBackendSampleRateTable);
1134
1135 gBackendNullPs[SWR_MULTISAMPLE_1X] = &BackendNullPS < SWR_MULTISAMPLE_1X > ;
1136 gBackendNullPs[SWR_MULTISAMPLE_2X] = &BackendNullPS < SWR_MULTISAMPLE_2X > ;
1137 gBackendNullPs[SWR_MULTISAMPLE_4X] = &BackendNullPS < SWR_MULTISAMPLE_4X > ;
1138 gBackendNullPs[SWR_MULTISAMPLE_8X] = &BackendNullPS < SWR_MULTISAMPLE_8X > ;
1139 gBackendNullPs[SWR_MULTISAMPLE_16X] = &BackendNullPS < SWR_MULTISAMPLE_16X > ;
1140 }