swr: [rasterizer core] Refactor/cleanup backends
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / backend.cpp
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file backend.cpp
24 *
25 * @brief Backend handles rasterization, pixel shading and output merger
26 * operations.
27 *
28 ******************************************************************************/
29
30 #include <smmintrin.h>
31
32 #include "backend.h"
33 #include "depthstencil.h"
34 #include "tilemgr.h"
35 #include "memory/tilingtraits.h"
36 #include "core/multisample.h"
37
38 #include <algorithm>
39
40 typedef void(*PFN_CLEAR_TILES)(DRAW_CONTEXT*, SWR_RENDERTARGET_ATTACHMENT rt, uint32_t, DWORD[4], const SWR_RECT& rect);
41 static PFN_CLEAR_TILES sClearTilesTable[NUM_SWR_FORMATS];
42
43 //////////////////////////////////////////////////////////////////////////
44 /// @brief Process compute work.
45 /// @param pDC - pointer to draw context (dispatch).
46 /// @param workerId - The unique worker ID that is assigned to this thread.
47 /// @param threadGroupId - the linear index for the thread group within the dispatch.
48 void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId, void*& pSpillFillBuffer)
49 {
50 SWR_CONTEXT *pContext = pDC->pContext;
51
52 AR_BEGIN(BEDispatch, pDC->drawId);
53
54 const COMPUTE_DESC* pTaskData = (COMPUTE_DESC*)pDC->pDispatch->GetTasksData();
55 SWR_ASSERT(pTaskData != nullptr);
56
57 // Ensure spill fill memory has been allocated.
58 size_t spillFillSize = pDC->pState->state.totalSpillFillSize;
59 if (spillFillSize && pSpillFillBuffer == nullptr)
60 {
61 pSpillFillBuffer = pDC->pArena->AllocAlignedSync(spillFillSize, KNOB_SIMD_BYTES);
62 }
63
64 const API_STATE& state = GetApiState(pDC);
65
66 SWR_CS_CONTEXT csContext{ 0 };
67 csContext.tileCounter = threadGroupId;
68 csContext.dispatchDims[0] = pTaskData->threadGroupCountX;
69 csContext.dispatchDims[1] = pTaskData->threadGroupCountY;
70 csContext.dispatchDims[2] = pTaskData->threadGroupCountZ;
71 csContext.pTGSM = pContext->ppScratch[workerId];
72 csContext.pSpillFillBuffer = (uint8_t*)pSpillFillBuffer;
73
74 state.pfnCsFunc(GetPrivateState(pDC), &csContext);
75
76 UPDATE_STAT(CsInvocations, state.totalThreadsInGroup);
77
78 AR_END(BEDispatch, 1);
79 }
80
81 //////////////////////////////////////////////////////////////////////////
82 /// @brief Process shutdown.
83 /// @param pDC - pointer to draw context (dispatch).
84 /// @param workerId - The unique worker ID that is assigned to this thread.
85 /// @param threadGroupId - the linear index for the thread group within the dispatch.
86 void ProcessShutdownBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData)
87 {
88 // Dummy function
89 }
90
91 void ProcessSyncBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData)
92 {
93 uint32_t x, y;
94 MacroTileMgr::getTileIndices(macroTile, x, y);
95 SWR_ASSERT(x == 0 && y == 0);
96 }
97
98 template<SWR_FORMAT format>
99 void ClearRasterTile(uint8_t *pTileBuffer, simdvector &value)
100 {
101 auto lambda = [&](int32_t comp)
102 {
103 FormatTraits<format>::storeSOA(comp, pTileBuffer, value.v[comp]);
104
105 pTileBuffer += (KNOB_SIMD_WIDTH * FormatTraits<format>::GetBPC(comp) / 8);
106 };
107
108 const uint32_t numIter = (KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD_TILE_X_DIM);
109
110 for (uint32_t i = 0; i < numIter; ++i)
111 {
112 UnrollerL<0, FormatTraits<format>::numComps, 1>::step(lambda);
113 }
114 }
115
116 #if USE_8x2_TILE_BACKEND
117 template<SWR_FORMAT format>
118 void ClearRasterTile(uint8_t *pTileBuffer, simd16vector &value)
119 {
120 auto lambda = [&](int32_t comp)
121 {
122 FormatTraits<format>::storeSOA(comp, pTileBuffer, value.v[comp]);
123
124 pTileBuffer += (KNOB_SIMD16_WIDTH * FormatTraits<format>::GetBPC(comp) / 8);
125 };
126
127 const uint32_t numIter = (KNOB_TILE_Y_DIM / SIMD16_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD16_TILE_X_DIM);
128
129 for (uint32_t i = 0; i < numIter; ++i)
130 {
131 UnrollerL<0, FormatTraits<format>::numComps, 1>::step(lambda);
132 }
133 }
134
135 #endif
136 template<SWR_FORMAT format>
137 INLINE void ClearMacroTile(DRAW_CONTEXT *pDC, SWR_RENDERTARGET_ATTACHMENT rt, uint32_t macroTile, DWORD clear[4], const SWR_RECT& rect)
138 {
139 // convert clear color to hottile format
140 // clear color is in RGBA float/uint32
141 #if USE_8x2_TILE_BACKEND
142 simd16vector vClear;
143 for (uint32_t comp = 0; comp < FormatTraits<format>::numComps; ++comp)
144 {
145 simd16scalar vComp;
146 vComp = _simd16_load1_ps((const float*)&clear[comp]);
147 if (FormatTraits<format>::isNormalized(comp))
148 {
149 vComp = _simd16_mul_ps(vComp, _simd16_set1_ps(FormatTraits<format>::fromFloat(comp)));
150 vComp = _simd16_castsi_ps(_simd16_cvtps_epi32(vComp));
151 }
152 vComp = FormatTraits<format>::pack(comp, vComp);
153 vClear.v[FormatTraits<format>::swizzle(comp)] = vComp;
154 }
155
156 #else
157 simdvector vClear;
158 for (uint32_t comp = 0; comp < FormatTraits<format>::numComps; ++comp)
159 {
160 simdscalar vComp;
161 vComp = _simd_load1_ps((const float*)&clear[comp]);
162 if (FormatTraits<format>::isNormalized(comp))
163 {
164 vComp = _simd_mul_ps(vComp, _simd_set1_ps(FormatTraits<format>::fromFloat(comp)));
165 vComp = _simd_castsi_ps(_simd_cvtps_epi32(vComp));
166 }
167 vComp = FormatTraits<format>::pack(comp, vComp);
168 vClear.v[FormatTraits<format>::swizzle(comp)] = vComp;
169 }
170
171 #endif
172 uint32_t tileX, tileY;
173 MacroTileMgr::getTileIndices(macroTile, tileX, tileY);
174
175 // Init to full macrotile
176 SWR_RECT clearTile =
177 {
178 KNOB_MACROTILE_X_DIM * int32_t(tileX),
179 KNOB_MACROTILE_Y_DIM * int32_t(tileY),
180 KNOB_MACROTILE_X_DIM * int32_t(tileX + 1),
181 KNOB_MACROTILE_Y_DIM * int32_t(tileY + 1),
182 };
183
184 // intersect with clear rect
185 clearTile &= rect;
186
187 // translate to local hottile origin
188 clearTile.Translate(-int32_t(tileX) * KNOB_MACROTILE_X_DIM, -int32_t(tileY) * KNOB_MACROTILE_Y_DIM);
189
190 // Make maximums inclusive (needed for convert to raster tiles)
191 clearTile.xmax -= 1;
192 clearTile.ymax -= 1;
193
194 // convert to raster tiles
195 clearTile.ymin >>= (KNOB_TILE_Y_DIM_SHIFT);
196 clearTile.ymax >>= (KNOB_TILE_Y_DIM_SHIFT);
197 clearTile.xmin >>= (KNOB_TILE_X_DIM_SHIFT);
198 clearTile.xmax >>= (KNOB_TILE_X_DIM_SHIFT);
199
200 const int32_t numSamples = GetNumSamples(pDC->pState->state.rastState.sampleCount);
201 // compute steps between raster tile samples / raster tiles / macro tile rows
202 const uint32_t rasterTileSampleStep = KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<format>::bpp / 8;
203 const uint32_t rasterTileStep = (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<format>::bpp / 8)) * numSamples;
204 const uint32_t macroTileRowStep = (KNOB_MACROTILE_X_DIM / KNOB_TILE_X_DIM) * rasterTileStep;
205 const uint32_t pitch = (FormatTraits<format>::bpp * KNOB_MACROTILE_X_DIM / 8);
206
207 HOTTILE *pHotTile = pDC->pContext->pHotTileMgr->GetHotTile(pDC->pContext, pDC, macroTile, rt, true, numSamples);
208 uint32_t rasterTileStartOffset = (ComputeTileOffset2D< TilingTraits<SWR_TILE_SWRZ, FormatTraits<format>::bpp > >(pitch, clearTile.xmin, clearTile.ymin)) * numSamples;
209 uint8_t* pRasterTileRow = pHotTile->pBuffer + rasterTileStartOffset; //(ComputeTileOffset2D< TilingTraits<SWR_TILE_SWRZ, FormatTraits<format>::bpp > >(pitch, x, y)) * numSamples;
210
211 // loop over all raster tiles in the current hot tile
212 for (int32_t y = clearTile.ymin; y <= clearTile.ymax; ++y)
213 {
214 uint8_t* pRasterTile = pRasterTileRow;
215 for (int32_t x = clearTile.xmin; x <= clearTile.xmax; ++x)
216 {
217 for( int32_t sampleNum = 0; sampleNum < numSamples; sampleNum++)
218 {
219 ClearRasterTile<format>(pRasterTile, vClear);
220 pRasterTile += rasterTileSampleStep;
221 }
222 }
223 pRasterTileRow += macroTileRowStep;
224 }
225
226 pHotTile->state = HOTTILE_DIRTY;
227 }
228
229
230 void ProcessClearBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData)
231 {
232 SWR_CONTEXT *pContext = pDC->pContext;
233
234 if (KNOB_FAST_CLEAR)
235 {
236 CLEAR_DESC *pClear = (CLEAR_DESC*)pUserData;
237 SWR_MULTISAMPLE_COUNT sampleCount = pDC->pState->state.rastState.sampleCount;
238 uint32_t numSamples = GetNumSamples(sampleCount);
239
240 SWR_ASSERT(pClear->flags.bits != 0); // shouldn't be here without a reason.
241
242 AR_BEGIN(BEClear, pDC->drawId);
243
244 if (pClear->flags.mask & SWR_CLEAR_COLOR)
245 {
246 HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, SWR_ATTACHMENT_COLOR0, true, numSamples);
247 // All we want to do here is to mark the hot tile as being in a "needs clear" state.
248 pHotTile->clearData[0] = *(DWORD*)&(pClear->clearRTColor[0]);
249 pHotTile->clearData[1] = *(DWORD*)&(pClear->clearRTColor[1]);
250 pHotTile->clearData[2] = *(DWORD*)&(pClear->clearRTColor[2]);
251 pHotTile->clearData[3] = *(DWORD*)&(pClear->clearRTColor[3]);
252 pHotTile->state = HOTTILE_CLEAR;
253 }
254
255 if (pClear->flags.mask & SWR_CLEAR_DEPTH)
256 {
257 HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, SWR_ATTACHMENT_DEPTH, true, numSamples);
258 pHotTile->clearData[0] = *(DWORD*)&pClear->clearDepth;
259 pHotTile->state = HOTTILE_CLEAR;
260 }
261
262 if (pClear->flags.mask & SWR_CLEAR_STENCIL)
263 {
264 HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, SWR_ATTACHMENT_STENCIL, true, numSamples);
265
266 pHotTile->clearData[0] = *(DWORD*)&pClear->clearStencil;
267 pHotTile->state = HOTTILE_CLEAR;
268 }
269
270 AR_END(BEClear, 1);
271 }
272 else
273 {
274 // Legacy clear
275 CLEAR_DESC *pClear = (CLEAR_DESC*)pUserData;
276 AR_BEGIN(BEClear, pDC->drawId);
277
278 if (pClear->flags.mask & SWR_CLEAR_COLOR)
279 {
280 /// @todo clear data should come in as RGBA32_FLOAT
281 DWORD clearData[4];
282 float clearFloat[4];
283 clearFloat[0] = ((uint8_t*)(&pClear->clearRTColor))[0] / 255.0f;
284 clearFloat[1] = ((uint8_t*)(&pClear->clearRTColor))[1] / 255.0f;
285 clearFloat[2] = ((uint8_t*)(&pClear->clearRTColor))[2] / 255.0f;
286 clearFloat[3] = ((uint8_t*)(&pClear->clearRTColor))[3] / 255.0f;
287 clearData[0] = *(DWORD*)&clearFloat[0];
288 clearData[1] = *(DWORD*)&clearFloat[1];
289 clearData[2] = *(DWORD*)&clearFloat[2];
290 clearData[3] = *(DWORD*)&clearFloat[3];
291
292 PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[KNOB_COLOR_HOT_TILE_FORMAT];
293 SWR_ASSERT(pfnClearTiles != nullptr);
294
295 pfnClearTiles(pDC, SWR_ATTACHMENT_COLOR0, macroTile, clearData, pClear->rect);
296 }
297
298 if (pClear->flags.mask & SWR_CLEAR_DEPTH)
299 {
300 DWORD clearData[4];
301 clearData[0] = *(DWORD*)&pClear->clearDepth;
302 PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[KNOB_DEPTH_HOT_TILE_FORMAT];
303 SWR_ASSERT(pfnClearTiles != nullptr);
304
305 pfnClearTiles(pDC, SWR_ATTACHMENT_DEPTH, macroTile, clearData, pClear->rect);
306 }
307
308 if (pClear->flags.mask & SWR_CLEAR_STENCIL)
309 {
310 uint32_t value = pClear->clearStencil;
311 DWORD clearData[4];
312 clearData[0] = *(DWORD*)&value;
313 PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[KNOB_STENCIL_HOT_TILE_FORMAT];
314
315 pfnClearTiles(pDC, SWR_ATTACHMENT_STENCIL, macroTile, clearData, pClear->rect);
316 }
317
318 AR_END(BEClear, 1);
319 }
320 }
321
322 void ProcessStoreTileBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, STORE_TILES_DESC* pDesc,
323 SWR_RENDERTARGET_ATTACHMENT attachment)
324 {
325 SWR_CONTEXT *pContext = pDC->pContext;
326
327 AR_BEGIN(BEStoreTiles, pDC->drawId);
328
329 SWR_FORMAT srcFormat;
330 switch (attachment)
331 {
332 case SWR_ATTACHMENT_COLOR0:
333 case SWR_ATTACHMENT_COLOR1:
334 case SWR_ATTACHMENT_COLOR2:
335 case SWR_ATTACHMENT_COLOR3:
336 case SWR_ATTACHMENT_COLOR4:
337 case SWR_ATTACHMENT_COLOR5:
338 case SWR_ATTACHMENT_COLOR6:
339 case SWR_ATTACHMENT_COLOR7: srcFormat = KNOB_COLOR_HOT_TILE_FORMAT; break;
340 case SWR_ATTACHMENT_DEPTH: srcFormat = KNOB_DEPTH_HOT_TILE_FORMAT; break;
341 case SWR_ATTACHMENT_STENCIL: srcFormat = KNOB_STENCIL_HOT_TILE_FORMAT; break;
342 default: SWR_ASSERT(false, "Unknown attachment: %d", attachment); srcFormat = KNOB_COLOR_HOT_TILE_FORMAT; break;
343 }
344
345 uint32_t x, y;
346 MacroTileMgr::getTileIndices(macroTile, x, y);
347
348 // Only need to store the hottile if it's been rendered to...
349 HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, attachment, false);
350 if (pHotTile)
351 {
352 // clear if clear is pending (i.e., not rendered to), then mark as dirty for store.
353 if (pHotTile->state == HOTTILE_CLEAR)
354 {
355 PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[srcFormat];
356 SWR_ASSERT(pfnClearTiles != nullptr);
357
358 pfnClearTiles(pDC, attachment, macroTile, pHotTile->clearData, pDesc->rect);
359 }
360
361 if (pHotTile->state == HOTTILE_DIRTY || pDesc->postStoreTileState == (SWR_TILE_STATE)HOTTILE_DIRTY)
362 {
363 int32_t destX = KNOB_MACROTILE_X_DIM * x;
364 int32_t destY = KNOB_MACROTILE_Y_DIM * y;
365
366 pContext->pfnStoreTile(GetPrivateState(pDC), srcFormat,
367 attachment, destX, destY, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer);
368 }
369
370
371 if (pHotTile->state == HOTTILE_DIRTY || pHotTile->state == HOTTILE_RESOLVED)
372 {
373 pHotTile->state = (HOTTILE_STATE)pDesc->postStoreTileState;
374 }
375 }
376 AR_END(BEStoreTiles, 1);
377 }
378
379 void ProcessStoreTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData)
380 {
381 STORE_TILES_DESC *pDesc = (STORE_TILES_DESC*)pData;
382
383 unsigned long rt = 0;
384 uint32_t mask = pDesc->attachmentMask;
385 while (_BitScanForward(&rt, mask))
386 {
387 mask &= ~(1 << rt);
388 ProcessStoreTileBE(pDC, workerId, macroTile, pDesc, (SWR_RENDERTARGET_ATTACHMENT)rt);
389 }
390 }
391
392 void ProcessDiscardInvalidateTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData)
393 {
394 DISCARD_INVALIDATE_TILES_DESC *pDesc = (DISCARD_INVALIDATE_TILES_DESC *)pData;
395 SWR_CONTEXT *pContext = pDC->pContext;
396
397 const int32_t numSamples = GetNumSamples(pDC->pState->state.rastState.sampleCount);
398
399 for (uint32_t i = 0; i < SWR_NUM_ATTACHMENTS; ++i)
400 {
401 if (pDesc->attachmentMask & (1 << i))
402 {
403 HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTileNoLoad(
404 pContext, pDC, macroTile, (SWR_RENDERTARGET_ATTACHMENT)i, pDesc->createNewTiles, numSamples);
405 if (pHotTile)
406 {
407 pHotTile->state = (HOTTILE_STATE)pDesc->newTileState;
408 }
409 }
410 }
411 }
412
413 #if KNOB_SIMD_WIDTH == 8
414 const __m256 vCenterOffsetsX = {0.5, 1.5, 0.5, 1.5, 2.5, 3.5, 2.5, 3.5};
415 const __m256 vCenterOffsetsY = {0.5, 0.5, 1.5, 1.5, 0.5, 0.5, 1.5, 1.5};
416 const __m256 vULOffsetsX = {0.0, 1.0, 0.0, 1.0, 2.0, 3.0, 2.0, 3.0};
417 const __m256 vULOffsetsY = {0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0};
418 #else
419 #error Unsupported vector width
420 #endif
421
422 simdmask ComputeUserClipMask(uint8_t clipMask, float* pUserClipBuffer, simdscalar vI, simdscalar vJ)
423 {
424 simdscalar vClipMask = _simd_setzero_ps();
425 uint32_t numClipDistance = _mm_popcnt_u32(clipMask);
426
427 for (uint32_t i = 0; i < numClipDistance; ++i)
428 {
429 // pull triangle clip distance values from clip buffer
430 simdscalar vA = _simd_broadcast_ss(pUserClipBuffer++);
431 simdscalar vB = _simd_broadcast_ss(pUserClipBuffer++);
432 simdscalar vC = _simd_broadcast_ss(pUserClipBuffer++);
433
434 // interpolate
435 simdscalar vInterp = vplaneps(vA, vB, vC, vI, vJ);
436
437 // clip if interpolated clip distance is < 0 || NAN
438 simdscalar vCull = _simd_cmp_ps(_simd_setzero_ps(), vInterp, _CMP_NLE_UQ);
439
440 vClipMask = _simd_or_ps(vClipMask, vCull);
441 }
442
443 return _simd_movemask_ps(vClipMask);
444 }
445
446 template<typename T>
447 void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
448 {
449 SWR_CONTEXT *pContext = pDC->pContext;
450
451 AR_BEGIN(BESingleSampleBackend, pDC->drawId);
452 AR_BEGIN(BESetup, pDC->drawId);
453
454 const API_STATE &state = GetApiState(pDC);
455
456 BarycentricCoeffs coeffs;
457 SetupBarycentricCoeffs(&coeffs, work);
458
459 uint8_t *pColorBuffer[SWR_NUM_RENDERTARGETS], *pDepthBuffer, *pStencilBuffer;
460 SetupRenderBuffers(pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.psState.numRenderTargets, renderBuffers);
461
462 SWR_PS_CONTEXT psContext;
463 SetupPixelShaderContext<T>(&psContext, work);
464
465 AR_END(BESetup, 1);
466
467 psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast<float>(y)));
468 psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast<float>(y)));
469
470 const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM));
471
472 for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
473 {
474 psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps(static_cast<float>(x)));
475 psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps(static_cast<float>(x)));
476
477 const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM));
478
479 for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
480 {
481 #if USE_8x2_TILE_BACKEND
482 const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0);
483
484 #endif
485 simdmask coverageMask = work.coverageMask[0] & MASK;
486
487 if (coverageMask)
488 {
489 if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable)
490 {
491 static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format");
492
493 const simdscalar z = _simd_load_ps(reinterpret_cast<const float *>(pDepthBuffer));
494
495 const float minz = state.depthBoundsState.depthBoundsTestMinValue;
496 const float maxz = state.depthBoundsState.depthBoundsTestMaxValue;
497
498 coverageMask &= CalcDepthBoundsAcceptMask(z, minz, maxz);
499 }
500
501 if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE)
502 {
503 const uint64_t* pCoverageMask = (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) ? &work.innerCoverageMask : &work.coverageMask[0];
504
505 generateInputCoverage<T, T::InputCoverage>(pCoverageMask, psContext.inputMask, state.blendState.sampleMask);
506 }
507
508 AR_BEGIN(BEBarycentric, pDC->drawId);
509
510 CalcPixelBarycentrics(coeffs, psContext);
511
512 CalcCentroid<T, true>(&psContext, coeffs, work.coverageMask, state.blendState.sampleMask);
513
514 // interpolate and quantize z
515 psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center);
516 psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
517
518 AR_END(BEBarycentric, 1);
519
520 // interpolate user clip distance if available
521 if (state.rastState.clipDistanceMask)
522 {
523 coverageMask &= ~ComputeUserClipMask(state.rastState.clipDistanceMask, work.pUserClipBuffer, psContext.vI.center, psContext.vJ.center);
524 }
525
526 simdscalar vCoverageMask = vMask(coverageMask);
527 simdscalar depthPassMask = vCoverageMask;
528 simdscalar stencilPassMask = vCoverageMask;
529
530 // Early-Z?
531 if (T::bCanEarlyZ)
532 {
533 AR_BEGIN(BEEarlyDepthTest, pDC->drawId);
534 depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
535 psContext.vZ, pDepthBuffer, vCoverageMask, pStencilBuffer, &stencilPassMask);
536 AR_END(BEEarlyDepthTest, 0);
537
538 // early-exit if no pixels passed depth or earlyZ is forced on
539 if (state.psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask))
540 {
541 DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
542 pDepthBuffer, depthPassMask, vCoverageMask, pStencilBuffer, stencilPassMask);
543
544 if (!_simd_movemask_ps(depthPassMask))
545 {
546 goto Endtile;
547 }
548 }
549 }
550
551 psContext.sampleIndex = 0;
552 psContext.activeMask = _simd_castps_si(vCoverageMask);
553
554 // execute pixel shader
555 AR_BEGIN(BEPixelShader, pDC->drawId);
556 UPDATE_STAT(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask)));
557 state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
558 AR_END(BEPixelShader, 0);
559
560 vCoverageMask = _simd_castsi_ps(psContext.activeMask);
561
562 // late-Z
563 if (!T::bCanEarlyZ)
564 {
565 AR_BEGIN(BELateDepthTest, pDC->drawId);
566 depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
567 psContext.vZ, pDepthBuffer, vCoverageMask, pStencilBuffer, &stencilPassMask);
568 AR_END(BELateDepthTest, 0);
569
570 if (!_simd_movemask_ps(depthPassMask))
571 {
572 // need to call depth/stencil write for stencil write
573 DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
574 pDepthBuffer, depthPassMask, vCoverageMask, pStencilBuffer, stencilPassMask);
575 goto Endtile;
576 }
577 }
578
579 uint32_t statMask = _simd_movemask_ps(depthPassMask);
580 uint32_t statCount = _mm_popcnt_u32(statMask);
581 UPDATE_STAT(DepthPassCount, statCount);
582
583 // output merger
584 AR_BEGIN(BEOutputMerger, pDC->drawId);
585 #if USE_8x2_TILE_BACKEND
586 OutputMerger(psContext, pColorBuffer, 0, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.numRenderTargets, useAlternateOffset);
587 #else
588 OutputMerger(psContext, pColorBuffer, 0, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.numRenderTargets);
589 #endif
590
591 // do final depth write after all pixel kills
592 if (!state.psState.forceEarlyZ)
593 {
594 DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
595 pDepthBuffer, depthPassMask, vCoverageMask, pStencilBuffer, stencilPassMask);
596 }
597 AR_END(BEOutputMerger, 0);
598 }
599
600 Endtile:
601 AR_BEGIN(BEEndTile, pDC->drawId);
602
603 work.coverageMask[0] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
604 if(T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
605 {
606 work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
607 }
608
609 #if USE_8x2_TILE_BACKEND
610 if (useAlternateOffset)
611 {
612 for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt)
613 {
614 pColorBuffer[rt] += (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
615 }
616 }
617 #else
618 for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt)
619 {
620 pColorBuffer[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
621 }
622 #endif
623 pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
624 pStencilBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
625
626 AR_END(BEEndTile, 0);
627
628 psContext.vX.UL = _simd_add_ps(psContext.vX.UL, dx);
629 psContext.vX.center = _simd_add_ps(psContext.vX.center, dx);
630 }
631
632 psContext.vY.UL = _simd_add_ps(psContext.vY.UL, dy);
633 psContext.vY.center = _simd_add_ps(psContext.vY.center, dy);
634 }
635
636 AR_END(BESingleSampleBackend, 0);
637 }
638
639 template<typename T>
640 void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
641 {
642 SWR_CONTEXT *pContext = pDC->pContext;
643
644 AR_BEGIN(BESampleRateBackend, pDC->drawId);
645 AR_BEGIN(BESetup, pDC->drawId);
646
647 const API_STATE &state = GetApiState(pDC);
648
649 BarycentricCoeffs coeffs;
650 SetupBarycentricCoeffs(&coeffs, work);
651
652 uint8_t *pColorBuffer[SWR_NUM_RENDERTARGETS], *pDepthBuffer, *pStencilBuffer;
653 SetupRenderBuffers(pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.psState.numRenderTargets, renderBuffers);
654
655 SWR_PS_CONTEXT psContext;
656 SetupPixelShaderContext<T>(&psContext, work);
657
658 AR_END(BESetup, 0);
659
660 psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast<float>(y)));
661 psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast<float>(y)));
662
663 const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM));
664
665 for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
666 {
667 psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps(static_cast<float>(x)));
668 psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps(static_cast<float>(x)));
669
670 const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM));
671
672 for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
673 {
674 #if USE_8x2_TILE_BACKEND
675 const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0);
676
677 #endif
678 if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE)
679 {
680 const uint64_t* pCoverageMask = (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) ? &work.innerCoverageMask : &work.coverageMask[0];
681
682 generateInputCoverage<T, T::InputCoverage>(pCoverageMask, psContext.inputMask, state.blendState.sampleMask);
683 }
684
685 AR_BEGIN(BEBarycentric, pDC->drawId);
686
687 CalcPixelBarycentrics(coeffs, psContext);
688
689 CalcCentroid<T, false>(&psContext, coeffs, work.coverageMask, state.blendState.sampleMask);
690
691 AR_END(BEBarycentric, 0);
692
693 for (uint32_t sample = 0; sample < T::MultisampleT::numSamples; sample++)
694 {
695 simdmask coverageMask = work.coverageMask[sample] & MASK;
696
697 if (coverageMask)
698 {
699 // offset depth/stencil buffers current sample
700 uint8_t *pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample);
701 uint8_t *pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample);
702
703 if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable)
704 {
705 static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format");
706
707 const simdscalar z = _simd_load_ps(reinterpret_cast<const float *>(pDepthSample));
708
709 const float minz = state.depthBoundsState.depthBoundsTestMinValue;
710 const float maxz = state.depthBoundsState.depthBoundsTestMaxValue;
711
712 coverageMask &= CalcDepthBoundsAcceptMask(z, minz, maxz);
713 }
714
715 AR_BEGIN(BEBarycentric, pDC->drawId);
716
717 // calculate per sample positions
718 psContext.vX.sample = _simd_add_ps(psContext.vX.UL, T::MultisampleT::vX(sample));
719 psContext.vY.sample = _simd_add_ps(psContext.vY.UL, T::MultisampleT::vY(sample));
720
721 CalcSampleBarycentrics(coeffs, psContext);
722
723 // interpolate and quantize z
724 psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample);
725 psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
726
727 AR_END(BEBarycentric, 0);
728
729 // interpolate user clip distance if available
730 if (state.rastState.clipDistanceMask)
731 {
732 coverageMask &= ~ComputeUserClipMask(state.rastState.clipDistanceMask, work.pUserClipBuffer, psContext.vI.sample, psContext.vJ.sample);
733 }
734
735 simdscalar vCoverageMask = vMask(coverageMask);
736 simdscalar depthPassMask = vCoverageMask;
737 simdscalar stencilPassMask = vCoverageMask;
738
739 // Early-Z?
740 if (T::bCanEarlyZ)
741 {
742 AR_BEGIN(BEEarlyDepthTest, pDC->drawId);
743 depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
744 psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
745 AR_END(BEEarlyDepthTest, 0);
746
747 // early-exit if no samples passed depth or earlyZ is forced on.
748 if (state.psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask))
749 {
750 DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
751 pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
752
753 if (!_simd_movemask_ps(depthPassMask))
754 {
755 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
756 continue;
757 }
758 }
759 }
760
761 psContext.sampleIndex = sample;
762 psContext.activeMask = _simd_castps_si(vCoverageMask);
763
764 // execute pixel shader
765 AR_BEGIN(BEPixelShader, pDC->drawId);
766 UPDATE_STAT(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask)));
767 state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
768 AR_END(BEPixelShader, 0);
769
770 vCoverageMask = _simd_castsi_ps(psContext.activeMask);
771
772 // late-Z
773 if (!T::bCanEarlyZ)
774 {
775 AR_BEGIN(BELateDepthTest, pDC->drawId);
776 depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
777 psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
778 AR_END(BELateDepthTest, 0);
779
780 if (!_simd_movemask_ps(depthPassMask))
781 {
782 // need to call depth/stencil write for stencil write
783 DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
784 pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
785
786 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
787 continue;
788 }
789 }
790
791 uint32_t statMask = _simd_movemask_ps(depthPassMask);
792 uint32_t statCount = _mm_popcnt_u32(statMask);
793 UPDATE_STAT(DepthPassCount, statCount);
794
795 // output merger
796 AR_BEGIN(BEOutputMerger, pDC->drawId);
797 #if USE_8x2_TILE_BACKEND
798 OutputMerger(psContext, pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.numRenderTargets, useAlternateOffset);
799 #else
800 OutputMerger(psContext, pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.numRenderTargets);
801 #endif
802
803 // do final depth write after all pixel kills
804 if (!state.psState.forceEarlyZ)
805 {
806 DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
807 pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
808 }
809 AR_END(BEOutputMerger, 0);
810 }
811 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
812 }
813
814 Endtile:
815 ATTR_UNUSED;
816
817 AR_BEGIN(BEEndTile, pDC->drawId);
818
819 if(T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
820 {
821 work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
822 }
823
824 #if USE_8x2_TILE_BACKEND
825 if (useAlternateOffset)
826 {
827 for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt)
828 {
829 pColorBuffer[rt] += (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
830 }
831 }
832 #else
833 for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt)
834 {
835 pColorBuffer[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
836 }
837 #endif
838 pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
839 pStencilBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
840
841 AR_END(BEEndTile, 0);
842
843 psContext.vX.UL = _simd_add_ps(psContext.vX.UL, dx);
844 psContext.vX.center = _simd_add_ps(psContext.vX.center, dx);
845 }
846
847 psContext.vY.UL = _simd_add_ps(psContext.vY.UL, dy);
848 psContext.vY.center = _simd_add_ps(psContext.vY.center, dy);
849 }
850
851 AR_END(BESampleRateBackend, 0);
852 }
853
854 template<typename T>
855 void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
856 {
857 SWR_CONTEXT *pContext = pDC->pContext;
858
859 AR_BEGIN(BEPixelRateBackend, pDC->drawId);
860 AR_BEGIN(BESetup, pDC->drawId);
861
862 const API_STATE &state = GetApiState(pDC);
863
864 BarycentricCoeffs coeffs;
865 SetupBarycentricCoeffs(&coeffs, work);
866
867 uint8_t *pColorBuffer[SWR_NUM_RENDERTARGETS], *pDepthBuffer, *pStencilBuffer;
868 SetupRenderBuffers(pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.psState.numRenderTargets, renderBuffers);
869
870 SWR_PS_CONTEXT psContext;
871 SetupPixelShaderContext<T>(&psContext, work);
872
873 AR_END(BESetup, 0);
874
875 PixelRateZTestLoop<T> PixelRateZTest(pDC, workerId, work, coeffs, state, pDepthBuffer, pStencilBuffer, state.rastState.clipDistanceMask);
876
877 psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast<float>(y)));
878 psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast<float>(y)));
879
880 const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM));
881
882 for(uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
883 {
884 psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps(static_cast<float>(x)));
885 psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps(static_cast<float>(x)));
886
887 const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM));
888
889 for(uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
890 {
891 #if USE_8x2_TILE_BACKEND
892 const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0);
893
894 #endif
895 simdscalar activeLanes;
896 if(!(work.anyCoveredSamples & MASK)) {goto Endtile;};
897 activeLanes = vMask(work.anyCoveredSamples & MASK);
898
899 if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE)
900 {
901 const uint64_t* pCoverageMask = (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) ? &work.innerCoverageMask : &work.coverageMask[0];
902
903 generateInputCoverage<T, T::InputCoverage>(pCoverageMask, psContext.inputMask, state.blendState.sampleMask);
904 }
905
906 AR_BEGIN(BEBarycentric, pDC->drawId);
907
908 CalcPixelBarycentrics(coeffs, psContext);
909
910 CalcCentroid<T, false>(&psContext, coeffs, work.coverageMask, state.blendState.sampleMask);
911
912 AR_END(BEBarycentric, 0);
913
914 if(T::bForcedSampleCount)
915 {
916 // candidate pixels (that passed coverage) will cause shader invocation if any bits in the samplemask are set
917 const simdscalar vSampleMask = _simd_castsi_ps(_simd_cmpgt_epi32(_simd_set1_epi32(state.blendState.sampleMask), _simd_setzero_si()));
918 activeLanes = _simd_and_ps(activeLanes, vSampleMask);
919 }
920
921 // Early-Z?
922 if(T::bCanEarlyZ && !T::bForcedSampleCount)
923 {
924 uint32_t depthPassCount = PixelRateZTest(activeLanes, psContext, BEEarlyDepthTest);
925 UPDATE_STAT(DepthPassCount, depthPassCount);
926 }
927
928 // if we have no covered samples that passed depth at this point, go to next tile
929 if(!_simd_movemask_ps(activeLanes)) { goto Endtile; };
930
931 if(state.psState.usesSourceDepth)
932 {
933 AR_BEGIN(BEBarycentric, pDC->drawId);
934 // interpolate and quantize z
935 psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center);
936 psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
937 AR_END(BEBarycentric, 0);
938 }
939
940 // pixels that are currently active
941 psContext.activeMask = _simd_castps_si(activeLanes);
942 psContext.oMask = T::MultisampleT::FullSampleMask();
943
944 // execute pixel shader
945 AR_BEGIN(BEPixelShader, pDC->drawId);
946 state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
947 UPDATE_STAT(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(activeLanes)));
948 AR_END(BEPixelShader, 0);
949
950 // update active lanes to remove any discarded or oMask'd pixels
951 activeLanes = _simd_castsi_ps(_simd_and_si(psContext.activeMask, _simd_cmpgt_epi32(psContext.oMask, _simd_setzero_si())));
952 if(!_simd_movemask_ps(activeLanes)) { goto Endtile; };
953
954 // late-Z
955 if(!T::bCanEarlyZ && !T::bForcedSampleCount)
956 {
957 uint32_t depthPassCount = PixelRateZTest(activeLanes, psContext, BELateDepthTest);
958 UPDATE_STAT(DepthPassCount, depthPassCount);
959 }
960
961 // if we have no covered samples that passed depth at this point, skip OM and go to next tile
962 if(!_simd_movemask_ps(activeLanes)) { goto Endtile; };
963
964 // output merger
965 // loop over all samples, broadcasting the results of the PS to all passing pixels
966 for(uint32_t sample = 0; sample < GetNumOMSamples<T>(state.blendState.sampleCount); sample++)
967 {
968 AR_BEGIN(BEOutputMerger, pDC->drawId);
969 // center pattern does a single coverage/depth/stencil test, standard pattern tests all samples
970 uint32_t coverageSampleNum = (T::bIsStandardPattern) ? sample : 0;
971 simdscalar coverageMask, depthMask;
972 if(T::bForcedSampleCount)
973 {
974 coverageMask = depthMask = activeLanes;
975 }
976 else
977 {
978 coverageMask = PixelRateZTest.vCoverageMask[coverageSampleNum];
979 depthMask = PixelRateZTest.depthPassMask[coverageSampleNum];
980 if(!_simd_movemask_ps(depthMask))
981 {
982 // stencil should already have been written in early/lateZ tests
983 AR_END(BEOutputMerger, 0);
984 continue;
985 }
986 }
987
988 // broadcast the results of the PS to all passing pixels
989 #if USE_8x2_TILE_BACKEND
990 OutputMerger(psContext, pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, coverageMask, depthMask, state.psState.numRenderTargets, useAlternateOffset);
991 #else
992 OutputMerger(psContext, pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, coverageMask, depthMask, state.psState.numRenderTargets);
993 #endif
994
995 if(!state.psState.forceEarlyZ && !T::bForcedSampleCount)
996 {
997 uint8_t *pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample);
998 uint8_t * pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample);
999
1000 DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, PixelRateZTest.vZ[coverageSampleNum],
1001 pDepthSample, depthMask, coverageMask, pStencilSample, PixelRateZTest.stencilPassMask[coverageSampleNum]);
1002 }
1003 AR_END(BEOutputMerger, 0);
1004 }
1005 Endtile:
1006 AR_BEGIN(BEEndTile, pDC->drawId);
1007
1008 for(uint32_t sample = 0; sample < T::MultisampleT::numCoverageSamples; sample++)
1009 {
1010 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
1011 }
1012
1013 if(T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
1014 {
1015 work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
1016 }
1017 work.anyCoveredSamples >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
1018
1019 #if USE_8x2_TILE_BACKEND
1020 if (useAlternateOffset)
1021 {
1022 for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt)
1023 {
1024 pColorBuffer[rt] += (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
1025 }
1026 }
1027 #else
1028 for(uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt)
1029 {
1030 pColorBuffer[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
1031 }
1032 pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
1033 pStencilBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
1034 #endif
1035
1036 AR_END(BEEndTile, 0);
1037
1038 psContext.vX.UL = _simd_add_ps(psContext.vX.UL, dx);
1039 psContext.vX.center = _simd_add_ps(psContext.vX.center, dx);
1040 }
1041
1042 psContext.vY.UL = _simd_add_ps(psContext.vY.UL, dy);
1043 psContext.vY.center = _simd_add_ps(psContext.vY.center, dy);
1044 }
1045
1046 AR_END(BEPixelRateBackend, 0);
1047 }
1048 // optimized backend flow with NULL PS
1049 template<uint32_t sampleCountT>
1050 void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
1051 {
1052 SWR_CONTEXT *pContext = pDC->pContext;
1053
1054 AR_BEGIN(BENullBackend, pDC->drawId);
1055 ///@todo: handle center multisample pattern
1056 typedef SwrBackendTraits<sampleCountT, SWR_MSAA_STANDARD_PATTERN> T;
1057 AR_BEGIN(BESetup, pDC->drawId);
1058
1059 const API_STATE &state = GetApiState(pDC);
1060
1061 BarycentricCoeffs coeffs;
1062 SetupBarycentricCoeffs(&coeffs, work);
1063
1064 uint8_t *pDepthBuffer, *pStencilBuffer;
1065 SetupRenderBuffers(NULL, &pDepthBuffer, &pStencilBuffer, 0, renderBuffers);
1066
1067 SWR_PS_CONTEXT psContext;
1068 // skip SetupPixelShaderContext(&psContext, ...); // not needed here
1069
1070 AR_END(BESetup, 0);
1071
1072 simdscalar vYSamplePosUL = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast<float>(y)));
1073
1074 const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM));
1075
1076 for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
1077 {
1078 simdscalar vXSamplePosUL = _simd_add_ps(vULOffsetsX, _simd_set1_ps(static_cast<float>(x)));
1079
1080 const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM));
1081
1082 for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
1083 {
1084 // iterate over active samples
1085 unsigned long sample = 0;
1086 uint32_t sampleMask = state.blendState.sampleMask;
1087 while (_BitScanForward(&sample, sampleMask))
1088 {
1089 sampleMask &= ~(1 << sample);
1090
1091 simdmask coverageMask = work.coverageMask[sample] & MASK;
1092
1093 if (coverageMask)
1094 {
1095 // offset depth/stencil buffers current sample
1096 uint8_t *pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample);
1097 uint8_t *pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample);
1098
1099 if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable)
1100 {
1101 static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format");
1102
1103 const simdscalar z = _simd_load_ps(reinterpret_cast<const float *>(pDepthSample));
1104
1105 const float minz = state.depthBoundsState.depthBoundsTestMinValue;
1106 const float maxz = state.depthBoundsState.depthBoundsTestMaxValue;
1107
1108 coverageMask &= CalcDepthBoundsAcceptMask(z, minz, maxz);
1109 }
1110
1111 AR_BEGIN(BEBarycentric, pDC->drawId);
1112
1113 // calculate per sample positions
1114 psContext.vX.sample = _simd_add_ps(vXSamplePosUL, T::MultisampleT::vX(sample));
1115 psContext.vY.sample = _simd_add_ps(vYSamplePosUL, T::MultisampleT::vY(sample));
1116
1117 CalcSampleBarycentrics(coeffs, psContext);
1118
1119 // interpolate and quantize z
1120 psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample);
1121 psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
1122
1123 AR_END(BEBarycentric, 0);
1124
1125 // interpolate user clip distance if available
1126 if (state.rastState.clipDistanceMask)
1127 {
1128 coverageMask &= ~ComputeUserClipMask(state.rastState.clipDistanceMask, work.pUserClipBuffer, psContext.vI.sample, psContext.vJ.sample);
1129 }
1130
1131 simdscalar vCoverageMask = vMask(coverageMask);
1132 simdscalar stencilPassMask = vCoverageMask;
1133
1134 AR_BEGIN(BEEarlyDepthTest, pDC->drawId);
1135 simdscalar depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
1136 psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
1137 DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
1138 pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
1139 AR_END(BEEarlyDepthTest, 0);
1140
1141 uint32_t statMask = _simd_movemask_ps(depthPassMask);
1142 uint32_t statCount = _mm_popcnt_u32(statMask);
1143 UPDATE_STAT(DepthPassCount, statCount);
1144 }
1145
1146 Endtile:
1147 ATTR_UNUSED;
1148 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
1149 }
1150
1151 pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
1152 pStencilBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
1153
1154 vXSamplePosUL = _simd_add_ps(vXSamplePosUL, dx);
1155 }
1156
1157 vYSamplePosUL = _simd_add_ps(vYSamplePosUL, dy);
1158 }
1159
1160 AR_END(BENullBackend, 0);
1161 }
1162
1163 void InitClearTilesTable()
1164 {
1165 memset(sClearTilesTable, 0, sizeof(sClearTilesTable));
1166
1167 sClearTilesTable[R8G8B8A8_UNORM] = ClearMacroTile<R8G8B8A8_UNORM>;
1168 sClearTilesTable[B8G8R8A8_UNORM] = ClearMacroTile<B8G8R8A8_UNORM>;
1169 sClearTilesTable[R32_FLOAT] = ClearMacroTile<R32_FLOAT>;
1170 sClearTilesTable[R32G32B32A32_FLOAT] = ClearMacroTile<R32G32B32A32_FLOAT>;
1171 sClearTilesTable[R8_UINT] = ClearMacroTile<R8_UINT>;
1172 }
1173
1174 PFN_BACKEND_FUNC gBackendNullPs[SWR_MULTISAMPLE_TYPE_COUNT];
1175 PFN_BACKEND_FUNC gBackendSingleSample[SWR_INPUT_COVERAGE_COUNT]
1176 [2] // centroid
1177 [2] // canEarlyZ
1178 = {};
1179 PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_COUNT]
1180 [SWR_MSAA_SAMPLE_PATTERN_COUNT]
1181 [SWR_INPUT_COVERAGE_COUNT]
1182 [2] // centroid
1183 [2] // forcedSampleCount
1184 [2] // canEarlyZ
1185 = {};
1186 PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_COUNT]
1187 [SWR_INPUT_COVERAGE_COUNT]
1188 [2] // centroid
1189 [2] // canEarlyZ
1190 = {};
1191
1192 // Recursive template used to auto-nest conditionals. Converts dynamic enum function
1193 // arguments to static template arguments.
1194 template <uint32_t... ArgsT>
1195 struct BEChooser
1196 {
1197 // Last Arg Terminator
1198 static PFN_BACKEND_FUNC GetFunc(SWR_BACKEND_FUNCS tArg)
1199 {
1200 switch(tArg)
1201 {
1202 case SWR_BACKEND_SINGLE_SAMPLE: return BackendSingleSample<SwrBackendTraits<ArgsT...>>; break;
1203 case SWR_BACKEND_MSAA_PIXEL_RATE: return BackendPixelRate<SwrBackendTraits<ArgsT...>>; break;
1204 case SWR_BACKEND_MSAA_SAMPLE_RATE: return BackendSampleRate<SwrBackendTraits<ArgsT...>>; break;
1205 default:
1206 SWR_ASSERT(0 && "Invalid backend func\n");
1207 return nullptr;
1208 break;
1209 }
1210 }
1211
1212 // Recursively parse args
1213 template <typename... TArgsT>
1214 static PFN_BACKEND_FUNC GetFunc(SWR_MSAA_SAMPLE_PATTERN tArg, TArgsT... remainingArgs)
1215 {
1216 switch(tArg)
1217 {
1218 case SWR_MSAA_CENTER_PATTERN: return BEChooser<ArgsT..., SWR_MSAA_CENTER_PATTERN>::GetFunc(remainingArgs...); break;
1219 case SWR_MSAA_STANDARD_PATTERN: return BEChooser<ArgsT..., SWR_MSAA_STANDARD_PATTERN>::GetFunc(remainingArgs...); break;
1220 default:
1221 SWR_ASSERT(0 && "Invalid sample pattern\n");
1222 return BEChooser<ArgsT..., SWR_MSAA_STANDARD_PATTERN>::GetFunc(remainingArgs...);
1223 break;
1224 }
1225 }
1226
1227 // Recursively parse args
1228 template <typename... TArgsT>
1229 static PFN_BACKEND_FUNC GetFunc(SWR_INPUT_COVERAGE tArg, TArgsT... remainingArgs)
1230 {
1231 switch(tArg)
1232 {
1233 case SWR_INPUT_COVERAGE_NONE: return BEChooser<ArgsT..., SWR_INPUT_COVERAGE_NONE>::GetFunc(remainingArgs...); break;
1234 case SWR_INPUT_COVERAGE_NORMAL: return BEChooser<ArgsT..., SWR_INPUT_COVERAGE_NORMAL>::GetFunc(remainingArgs...); break;
1235 case SWR_INPUT_COVERAGE_INNER_CONSERVATIVE: return BEChooser<ArgsT..., SWR_INPUT_COVERAGE_INNER_CONSERVATIVE>::GetFunc(remainingArgs...); break;
1236 default:
1237 SWR_ASSERT(0 && "Invalid sample pattern\n");
1238 return BEChooser<ArgsT..., SWR_INPUT_COVERAGE_NONE>::GetFunc(remainingArgs...);
1239 break;
1240 }
1241 }
1242
1243 // Recursively parse args
1244 template <typename... TArgsT>
1245 static PFN_BACKEND_FUNC GetFunc(SWR_MULTISAMPLE_COUNT tArg, TArgsT... remainingArgs)
1246 {
1247 switch(tArg)
1248 {
1249 case SWR_MULTISAMPLE_1X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...); break;
1250 case SWR_MULTISAMPLE_2X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_2X>::GetFunc(remainingArgs...); break;
1251 case SWR_MULTISAMPLE_4X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_4X>::GetFunc(remainingArgs...); break;
1252 case SWR_MULTISAMPLE_8X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_8X>::GetFunc(remainingArgs...); break;
1253 case SWR_MULTISAMPLE_16X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_16X>::GetFunc(remainingArgs...); break;
1254 default:
1255 SWR_ASSERT(0 && "Invalid sample count\n");
1256 return BEChooser<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...);
1257 break;
1258 }
1259 }
1260
1261 // Recursively parse args
1262 template <typename... TArgsT>
1263 static PFN_BACKEND_FUNC GetFunc(bool tArg, TArgsT... remainingArgs)
1264 {
1265 if(tArg == true)
1266 {
1267 return BEChooser<ArgsT..., 1>::GetFunc(remainingArgs...);
1268 }
1269
1270 return BEChooser<ArgsT..., 0>::GetFunc(remainingArgs...);
1271 }
1272 };
1273
1274 void InitBackendSingleFuncTable(PFN_BACKEND_FUNC (&table)[SWR_INPUT_COVERAGE_COUNT][2][2])
1275 {
1276 for(uint32_t inputCoverage = 0; inputCoverage < SWR_INPUT_COVERAGE_COUNT; inputCoverage++)
1277 {
1278 for(uint32_t isCentroid = 0; isCentroid < 2; isCentroid++)
1279 {
1280 for(uint32_t canEarlyZ = 0; canEarlyZ < 2; canEarlyZ++)
1281 {
1282 table[inputCoverage][isCentroid][canEarlyZ] =
1283 BEChooser<>::GetFunc(SWR_MULTISAMPLE_1X, SWR_MSAA_STANDARD_PATTERN, (SWR_INPUT_COVERAGE)inputCoverage,
1284 (isCentroid > 0), false, (canEarlyZ > 0), SWR_BACKEND_SINGLE_SAMPLE);
1285 }
1286 }
1287 }
1288 }
1289
1290 void InitBackendPixelFuncTable(PFN_BACKEND_FUNC (&table)[SWR_MULTISAMPLE_TYPE_COUNT][SWR_MSAA_SAMPLE_PATTERN_COUNT][SWR_INPUT_COVERAGE_COUNT][2][2][2])
1291 {
1292 for(uint32_t sampleCount = SWR_MULTISAMPLE_1X; sampleCount < SWR_MULTISAMPLE_TYPE_COUNT; sampleCount++)
1293 {
1294 for(uint32_t samplePattern = SWR_MSAA_CENTER_PATTERN; samplePattern < SWR_MSAA_SAMPLE_PATTERN_COUNT; samplePattern++)
1295 {
1296 for(uint32_t inputCoverage = 0; inputCoverage < SWR_INPUT_COVERAGE_COUNT; inputCoverage++)
1297 {
1298 for(uint32_t isCentroid = 0; isCentroid < 2; isCentroid++)
1299 {
1300 for(uint32_t forcedSampleCount = 0; forcedSampleCount < 2; forcedSampleCount++)
1301 {
1302 for(uint32_t canEarlyZ = 0; canEarlyZ < 2; canEarlyZ++)
1303 {
1304 table[sampleCount][samplePattern][inputCoverage][isCentroid][forcedSampleCount][canEarlyZ] =
1305 BEChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, (SWR_MSAA_SAMPLE_PATTERN)samplePattern, (SWR_INPUT_COVERAGE)inputCoverage,
1306 (isCentroid > 0), (forcedSampleCount > 0), (canEarlyZ > 0), SWR_BACKEND_MSAA_PIXEL_RATE);
1307 }
1308 }
1309 }
1310 }
1311 }
1312 }
1313 }
1314
1315 void InitBackendSampleFuncTable(PFN_BACKEND_FUNC (&table)[SWR_MULTISAMPLE_TYPE_COUNT][SWR_INPUT_COVERAGE_COUNT][2][2])
1316 {
1317 for(uint32_t sampleCount = SWR_MULTISAMPLE_1X; sampleCount < SWR_MULTISAMPLE_TYPE_COUNT; sampleCount++)
1318 {
1319 for(uint32_t inputCoverage = 0; inputCoverage < SWR_INPUT_COVERAGE_COUNT; inputCoverage++)
1320 {
1321 for(uint32_t centroid = 0; centroid < 2; centroid++)
1322 {
1323 for(uint32_t canEarlyZ = 0; canEarlyZ < 2; canEarlyZ++)
1324 {
1325 table[sampleCount][inputCoverage][centroid][canEarlyZ] =
1326 BEChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, SWR_MSAA_STANDARD_PATTERN, (SWR_INPUT_COVERAGE)inputCoverage,
1327 (centroid > 0), false, (canEarlyZ > 0), (SWR_BACKEND_FUNCS)SWR_BACKEND_MSAA_SAMPLE_RATE);
1328 }
1329 }
1330 }
1331 }
1332 }
1333
1334 void InitBackendFuncTables()
1335 {
1336 InitBackendSingleFuncTable(gBackendSingleSample);
1337 InitBackendPixelFuncTable(gBackendPixelRateTable);
1338 InitBackendSampleFuncTable(gBackendSampleRateTable);
1339
1340 gBackendNullPs[SWR_MULTISAMPLE_1X] = &BackendNullPS < SWR_MULTISAMPLE_1X > ;
1341 gBackendNullPs[SWR_MULTISAMPLE_2X] = &BackendNullPS < SWR_MULTISAMPLE_2X > ;
1342 gBackendNullPs[SWR_MULTISAMPLE_4X] = &BackendNullPS < SWR_MULTISAMPLE_4X > ;
1343 gBackendNullPs[SWR_MULTISAMPLE_8X] = &BackendNullPS < SWR_MULTISAMPLE_8X > ;
1344 gBackendNullPs[SWR_MULTISAMPLE_16X] = &BackendNullPS < SWR_MULTISAMPLE_16X > ;
1345 }