16c4537b213d6843b19c414ce735f1bb3b1cc8ca
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / backend.cpp
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file backend.cpp
24 *
25 * @brief Backend handles rasterization, pixel shading and output merger
26 * operations.
27 *
28 ******************************************************************************/
29
30 #include <smmintrin.h>
31
32 #include "backend.h"
33 #include "depthstencil.h"
34 #include "tilemgr.h"
35 #include "memory/tilingtraits.h"
36 #include "core/multisample.h"
37
38 #include <algorithm>
39
40 typedef void(*PFN_CLEAR_TILES)(DRAW_CONTEXT*, SWR_RENDERTARGET_ATTACHMENT rt, uint32_t, DWORD[4], const SWR_RECT& rect);
41 static PFN_CLEAR_TILES sClearTilesTable[NUM_SWR_FORMATS];
42
43 //////////////////////////////////////////////////////////////////////////
44 /// @brief Process compute work.
45 /// @param pDC - pointer to draw context (dispatch).
46 /// @param workerId - The unique worker ID that is assigned to this thread.
47 /// @param threadGroupId - the linear index for the thread group within the dispatch.
48 void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId, void*& pSpillFillBuffer)
49 {
50 SWR_CONTEXT *pContext = pDC->pContext;
51
52 AR_BEGIN(BEDispatch, pDC->drawId);
53
54 const COMPUTE_DESC* pTaskData = (COMPUTE_DESC*)pDC->pDispatch->GetTasksData();
55 SWR_ASSERT(pTaskData != nullptr);
56
57 // Ensure spill fill memory has been allocated.
58 size_t spillFillSize = pDC->pState->state.totalSpillFillSize;
59 if (spillFillSize && pSpillFillBuffer == nullptr)
60 {
61 pSpillFillBuffer = pDC->pArena->AllocAlignedSync(spillFillSize, KNOB_SIMD_BYTES);
62 }
63
64 const API_STATE& state = GetApiState(pDC);
65
66 SWR_CS_CONTEXT csContext{ 0 };
67 csContext.tileCounter = threadGroupId;
68 csContext.dispatchDims[0] = pTaskData->threadGroupCountX;
69 csContext.dispatchDims[1] = pTaskData->threadGroupCountY;
70 csContext.dispatchDims[2] = pTaskData->threadGroupCountZ;
71 csContext.pTGSM = pContext->ppScratch[workerId];
72 csContext.pSpillFillBuffer = (uint8_t*)pSpillFillBuffer;
73
74 state.pfnCsFunc(GetPrivateState(pDC), &csContext);
75
76 UPDATE_STAT_BE(CsInvocations, state.totalThreadsInGroup);
77
78 AR_END(BEDispatch, 1);
79 }
80
81 //////////////////////////////////////////////////////////////////////////
82 /// @brief Process shutdown.
83 /// @param pDC - pointer to draw context (dispatch).
84 /// @param workerId - The unique worker ID that is assigned to this thread.
85 /// @param threadGroupId - the linear index for the thread group within the dispatch.
86 void ProcessShutdownBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData)
87 {
88 // Dummy function
89 }
90
91 void ProcessSyncBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData)
92 {
93 uint32_t x, y;
94 MacroTileMgr::getTileIndices(macroTile, x, y);
95 SWR_ASSERT(x == 0 && y == 0);
96 }
97
98 template<SWR_FORMAT format>
99 void ClearRasterTile(uint8_t *pTileBuffer, simdvector &value)
100 {
101 auto lambda = [&](int32_t comp)
102 {
103 FormatTraits<format>::storeSOA(comp, pTileBuffer, value.v[comp]);
104
105 pTileBuffer += (KNOB_SIMD_WIDTH * FormatTraits<format>::GetBPC(comp) / 8);
106 };
107
108 const uint32_t numIter = (KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD_TILE_X_DIM);
109
110 for (uint32_t i = 0; i < numIter; ++i)
111 {
112 UnrollerL<0, FormatTraits<format>::numComps, 1>::step(lambda);
113 }
114 }
115
116 #if USE_8x2_TILE_BACKEND
117 template<SWR_FORMAT format>
118 void ClearRasterTile(uint8_t *pTileBuffer, simd16vector &value)
119 {
120 auto lambda = [&](int32_t comp)
121 {
122 FormatTraits<format>::storeSOA(comp, pTileBuffer, value.v[comp]);
123
124 pTileBuffer += (KNOB_SIMD16_WIDTH * FormatTraits<format>::GetBPC(comp) / 8);
125 };
126
127 const uint32_t numIter = (KNOB_TILE_Y_DIM / SIMD16_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD16_TILE_X_DIM);
128
129 for (uint32_t i = 0; i < numIter; ++i)
130 {
131 UnrollerL<0, FormatTraits<format>::numComps, 1>::step(lambda);
132 }
133 }
134
135 #endif
136 template<SWR_FORMAT format>
137 INLINE void ClearMacroTile(DRAW_CONTEXT *pDC, SWR_RENDERTARGET_ATTACHMENT rt, uint32_t macroTile, DWORD clear[4], const SWR_RECT& rect)
138 {
139 // convert clear color to hottile format
140 // clear color is in RGBA float/uint32
141 #if USE_8x2_TILE_BACKEND
142 simd16vector vClear;
143 for (uint32_t comp = 0; comp < FormatTraits<format>::numComps; ++comp)
144 {
145 simd16scalar vComp;
146 vComp = _simd16_load1_ps((const float*)&clear[comp]);
147 if (FormatTraits<format>::isNormalized(comp))
148 {
149 vComp = _simd16_mul_ps(vComp, _simd16_set1_ps(FormatTraits<format>::fromFloat(comp)));
150 vComp = _simd16_castsi_ps(_simd16_cvtps_epi32(vComp));
151 }
152 vComp = FormatTraits<format>::pack(comp, vComp);
153 vClear.v[FormatTraits<format>::swizzle(comp)] = vComp;
154 }
155
156 #else
157 simdvector vClear;
158 for (uint32_t comp = 0; comp < FormatTraits<format>::numComps; ++comp)
159 {
160 simdscalar vComp;
161 vComp = _simd_load1_ps((const float*)&clear[comp]);
162 if (FormatTraits<format>::isNormalized(comp))
163 {
164 vComp = _simd_mul_ps(vComp, _simd_set1_ps(FormatTraits<format>::fromFloat(comp)));
165 vComp = _simd_castsi_ps(_simd_cvtps_epi32(vComp));
166 }
167 vComp = FormatTraits<format>::pack(comp, vComp);
168 vClear.v[FormatTraits<format>::swizzle(comp)] = vComp;
169 }
170
171 #endif
172 uint32_t tileX, tileY;
173 MacroTileMgr::getTileIndices(macroTile, tileX, tileY);
174
175 // Init to full macrotile
176 SWR_RECT clearTile =
177 {
178 KNOB_MACROTILE_X_DIM * int32_t(tileX),
179 KNOB_MACROTILE_Y_DIM * int32_t(tileY),
180 KNOB_MACROTILE_X_DIM * int32_t(tileX + 1),
181 KNOB_MACROTILE_Y_DIM * int32_t(tileY + 1),
182 };
183
184 // intersect with clear rect
185 clearTile &= rect;
186
187 // translate to local hottile origin
188 clearTile.Translate(-int32_t(tileX) * KNOB_MACROTILE_X_DIM, -int32_t(tileY) * KNOB_MACROTILE_Y_DIM);
189
190 // Make maximums inclusive (needed for convert to raster tiles)
191 clearTile.xmax -= 1;
192 clearTile.ymax -= 1;
193
194 // convert to raster tiles
195 clearTile.ymin >>= (KNOB_TILE_Y_DIM_SHIFT);
196 clearTile.ymax >>= (KNOB_TILE_Y_DIM_SHIFT);
197 clearTile.xmin >>= (KNOB_TILE_X_DIM_SHIFT);
198 clearTile.xmax >>= (KNOB_TILE_X_DIM_SHIFT);
199
200 const int32_t numSamples = GetNumSamples(pDC->pState->state.rastState.sampleCount);
201 // compute steps between raster tile samples / raster tiles / macro tile rows
202 const uint32_t rasterTileSampleStep = KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<format>::bpp / 8;
203 const uint32_t rasterTileStep = (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<format>::bpp / 8)) * numSamples;
204 const uint32_t macroTileRowStep = (KNOB_MACROTILE_X_DIM / KNOB_TILE_X_DIM) * rasterTileStep;
205 const uint32_t pitch = (FormatTraits<format>::bpp * KNOB_MACROTILE_X_DIM / 8);
206
207 HOTTILE *pHotTile = pDC->pContext->pHotTileMgr->GetHotTile(pDC->pContext, pDC, macroTile, rt, true, numSamples);
208 uint32_t rasterTileStartOffset = (ComputeTileOffset2D< TilingTraits<SWR_TILE_SWRZ, FormatTraits<format>::bpp > >(pitch, clearTile.xmin, clearTile.ymin)) * numSamples;
209 uint8_t* pRasterTileRow = pHotTile->pBuffer + rasterTileStartOffset; //(ComputeTileOffset2D< TilingTraits<SWR_TILE_SWRZ, FormatTraits<format>::bpp > >(pitch, x, y)) * numSamples;
210
211 // loop over all raster tiles in the current hot tile
212 for (int32_t y = clearTile.ymin; y <= clearTile.ymax; ++y)
213 {
214 uint8_t* pRasterTile = pRasterTileRow;
215 for (int32_t x = clearTile.xmin; x <= clearTile.xmax; ++x)
216 {
217 for( int32_t sampleNum = 0; sampleNum < numSamples; sampleNum++)
218 {
219 ClearRasterTile<format>(pRasterTile, vClear);
220 pRasterTile += rasterTileSampleStep;
221 }
222 }
223 pRasterTileRow += macroTileRowStep;
224 }
225
226 pHotTile->state = HOTTILE_DIRTY;
227 }
228
229
230 void ProcessClearBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData)
231 {
232 SWR_CONTEXT *pContext = pDC->pContext;
233
234 if (KNOB_FAST_CLEAR)
235 {
236 CLEAR_DESC *pClear = (CLEAR_DESC*)pUserData;
237 SWR_MULTISAMPLE_COUNT sampleCount = pDC->pState->state.rastState.sampleCount;
238 uint32_t numSamples = GetNumSamples(sampleCount);
239
240 SWR_ASSERT(pClear->flags.bits != 0); // shouldn't be here without a reason.
241
242 AR_BEGIN(BEClear, pDC->drawId);
243
244 if (pClear->flags.mask & SWR_CLEAR_COLOR)
245 {
246 HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, SWR_ATTACHMENT_COLOR0, true, numSamples);
247 // All we want to do here is to mark the hot tile as being in a "needs clear" state.
248 pHotTile->clearData[0] = *(DWORD*)&(pClear->clearRTColor[0]);
249 pHotTile->clearData[1] = *(DWORD*)&(pClear->clearRTColor[1]);
250 pHotTile->clearData[2] = *(DWORD*)&(pClear->clearRTColor[2]);
251 pHotTile->clearData[3] = *(DWORD*)&(pClear->clearRTColor[3]);
252 pHotTile->state = HOTTILE_CLEAR;
253 }
254
255 if (pClear->flags.mask & SWR_CLEAR_DEPTH)
256 {
257 HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, SWR_ATTACHMENT_DEPTH, true, numSamples);
258 pHotTile->clearData[0] = *(DWORD*)&pClear->clearDepth;
259 pHotTile->state = HOTTILE_CLEAR;
260 }
261
262 if (pClear->flags.mask & SWR_CLEAR_STENCIL)
263 {
264 HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, SWR_ATTACHMENT_STENCIL, true, numSamples);
265
266 pHotTile->clearData[0] = *(DWORD*)&pClear->clearStencil;
267 pHotTile->state = HOTTILE_CLEAR;
268 }
269
270 AR_END(BEClear, 1);
271 }
272 else
273 {
274 // Legacy clear
275 CLEAR_DESC *pClear = (CLEAR_DESC*)pUserData;
276 AR_BEGIN(BEClear, pDC->drawId);
277
278 if (pClear->flags.mask & SWR_CLEAR_COLOR)
279 {
280 /// @todo clear data should come in as RGBA32_FLOAT
281 DWORD clearData[4];
282 float clearFloat[4];
283 clearFloat[0] = ((uint8_t*)(&pClear->clearRTColor))[0] / 255.0f;
284 clearFloat[1] = ((uint8_t*)(&pClear->clearRTColor))[1] / 255.0f;
285 clearFloat[2] = ((uint8_t*)(&pClear->clearRTColor))[2] / 255.0f;
286 clearFloat[3] = ((uint8_t*)(&pClear->clearRTColor))[3] / 255.0f;
287 clearData[0] = *(DWORD*)&clearFloat[0];
288 clearData[1] = *(DWORD*)&clearFloat[1];
289 clearData[2] = *(DWORD*)&clearFloat[2];
290 clearData[3] = *(DWORD*)&clearFloat[3];
291
292 PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[KNOB_COLOR_HOT_TILE_FORMAT];
293 SWR_ASSERT(pfnClearTiles != nullptr);
294
295 pfnClearTiles(pDC, SWR_ATTACHMENT_COLOR0, macroTile, clearData, pClear->rect);
296 }
297
298 if (pClear->flags.mask & SWR_CLEAR_DEPTH)
299 {
300 DWORD clearData[4];
301 clearData[0] = *(DWORD*)&pClear->clearDepth;
302 PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[KNOB_DEPTH_HOT_TILE_FORMAT];
303 SWR_ASSERT(pfnClearTiles != nullptr);
304
305 pfnClearTiles(pDC, SWR_ATTACHMENT_DEPTH, macroTile, clearData, pClear->rect);
306 }
307
308 if (pClear->flags.mask & SWR_CLEAR_STENCIL)
309 {
310 uint32_t value = pClear->clearStencil;
311 DWORD clearData[4];
312 clearData[0] = *(DWORD*)&value;
313 PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[KNOB_STENCIL_HOT_TILE_FORMAT];
314
315 pfnClearTiles(pDC, SWR_ATTACHMENT_STENCIL, macroTile, clearData, pClear->rect);
316 }
317
318 AR_END(BEClear, 1);
319 }
320 }
321
322 void ProcessStoreTileBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, STORE_TILES_DESC* pDesc,
323 SWR_RENDERTARGET_ATTACHMENT attachment)
324 {
325 SWR_CONTEXT *pContext = pDC->pContext;
326
327 AR_BEGIN(BEStoreTiles, pDC->drawId);
328
329 SWR_FORMAT srcFormat;
330 switch (attachment)
331 {
332 case SWR_ATTACHMENT_COLOR0:
333 case SWR_ATTACHMENT_COLOR1:
334 case SWR_ATTACHMENT_COLOR2:
335 case SWR_ATTACHMENT_COLOR3:
336 case SWR_ATTACHMENT_COLOR4:
337 case SWR_ATTACHMENT_COLOR5:
338 case SWR_ATTACHMENT_COLOR6:
339 case SWR_ATTACHMENT_COLOR7: srcFormat = KNOB_COLOR_HOT_TILE_FORMAT; break;
340 case SWR_ATTACHMENT_DEPTH: srcFormat = KNOB_DEPTH_HOT_TILE_FORMAT; break;
341 case SWR_ATTACHMENT_STENCIL: srcFormat = KNOB_STENCIL_HOT_TILE_FORMAT; break;
342 default: SWR_ASSERT(false, "Unknown attachment: %d", attachment); srcFormat = KNOB_COLOR_HOT_TILE_FORMAT; break;
343 }
344
345 uint32_t x, y;
346 MacroTileMgr::getTileIndices(macroTile, x, y);
347
348 // Only need to store the hottile if it's been rendered to...
349 HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, attachment, false);
350 if (pHotTile)
351 {
352 // clear if clear is pending (i.e., not rendered to), then mark as dirty for store.
353 if (pHotTile->state == HOTTILE_CLEAR)
354 {
355 PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[srcFormat];
356 SWR_ASSERT(pfnClearTiles != nullptr);
357
358 pfnClearTiles(pDC, attachment, macroTile, pHotTile->clearData, pDesc->rect);
359 }
360
361 if (pHotTile->state == HOTTILE_DIRTY || pDesc->postStoreTileState == (SWR_TILE_STATE)HOTTILE_DIRTY)
362 {
363 int32_t destX = KNOB_MACROTILE_X_DIM * x;
364 int32_t destY = KNOB_MACROTILE_Y_DIM * y;
365
366 pContext->pfnStoreTile(GetPrivateState(pDC), srcFormat,
367 attachment, destX, destY, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer);
368 }
369
370
371 if (pHotTile->state == HOTTILE_DIRTY || pHotTile->state == HOTTILE_RESOLVED)
372 {
373 pHotTile->state = (HOTTILE_STATE)pDesc->postStoreTileState;
374 }
375 }
376 AR_END(BEStoreTiles, 1);
377 }
378
379 void ProcessStoreTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData)
380 {
381 STORE_TILES_DESC *pDesc = (STORE_TILES_DESC*)pData;
382
383 unsigned long rt = 0;
384 uint32_t mask = pDesc->attachmentMask;
385 while (_BitScanForward(&rt, mask))
386 {
387 mask &= ~(1 << rt);
388 ProcessStoreTileBE(pDC, workerId, macroTile, pDesc, (SWR_RENDERTARGET_ATTACHMENT)rt);
389 }
390 }
391
392 void ProcessDiscardInvalidateTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData)
393 {
394 DISCARD_INVALIDATE_TILES_DESC *pDesc = (DISCARD_INVALIDATE_TILES_DESC *)pData;
395 SWR_CONTEXT *pContext = pDC->pContext;
396
397 const int32_t numSamples = GetNumSamples(pDC->pState->state.rastState.sampleCount);
398
399 for (uint32_t i = 0; i < SWR_NUM_ATTACHMENTS; ++i)
400 {
401 if (pDesc->attachmentMask & (1 << i))
402 {
403 HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTileNoLoad(
404 pContext, pDC, macroTile, (SWR_RENDERTARGET_ATTACHMENT)i, pDesc->createNewTiles, numSamples);
405 if (pHotTile)
406 {
407 pHotTile->state = (HOTTILE_STATE)pDesc->newTileState;
408 }
409 }
410 }
411 }
412
413 #if KNOB_SIMD_WIDTH == 8
414 const __m256 vCenterOffsetsX = {0.5, 1.5, 0.5, 1.5, 2.5, 3.5, 2.5, 3.5};
415 const __m256 vCenterOffsetsY = {0.5, 0.5, 1.5, 1.5, 0.5, 0.5, 1.5, 1.5};
416 const __m256 vULOffsetsX = {0.0, 1.0, 0.0, 1.0, 2.0, 3.0, 2.0, 3.0};
417 const __m256 vULOffsetsY = {0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0};
418 #else
419 #error Unsupported vector width
420 #endif
421
422 simdmask ComputeUserClipMask(uint8_t clipMask, float* pUserClipBuffer, simdscalar vI, simdscalar vJ)
423 {
424 simdscalar vClipMask = _simd_setzero_ps();
425 uint32_t numClipDistance = _mm_popcnt_u32(clipMask);
426
427 for (uint32_t i = 0; i < numClipDistance; ++i)
428 {
429 // pull triangle clip distance values from clip buffer
430 simdscalar vA = _simd_broadcast_ss(pUserClipBuffer++);
431 simdscalar vB = _simd_broadcast_ss(pUserClipBuffer++);
432 simdscalar vC = _simd_broadcast_ss(pUserClipBuffer++);
433
434 // interpolate
435 simdscalar vInterp = vplaneps(vA, vB, vC, vI, vJ);
436
437 // clip if interpolated clip distance is < 0 || NAN
438 simdscalar vCull = _simd_cmp_ps(_simd_setzero_ps(), vInterp, _CMP_NLE_UQ);
439
440 vClipMask = _simd_or_ps(vClipMask, vCull);
441 }
442
443 return _simd_movemask_ps(vClipMask);
444 }
445
446 template<typename T>
447 void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
448 {
449 SWR_CONTEXT *pContext = pDC->pContext;
450
451 AR_BEGIN(BESingleSampleBackend, pDC->drawId);
452 AR_BEGIN(BESetup, pDC->drawId);
453
454 const API_STATE &state = GetApiState(pDC);
455
456 BarycentricCoeffs coeffs;
457 SetupBarycentricCoeffs(&coeffs, work);
458
459 uint8_t *pColorBuffer[SWR_NUM_RENDERTARGETS], *pDepthBuffer, *pStencilBuffer;
460 SetupRenderBuffers(pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.psState.numRenderTargets, renderBuffers);
461
462 SWR_PS_CONTEXT psContext;
463 SetupPixelShaderContext<T>(&psContext, work);
464
465 AR_END(BESetup, 1);
466
467 psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast<float>(y)));
468 psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast<float>(y)));
469
470 const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM));
471
472 for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
473 {
474 psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps(static_cast<float>(x)));
475 psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps(static_cast<float>(x)));
476
477 const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM));
478
479 for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
480 {
481 #if USE_8x2_TILE_BACKEND
482 const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0);
483
484 #endif
485 simdmask coverageMask = work.coverageMask[0] & MASK;
486
487 if (coverageMask)
488 {
489 if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable)
490 {
491 static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format");
492
493 const simdscalar z = _simd_load_ps(reinterpret_cast<const float *>(pDepthBuffer));
494
495 const float minz = state.depthBoundsState.depthBoundsTestMinValue;
496 const float maxz = state.depthBoundsState.depthBoundsTestMaxValue;
497
498 coverageMask &= CalcDepthBoundsAcceptMask(z, minz, maxz);
499 }
500
501 if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE)
502 {
503 const uint64_t* pCoverageMask = (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) ? &work.innerCoverageMask : &work.coverageMask[0];
504
505 generateInputCoverage<T, T::InputCoverage>(pCoverageMask, psContext.inputMask, state.blendState.sampleMask);
506 }
507
508 AR_BEGIN(BEBarycentric, pDC->drawId);
509
510 CalcPixelBarycentrics(coeffs, psContext);
511
512 CalcCentroid<T, true>(&psContext, coeffs, work.coverageMask, state.blendState.sampleMask);
513
514 // interpolate and quantize z
515 psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center);
516 psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
517
518 AR_END(BEBarycentric, 1);
519
520 // interpolate user clip distance if available
521 if (state.rastState.clipDistanceMask)
522 {
523 coverageMask &= ~ComputeUserClipMask(state.rastState.clipDistanceMask, work.pUserClipBuffer, psContext.vI.center, psContext.vJ.center);
524 }
525
526 simdscalar vCoverageMask = vMask(coverageMask);
527 simdscalar depthPassMask = vCoverageMask;
528 simdscalar stencilPassMask = vCoverageMask;
529
530 // Early-Z?
531 if (T::bCanEarlyZ)
532 {
533 AR_BEGIN(BEEarlyDepthTest, pDC->drawId);
534 depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
535 psContext.vZ, pDepthBuffer, vCoverageMask, pStencilBuffer, &stencilPassMask);
536 AR_EVENT(EarlyDepthStencilInfoSingleSample(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(vCoverageMask), _simd_movemask_ps(stencilPassMask)));
537 AR_END(BEEarlyDepthTest, 0);
538
539 // early-exit if no pixels passed depth or earlyZ is forced on
540 if (state.psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask))
541 {
542 DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
543 pDepthBuffer, depthPassMask, vCoverageMask, pStencilBuffer, stencilPassMask);
544
545 if (!_simd_movemask_ps(depthPassMask))
546 {
547 goto Endtile;
548 }
549 }
550 }
551
552 psContext.sampleIndex = 0;
553 psContext.activeMask = _simd_castps_si(vCoverageMask);
554
555 // execute pixel shader
556 AR_BEGIN(BEPixelShader, pDC->drawId);
557 UPDATE_STAT_BE(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask)));
558 state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
559 AR_END(BEPixelShader, 0);
560
561 vCoverageMask = _simd_castsi_ps(psContext.activeMask);
562
563 // late-Z
564 if (!T::bCanEarlyZ)
565 {
566 AR_BEGIN(BELateDepthTest, pDC->drawId);
567 depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
568 psContext.vZ, pDepthBuffer, vCoverageMask, pStencilBuffer, &stencilPassMask);
569 AR_EVENT(LateDepthStencilInfoSingleSample(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(vCoverageMask), _simd_movemask_ps(stencilPassMask)));
570 AR_END(BELateDepthTest, 0);
571
572 if (!_simd_movemask_ps(depthPassMask))
573 {
574 // need to call depth/stencil write for stencil write
575 DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
576 pDepthBuffer, depthPassMask, vCoverageMask, pStencilBuffer, stencilPassMask);
577 goto Endtile;
578 }
579 }
580
581 uint32_t statMask = _simd_movemask_ps(depthPassMask);
582 uint32_t statCount = _mm_popcnt_u32(statMask);
583 UPDATE_STAT_BE(DepthPassCount, statCount);
584
585 // output merger
586 AR_BEGIN(BEOutputMerger, pDC->drawId);
587 #if USE_8x2_TILE_BACKEND
588 OutputMerger(psContext, pColorBuffer, 0, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.numRenderTargets, useAlternateOffset);
589 #else
590 OutputMerger(psContext, pColorBuffer, 0, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.numRenderTargets);
591 #endif
592
593 // do final depth write after all pixel kills
594 if (!state.psState.forceEarlyZ)
595 {
596 DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
597 pDepthBuffer, depthPassMask, vCoverageMask, pStencilBuffer, stencilPassMask);
598 }
599 AR_END(BEOutputMerger, 0);
600 }
601
602 Endtile:
603 AR_BEGIN(BEEndTile, pDC->drawId);
604
605 work.coverageMask[0] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
606 if(T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
607 {
608 work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
609 }
610
611 #if USE_8x2_TILE_BACKEND
612 if (useAlternateOffset)
613 {
614 for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt)
615 {
616 pColorBuffer[rt] += (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
617 }
618 }
619 #else
620 for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt)
621 {
622 pColorBuffer[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
623 }
624 #endif
625 pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
626 pStencilBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
627
628 AR_END(BEEndTile, 0);
629
630 psContext.vX.UL = _simd_add_ps(psContext.vX.UL, dx);
631 psContext.vX.center = _simd_add_ps(psContext.vX.center, dx);
632 }
633
634 psContext.vY.UL = _simd_add_ps(psContext.vY.UL, dy);
635 psContext.vY.center = _simd_add_ps(psContext.vY.center, dy);
636 }
637
638 AR_END(BESingleSampleBackend, 0);
639 }
640
641 template<typename T>
642 void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
643 {
644 SWR_CONTEXT *pContext = pDC->pContext;
645
646 AR_BEGIN(BESampleRateBackend, pDC->drawId);
647 AR_BEGIN(BESetup, pDC->drawId);
648
649 const API_STATE &state = GetApiState(pDC);
650
651 BarycentricCoeffs coeffs;
652 SetupBarycentricCoeffs(&coeffs, work);
653
654 uint8_t *pColorBuffer[SWR_NUM_RENDERTARGETS], *pDepthBuffer, *pStencilBuffer;
655 SetupRenderBuffers(pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.psState.numRenderTargets, renderBuffers);
656
657 SWR_PS_CONTEXT psContext;
658 SetupPixelShaderContext<T>(&psContext, work);
659
660 AR_END(BESetup, 0);
661
662 psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast<float>(y)));
663 psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast<float>(y)));
664
665 const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM));
666
667 for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
668 {
669 psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps(static_cast<float>(x)));
670 psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps(static_cast<float>(x)));
671
672 const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM));
673
674 for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
675 {
676 #if USE_8x2_TILE_BACKEND
677 const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0);
678
679 #endif
680 if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE)
681 {
682 const uint64_t* pCoverageMask = (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) ? &work.innerCoverageMask : &work.coverageMask[0];
683
684 generateInputCoverage<T, T::InputCoverage>(pCoverageMask, psContext.inputMask, state.blendState.sampleMask);
685 }
686
687 AR_BEGIN(BEBarycentric, pDC->drawId);
688
689 CalcPixelBarycentrics(coeffs, psContext);
690
691 CalcCentroid<T, false>(&psContext, coeffs, work.coverageMask, state.blendState.sampleMask);
692
693 AR_END(BEBarycentric, 0);
694
695 for (uint32_t sample = 0; sample < T::MultisampleT::numSamples; sample++)
696 {
697 simdmask coverageMask = work.coverageMask[sample] & MASK;
698
699 if (coverageMask)
700 {
701 // offset depth/stencil buffers current sample
702 uint8_t *pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample);
703 uint8_t *pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample);
704
705 if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable)
706 {
707 static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format");
708
709 const simdscalar z = _simd_load_ps(reinterpret_cast<const float *>(pDepthSample));
710
711 const float minz = state.depthBoundsState.depthBoundsTestMinValue;
712 const float maxz = state.depthBoundsState.depthBoundsTestMaxValue;
713
714 coverageMask &= CalcDepthBoundsAcceptMask(z, minz, maxz);
715 }
716
717 AR_BEGIN(BEBarycentric, pDC->drawId);
718
719 // calculate per sample positions
720 psContext.vX.sample = _simd_add_ps(psContext.vX.UL, T::MultisampleT::vX(sample));
721 psContext.vY.sample = _simd_add_ps(psContext.vY.UL, T::MultisampleT::vY(sample));
722
723 CalcSampleBarycentrics(coeffs, psContext);
724
725 // interpolate and quantize z
726 psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample);
727 psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
728
729 AR_END(BEBarycentric, 0);
730
731 // interpolate user clip distance if available
732 if (state.rastState.clipDistanceMask)
733 {
734 coverageMask &= ~ComputeUserClipMask(state.rastState.clipDistanceMask, work.pUserClipBuffer, psContext.vI.sample, psContext.vJ.sample);
735 }
736
737 simdscalar vCoverageMask = vMask(coverageMask);
738 simdscalar depthPassMask = vCoverageMask;
739 simdscalar stencilPassMask = vCoverageMask;
740
741 // Early-Z?
742 if (T::bCanEarlyZ)
743 {
744 AR_BEGIN(BEEarlyDepthTest, pDC->drawId);
745 depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
746 psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
747 AR_EVENT(EarlyDepthStencilInfoSampleRate(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(vCoverageMask), _simd_movemask_ps(stencilPassMask)));
748 AR_END(BEEarlyDepthTest, 0);
749
750 // early-exit if no samples passed depth or earlyZ is forced on.
751 if (state.psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask))
752 {
753 DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
754 pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
755
756 if (!_simd_movemask_ps(depthPassMask))
757 {
758 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
759 continue;
760 }
761 }
762 }
763
764 psContext.sampleIndex = sample;
765 psContext.activeMask = _simd_castps_si(vCoverageMask);
766
767 // execute pixel shader
768 AR_BEGIN(BEPixelShader, pDC->drawId);
769 UPDATE_STAT_BE(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask)));
770 state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
771 AR_END(BEPixelShader, 0);
772
773 vCoverageMask = _simd_castsi_ps(psContext.activeMask);
774
775 // late-Z
776 if (!T::bCanEarlyZ)
777 {
778 AR_BEGIN(BELateDepthTest, pDC->drawId);
779 depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
780 psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
781 AR_EVENT(LateDepthStencilInfoSampleRate(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(vCoverageMask), _simd_movemask_ps(stencilPassMask)));
782 AR_END(BELateDepthTest, 0);
783
784 if (!_simd_movemask_ps(depthPassMask))
785 {
786 // need to call depth/stencil write for stencil write
787 DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
788 pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
789
790 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
791 continue;
792 }
793 }
794
795 uint32_t statMask = _simd_movemask_ps(depthPassMask);
796 uint32_t statCount = _mm_popcnt_u32(statMask);
797 UPDATE_STAT_BE(DepthPassCount, statCount);
798
799 // output merger
800 AR_BEGIN(BEOutputMerger, pDC->drawId);
801 #if USE_8x2_TILE_BACKEND
802 OutputMerger(psContext, pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.numRenderTargets, useAlternateOffset);
803 #else
804 OutputMerger(psContext, pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.numRenderTargets);
805 #endif
806
807 // do final depth write after all pixel kills
808 if (!state.psState.forceEarlyZ)
809 {
810 DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
811 pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
812 }
813 AR_END(BEOutputMerger, 0);
814 }
815 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
816 }
817
818 Endtile:
819 ATTR_UNUSED;
820
821 AR_BEGIN(BEEndTile, pDC->drawId);
822
823 if(T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
824 {
825 work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
826 }
827
828 #if USE_8x2_TILE_BACKEND
829 if (useAlternateOffset)
830 {
831 for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt)
832 {
833 pColorBuffer[rt] += (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
834 }
835 }
836 #else
837 for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt)
838 {
839 pColorBuffer[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
840 }
841 #endif
842 pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
843 pStencilBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
844
845 AR_END(BEEndTile, 0);
846
847 psContext.vX.UL = _simd_add_ps(psContext.vX.UL, dx);
848 psContext.vX.center = _simd_add_ps(psContext.vX.center, dx);
849 }
850
851 psContext.vY.UL = _simd_add_ps(psContext.vY.UL, dy);
852 psContext.vY.center = _simd_add_ps(psContext.vY.center, dy);
853 }
854
855 AR_END(BESampleRateBackend, 0);
856 }
857
858 template<typename T>
859 void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
860 {
861 SWR_CONTEXT *pContext = pDC->pContext;
862
863 AR_BEGIN(BEPixelRateBackend, pDC->drawId);
864 AR_BEGIN(BESetup, pDC->drawId);
865
866 const API_STATE &state = GetApiState(pDC);
867
868 BarycentricCoeffs coeffs;
869 SetupBarycentricCoeffs(&coeffs, work);
870
871 uint8_t *pColorBuffer[SWR_NUM_RENDERTARGETS], *pDepthBuffer, *pStencilBuffer;
872 SetupRenderBuffers(pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.psState.numRenderTargets, renderBuffers);
873
874 SWR_PS_CONTEXT psContext;
875 SetupPixelShaderContext<T>(&psContext, work);
876
877 AR_END(BESetup, 0);
878
879 PixelRateZTestLoop<T> PixelRateZTest(pDC, workerId, work, coeffs, state, pDepthBuffer, pStencilBuffer, state.rastState.clipDistanceMask);
880
881 psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast<float>(y)));
882 psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast<float>(y)));
883
884 const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM));
885
886 for(uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
887 {
888 psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps(static_cast<float>(x)));
889 psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps(static_cast<float>(x)));
890
891 const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM));
892
893 for(uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
894 {
895 #if USE_8x2_TILE_BACKEND
896 const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0);
897
898 #endif
899 simdscalar activeLanes;
900 if(!(work.anyCoveredSamples & MASK)) {goto Endtile;};
901 activeLanes = vMask(work.anyCoveredSamples & MASK);
902
903 if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE)
904 {
905 const uint64_t* pCoverageMask = (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) ? &work.innerCoverageMask : &work.coverageMask[0];
906
907 generateInputCoverage<T, T::InputCoverage>(pCoverageMask, psContext.inputMask, state.blendState.sampleMask);
908 }
909
910 AR_BEGIN(BEBarycentric, pDC->drawId);
911
912 CalcPixelBarycentrics(coeffs, psContext);
913
914 CalcCentroid<T, false>(&psContext, coeffs, work.coverageMask, state.blendState.sampleMask);
915
916 AR_END(BEBarycentric, 0);
917
918 if(T::bForcedSampleCount)
919 {
920 // candidate pixels (that passed coverage) will cause shader invocation if any bits in the samplemask are set
921 const simdscalar vSampleMask = _simd_castsi_ps(_simd_cmpgt_epi32(_simd_set1_epi32(state.blendState.sampleMask), _simd_setzero_si()));
922 activeLanes = _simd_and_ps(activeLanes, vSampleMask);
923 }
924
925 // Early-Z?
926 if(T::bCanEarlyZ && !T::bForcedSampleCount)
927 {
928 uint32_t depthPassCount = PixelRateZTest(activeLanes, psContext, BEEarlyDepthTest);
929 UPDATE_STAT_BE(DepthPassCount, depthPassCount);
930 AR_EVENT(EarlyDepthInfoPixelRate(depthPassCount, _simd_movemask_ps(activeLanes)));
931 }
932
933 // if we have no covered samples that passed depth at this point, go to next tile
934 if(!_simd_movemask_ps(activeLanes)) { goto Endtile; };
935
936 if(state.psState.usesSourceDepth)
937 {
938 AR_BEGIN(BEBarycentric, pDC->drawId);
939 // interpolate and quantize z
940 psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center);
941 psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
942 AR_END(BEBarycentric, 0);
943 }
944
945 // pixels that are currently active
946 psContext.activeMask = _simd_castps_si(activeLanes);
947 psContext.oMask = T::MultisampleT::FullSampleMask();
948
949 // execute pixel shader
950 AR_BEGIN(BEPixelShader, pDC->drawId);
951 state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
952 UPDATE_STAT_BE(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(activeLanes)));
953 AR_END(BEPixelShader, 0);
954
955 // update active lanes to remove any discarded or oMask'd pixels
956 activeLanes = _simd_castsi_ps(_simd_and_si(psContext.activeMask, _simd_cmpgt_epi32(psContext.oMask, _simd_setzero_si())));
957 if(!_simd_movemask_ps(activeLanes)) { goto Endtile; };
958
959 // late-Z
960 if(!T::bCanEarlyZ && !T::bForcedSampleCount)
961 {
962 uint32_t depthPassCount = PixelRateZTest(activeLanes, psContext, BELateDepthTest);
963 UPDATE_STAT_BE(DepthPassCount, depthPassCount);
964 AR_EVENT(LateDepthInfoPixelRate(depthPassCount, _simd_movemask_ps(activeLanes)));
965 }
966
967 // if we have no covered samples that passed depth at this point, skip OM and go to next tile
968 if(!_simd_movemask_ps(activeLanes)) { goto Endtile; };
969
970 // output merger
971 // loop over all samples, broadcasting the results of the PS to all passing pixels
972 for(uint32_t sample = 0; sample < GetNumOMSamples<T>(state.blendState.sampleCount); sample++)
973 {
974 AR_BEGIN(BEOutputMerger, pDC->drawId);
975 // center pattern does a single coverage/depth/stencil test, standard pattern tests all samples
976 uint32_t coverageSampleNum = (T::bIsStandardPattern) ? sample : 0;
977 simdscalar coverageMask, depthMask;
978 if(T::bForcedSampleCount)
979 {
980 coverageMask = depthMask = activeLanes;
981 }
982 else
983 {
984 coverageMask = PixelRateZTest.vCoverageMask[coverageSampleNum];
985 depthMask = PixelRateZTest.depthPassMask[coverageSampleNum];
986 if(!_simd_movemask_ps(depthMask))
987 {
988 // stencil should already have been written in early/lateZ tests
989 AR_END(BEOutputMerger, 0);
990 continue;
991 }
992 }
993
994 // broadcast the results of the PS to all passing pixels
995 #if USE_8x2_TILE_BACKEND
996 OutputMerger(psContext, pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, coverageMask, depthMask, state.psState.numRenderTargets, useAlternateOffset);
997 #else
998 OutputMerger(psContext, pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, coverageMask, depthMask, state.psState.numRenderTargets);
999 #endif
1000
1001 if(!state.psState.forceEarlyZ && !T::bForcedSampleCount)
1002 {
1003 uint8_t *pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample);
1004 uint8_t * pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample);
1005
1006 DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, PixelRateZTest.vZ[coverageSampleNum],
1007 pDepthSample, depthMask, coverageMask, pStencilSample, PixelRateZTest.stencilPassMask[coverageSampleNum]);
1008 }
1009 AR_END(BEOutputMerger, 0);
1010 }
1011 Endtile:
1012 AR_BEGIN(BEEndTile, pDC->drawId);
1013
1014 for(uint32_t sample = 0; sample < T::MultisampleT::numCoverageSamples; sample++)
1015 {
1016 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
1017 }
1018
1019 if(T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
1020 {
1021 work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
1022 }
1023 work.anyCoveredSamples >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
1024
1025 #if USE_8x2_TILE_BACKEND
1026 if (useAlternateOffset)
1027 {
1028 for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt)
1029 {
1030 pColorBuffer[rt] += (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
1031 }
1032 }
1033 #else
1034 for(uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt)
1035 {
1036 pColorBuffer[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
1037 }
1038 pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
1039 pStencilBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
1040 #endif
1041
1042 AR_END(BEEndTile, 0);
1043
1044 psContext.vX.UL = _simd_add_ps(psContext.vX.UL, dx);
1045 psContext.vX.center = _simd_add_ps(psContext.vX.center, dx);
1046 }
1047
1048 psContext.vY.UL = _simd_add_ps(psContext.vY.UL, dy);
1049 psContext.vY.center = _simd_add_ps(psContext.vY.center, dy);
1050 }
1051
1052 AR_END(BEPixelRateBackend, 0);
1053 }
1054 // optimized backend flow with NULL PS
1055 template<uint32_t sampleCountT>
1056 void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
1057 {
1058 SWR_CONTEXT *pContext = pDC->pContext;
1059
1060 AR_BEGIN(BENullBackend, pDC->drawId);
1061 ///@todo: handle center multisample pattern
1062 typedef SwrBackendTraits<sampleCountT, SWR_MSAA_STANDARD_PATTERN> T;
1063 AR_BEGIN(BESetup, pDC->drawId);
1064
1065 const API_STATE &state = GetApiState(pDC);
1066
1067 BarycentricCoeffs coeffs;
1068 SetupBarycentricCoeffs(&coeffs, work);
1069
1070 uint8_t *pDepthBuffer, *pStencilBuffer;
1071 SetupRenderBuffers(NULL, &pDepthBuffer, &pStencilBuffer, 0, renderBuffers);
1072
1073 SWR_PS_CONTEXT psContext;
1074 // skip SetupPixelShaderContext(&psContext, ...); // not needed here
1075
1076 AR_END(BESetup, 0);
1077
1078 simdscalar vYSamplePosUL = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast<float>(y)));
1079
1080 const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM));
1081
1082 for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
1083 {
1084 simdscalar vXSamplePosUL = _simd_add_ps(vULOffsetsX, _simd_set1_ps(static_cast<float>(x)));
1085
1086 const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM));
1087
1088 for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
1089 {
1090 // iterate over active samples
1091 unsigned long sample = 0;
1092 uint32_t sampleMask = state.blendState.sampleMask;
1093 while (_BitScanForward(&sample, sampleMask))
1094 {
1095 sampleMask &= ~(1 << sample);
1096
1097 simdmask coverageMask = work.coverageMask[sample] & MASK;
1098
1099 if (coverageMask)
1100 {
1101 // offset depth/stencil buffers current sample
1102 uint8_t *pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample);
1103 uint8_t *pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample);
1104
1105 if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable)
1106 {
1107 static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format");
1108
1109 const simdscalar z = _simd_load_ps(reinterpret_cast<const float *>(pDepthSample));
1110
1111 const float minz = state.depthBoundsState.depthBoundsTestMinValue;
1112 const float maxz = state.depthBoundsState.depthBoundsTestMaxValue;
1113
1114 coverageMask &= CalcDepthBoundsAcceptMask(z, minz, maxz);
1115 }
1116
1117 AR_BEGIN(BEBarycentric, pDC->drawId);
1118
1119 // calculate per sample positions
1120 psContext.vX.sample = _simd_add_ps(vXSamplePosUL, T::MultisampleT::vX(sample));
1121 psContext.vY.sample = _simd_add_ps(vYSamplePosUL, T::MultisampleT::vY(sample));
1122
1123 CalcSampleBarycentrics(coeffs, psContext);
1124
1125 // interpolate and quantize z
1126 psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample);
1127 psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
1128
1129 AR_END(BEBarycentric, 0);
1130
1131 // interpolate user clip distance if available
1132 if (state.rastState.clipDistanceMask)
1133 {
1134 coverageMask &= ~ComputeUserClipMask(state.rastState.clipDistanceMask, work.pUserClipBuffer, psContext.vI.sample, psContext.vJ.sample);
1135 }
1136
1137 simdscalar vCoverageMask = vMask(coverageMask);
1138 simdscalar stencilPassMask = vCoverageMask;
1139
1140 AR_BEGIN(BEEarlyDepthTest, pDC->drawId);
1141 simdscalar depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
1142 psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
1143 AR_EVENT(EarlyDepthStencilInfoNullPS(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(vCoverageMask), _simd_movemask_ps(stencilPassMask)));
1144 DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
1145 pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
1146 AR_END(BEEarlyDepthTest, 0);
1147
1148 uint32_t statMask = _simd_movemask_ps(depthPassMask);
1149 uint32_t statCount = _mm_popcnt_u32(statMask);
1150 UPDATE_STAT_BE(DepthPassCount, statCount);
1151 }
1152
1153 Endtile:
1154 ATTR_UNUSED;
1155 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
1156 }
1157
1158 pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
1159 pStencilBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
1160
1161 vXSamplePosUL = _simd_add_ps(vXSamplePosUL, dx);
1162 }
1163
1164 vYSamplePosUL = _simd_add_ps(vYSamplePosUL, dy);
1165 }
1166
1167 AR_END(BENullBackend, 0);
1168 }
1169
1170 void InitClearTilesTable()
1171 {
1172 memset(sClearTilesTable, 0, sizeof(sClearTilesTable));
1173
1174 sClearTilesTable[R8G8B8A8_UNORM] = ClearMacroTile<R8G8B8A8_UNORM>;
1175 sClearTilesTable[B8G8R8A8_UNORM] = ClearMacroTile<B8G8R8A8_UNORM>;
1176 sClearTilesTable[R32_FLOAT] = ClearMacroTile<R32_FLOAT>;
1177 sClearTilesTable[R32G32B32A32_FLOAT] = ClearMacroTile<R32G32B32A32_FLOAT>;
1178 sClearTilesTable[R8_UINT] = ClearMacroTile<R8_UINT>;
1179 }
1180
1181 PFN_BACKEND_FUNC gBackendNullPs[SWR_MULTISAMPLE_TYPE_COUNT];
1182 PFN_BACKEND_FUNC gBackendSingleSample[SWR_INPUT_COVERAGE_COUNT]
1183 [2] // centroid
1184 [2] // canEarlyZ
1185 = {};
1186 PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_COUNT]
1187 [SWR_MSAA_SAMPLE_PATTERN_COUNT]
1188 [SWR_INPUT_COVERAGE_COUNT]
1189 [2] // centroid
1190 [2] // forcedSampleCount
1191 [2] // canEarlyZ
1192 = {};
1193 PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_COUNT]
1194 [SWR_INPUT_COVERAGE_COUNT]
1195 [2] // centroid
1196 [2] // canEarlyZ
1197 = {};
1198
1199 // Recursive template used to auto-nest conditionals. Converts dynamic enum function
1200 // arguments to static template arguments.
1201 template <uint32_t... ArgsT>
1202 struct BEChooser
1203 {
1204 // Last Arg Terminator
1205 static PFN_BACKEND_FUNC GetFunc(SWR_BACKEND_FUNCS tArg)
1206 {
1207 switch(tArg)
1208 {
1209 case SWR_BACKEND_SINGLE_SAMPLE: return BackendSingleSample<SwrBackendTraits<ArgsT...>>; break;
1210 case SWR_BACKEND_MSAA_PIXEL_RATE: return BackendPixelRate<SwrBackendTraits<ArgsT...>>; break;
1211 case SWR_BACKEND_MSAA_SAMPLE_RATE: return BackendSampleRate<SwrBackendTraits<ArgsT...>>; break;
1212 default:
1213 SWR_ASSERT(0 && "Invalid backend func\n");
1214 return nullptr;
1215 break;
1216 }
1217 }
1218
1219 // Recursively parse args
1220 template <typename... TArgsT>
1221 static PFN_BACKEND_FUNC GetFunc(SWR_MSAA_SAMPLE_PATTERN tArg, TArgsT... remainingArgs)
1222 {
1223 switch(tArg)
1224 {
1225 case SWR_MSAA_CENTER_PATTERN: return BEChooser<ArgsT..., SWR_MSAA_CENTER_PATTERN>::GetFunc(remainingArgs...); break;
1226 case SWR_MSAA_STANDARD_PATTERN: return BEChooser<ArgsT..., SWR_MSAA_STANDARD_PATTERN>::GetFunc(remainingArgs...); break;
1227 default:
1228 SWR_ASSERT(0 && "Invalid sample pattern\n");
1229 return BEChooser<ArgsT..., SWR_MSAA_STANDARD_PATTERN>::GetFunc(remainingArgs...);
1230 break;
1231 }
1232 }
1233
1234 // Recursively parse args
1235 template <typename... TArgsT>
1236 static PFN_BACKEND_FUNC GetFunc(SWR_INPUT_COVERAGE tArg, TArgsT... remainingArgs)
1237 {
1238 switch(tArg)
1239 {
1240 case SWR_INPUT_COVERAGE_NONE: return BEChooser<ArgsT..., SWR_INPUT_COVERAGE_NONE>::GetFunc(remainingArgs...); break;
1241 case SWR_INPUT_COVERAGE_NORMAL: return BEChooser<ArgsT..., SWR_INPUT_COVERAGE_NORMAL>::GetFunc(remainingArgs...); break;
1242 case SWR_INPUT_COVERAGE_INNER_CONSERVATIVE: return BEChooser<ArgsT..., SWR_INPUT_COVERAGE_INNER_CONSERVATIVE>::GetFunc(remainingArgs...); break;
1243 default:
1244 SWR_ASSERT(0 && "Invalid sample pattern\n");
1245 return BEChooser<ArgsT..., SWR_INPUT_COVERAGE_NONE>::GetFunc(remainingArgs...);
1246 break;
1247 }
1248 }
1249
1250 // Recursively parse args
1251 template <typename... TArgsT>
1252 static PFN_BACKEND_FUNC GetFunc(SWR_MULTISAMPLE_COUNT tArg, TArgsT... remainingArgs)
1253 {
1254 switch(tArg)
1255 {
1256 case SWR_MULTISAMPLE_1X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...); break;
1257 case SWR_MULTISAMPLE_2X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_2X>::GetFunc(remainingArgs...); break;
1258 case SWR_MULTISAMPLE_4X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_4X>::GetFunc(remainingArgs...); break;
1259 case SWR_MULTISAMPLE_8X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_8X>::GetFunc(remainingArgs...); break;
1260 case SWR_MULTISAMPLE_16X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_16X>::GetFunc(remainingArgs...); break;
1261 default:
1262 SWR_ASSERT(0 && "Invalid sample count\n");
1263 return BEChooser<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...);
1264 break;
1265 }
1266 }
1267
1268 // Recursively parse args
1269 template <typename... TArgsT>
1270 static PFN_BACKEND_FUNC GetFunc(bool tArg, TArgsT... remainingArgs)
1271 {
1272 if(tArg == true)
1273 {
1274 return BEChooser<ArgsT..., 1>::GetFunc(remainingArgs...);
1275 }
1276
1277 return BEChooser<ArgsT..., 0>::GetFunc(remainingArgs...);
1278 }
1279 };
1280
1281 void InitBackendSingleFuncTable(PFN_BACKEND_FUNC (&table)[SWR_INPUT_COVERAGE_COUNT][2][2])
1282 {
1283 for(uint32_t inputCoverage = 0; inputCoverage < SWR_INPUT_COVERAGE_COUNT; inputCoverage++)
1284 {
1285 for(uint32_t isCentroid = 0; isCentroid < 2; isCentroid++)
1286 {
1287 for(uint32_t canEarlyZ = 0; canEarlyZ < 2; canEarlyZ++)
1288 {
1289 table[inputCoverage][isCentroid][canEarlyZ] =
1290 BEChooser<>::GetFunc(SWR_MULTISAMPLE_1X, SWR_MSAA_STANDARD_PATTERN, (SWR_INPUT_COVERAGE)inputCoverage,
1291 (isCentroid > 0), false, (canEarlyZ > 0), SWR_BACKEND_SINGLE_SAMPLE);
1292 }
1293 }
1294 }
1295 }
1296
1297 void InitBackendPixelFuncTable(PFN_BACKEND_FUNC (&table)[SWR_MULTISAMPLE_TYPE_COUNT][SWR_MSAA_SAMPLE_PATTERN_COUNT][SWR_INPUT_COVERAGE_COUNT][2][2][2])
1298 {
1299 for(uint32_t sampleCount = SWR_MULTISAMPLE_1X; sampleCount < SWR_MULTISAMPLE_TYPE_COUNT; sampleCount++)
1300 {
1301 for(uint32_t samplePattern = SWR_MSAA_CENTER_PATTERN; samplePattern < SWR_MSAA_SAMPLE_PATTERN_COUNT; samplePattern++)
1302 {
1303 for(uint32_t inputCoverage = 0; inputCoverage < SWR_INPUT_COVERAGE_COUNT; inputCoverage++)
1304 {
1305 for(uint32_t isCentroid = 0; isCentroid < 2; isCentroid++)
1306 {
1307 for(uint32_t forcedSampleCount = 0; forcedSampleCount < 2; forcedSampleCount++)
1308 {
1309 for(uint32_t canEarlyZ = 0; canEarlyZ < 2; canEarlyZ++)
1310 {
1311 table[sampleCount][samplePattern][inputCoverage][isCentroid][forcedSampleCount][canEarlyZ] =
1312 BEChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, (SWR_MSAA_SAMPLE_PATTERN)samplePattern, (SWR_INPUT_COVERAGE)inputCoverage,
1313 (isCentroid > 0), (forcedSampleCount > 0), (canEarlyZ > 0), SWR_BACKEND_MSAA_PIXEL_RATE);
1314 }
1315 }
1316 }
1317 }
1318 }
1319 }
1320 }
1321
1322 void InitBackendSampleFuncTable(PFN_BACKEND_FUNC (&table)[SWR_MULTISAMPLE_TYPE_COUNT][SWR_INPUT_COVERAGE_COUNT][2][2])
1323 {
1324 for(uint32_t sampleCount = SWR_MULTISAMPLE_1X; sampleCount < SWR_MULTISAMPLE_TYPE_COUNT; sampleCount++)
1325 {
1326 for(uint32_t inputCoverage = 0; inputCoverage < SWR_INPUT_COVERAGE_COUNT; inputCoverage++)
1327 {
1328 for(uint32_t centroid = 0; centroid < 2; centroid++)
1329 {
1330 for(uint32_t canEarlyZ = 0; canEarlyZ < 2; canEarlyZ++)
1331 {
1332 table[sampleCount][inputCoverage][centroid][canEarlyZ] =
1333 BEChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, SWR_MSAA_STANDARD_PATTERN, (SWR_INPUT_COVERAGE)inputCoverage,
1334 (centroid > 0), false, (canEarlyZ > 0), (SWR_BACKEND_FUNCS)SWR_BACKEND_MSAA_SAMPLE_RATE);
1335 }
1336 }
1337 }
1338 }
1339 }
1340
1341 void InitBackendFuncTables()
1342 {
1343 InitBackendSingleFuncTable(gBackendSingleSample);
1344 InitBackendPixelFuncTable(gBackendPixelRateTable);
1345 InitBackendSampleFuncTable(gBackendSampleRateTable);
1346
1347 gBackendNullPs[SWR_MULTISAMPLE_1X] = &BackendNullPS < SWR_MULTISAMPLE_1X > ;
1348 gBackendNullPs[SWR_MULTISAMPLE_2X] = &BackendNullPS < SWR_MULTISAMPLE_2X > ;
1349 gBackendNullPs[SWR_MULTISAMPLE_4X] = &BackendNullPS < SWR_MULTISAMPLE_4X > ;
1350 gBackendNullPs[SWR_MULTISAMPLE_8X] = &BackendNullPS < SWR_MULTISAMPLE_8X > ;
1351 gBackendNullPs[SWR_MULTISAMPLE_16X] = &BackendNullPS < SWR_MULTISAMPLE_16X > ;
1352 }