swr: [rasterizer core] SIMD16 Frontend WIP - Clipper
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / backend.cpp
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file backend.cpp
24 *
25 * @brief Backend handles rasterization, pixel shading and output merger
26 * operations.
27 *
28 ******************************************************************************/
29
30 #include <smmintrin.h>
31
32 #include "backend.h"
33 #include "tilemgr.h"
34 #include "memory/tilingtraits.h"
35 #include "core/multisample.h"
36
37 #include <algorithm>
38
39 typedef void(*PFN_CLEAR_TILES)(DRAW_CONTEXT*, SWR_RENDERTARGET_ATTACHMENT rt, uint32_t, uint32_t, DWORD[4], const SWR_RECT& rect);
40 static PFN_CLEAR_TILES sClearTilesTable[NUM_SWR_FORMATS];
41
42
43 //////////////////////////////////////////////////////////////////////////
44 /// @brief Process compute work.
45 /// @param pDC - pointer to draw context (dispatch).
46 /// @param workerId - The unique worker ID that is assigned to this thread.
47 /// @param threadGroupId - the linear index for the thread group within the dispatch.
48 void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId, void*& pSpillFillBuffer)
49 {
50 SWR_CONTEXT *pContext = pDC->pContext;
51
52 AR_BEGIN(BEDispatch, pDC->drawId);
53
54 const COMPUTE_DESC* pTaskData = (COMPUTE_DESC*)pDC->pDispatch->GetTasksData();
55 SWR_ASSERT(pTaskData != nullptr);
56
57 // Ensure spill fill memory has been allocated.
58 size_t spillFillSize = pDC->pState->state.totalSpillFillSize;
59 if (spillFillSize && pSpillFillBuffer == nullptr)
60 {
61 pSpillFillBuffer = pDC->pArena->AllocAlignedSync(spillFillSize, KNOB_SIMD_BYTES);
62 }
63
64 const API_STATE& state = GetApiState(pDC);
65
66 SWR_CS_CONTEXT csContext{ 0 };
67 csContext.tileCounter = threadGroupId;
68 csContext.dispatchDims[0] = pTaskData->threadGroupCountX;
69 csContext.dispatchDims[1] = pTaskData->threadGroupCountY;
70 csContext.dispatchDims[2] = pTaskData->threadGroupCountZ;
71 csContext.pTGSM = pContext->ppScratch[workerId];
72 csContext.pSpillFillBuffer = (uint8_t*)pSpillFillBuffer;
73
74 state.pfnCsFunc(GetPrivateState(pDC), &csContext);
75
76 UPDATE_STAT_BE(CsInvocations, state.totalThreadsInGroup);
77
78 AR_END(BEDispatch, 1);
79 }
80
81 //////////////////////////////////////////////////////////////////////////
82 /// @brief Process shutdown.
83 /// @param pDC - pointer to draw context (dispatch).
84 /// @param workerId - The unique worker ID that is assigned to this thread.
85 /// @param threadGroupId - the linear index for the thread group within the dispatch.
86 void ProcessShutdownBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData)
87 {
88 // Dummy function
89 }
90
91 void ProcessSyncBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData)
92 {
93 uint32_t x, y;
94 MacroTileMgr::getTileIndices(macroTile, x, y);
95 SWR_ASSERT(x == 0 && y == 0);
96 }
97
98 template<SWR_FORMAT format>
99 void ClearRasterTile(uint8_t *pTileBuffer, simdvector &value)
100 {
101 auto lambda = [&](int32_t comp)
102 {
103 FormatTraits<format>::storeSOA(comp, pTileBuffer, value.v[comp]);
104
105 pTileBuffer += (KNOB_SIMD_WIDTH * FormatTraits<format>::GetBPC(comp) / 8);
106 };
107
108 const uint32_t numIter = (KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD_TILE_X_DIM);
109
110 for (uint32_t i = 0; i < numIter; ++i)
111 {
112 UnrollerL<0, FormatTraits<format>::numComps, 1>::step(lambda);
113 }
114 }
115
116 #if USE_8x2_TILE_BACKEND
117 template<SWR_FORMAT format>
118 void ClearRasterTile(uint8_t *pTileBuffer, simd16vector &value)
119 {
120 auto lambda = [&](int32_t comp)
121 {
122 FormatTraits<format>::storeSOA(comp, pTileBuffer, value.v[comp]);
123
124 pTileBuffer += (KNOB_SIMD16_WIDTH * FormatTraits<format>::GetBPC(comp) / 8);
125 };
126
127 const uint32_t numIter = (KNOB_TILE_Y_DIM / SIMD16_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD16_TILE_X_DIM);
128
129 for (uint32_t i = 0; i < numIter; ++i)
130 {
131 UnrollerL<0, FormatTraits<format>::numComps, 1>::step(lambda);
132 }
133 }
134
135 #endif
136 template<SWR_FORMAT format>
137 INLINE void ClearMacroTile(DRAW_CONTEXT *pDC, SWR_RENDERTARGET_ATTACHMENT rt, uint32_t macroTile, uint32_t renderTargetArrayIndex, DWORD clear[4], const SWR_RECT& rect)
138 {
139 // convert clear color to hottile format
140 // clear color is in RGBA float/uint32
141 #if USE_8x2_TILE_BACKEND
142 simd16vector vClear;
143 for (uint32_t comp = 0; comp < FormatTraits<format>::numComps; ++comp)
144 {
145 simd16scalar vComp;
146 vComp = _simd16_load1_ps((const float*)&clear[comp]);
147 if (FormatTraits<format>::isNormalized(comp))
148 {
149 vComp = _simd16_mul_ps(vComp, _simd16_set1_ps(FormatTraits<format>::fromFloat(comp)));
150 vComp = _simd16_castsi_ps(_simd16_cvtps_epi32(vComp));
151 }
152 vComp = FormatTraits<format>::pack(comp, vComp);
153 vClear.v[FormatTraits<format>::swizzle(comp)] = vComp;
154 }
155
156 #else
157 simdvector vClear;
158 for (uint32_t comp = 0; comp < FormatTraits<format>::numComps; ++comp)
159 {
160 simdscalar vComp;
161 vComp = _simd_load1_ps((const float*)&clear[comp]);
162 if (FormatTraits<format>::isNormalized(comp))
163 {
164 vComp = _simd_mul_ps(vComp, _simd_set1_ps(FormatTraits<format>::fromFloat(comp)));
165 vComp = _simd_castsi_ps(_simd_cvtps_epi32(vComp));
166 }
167 vComp = FormatTraits<format>::pack(comp, vComp);
168 vClear.v[FormatTraits<format>::swizzle(comp)] = vComp;
169 }
170
171 #endif
172 uint32_t tileX, tileY;
173 MacroTileMgr::getTileIndices(macroTile, tileX, tileY);
174
175 // Init to full macrotile
176 SWR_RECT clearTile =
177 {
178 KNOB_MACROTILE_X_DIM * int32_t(tileX),
179 KNOB_MACROTILE_Y_DIM * int32_t(tileY),
180 KNOB_MACROTILE_X_DIM * int32_t(tileX + 1),
181 KNOB_MACROTILE_Y_DIM * int32_t(tileY + 1),
182 };
183
184 // intersect with clear rect
185 clearTile &= rect;
186
187 // translate to local hottile origin
188 clearTile.Translate(-int32_t(tileX) * KNOB_MACROTILE_X_DIM, -int32_t(tileY) * KNOB_MACROTILE_Y_DIM);
189
190 // Make maximums inclusive (needed for convert to raster tiles)
191 clearTile.xmax -= 1;
192 clearTile.ymax -= 1;
193
194 // convert to raster tiles
195 clearTile.ymin >>= (KNOB_TILE_Y_DIM_SHIFT);
196 clearTile.ymax >>= (KNOB_TILE_Y_DIM_SHIFT);
197 clearTile.xmin >>= (KNOB_TILE_X_DIM_SHIFT);
198 clearTile.xmax >>= (KNOB_TILE_X_DIM_SHIFT);
199
200 const int32_t numSamples = GetNumSamples(pDC->pState->state.rastState.sampleCount);
201 // compute steps between raster tile samples / raster tiles / macro tile rows
202 const uint32_t rasterTileSampleStep = KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<format>::bpp / 8;
203 const uint32_t rasterTileStep = (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<format>::bpp / 8)) * numSamples;
204 const uint32_t macroTileRowStep = (KNOB_MACROTILE_X_DIM / KNOB_TILE_X_DIM) * rasterTileStep;
205 const uint32_t pitch = (FormatTraits<format>::bpp * KNOB_MACROTILE_X_DIM / 8);
206
207 HOTTILE *pHotTile = pDC->pContext->pHotTileMgr->GetHotTile(pDC->pContext, pDC, macroTile, rt, true, numSamples, renderTargetArrayIndex);
208 uint32_t rasterTileStartOffset = (ComputeTileOffset2D< TilingTraits<SWR_TILE_SWRZ, FormatTraits<format>::bpp > >(pitch, clearTile.xmin, clearTile.ymin)) * numSamples;
209 uint8_t* pRasterTileRow = pHotTile->pBuffer + rasterTileStartOffset; //(ComputeTileOffset2D< TilingTraits<SWR_TILE_SWRZ, FormatTraits<format>::bpp > >(pitch, x, y)) * numSamples;
210
211 // loop over all raster tiles in the current hot tile
212 for (int32_t y = clearTile.ymin; y <= clearTile.ymax; ++y)
213 {
214 uint8_t* pRasterTile = pRasterTileRow;
215 for (int32_t x = clearTile.xmin; x <= clearTile.xmax; ++x)
216 {
217 for( int32_t sampleNum = 0; sampleNum < numSamples; sampleNum++)
218 {
219 ClearRasterTile<format>(pRasterTile, vClear);
220 pRasterTile += rasterTileSampleStep;
221 }
222 }
223 pRasterTileRow += macroTileRowStep;
224 }
225
226 pHotTile->state = HOTTILE_DIRTY;
227 }
228
229
230 void ProcessClearBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData)
231 {
232 SWR_CONTEXT *pContext = pDC->pContext;
233
234 if (KNOB_FAST_CLEAR)
235 {
236 CLEAR_DESC *pClear = (CLEAR_DESC*)pUserData;
237 SWR_MULTISAMPLE_COUNT sampleCount = pDC->pState->state.rastState.sampleCount;
238 uint32_t numSamples = GetNumSamples(sampleCount);
239
240 SWR_ASSERT(pClear->attachmentMask != 0); // shouldn't be here without a reason.
241
242 AR_BEGIN(BEClear, pDC->drawId);
243
244 if (pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR)
245 {
246 unsigned long rt = 0;
247 uint32_t mask = pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR;
248 while (_BitScanForward(&rt, mask))
249 {
250 mask &= ~(1 << rt);
251
252 HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, (SWR_RENDERTARGET_ATTACHMENT)rt, true, numSamples, pClear->renderTargetArrayIndex);
253
254 // All we want to do here is to mark the hot tile as being in a "needs clear" state.
255 pHotTile->clearData[0] = *(DWORD*)&(pClear->clearRTColor[0]);
256 pHotTile->clearData[1] = *(DWORD*)&(pClear->clearRTColor[1]);
257 pHotTile->clearData[2] = *(DWORD*)&(pClear->clearRTColor[2]);
258 pHotTile->clearData[3] = *(DWORD*)&(pClear->clearRTColor[3]);
259 pHotTile->state = HOTTILE_CLEAR;
260 }
261 }
262
263 if (pClear->attachmentMask & SWR_ATTACHMENT_DEPTH_BIT)
264 {
265 HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, SWR_ATTACHMENT_DEPTH, true, numSamples, pClear->renderTargetArrayIndex);
266 pHotTile->clearData[0] = *(DWORD*)&pClear->clearDepth;
267 pHotTile->state = HOTTILE_CLEAR;
268 }
269
270 if (pClear->attachmentMask & SWR_ATTACHMENT_STENCIL_BIT)
271 {
272 HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, SWR_ATTACHMENT_STENCIL, true, numSamples, pClear->renderTargetArrayIndex);
273
274 pHotTile->clearData[0] = pClear->clearStencil;
275 pHotTile->state = HOTTILE_CLEAR;
276 }
277
278 AR_END(BEClear, 1);
279 }
280 else
281 {
282 // Legacy clear
283 CLEAR_DESC *pClear = (CLEAR_DESC*)pUserData;
284 AR_BEGIN(BEClear, pDC->drawId);
285
286 if (pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR)
287 {
288 DWORD clearData[4];
289 clearData[0] = *(DWORD*)&(pClear->clearRTColor[0]);
290 clearData[1] = *(DWORD*)&(pClear->clearRTColor[1]);
291 clearData[2] = *(DWORD*)&(pClear->clearRTColor[2]);
292 clearData[3] = *(DWORD*)&(pClear->clearRTColor[3]);
293
294 PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[KNOB_COLOR_HOT_TILE_FORMAT];
295 SWR_ASSERT(pfnClearTiles != nullptr);
296
297 unsigned long rt = 0;
298 uint32_t mask = pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR;
299 while (_BitScanForward(&rt, mask))
300 {
301 mask &= ~(1 << rt);
302
303 pfnClearTiles(pDC, (SWR_RENDERTARGET_ATTACHMENT)rt, macroTile, pClear->renderTargetArrayIndex, clearData, pClear->rect);
304 }
305 }
306
307 if (pClear->attachmentMask & SWR_ATTACHMENT_DEPTH_BIT)
308 {
309 DWORD clearData[4];
310 clearData[0] = *(DWORD*)&pClear->clearDepth;
311 PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[KNOB_DEPTH_HOT_TILE_FORMAT];
312 SWR_ASSERT(pfnClearTiles != nullptr);
313
314 pfnClearTiles(pDC, SWR_ATTACHMENT_DEPTH, macroTile, pClear->renderTargetArrayIndex, clearData, pClear->rect);
315 }
316
317 if (pClear->attachmentMask & SWR_ATTACHMENT_STENCIL_BIT)
318 {
319 DWORD clearData[4];
320 clearData[0] = pClear->clearStencil;
321 PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[KNOB_STENCIL_HOT_TILE_FORMAT];
322
323 pfnClearTiles(pDC, SWR_ATTACHMENT_STENCIL, macroTile, pClear->renderTargetArrayIndex, clearData, pClear->rect);
324 }
325
326 AR_END(BEClear, 1);
327 }
328 }
329
330 void ProcessStoreTileBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, STORE_TILES_DESC* pDesc,
331 SWR_RENDERTARGET_ATTACHMENT attachment)
332 {
333 SWR_CONTEXT *pContext = pDC->pContext;
334
335 AR_BEGIN(BEStoreTiles, pDC->drawId);
336
337 SWR_FORMAT srcFormat;
338 switch (attachment)
339 {
340 case SWR_ATTACHMENT_COLOR0:
341 case SWR_ATTACHMENT_COLOR1:
342 case SWR_ATTACHMENT_COLOR2:
343 case SWR_ATTACHMENT_COLOR3:
344 case SWR_ATTACHMENT_COLOR4:
345 case SWR_ATTACHMENT_COLOR5:
346 case SWR_ATTACHMENT_COLOR6:
347 case SWR_ATTACHMENT_COLOR7: srcFormat = KNOB_COLOR_HOT_TILE_FORMAT; break;
348 case SWR_ATTACHMENT_DEPTH: srcFormat = KNOB_DEPTH_HOT_TILE_FORMAT; break;
349 case SWR_ATTACHMENT_STENCIL: srcFormat = KNOB_STENCIL_HOT_TILE_FORMAT; break;
350 default: SWR_INVALID("Unknown attachment: %d", attachment); srcFormat = KNOB_COLOR_HOT_TILE_FORMAT; break;
351 }
352
353 uint32_t x, y;
354 MacroTileMgr::getTileIndices(macroTile, x, y);
355
356 // Only need to store the hottile if it's been rendered to...
357 HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTileNoLoad(pContext, pDC, macroTile, attachment, false);
358 if (pHotTile)
359 {
360 // clear if clear is pending (i.e., not rendered to), then mark as dirty for store.
361 if (pHotTile->state == HOTTILE_CLEAR)
362 {
363 PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[srcFormat];
364 SWR_ASSERT(pfnClearTiles != nullptr);
365
366 pfnClearTiles(pDC, attachment, macroTile, pHotTile->renderTargetArrayIndex, pHotTile->clearData, pDesc->rect);
367 }
368
369 if (pHotTile->state == HOTTILE_DIRTY || pDesc->postStoreTileState == (SWR_TILE_STATE)HOTTILE_DIRTY)
370 {
371 int32_t destX = KNOB_MACROTILE_X_DIM * x;
372 int32_t destY = KNOB_MACROTILE_Y_DIM * y;
373
374 pContext->pfnStoreTile(GetPrivateState(pDC), srcFormat,
375 attachment, destX, destY, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer);
376 }
377
378
379 if (pHotTile->state == HOTTILE_DIRTY || pHotTile->state == HOTTILE_RESOLVED)
380 {
381 pHotTile->state = (HOTTILE_STATE)pDesc->postStoreTileState;
382 }
383 }
384 AR_END(BEStoreTiles, 1);
385 }
386
387 void ProcessStoreTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData)
388 {
389 STORE_TILES_DESC *pDesc = (STORE_TILES_DESC*)pData;
390
391 unsigned long rt = 0;
392 uint32_t mask = pDesc->attachmentMask;
393 while (_BitScanForward(&rt, mask))
394 {
395 mask &= ~(1 << rt);
396 ProcessStoreTileBE(pDC, workerId, macroTile, pDesc, (SWR_RENDERTARGET_ATTACHMENT)rt);
397 }
398 }
399
400 void ProcessDiscardInvalidateTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData)
401 {
402 DISCARD_INVALIDATE_TILES_DESC *pDesc = (DISCARD_INVALIDATE_TILES_DESC *)pData;
403 SWR_CONTEXT *pContext = pDC->pContext;
404
405 const int32_t numSamples = GetNumSamples(pDC->pState->state.rastState.sampleCount);
406
407 for (uint32_t i = 0; i < SWR_NUM_ATTACHMENTS; ++i)
408 {
409 if (pDesc->attachmentMask & (1 << i))
410 {
411 HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTileNoLoad(
412 pContext, pDC, macroTile, (SWR_RENDERTARGET_ATTACHMENT)i, pDesc->createNewTiles, numSamples);
413 if (pHotTile)
414 {
415 pHotTile->state = (HOTTILE_STATE)pDesc->newTileState;
416 }
417 }
418 }
419 }
420
421 #if KNOB_SIMD_WIDTH == 8
422 const simdscalar vCenterOffsetsX = __m256{0.5, 1.5, 0.5, 1.5, 2.5, 3.5, 2.5, 3.5};
423 const simdscalar vCenterOffsetsY = __m256{0.5, 0.5, 1.5, 1.5, 0.5, 0.5, 1.5, 1.5};
424 const simdscalar vULOffsetsX = __m256{0.0, 1.0, 0.0, 1.0, 2.0, 3.0, 2.0, 3.0};
425 const simdscalar vULOffsetsY = __m256{0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0};
426 #else
427 #error Unsupported vector width
428 #endif
429
430 simdmask ComputeUserClipMask(uint8_t clipMask, float* pUserClipBuffer, simdscalar vI, simdscalar vJ)
431 {
432 simdscalar vClipMask = _simd_setzero_ps();
433 uint32_t numClipDistance = _mm_popcnt_u32(clipMask);
434
435 for (uint32_t i = 0; i < numClipDistance; ++i)
436 {
437 // pull triangle clip distance values from clip buffer
438 simdscalar vA = _simd_broadcast_ss(pUserClipBuffer++);
439 simdscalar vB = _simd_broadcast_ss(pUserClipBuffer++);
440 simdscalar vC = _simd_broadcast_ss(pUserClipBuffer++);
441
442 // interpolate
443 simdscalar vInterp = vplaneps(vA, vB, vC, vI, vJ);
444
445 // clip if interpolated clip distance is < 0 || NAN
446 simdscalar vCull = _simd_cmp_ps(_simd_setzero_ps(), vInterp, _CMP_NLE_UQ);
447
448 vClipMask = _simd_or_ps(vClipMask, vCull);
449 }
450
451 return _simd_movemask_ps(vClipMask);
452 }
453
454 template<typename T>
455 void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
456 {
457 SWR_CONTEXT *pContext = pDC->pContext;
458
459 AR_BEGIN(BESingleSampleBackend, pDC->drawId);
460 AR_BEGIN(BESetup, pDC->drawId);
461
462 const API_STATE &state = GetApiState(pDC);
463
464 BarycentricCoeffs coeffs;
465 SetupBarycentricCoeffs(&coeffs, work);
466
467 uint8_t *pColorBuffer[SWR_NUM_RENDERTARGETS], *pDepthBuffer, *pStencilBuffer;
468 SetupRenderBuffers(pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.psState.numRenderTargets, renderBuffers);
469
470 SWR_PS_CONTEXT psContext;
471 const SWR_MULTISAMPLE_POS& samplePos = state.rastState.samplePositions;
472 SetupPixelShaderContext<T>(&psContext, samplePos, work);
473
474 AR_END(BESetup, 1);
475
476 psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast<float>(y)));
477 psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast<float>(y)));
478
479 const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM));
480
481 for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
482 {
483 psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps(static_cast<float>(x)));
484 psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps(static_cast<float>(x)));
485
486 const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM));
487
488 for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
489 {
490 #if USE_8x2_TILE_BACKEND
491 const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0);
492
493 #endif
494 simdmask coverageMask = work.coverageMask[0] & MASK;
495
496 if (coverageMask)
497 {
498 if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable)
499 {
500 static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format");
501
502 const simdscalar z = _simd_load_ps(reinterpret_cast<const float *>(pDepthBuffer));
503
504 const float minz = state.depthBoundsState.depthBoundsTestMinValue;
505 const float maxz = state.depthBoundsState.depthBoundsTestMaxValue;
506
507 coverageMask &= CalcDepthBoundsAcceptMask(z, minz, maxz);
508 }
509
510 if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE)
511 {
512 const uint64_t* pCoverageMask = (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) ? &work.innerCoverageMask : &work.coverageMask[0];
513
514 generateInputCoverage<T, T::InputCoverage>(pCoverageMask, psContext.inputMask, state.blendState.sampleMask);
515 }
516
517 AR_BEGIN(BEBarycentric, pDC->drawId);
518
519 CalcPixelBarycentrics(coeffs, psContext);
520
521 CalcCentroid<T, true>(&psContext, samplePos, coeffs, work.coverageMask, state.blendState.sampleMask);
522
523 // interpolate and quantize z
524 psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center);
525 psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
526
527 AR_END(BEBarycentric, 1);
528
529 // interpolate user clip distance if available
530 if (state.rastState.clipDistanceMask)
531 {
532 coverageMask &= ~ComputeUserClipMask(state.rastState.clipDistanceMask, work.pUserClipBuffer, psContext.vI.center, psContext.vJ.center);
533 }
534
535 simdscalar vCoverageMask = vMask(coverageMask);
536 simdscalar depthPassMask = vCoverageMask;
537 simdscalar stencilPassMask = vCoverageMask;
538
539 // Early-Z?
540 if (T::bCanEarlyZ)
541 {
542 AR_BEGIN(BEEarlyDepthTest, pDC->drawId);
543 depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
544 psContext.vZ, pDepthBuffer, vCoverageMask, pStencilBuffer, &stencilPassMask);
545 AR_EVENT(EarlyDepthStencilInfoSingleSample(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(stencilPassMask), _simd_movemask_ps(vCoverageMask)));
546 AR_END(BEEarlyDepthTest, 0);
547
548 // early-exit if no pixels passed depth or earlyZ is forced on
549 if (state.psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask))
550 {
551 DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
552 pDepthBuffer, depthPassMask, vCoverageMask, pStencilBuffer, stencilPassMask);
553
554 if (!_simd_movemask_ps(depthPassMask))
555 {
556 goto Endtile;
557 }
558 }
559 }
560
561 psContext.sampleIndex = 0;
562 psContext.activeMask = _simd_castps_si(vCoverageMask);
563
564 // execute pixel shader
565 AR_BEGIN(BEPixelShader, pDC->drawId);
566 UPDATE_STAT_BE(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask)));
567 state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
568 AR_END(BEPixelShader, 0);
569
570 vCoverageMask = _simd_castsi_ps(psContext.activeMask);
571
572 // late-Z
573 if (!T::bCanEarlyZ)
574 {
575 AR_BEGIN(BELateDepthTest, pDC->drawId);
576 depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
577 psContext.vZ, pDepthBuffer, vCoverageMask, pStencilBuffer, &stencilPassMask);
578 AR_EVENT(LateDepthStencilInfoSingleSample(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(stencilPassMask), _simd_movemask_ps(vCoverageMask)));
579 AR_END(BELateDepthTest, 0);
580
581 if (!_simd_movemask_ps(depthPassMask))
582 {
583 // need to call depth/stencil write for stencil write
584 DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
585 pDepthBuffer, depthPassMask, vCoverageMask, pStencilBuffer, stencilPassMask);
586 goto Endtile;
587 }
588 }
589
590 uint32_t statMask = _simd_movemask_ps(depthPassMask);
591 uint32_t statCount = _mm_popcnt_u32(statMask);
592 UPDATE_STAT_BE(DepthPassCount, statCount);
593
594 // output merger
595 AR_BEGIN(BEOutputMerger, pDC->drawId);
596 #if USE_8x2_TILE_BACKEND
597 OutputMerger8x2(psContext, pColorBuffer, 0, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.numRenderTargets, state.colorHottileEnable, useAlternateOffset);
598 #else
599 OutputMerger4x2(psContext, pColorBuffer, 0, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.numRenderTargets);
600 #endif
601
602 // do final depth write after all pixel kills
603 if (!state.psState.forceEarlyZ)
604 {
605 DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
606 pDepthBuffer, depthPassMask, vCoverageMask, pStencilBuffer, stencilPassMask);
607 }
608 AR_END(BEOutputMerger, 0);
609 }
610
611 Endtile:
612 AR_BEGIN(BEEndTile, pDC->drawId);
613
614 work.coverageMask[0] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
615 if(T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
616 {
617 work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
618 }
619
620 #if USE_8x2_TILE_BACKEND
621 if (useAlternateOffset)
622 {
623 for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt)
624 {
625 pColorBuffer[rt] += (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
626 }
627 }
628 #else
629 for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt)
630 {
631 pColorBuffer[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
632 }
633 #endif
634 pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
635 pStencilBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
636
637 AR_END(BEEndTile, 0);
638
639 psContext.vX.UL = _simd_add_ps(psContext.vX.UL, dx);
640 psContext.vX.center = _simd_add_ps(psContext.vX.center, dx);
641 }
642
643 psContext.vY.UL = _simd_add_ps(psContext.vY.UL, dy);
644 psContext.vY.center = _simd_add_ps(psContext.vY.center, dy);
645 }
646
647 AR_END(BESingleSampleBackend, 0);
648 }
649
650 template<typename T>
651 void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
652 {
653 SWR_CONTEXT *pContext = pDC->pContext;
654
655 AR_BEGIN(BESampleRateBackend, pDC->drawId);
656 AR_BEGIN(BESetup, pDC->drawId);
657
658 const API_STATE &state = GetApiState(pDC);
659
660 BarycentricCoeffs coeffs;
661 SetupBarycentricCoeffs(&coeffs, work);
662
663 uint8_t *pColorBuffer[SWR_NUM_RENDERTARGETS], *pDepthBuffer, *pStencilBuffer;
664 SetupRenderBuffers(pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.psState.numRenderTargets, renderBuffers);
665
666 SWR_PS_CONTEXT psContext;
667 const SWR_MULTISAMPLE_POS& samplePos = state.rastState.samplePositions;
668 SetupPixelShaderContext<T>(&psContext, samplePos, work);
669
670 AR_END(BESetup, 0);
671
672 psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast<float>(y)));
673 psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast<float>(y)));
674
675 const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM));
676
677 for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
678 {
679 psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps(static_cast<float>(x)));
680 psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps(static_cast<float>(x)));
681
682 const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM));
683
684 for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
685 {
686 #if USE_8x2_TILE_BACKEND
687 const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0);
688
689 #endif
690 if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE)
691 {
692 const uint64_t* pCoverageMask = (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) ? &work.innerCoverageMask : &work.coverageMask[0];
693
694 generateInputCoverage<T, T::InputCoverage>(pCoverageMask, psContext.inputMask, state.blendState.sampleMask);
695 }
696
697 AR_BEGIN(BEBarycentric, pDC->drawId);
698
699 CalcPixelBarycentrics(coeffs, psContext);
700
701 CalcCentroid<T, false>(&psContext, samplePos, coeffs, work.coverageMask, state.blendState.sampleMask);
702
703 AR_END(BEBarycentric, 0);
704
705 for (uint32_t sample = 0; sample < T::MultisampleT::numSamples; sample++)
706 {
707 simdmask coverageMask = work.coverageMask[sample] & MASK;
708
709 if (coverageMask)
710 {
711 // offset depth/stencil buffers current sample
712 uint8_t *pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample);
713 uint8_t *pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample);
714
715 if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable)
716 {
717 static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format");
718
719 const simdscalar z = _simd_load_ps(reinterpret_cast<const float *>(pDepthSample));
720
721 const float minz = state.depthBoundsState.depthBoundsTestMinValue;
722 const float maxz = state.depthBoundsState.depthBoundsTestMaxValue;
723
724 coverageMask &= CalcDepthBoundsAcceptMask(z, minz, maxz);
725 }
726
727 AR_BEGIN(BEBarycentric, pDC->drawId);
728
729 // calculate per sample positions
730 psContext.vX.sample = _simd_add_ps(psContext.vX.UL, samplePos.vX(sample));
731 psContext.vY.sample = _simd_add_ps(psContext.vY.UL, samplePos.vY(sample));
732
733 CalcSampleBarycentrics(coeffs, psContext);
734
735 // interpolate and quantize z
736 psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample);
737 psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
738
739 AR_END(BEBarycentric, 0);
740
741 // interpolate user clip distance if available
742 if (state.rastState.clipDistanceMask)
743 {
744 coverageMask &= ~ComputeUserClipMask(state.rastState.clipDistanceMask, work.pUserClipBuffer, psContext.vI.sample, psContext.vJ.sample);
745 }
746
747 simdscalar vCoverageMask = vMask(coverageMask);
748 simdscalar depthPassMask = vCoverageMask;
749 simdscalar stencilPassMask = vCoverageMask;
750
751 // Early-Z?
752 if (T::bCanEarlyZ)
753 {
754 AR_BEGIN(BEEarlyDepthTest, pDC->drawId);
755 depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
756 psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
757 AR_EVENT(EarlyDepthStencilInfoSampleRate(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(stencilPassMask), _simd_movemask_ps(vCoverageMask)));
758 AR_END(BEEarlyDepthTest, 0);
759
760 // early-exit if no samples passed depth or earlyZ is forced on.
761 if (state.psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask))
762 {
763 DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
764 pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
765
766 if (!_simd_movemask_ps(depthPassMask))
767 {
768 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
769 continue;
770 }
771 }
772 }
773
774 psContext.sampleIndex = sample;
775 psContext.activeMask = _simd_castps_si(vCoverageMask);
776
777 // execute pixel shader
778 AR_BEGIN(BEPixelShader, pDC->drawId);
779 UPDATE_STAT_BE(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask)));
780 state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
781 AR_END(BEPixelShader, 0);
782
783 vCoverageMask = _simd_castsi_ps(psContext.activeMask);
784
785 // late-Z
786 if (!T::bCanEarlyZ)
787 {
788 AR_BEGIN(BELateDepthTest, pDC->drawId);
789 depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
790 psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
791 AR_EVENT(LateDepthStencilInfoSampleRate(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(stencilPassMask), _simd_movemask_ps(vCoverageMask)));
792 AR_END(BELateDepthTest, 0);
793
794 if (!_simd_movemask_ps(depthPassMask))
795 {
796 // need to call depth/stencil write for stencil write
797 DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
798 pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
799
800 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
801 continue;
802 }
803 }
804
805 uint32_t statMask = _simd_movemask_ps(depthPassMask);
806 uint32_t statCount = _mm_popcnt_u32(statMask);
807 UPDATE_STAT_BE(DepthPassCount, statCount);
808
809 // output merger
810 AR_BEGIN(BEOutputMerger, pDC->drawId);
811 #if USE_8x2_TILE_BACKEND
812 OutputMerger8x2(psContext, pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.numRenderTargets, state.colorHottileEnable, useAlternateOffset);
813 #else
814 OutputMerger4x2(psContext, pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.numRenderTargets);
815 #endif
816
817 // do final depth write after all pixel kills
818 if (!state.psState.forceEarlyZ)
819 {
820 DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
821 pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
822 }
823 AR_END(BEOutputMerger, 0);
824 }
825 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
826 }
827
828 Endtile:
829 ATTR_UNUSED;
830
831 AR_BEGIN(BEEndTile, pDC->drawId);
832
833 if(T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
834 {
835 work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
836 }
837
838 #if USE_8x2_TILE_BACKEND
839 if (useAlternateOffset)
840 {
841 for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt)
842 {
843 pColorBuffer[rt] += (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
844 }
845 }
846 #else
847 for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt)
848 {
849 pColorBuffer[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
850 }
851 #endif
852 pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
853 pStencilBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
854
855 AR_END(BEEndTile, 0);
856
857 psContext.vX.UL = _simd_add_ps(psContext.vX.UL, dx);
858 psContext.vX.center = _simd_add_ps(psContext.vX.center, dx);
859 }
860
861 psContext.vY.UL = _simd_add_ps(psContext.vY.UL, dy);
862 psContext.vY.center = _simd_add_ps(psContext.vY.center, dy);
863 }
864
865 AR_END(BESampleRateBackend, 0);
866 }
867 // optimized backend flow with NULL PS
868 template<uint32_t sampleCountT>
869 void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
870 {
871 SWR_CONTEXT *pContext = pDC->pContext;
872
873 AR_BEGIN(BENullBackend, pDC->drawId);
874 ///@todo: handle center multisample pattern
875 typedef SwrBackendTraits<sampleCountT, false> T;
876 AR_BEGIN(BESetup, pDC->drawId);
877
878 const API_STATE &state = GetApiState(pDC);
879
880 BarycentricCoeffs coeffs;
881 SetupBarycentricCoeffs(&coeffs, work);
882
883 uint8_t *pDepthBuffer, *pStencilBuffer;
884 SetupRenderBuffers(NULL, &pDepthBuffer, &pStencilBuffer, 0, renderBuffers);
885
886 SWR_PS_CONTEXT psContext;
887 // skip SetupPixelShaderContext(&psContext, ...); // not needed here
888
889 AR_END(BESetup, 0);
890
891 simdscalar vYSamplePosUL = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast<float>(y)));
892
893 const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM));
894 const SWR_MULTISAMPLE_POS& samplePos = state.rastState.samplePositions;
895 for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
896 {
897 simdscalar vXSamplePosUL = _simd_add_ps(vULOffsetsX, _simd_set1_ps(static_cast<float>(x)));
898
899 const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM));
900
901 for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
902 {
903 // iterate over active samples
904 unsigned long sample = 0;
905 uint32_t sampleMask = state.blendState.sampleMask;
906 while (_BitScanForward(&sample, sampleMask))
907 {
908 sampleMask &= ~(1 << sample);
909
910 simdmask coverageMask = work.coverageMask[sample] & MASK;
911
912 if (coverageMask)
913 {
914 // offset depth/stencil buffers current sample
915 uint8_t *pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample);
916 uint8_t *pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample);
917
918 if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable)
919 {
920 static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format");
921
922 const simdscalar z = _simd_load_ps(reinterpret_cast<const float *>(pDepthSample));
923
924 const float minz = state.depthBoundsState.depthBoundsTestMinValue;
925 const float maxz = state.depthBoundsState.depthBoundsTestMaxValue;
926
927 coverageMask &= CalcDepthBoundsAcceptMask(z, minz, maxz);
928 }
929
930 AR_BEGIN(BEBarycentric, pDC->drawId);
931
932 // calculate per sample positions
933 psContext.vX.sample = _simd_add_ps(vXSamplePosUL, samplePos.vX(sample));
934 psContext.vY.sample = _simd_add_ps(vYSamplePosUL, samplePos.vY(sample));
935
936 CalcSampleBarycentrics(coeffs, psContext);
937
938 // interpolate and quantize z
939 psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample);
940 psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
941
942 AR_END(BEBarycentric, 0);
943
944 // interpolate user clip distance if available
945 if (state.rastState.clipDistanceMask)
946 {
947 coverageMask &= ~ComputeUserClipMask(state.rastState.clipDistanceMask, work.pUserClipBuffer, psContext.vI.sample, psContext.vJ.sample);
948 }
949
950 simdscalar vCoverageMask = vMask(coverageMask);
951 simdscalar stencilPassMask = vCoverageMask;
952
953 AR_BEGIN(BEEarlyDepthTest, pDC->drawId);
954 simdscalar depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
955 psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
956 AR_EVENT(EarlyDepthStencilInfoNullPS(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(stencilPassMask), _simd_movemask_ps(vCoverageMask)));
957 DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
958 pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
959 AR_END(BEEarlyDepthTest, 0);
960
961 uint32_t statMask = _simd_movemask_ps(depthPassMask);
962 uint32_t statCount = _mm_popcnt_u32(statMask);
963 UPDATE_STAT_BE(DepthPassCount, statCount);
964 }
965
966 Endtile:
967 ATTR_UNUSED;
968 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
969 }
970
971 pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
972 pStencilBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
973
974 vXSamplePosUL = _simd_add_ps(vXSamplePosUL, dx);
975 }
976
977 vYSamplePosUL = _simd_add_ps(vYSamplePosUL, dy);
978 }
979
980 AR_END(BENullBackend, 0);
981 }
982
983 void InitClearTilesTable()
984 {
985 memset(sClearTilesTable, 0, sizeof(sClearTilesTable));
986
987 sClearTilesTable[R8G8B8A8_UNORM] = ClearMacroTile<R8G8B8A8_UNORM>;
988 sClearTilesTable[B8G8R8A8_UNORM] = ClearMacroTile<B8G8R8A8_UNORM>;
989 sClearTilesTable[R32_FLOAT] = ClearMacroTile<R32_FLOAT>;
990 sClearTilesTable[R32G32B32A32_FLOAT] = ClearMacroTile<R32G32B32A32_FLOAT>;
991 sClearTilesTable[R8_UINT] = ClearMacroTile<R8_UINT>;
992 }
993
994 PFN_BACKEND_FUNC gBackendNullPs[SWR_MULTISAMPLE_TYPE_COUNT];
995 PFN_BACKEND_FUNC gBackendSingleSample[SWR_INPUT_COVERAGE_COUNT]
996 [2] // centroid
997 [2] // canEarlyZ
998 = {};
999 PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_COUNT]
1000 [2] // isCenterPattern
1001 [SWR_INPUT_COVERAGE_COUNT]
1002 [2] // centroid
1003 [2] // forcedSampleCount
1004 [2] // canEarlyZ
1005 = {};
1006 PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_COUNT]
1007 [SWR_INPUT_COVERAGE_COUNT]
1008 [2] // centroid
1009 [2] // canEarlyZ
1010 = {};
1011
1012 // Recursive template used to auto-nest conditionals. Converts dynamic enum function
1013 // arguments to static template arguments.
1014 template <uint32_t... ArgsT>
1015 struct BEChooser
1016 {
1017 // Last Arg Terminator
1018 static PFN_BACKEND_FUNC GetFunc(SWR_BACKEND_FUNCS tArg)
1019 {
1020 switch(tArg)
1021 {
1022 case SWR_BACKEND_SINGLE_SAMPLE: return BackendSingleSample<SwrBackendTraits<ArgsT...>>; break;
1023 case SWR_BACKEND_MSAA_PIXEL_RATE: return BackendPixelRate<SwrBackendTraits<ArgsT...>>; break;
1024 case SWR_BACKEND_MSAA_SAMPLE_RATE: return BackendSampleRate<SwrBackendTraits<ArgsT...>>; break;
1025 default:
1026 SWR_ASSERT(0 && "Invalid backend func\n");
1027 return nullptr;
1028 break;
1029 }
1030 }
1031
1032 // Recursively parse args
1033 template <typename... TArgsT>
1034 static PFN_BACKEND_FUNC GetFunc(SWR_INPUT_COVERAGE tArg, TArgsT... remainingArgs)
1035 {
1036 switch(tArg)
1037 {
1038 case SWR_INPUT_COVERAGE_NONE: return BEChooser<ArgsT..., SWR_INPUT_COVERAGE_NONE>::GetFunc(remainingArgs...); break;
1039 case SWR_INPUT_COVERAGE_NORMAL: return BEChooser<ArgsT..., SWR_INPUT_COVERAGE_NORMAL>::GetFunc(remainingArgs...); break;
1040 case SWR_INPUT_COVERAGE_INNER_CONSERVATIVE: return BEChooser<ArgsT..., SWR_INPUT_COVERAGE_INNER_CONSERVATIVE>::GetFunc(remainingArgs...); break;
1041 default:
1042 SWR_ASSERT(0 && "Invalid sample pattern\n");
1043 return BEChooser<ArgsT..., SWR_INPUT_COVERAGE_NONE>::GetFunc(remainingArgs...);
1044 break;
1045 }
1046 }
1047
1048 // Recursively parse args
1049 template <typename... TArgsT>
1050 static PFN_BACKEND_FUNC GetFunc(SWR_MULTISAMPLE_COUNT tArg, TArgsT... remainingArgs)
1051 {
1052 switch(tArg)
1053 {
1054 case SWR_MULTISAMPLE_1X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...); break;
1055 case SWR_MULTISAMPLE_2X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_2X>::GetFunc(remainingArgs...); break;
1056 case SWR_MULTISAMPLE_4X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_4X>::GetFunc(remainingArgs...); break;
1057 case SWR_MULTISAMPLE_8X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_8X>::GetFunc(remainingArgs...); break;
1058 case SWR_MULTISAMPLE_16X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_16X>::GetFunc(remainingArgs...); break;
1059 default:
1060 SWR_ASSERT(0 && "Invalid sample count\n");
1061 return BEChooser<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...);
1062 break;
1063 }
1064 }
1065
1066 // Recursively parse args
1067 template <typename... TArgsT>
1068 static PFN_BACKEND_FUNC GetFunc(bool tArg, TArgsT... remainingArgs)
1069 {
1070 if(tArg == true)
1071 {
1072 return BEChooser<ArgsT..., 1>::GetFunc(remainingArgs...);
1073 }
1074
1075 return BEChooser<ArgsT..., 0>::GetFunc(remainingArgs...);
1076 }
1077 };
1078
1079 void InitBackendSingleFuncTable(PFN_BACKEND_FUNC (&table)[SWR_INPUT_COVERAGE_COUNT][2][2])
1080 {
1081 for(uint32_t inputCoverage = 0; inputCoverage < SWR_INPUT_COVERAGE_COUNT; inputCoverage++)
1082 {
1083 for(uint32_t isCentroid = 0; isCentroid < 2; isCentroid++)
1084 {
1085 for(uint32_t canEarlyZ = 0; canEarlyZ < 2; canEarlyZ++)
1086 {
1087 table[inputCoverage][isCentroid][canEarlyZ] =
1088 BEChooser<>::GetFunc(SWR_MULTISAMPLE_1X, false, (SWR_INPUT_COVERAGE)inputCoverage,
1089 (isCentroid > 0), false, (canEarlyZ > 0), SWR_BACKEND_SINGLE_SAMPLE);
1090 }
1091 }
1092 }
1093 }
1094
1095 void InitBackendSampleFuncTable(PFN_BACKEND_FUNC (&table)[SWR_MULTISAMPLE_TYPE_COUNT][SWR_INPUT_COVERAGE_COUNT][2][2])
1096 {
1097 for(uint32_t sampleCount = SWR_MULTISAMPLE_1X; sampleCount < SWR_MULTISAMPLE_TYPE_COUNT; sampleCount++)
1098 {
1099 for(uint32_t inputCoverage = 0; inputCoverage < SWR_INPUT_COVERAGE_COUNT; inputCoverage++)
1100 {
1101 for(uint32_t centroid = 0; centroid < 2; centroid++)
1102 {
1103 for(uint32_t canEarlyZ = 0; canEarlyZ < 2; canEarlyZ++)
1104 {
1105 table[sampleCount][inputCoverage][centroid][canEarlyZ] =
1106 BEChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, false, (SWR_INPUT_COVERAGE)inputCoverage,
1107 (centroid > 0), false, (canEarlyZ > 0), (SWR_BACKEND_FUNCS)SWR_BACKEND_MSAA_SAMPLE_RATE);
1108 }
1109 }
1110 }
1111 }
1112 }
1113
1114 void InitBackendPixelRate0();
1115 void InitBackendFuncTables()
1116 {
1117 InitBackendSingleFuncTable(gBackendSingleSample);
1118 InitBackendPixelRate0();
1119 InitBackendSampleFuncTable(gBackendSampleRateTable);
1120
1121 gBackendNullPs[SWR_MULTISAMPLE_1X] = &BackendNullPS < SWR_MULTISAMPLE_1X > ;
1122 gBackendNullPs[SWR_MULTISAMPLE_2X] = &BackendNullPS < SWR_MULTISAMPLE_2X > ;
1123 gBackendNullPs[SWR_MULTISAMPLE_4X] = &BackendNullPS < SWR_MULTISAMPLE_4X > ;
1124 gBackendNullPs[SWR_MULTISAMPLE_8X] = &BackendNullPS < SWR_MULTISAMPLE_8X > ;
1125 gBackendNullPs[SWR_MULTISAMPLE_16X] = &BackendNullPS < SWR_MULTISAMPLE_16X > ;
1126 }