0a0001d07761fb4e970fc5a771257074fe72dc76
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / backend.cpp
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file backend.cpp
24 *
25 * @brief Backend handles rasterization, pixel shading and output merger
26 * operations.
27 *
28 ******************************************************************************/
29
30 #include <smmintrin.h>
31
32 #include "backend.h"
33 #include "depthstencil.h"
34 #include "tilemgr.h"
35 #include "memory/tilingtraits.h"
36 #include "core/multisample.h"
37
38 #include <algorithm>
39
40 typedef void(*PFN_CLEAR_TILES)(DRAW_CONTEXT*, SWR_RENDERTARGET_ATTACHMENT rt, uint32_t, DWORD[4], const SWR_RECT& rect);
41 static PFN_CLEAR_TILES sClearTilesTable[NUM_SWR_FORMATS];
42
43 //////////////////////////////////////////////////////////////////////////
44 /// @brief Process compute work.
45 /// @param pDC - pointer to draw context (dispatch).
46 /// @param workerId - The unique worker ID that is assigned to this thread.
47 /// @param threadGroupId - the linear index for the thread group within the dispatch.
48 void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId, void*& pSpillFillBuffer)
49 {
50 SWR_CONTEXT *pContext = pDC->pContext;
51
52 AR_BEGIN(BEDispatch, pDC->drawId);
53
54 const COMPUTE_DESC* pTaskData = (COMPUTE_DESC*)pDC->pDispatch->GetTasksData();
55 SWR_ASSERT(pTaskData != nullptr);
56
57 // Ensure spill fill memory has been allocated.
58 size_t spillFillSize = pDC->pState->state.totalSpillFillSize;
59 if (spillFillSize && pSpillFillBuffer == nullptr)
60 {
61 pSpillFillBuffer = pDC->pArena->AllocAlignedSync(spillFillSize, KNOB_SIMD_BYTES);
62 }
63
64 const API_STATE& state = GetApiState(pDC);
65
66 SWR_CS_CONTEXT csContext{ 0 };
67 csContext.tileCounter = threadGroupId;
68 csContext.dispatchDims[0] = pTaskData->threadGroupCountX;
69 csContext.dispatchDims[1] = pTaskData->threadGroupCountY;
70 csContext.dispatchDims[2] = pTaskData->threadGroupCountZ;
71 csContext.pTGSM = pContext->ppScratch[workerId];
72 csContext.pSpillFillBuffer = (uint8_t*)pSpillFillBuffer;
73
74 state.pfnCsFunc(GetPrivateState(pDC), &csContext);
75
76 UPDATE_STAT(CsInvocations, state.totalThreadsInGroup);
77
78 AR_END(BEDispatch, 1);
79 }
80
81 //////////////////////////////////////////////////////////////////////////
82 /// @brief Process shutdown.
83 /// @param pDC - pointer to draw context (dispatch).
84 /// @param workerId - The unique worker ID that is assigned to this thread.
85 /// @param threadGroupId - the linear index for the thread group within the dispatch.
86 void ProcessShutdownBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData)
87 {
88 // Dummy function
89 }
90
91 void ProcessSyncBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData)
92 {
93 uint32_t x, y;
94 MacroTileMgr::getTileIndices(macroTile, x, y);
95 SWR_ASSERT(x == 0 && y == 0);
96 }
97
98 template<SWR_FORMAT format>
99 void ClearRasterTile(uint8_t *pTileBuffer, simdvector &value)
100 {
101 auto lambda = [&](int32_t comp)
102 {
103 FormatTraits<format>::storeSOA(comp, pTileBuffer, value.v[comp]);
104 pTileBuffer += (KNOB_SIMD_WIDTH * FormatTraits<format>::GetBPC(comp) / 8);
105 };
106
107 const uint32_t numIter = (KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD_TILE_X_DIM);
108 for (uint32_t i = 0; i < numIter; ++i)
109 {
110 UnrollerL<0, FormatTraits<format>::numComps, 1>::step(lambda);
111 }
112 }
113
114 template<SWR_FORMAT format>
115 INLINE void ClearMacroTile(DRAW_CONTEXT *pDC, SWR_RENDERTARGET_ATTACHMENT rt, uint32_t macroTile, DWORD clear[4], const SWR_RECT& rect)
116 {
117 // convert clear color to hottile format
118 // clear color is in RGBA float/uint32
119 simdvector vClear;
120 for (uint32_t comp = 0; comp < FormatTraits<format>::numComps; ++comp)
121 {
122 simdscalar vComp;
123 vComp = _simd_load1_ps((const float*)&clear[comp]);
124 if (FormatTraits<format>::isNormalized(comp))
125 {
126 vComp = _simd_mul_ps(vComp, _simd_set1_ps(FormatTraits<format>::fromFloat(comp)));
127 vComp = _simd_castsi_ps(_simd_cvtps_epi32(vComp));
128 }
129 vComp = FormatTraits<format>::pack(comp, vComp);
130 vClear.v[FormatTraits<format>::swizzle(comp)] = vComp;
131 }
132
133 uint32_t tileX, tileY;
134 MacroTileMgr::getTileIndices(macroTile, tileX, tileY);
135
136 // Init to full macrotile
137 SWR_RECT clearTile =
138 {
139 KNOB_MACROTILE_X_DIM * int32_t(tileX),
140 KNOB_MACROTILE_Y_DIM * int32_t(tileY),
141 KNOB_MACROTILE_X_DIM * int32_t(tileX + 1),
142 KNOB_MACROTILE_Y_DIM * int32_t(tileY + 1),
143 };
144
145 // intersect with clear rect
146 clearTile &= rect;
147
148 // translate to local hottile origin
149 clearTile.Translate(-int32_t(tileX) * KNOB_MACROTILE_X_DIM, -int32_t(tileY) * KNOB_MACROTILE_Y_DIM);
150
151 // Make maximums inclusive (needed for convert to raster tiles)
152 clearTile.xmax -= 1;
153 clearTile.ymax -= 1;
154
155 // convert to raster tiles
156 clearTile.ymin >>= (KNOB_TILE_Y_DIM_SHIFT);
157 clearTile.ymax >>= (KNOB_TILE_Y_DIM_SHIFT);
158 clearTile.xmin >>= (KNOB_TILE_X_DIM_SHIFT);
159 clearTile.xmax >>= (KNOB_TILE_X_DIM_SHIFT);
160
161 const int32_t numSamples = GetNumSamples(pDC->pState->state.rastState.sampleCount);
162 // compute steps between raster tile samples / raster tiles / macro tile rows
163 const uint32_t rasterTileSampleStep = KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<format>::bpp / 8;
164 const uint32_t rasterTileStep = (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<format>::bpp / 8)) * numSamples;
165 const uint32_t macroTileRowStep = (KNOB_MACROTILE_X_DIM / KNOB_TILE_X_DIM) * rasterTileStep;
166 const uint32_t pitch = (FormatTraits<format>::bpp * KNOB_MACROTILE_X_DIM / 8);
167
168 HOTTILE *pHotTile = pDC->pContext->pHotTileMgr->GetHotTile(pDC->pContext, pDC, macroTile, rt, true, numSamples);
169 uint32_t rasterTileStartOffset = (ComputeTileOffset2D< TilingTraits<SWR_TILE_SWRZ, FormatTraits<format>::bpp > >(pitch, clearTile.xmin, clearTile.ymin)) * numSamples;
170 uint8_t* pRasterTileRow = pHotTile->pBuffer + rasterTileStartOffset; //(ComputeTileOffset2D< TilingTraits<SWR_TILE_SWRZ, FormatTraits<format>::bpp > >(pitch, x, y)) * numSamples;
171
172 // loop over all raster tiles in the current hot tile
173 for (int32_t y = clearTile.ymin; y <= clearTile.ymax; ++y)
174 {
175 uint8_t* pRasterTile = pRasterTileRow;
176 for (int32_t x = clearTile.xmin; x <= clearTile.xmax; ++x)
177 {
178 for( int32_t sampleNum = 0; sampleNum < numSamples; sampleNum++)
179 {
180 ClearRasterTile<format>(pRasterTile, vClear);
181 pRasterTile += rasterTileSampleStep;
182 }
183 }
184 pRasterTileRow += macroTileRowStep;
185 }
186
187 pHotTile->state = HOTTILE_DIRTY;
188 }
189
190
191 void ProcessClearBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData)
192 {
193 SWR_CONTEXT *pContext = pDC->pContext;
194
195 if (KNOB_FAST_CLEAR)
196 {
197 CLEAR_DESC *pClear = (CLEAR_DESC*)pUserData;
198 SWR_MULTISAMPLE_COUNT sampleCount = pDC->pState->state.rastState.sampleCount;
199 uint32_t numSamples = GetNumSamples(sampleCount);
200
201 SWR_ASSERT(pClear->flags.bits != 0); // shouldn't be here without a reason.
202
203 AR_BEGIN(BEClear, pDC->drawId);
204
205 if (pClear->flags.mask & SWR_CLEAR_COLOR)
206 {
207 HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, SWR_ATTACHMENT_COLOR0, true, numSamples);
208 // All we want to do here is to mark the hot tile as being in a "needs clear" state.
209 pHotTile->clearData[0] = *(DWORD*)&(pClear->clearRTColor[0]);
210 pHotTile->clearData[1] = *(DWORD*)&(pClear->clearRTColor[1]);
211 pHotTile->clearData[2] = *(DWORD*)&(pClear->clearRTColor[2]);
212 pHotTile->clearData[3] = *(DWORD*)&(pClear->clearRTColor[3]);
213 pHotTile->state = HOTTILE_CLEAR;
214 }
215
216 if (pClear->flags.mask & SWR_CLEAR_DEPTH)
217 {
218 HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, SWR_ATTACHMENT_DEPTH, true, numSamples);
219 pHotTile->clearData[0] = *(DWORD*)&pClear->clearDepth;
220 pHotTile->state = HOTTILE_CLEAR;
221 }
222
223 if (pClear->flags.mask & SWR_CLEAR_STENCIL)
224 {
225 HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, SWR_ATTACHMENT_STENCIL, true, numSamples);
226
227 pHotTile->clearData[0] = *(DWORD*)&pClear->clearStencil;
228 pHotTile->state = HOTTILE_CLEAR;
229 }
230
231 AR_END(BEClear, 1);
232 }
233 else
234 {
235 // Legacy clear
236 CLEAR_DESC *pClear = (CLEAR_DESC*)pUserData;
237 AR_BEGIN(BEClear, pDC->drawId);
238
239 if (pClear->flags.mask & SWR_CLEAR_COLOR)
240 {
241 /// @todo clear data should come in as RGBA32_FLOAT
242 DWORD clearData[4];
243 float clearFloat[4];
244 clearFloat[0] = ((uint8_t*)(&pClear->clearRTColor))[0] / 255.0f;
245 clearFloat[1] = ((uint8_t*)(&pClear->clearRTColor))[1] / 255.0f;
246 clearFloat[2] = ((uint8_t*)(&pClear->clearRTColor))[2] / 255.0f;
247 clearFloat[3] = ((uint8_t*)(&pClear->clearRTColor))[3] / 255.0f;
248 clearData[0] = *(DWORD*)&clearFloat[0];
249 clearData[1] = *(DWORD*)&clearFloat[1];
250 clearData[2] = *(DWORD*)&clearFloat[2];
251 clearData[3] = *(DWORD*)&clearFloat[3];
252
253 PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[KNOB_COLOR_HOT_TILE_FORMAT];
254 SWR_ASSERT(pfnClearTiles != nullptr);
255
256 pfnClearTiles(pDC, SWR_ATTACHMENT_COLOR0, macroTile, clearData, pClear->rect);
257 }
258
259 if (pClear->flags.mask & SWR_CLEAR_DEPTH)
260 {
261 DWORD clearData[4];
262 clearData[0] = *(DWORD*)&pClear->clearDepth;
263 PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[KNOB_DEPTH_HOT_TILE_FORMAT];
264 SWR_ASSERT(pfnClearTiles != nullptr);
265
266 pfnClearTiles(pDC, SWR_ATTACHMENT_DEPTH, macroTile, clearData, pClear->rect);
267 }
268
269 if (pClear->flags.mask & SWR_CLEAR_STENCIL)
270 {
271 uint32_t value = pClear->clearStencil;
272 DWORD clearData[4];
273 clearData[0] = *(DWORD*)&value;
274 PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[KNOB_STENCIL_HOT_TILE_FORMAT];
275
276 pfnClearTiles(pDC, SWR_ATTACHMENT_STENCIL, macroTile, clearData, pClear->rect);
277 }
278
279 AR_END(BEClear, 1);
280 }
281 }
282
283
284 void ProcessStoreTileBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData)
285 {
286 STORE_TILES_DESC *pDesc = (STORE_TILES_DESC*)pData;
287 SWR_CONTEXT *pContext = pDC->pContext;
288
289 AR_BEGIN(BEStoreTiles, pDC->drawId);
290
291 #ifdef KNOB_ENABLE_RDTSC
292 uint32_t numTiles = 0;
293 #endif
294 SWR_FORMAT srcFormat;
295 switch (pDesc->attachment)
296 {
297 case SWR_ATTACHMENT_COLOR0:
298 case SWR_ATTACHMENT_COLOR1:
299 case SWR_ATTACHMENT_COLOR2:
300 case SWR_ATTACHMENT_COLOR3:
301 case SWR_ATTACHMENT_COLOR4:
302 case SWR_ATTACHMENT_COLOR5:
303 case SWR_ATTACHMENT_COLOR6:
304 case SWR_ATTACHMENT_COLOR7: srcFormat = KNOB_COLOR_HOT_TILE_FORMAT; break;
305 case SWR_ATTACHMENT_DEPTH: srcFormat = KNOB_DEPTH_HOT_TILE_FORMAT; break;
306 case SWR_ATTACHMENT_STENCIL: srcFormat = KNOB_STENCIL_HOT_TILE_FORMAT; break;
307 default: SWR_ASSERT(false, "Unknown attachment: %d", pDesc->attachment); srcFormat = KNOB_COLOR_HOT_TILE_FORMAT; break;
308 }
309
310 uint32_t x, y;
311 MacroTileMgr::getTileIndices(macroTile, x, y);
312
313 // Only need to store the hottile if it's been rendered to...
314 HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, pDesc->attachment, false);
315 if (pHotTile)
316 {
317 // clear if clear is pending (i.e., not rendered to), then mark as dirty for store.
318 if (pHotTile->state == HOTTILE_CLEAR)
319 {
320 PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[srcFormat];
321 SWR_ASSERT(pfnClearTiles != nullptr);
322
323 pfnClearTiles(pDC, pDesc->attachment, macroTile, pHotTile->clearData, pDesc->rect);
324 }
325
326 if (pHotTile->state == HOTTILE_DIRTY || pDesc->postStoreTileState == (SWR_TILE_STATE)HOTTILE_DIRTY)
327 {
328 int32_t destX = KNOB_MACROTILE_X_DIM * x;
329 int32_t destY = KNOB_MACROTILE_Y_DIM * y;
330
331 pContext->pfnStoreTile(GetPrivateState(pDC), srcFormat,
332 pDesc->attachment, destX, destY, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer);
333 }
334
335
336 if (pHotTile->state == HOTTILE_DIRTY || pHotTile->state == HOTTILE_RESOLVED)
337 {
338 pHotTile->state = (HOTTILE_STATE)pDesc->postStoreTileState;
339 }
340 }
341 AR_END(BEStoreTiles, numTiles);
342 }
343
344
345 void ProcessDiscardInvalidateTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData)
346 {
347 DISCARD_INVALIDATE_TILES_DESC *pDesc = (DISCARD_INVALIDATE_TILES_DESC *)pData;
348 SWR_CONTEXT *pContext = pDC->pContext;
349
350 const int32_t numSamples = GetNumSamples(pDC->pState->state.rastState.sampleCount);
351
352 for (uint32_t i = 0; i < SWR_NUM_ATTACHMENTS; ++i)
353 {
354 if (pDesc->attachmentMask & (1 << i))
355 {
356 HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTileNoLoad(
357 pContext, pDC, macroTile, (SWR_RENDERTARGET_ATTACHMENT)i, pDesc->createNewTiles, numSamples);
358 if (pHotTile)
359 {
360 pHotTile->state = (HOTTILE_STATE)pDesc->newTileState;
361 }
362 }
363 }
364 }
365
366 #if KNOB_SIMD_WIDTH == 8
367 const __m256 vCenterOffsetsX = {0.5, 1.5, 0.5, 1.5, 2.5, 3.5, 2.5, 3.5};
368 const __m256 vCenterOffsetsY = {0.5, 0.5, 1.5, 1.5, 0.5, 0.5, 1.5, 1.5};
369 const __m256 vULOffsetsX = {0.0, 1.0, 0.0, 1.0, 2.0, 3.0, 2.0, 3.0};
370 const __m256 vULOffsetsY = {0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0};
371 #else
372 #error Unsupported vector width
373 #endif
374
375 simdmask ComputeUserClipMask(uint8_t clipMask, float* pUserClipBuffer, simdscalar vI, simdscalar vJ)
376 {
377 simdscalar vClipMask = _simd_setzero_ps();
378 uint32_t numClipDistance = _mm_popcnt_u32(clipMask);
379
380 for (uint32_t i = 0; i < numClipDistance; ++i)
381 {
382 // pull triangle clip distance values from clip buffer
383 simdscalar vA = _simd_broadcast_ss(pUserClipBuffer++);
384 simdscalar vB = _simd_broadcast_ss(pUserClipBuffer++);
385 simdscalar vC = _simd_broadcast_ss(pUserClipBuffer++);
386
387 // interpolate
388 simdscalar vInterp = vplaneps(vA, vB, vC, vI, vJ);
389
390 // clip if interpolated clip distance is < 0 || NAN
391 simdscalar vCull = _simd_cmp_ps(_simd_setzero_ps(), vInterp, _CMP_NLE_UQ);
392
393 vClipMask = _simd_or_ps(vClipMask, vCull);
394 }
395
396 return _simd_movemask_ps(vClipMask);
397 }
398
399 template<typename T>
400 void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
401 {
402 SWR_CONTEXT *pContext = pDC->pContext;
403
404 AR_BEGIN(BESingleSampleBackend, pDC->drawId);
405 AR_BEGIN(BESetup, pDC->drawId);
406
407 const API_STATE& state = GetApiState(pDC);
408 const SWR_RASTSTATE& rastState = state.rastState;
409 const SWR_PS_STATE *pPSState = &state.psState;
410 const SWR_BLEND_STATE *pBlendState = &state.blendState;
411 uint64_t coverageMask = work.coverageMask[0];
412
413 // broadcast scalars
414 BarycentricCoeffs coeffs;
415 coeffs.vIa = _simd_broadcast_ss(&work.I[0]);
416 coeffs.vIb = _simd_broadcast_ss(&work.I[1]);
417 coeffs.vIc = _simd_broadcast_ss(&work.I[2]);
418
419 coeffs.vJa = _simd_broadcast_ss(&work.J[0]);
420 coeffs.vJb = _simd_broadcast_ss(&work.J[1]);
421 coeffs.vJc = _simd_broadcast_ss(&work.J[2]);
422
423 coeffs.vZa = _simd_broadcast_ss(&work.Z[0]);
424 coeffs.vZb = _simd_broadcast_ss(&work.Z[1]);
425 coeffs.vZc = _simd_broadcast_ss(&work.Z[2]);
426
427 coeffs.vRecipDet = _simd_broadcast_ss(&work.recipDet);
428
429 coeffs.vAOneOverW = _simd_broadcast_ss(&work.OneOverW[0]);
430 coeffs.vBOneOverW = _simd_broadcast_ss(&work.OneOverW[1]);
431 coeffs.vCOneOverW = _simd_broadcast_ss(&work.OneOverW[2]);
432
433 uint8_t *pColorBase[SWR_NUM_RENDERTARGETS];
434 uint32_t NumRT = state.psState.numRenderTargets;
435 for(uint32_t rt = 0; rt < NumRT; ++rt)
436 {
437 pColorBase[rt] = renderBuffers.pColor[rt];
438 }
439 uint8_t *pDepthBase = renderBuffers.pDepth, *pStencilBase = renderBuffers.pStencil;
440 AR_END(BESetup, 1);
441
442 SWR_PS_CONTEXT psContext;
443 psContext.pAttribs = work.pAttribs;
444 psContext.pPerspAttribs = work.pPerspAttribs;
445 psContext.frontFace = work.triFlags.frontFacing;
446 psContext.primID = work.triFlags.primID;
447
448 // save Ia/Ib/Ic and Ja/Jb/Jc if we need to reevaluate i/j/k in the shader because of pull attribs
449 psContext.I = work.I;
450 psContext.J = work.J;
451 psContext.recipDet = work.recipDet;
452 psContext.pRecipW = work.pRecipW;
453 psContext.pSamplePosX = (const float*)&T::MultisampleT::samplePosX;
454 psContext.pSamplePosY = (const float*)&T::MultisampleT::samplePosY;
455 psContext.rasterizerSampleCount = T::MultisampleT::numSamples;
456
457 for(uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
458 {
459 // UL pixel corner
460 psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps((float)yy));
461 // pixel center
462 psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps((float)yy));
463
464 for(uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
465 {
466 if(coverageMask & MASK)
467 {
468 psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps((float)xx));
469 // pixel center
470 psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps((float)xx));
471
472 if(T::InputCoverage != SWR_INPUT_COVERAGE_NONE)
473 {
474 const uint64_t* pCoverageMask = (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) ? &work.innerCoverageMask :
475 &work.coverageMask[0];
476 generateInputCoverage<T, T::InputCoverage>(pCoverageMask, psContext.inputMask, pBlendState->sampleMask);
477 }
478
479 AR_BEGIN(BEBarycentric, pDC->drawId);
480 CalcPixelBarycentrics(coeffs, psContext);
481
482 // for 1x case, centroid is pixel center
483 psContext.vX.centroid = psContext.vX.center;
484 psContext.vY.centroid = psContext.vY.center;
485 psContext.vI.centroid = psContext.vI.center;
486 psContext.vJ.centroid = psContext.vJ.center;
487 psContext.vOneOverW.centroid = psContext.vOneOverW.center;
488
489 // interpolate and quantize z
490 psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center);
491 psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
492 AR_END(BEBarycentric, 1);
493
494 simdmask clipCoverageMask = coverageMask & MASK;
495 // interpolate user clip distance if available
496 if(rastState.clipDistanceMask)
497 {
498 clipCoverageMask &= ~ComputeUserClipMask(rastState.clipDistanceMask, work.pUserClipBuffer,
499 psContext.vI.center, psContext.vJ.center);
500 }
501
502 simdscalar vCoverageMask = vMask(clipCoverageMask);
503 simdscalar depthPassMask = vCoverageMask;
504 simdscalar stencilPassMask = vCoverageMask;
505
506 // Early-Z?
507 if(T::bCanEarlyZ)
508 {
509 AR_BEGIN(BEEarlyDepthTest, pDC->drawId);
510 depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
511 psContext.vZ, pDepthBase, vCoverageMask, pStencilBase, &stencilPassMask);
512 AR_END(BEEarlyDepthTest, 0);
513
514 // early-exit if no pixels passed depth or earlyZ is forced on
515 if(pPSState->forceEarlyZ || !_simd_movemask_ps(depthPassMask))
516 {
517 DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
518 pDepthBase, depthPassMask, vCoverageMask, pStencilBase, stencilPassMask);
519
520 if (!_simd_movemask_ps(depthPassMask))
521 {
522 goto Endtile;
523 }
524 }
525 }
526
527 psContext.sampleIndex = 0;
528 psContext.activeMask = _simd_castps_si(vCoverageMask);
529
530 // execute pixel shader
531 AR_BEGIN(BEPixelShader, pDC->drawId);
532 UPDATE_STAT(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask)));
533 state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
534 AR_END(BEPixelShader, 0);
535
536 vCoverageMask = _simd_castsi_ps(psContext.activeMask);
537
538 // late-Z
539 if(!T::bCanEarlyZ)
540 {
541 AR_BEGIN(BELateDepthTest, pDC->drawId);
542 depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
543 psContext.vZ, pDepthBase, vCoverageMask, pStencilBase, &stencilPassMask);
544 AR_END(BELateDepthTest, 0);
545
546 if(!_simd_movemask_ps(depthPassMask))
547 {
548 // need to call depth/stencil write for stencil write
549 DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
550 pDepthBase, depthPassMask, vCoverageMask, pStencilBase, stencilPassMask);
551 goto Endtile;
552 }
553 }
554
555 uint32_t statMask = _simd_movemask_ps(depthPassMask);
556 uint32_t statCount = _mm_popcnt_u32(statMask);
557 UPDATE_STAT(DepthPassCount, statCount);
558
559 // output merger
560 AR_BEGIN(BEOutputMerger, pDC->drawId);
561 OutputMerger(psContext, pColorBase, 0, pBlendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, pPSState->numRenderTargets);
562
563 // do final depth write after all pixel kills
564 if (!pPSState->forceEarlyZ)
565 {
566 DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
567 pDepthBase, depthPassMask, vCoverageMask, pStencilBase, stencilPassMask);
568 }
569 AR_END(BEOutputMerger, 0);
570 }
571
572 Endtile:
573 AR_BEGIN(BEEndTile, pDC->drawId);
574 coverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
575 if(T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
576 {
577 work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
578 }
579 pDepthBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
580 pStencilBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
581
582 for(uint32_t rt = 0; rt < NumRT; ++rt)
583 {
584 pColorBase[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
585 }
586 AR_END(BEEndTile, 0);
587 }
588 }
589 AR_END(BESingleSampleBackend, 0);
590 }
591
592 template<typename T>
593 void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
594 {
595 SWR_CONTEXT *pContext = pDC->pContext;
596
597 AR_BEGIN(BESampleRateBackend, pDC->drawId);
598 AR_BEGIN(BESetup, pDC->drawId);
599
600 const API_STATE& state = GetApiState(pDC);
601 const SWR_RASTSTATE& rastState = state.rastState;
602 const SWR_PS_STATE *pPSState = &state.psState;
603 const SWR_BLEND_STATE *pBlendState = &state.blendState;
604
605 // broadcast scalars
606 BarycentricCoeffs coeffs;
607 coeffs.vIa = _simd_broadcast_ss(&work.I[0]);
608 coeffs.vIb = _simd_broadcast_ss(&work.I[1]);
609 coeffs.vIc = _simd_broadcast_ss(&work.I[2]);
610
611 coeffs.vJa = _simd_broadcast_ss(&work.J[0]);
612 coeffs.vJb = _simd_broadcast_ss(&work.J[1]);
613 coeffs.vJc = _simd_broadcast_ss(&work.J[2]);
614
615 coeffs.vZa = _simd_broadcast_ss(&work.Z[0]);
616 coeffs.vZb = _simd_broadcast_ss(&work.Z[1]);
617 coeffs.vZc = _simd_broadcast_ss(&work.Z[2]);
618
619 coeffs.vRecipDet = _simd_broadcast_ss(&work.recipDet);
620
621 coeffs.vAOneOverW = _simd_broadcast_ss(&work.OneOverW[0]);
622 coeffs.vBOneOverW = _simd_broadcast_ss(&work.OneOverW[1]);
623 coeffs.vCOneOverW = _simd_broadcast_ss(&work.OneOverW[2]);
624
625 uint8_t *pColorBase[SWR_NUM_RENDERTARGETS];
626 uint32_t NumRT = state.psState.numRenderTargets;
627 for(uint32_t rt = 0; rt < NumRT; ++rt)
628 {
629 pColorBase[rt] = renderBuffers.pColor[rt];
630 }
631 uint8_t *pDepthBase = renderBuffers.pDepth, *pStencilBase = renderBuffers.pStencil;
632 AR_END(BESetup, 0);
633
634 SWR_PS_CONTEXT psContext;
635 psContext.pAttribs = work.pAttribs;
636 psContext.pPerspAttribs = work.pPerspAttribs;
637 psContext.pRecipW = work.pRecipW;
638 psContext.frontFace = work.triFlags.frontFacing;
639 psContext.primID = work.triFlags.primID;
640
641 // save Ia/Ib/Ic and Ja/Jb/Jc if we need to reevaluate i/j/k in the shader because of pull attribs
642 psContext.I = work.I;
643 psContext.J = work.J;
644 psContext.recipDet = work.recipDet;
645 psContext.pSamplePosX = (const float*)&T::MultisampleT::samplePosX;
646 psContext.pSamplePosY = (const float*)&T::MultisampleT::samplePosY;
647 psContext.rasterizerSampleCount = T::MultisampleT::numSamples;
648
649 for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
650 {
651 // UL pixel corner
652 psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps((float)yy));
653 // pixel center
654 psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps((float)yy));
655
656 for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
657 {
658 psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps((float)xx));
659 // pixel center
660 psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps((float)xx));
661
662 AR_BEGIN(BEBarycentric, pDC->drawId);
663 CalcPixelBarycentrics(coeffs, psContext);
664 AR_END(BEBarycentric, 0);
665
666 if(T::InputCoverage != SWR_INPUT_COVERAGE_NONE)
667 {
668 const uint64_t* pCoverageMask = (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) ? &work.innerCoverageMask :
669 &work.coverageMask[0];
670 generateInputCoverage<T, T::InputCoverage>(pCoverageMask, psContext.inputMask, pBlendState->sampleMask);
671 }
672
673 if(T::bCentroidPos)
674 {
675 ///@ todo: don't need to genererate input coverage 2x if input coverage and centroid
676 AR_BEGIN(BEBarycentric, pDC->drawId);
677 if(T::bIsStandardPattern)
678 {
679 CalcCentroidPos<T>(psContext, &work.coverageMask[0], pBlendState->sampleMask, psContext.vX.UL, psContext.vY.UL);
680 }
681 else
682 {
683 psContext.vX.centroid = _simd_add_ps(psContext.vX.UL, _simd_set1_ps(0.5f));
684 psContext.vY.centroid = _simd_add_ps(psContext.vY.UL, _simd_set1_ps(0.5f));
685 }
686 CalcCentroidBarycentrics(coeffs, psContext, psContext.vX.UL, psContext.vY.UL);
687 AR_END(BEBarycentric, 0);
688 }
689 else
690 {
691 psContext.vX.centroid = psContext.vX.sample;
692 psContext.vY.centroid = psContext.vY.sample;
693 }
694
695 for(uint32_t sample = 0; sample < T::MultisampleT::numSamples; sample++)
696 {
697 simdmask coverageMask = work.coverageMask[sample] & MASK;
698 if (coverageMask)
699 {
700 AR_BEGIN(BEBarycentric, pDC->drawId);
701 // calculate per sample positions
702 psContext.vX.sample = _simd_add_ps(psContext.vX.UL, T::MultisampleT::vX(sample));
703 psContext.vY.sample = _simd_add_ps(psContext.vY.UL, T::MultisampleT::vY(sample));
704
705 CalcSampleBarycentrics(coeffs, psContext);
706
707 // interpolate and quantize z
708 psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample);
709 psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
710 AR_END(BEBarycentric, 0);
711
712 // interpolate user clip distance if available
713 if (rastState.clipDistanceMask)
714 {
715 coverageMask &= ~ComputeUserClipMask(rastState.clipDistanceMask, work.pUserClipBuffer,
716 psContext.vI.sample, psContext.vJ.sample);
717 }
718
719 simdscalar vCoverageMask = vMask(coverageMask);
720 simdscalar depthPassMask = vCoverageMask;
721 simdscalar stencilPassMask = vCoverageMask;
722
723 // offset depth/stencil buffers current sample
724 uint8_t *pDepthSample = pDepthBase + RasterTileDepthOffset(sample);
725 uint8_t *pStencilSample = pStencilBase + RasterTileStencilOffset(sample);
726
727 // Early-Z?
728 if (T::bCanEarlyZ)
729 {
730 AR_BEGIN(BEEarlyDepthTest, pDC->drawId);
731 depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
732 psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
733 AR_END(BEEarlyDepthTest, 0);
734
735 // early-exit if no samples passed depth or earlyZ is forced on.
736 if (pPSState->forceEarlyZ || !_simd_movemask_ps(depthPassMask))
737 {
738 DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
739 pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
740
741 if (!_simd_movemask_ps(depthPassMask))
742 {
743 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
744 continue;
745 }
746 }
747 }
748
749 psContext.sampleIndex = sample;
750 psContext.activeMask = _simd_castps_si(vCoverageMask);
751
752 // execute pixel shader
753 AR_BEGIN(BEPixelShader, pDC->drawId);
754 UPDATE_STAT(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask)));
755 state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
756 AR_END(BEPixelShader, 0);
757
758 vCoverageMask = _simd_castsi_ps(psContext.activeMask);
759
760 // late-Z
761 if (!T::bCanEarlyZ)
762 {
763 AR_BEGIN(BELateDepthTest, pDC->drawId);
764 depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
765 psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
766 AR_END(BELateDepthTest, 0);
767
768 if (!_simd_movemask_ps(depthPassMask))
769 {
770 // need to call depth/stencil write for stencil write
771 DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
772 pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
773
774 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
775 continue;
776 }
777 }
778
779 uint32_t statMask = _simd_movemask_ps(depthPassMask);
780 uint32_t statCount = _mm_popcnt_u32(statMask);
781 UPDATE_STAT(DepthPassCount, statCount);
782
783 // output merger
784 AR_BEGIN(BEOutputMerger, pDC->drawId);
785 OutputMerger(psContext, pColorBase, sample, pBlendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, pPSState->numRenderTargets);
786
787 // do final depth write after all pixel kills
788 if (!pPSState->forceEarlyZ)
789 {
790 DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
791 pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
792 }
793 AR_END(BEOutputMerger, 0);
794 }
795 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
796 }
797 AR_BEGIN(BEEndTile, pDC->drawId);
798 if(T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
799 {
800 work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
801 }
802 pDepthBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
803 pStencilBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
804
805 for (uint32_t rt = 0; rt < NumRT; ++rt)
806 {
807 pColorBase[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
808 }
809 AR_END(BEEndTile, 0);
810 }
811 }
812 AR_END(BESampleRateBackend, 0);
813 }
814
815 template<typename T>
816 void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
817 {
818 SWR_CONTEXT *pContext = pDC->pContext;
819
820 AR_BEGIN(BEPixelRateBackend, pDC->drawId);
821 AR_BEGIN(BESetup, pDC->drawId);
822
823 const API_STATE& state = GetApiState(pDC);
824 const SWR_RASTSTATE& rastState = state.rastState;
825 const SWR_PS_STATE *pPSState = &state.psState;
826 const SWR_BLEND_STATE *pBlendState = &state.blendState;
827
828 // broadcast scalars
829 BarycentricCoeffs coeffs;
830 coeffs.vIa = _simd_broadcast_ss(&work.I[0]);
831 coeffs.vIb = _simd_broadcast_ss(&work.I[1]);
832 coeffs.vIc = _simd_broadcast_ss(&work.I[2]);
833
834 coeffs.vJa = _simd_broadcast_ss(&work.J[0]);
835 coeffs.vJb = _simd_broadcast_ss(&work.J[1]);
836 coeffs.vJc = _simd_broadcast_ss(&work.J[2]);
837
838 coeffs.vZa = _simd_broadcast_ss(&work.Z[0]);
839 coeffs.vZb = _simd_broadcast_ss(&work.Z[1]);
840 coeffs.vZc = _simd_broadcast_ss(&work.Z[2]);
841
842 coeffs.vRecipDet = _simd_broadcast_ss(&work.recipDet);
843
844 coeffs.vAOneOverW = _simd_broadcast_ss(&work.OneOverW[0]);
845 coeffs.vBOneOverW = _simd_broadcast_ss(&work.OneOverW[1]);
846 coeffs.vCOneOverW = _simd_broadcast_ss(&work.OneOverW[2]);
847
848 uint8_t *pColorBase[SWR_NUM_RENDERTARGETS];
849 uint32_t NumRT = state.psState.numRenderTargets;
850 for(uint32_t rt = 0; rt < NumRT; ++rt)
851 {
852 pColorBase[rt] = renderBuffers.pColor[rt];
853 }
854 uint8_t *pDepthBase = renderBuffers.pDepth, *pStencilBase = renderBuffers.pStencil;
855 AR_END(BESetup, 0);
856
857 SWR_PS_CONTEXT psContext;
858 psContext.pAttribs = work.pAttribs;
859 psContext.pPerspAttribs = work.pPerspAttribs;
860 psContext.frontFace = work.triFlags.frontFacing;
861 psContext.primID = work.triFlags.primID;
862 psContext.pRecipW = work.pRecipW;
863 // save Ia/Ib/Ic and Ja/Jb/Jc if we need to reevaluate i/j/k in the shader because of pull attribs
864 psContext.I = work.I;
865 psContext.J = work.J;
866 psContext.recipDet = work.recipDet;
867 psContext.pSamplePosX = (const float*)&T::MultisampleT::samplePosX;
868 psContext.pSamplePosY = (const float*)&T::MultisampleT::samplePosY;
869 psContext.rasterizerSampleCount = T::MultisampleT::numSamples;
870
871 psContext.sampleIndex = 0;
872
873 PixelRateZTestLoop<T> PixelRateZTest(pDC, workerId, work, coeffs, state, pDepthBase, pStencilBase, rastState.clipDistanceMask);
874
875 for(uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
876 {
877 psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps((float)yy));
878 psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps((float)yy));
879 for(uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
880 {
881 simdscalar activeLanes;
882 if(!(work.anyCoveredSamples & MASK)) {goto Endtile;};
883 activeLanes = vMask(work.anyCoveredSamples & MASK);
884
885 psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps((float)xx));
886 // set pixel center positions
887 psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps((float)xx));
888
889 AR_BEGIN(BEBarycentric, pDC->drawId);
890 CalcPixelBarycentrics(coeffs, psContext);
891 AR_END(BEBarycentric, 0);
892
893 if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE)
894 {
895 const uint64_t* pCoverageMask = (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) ? &work.innerCoverageMask :
896 &work.coverageMask[0];
897 generateInputCoverage<T, T::InputCoverage>(pCoverageMask, psContext.inputMask, pBlendState->sampleMask);
898 }
899
900 if(T::bCentroidPos)
901 {
902 ///@ todo: don't need to genererate input coverage 2x if input coverage and centroid
903 AR_BEGIN(BEBarycentric, pDC->drawId);
904 if(T::bIsStandardPattern)
905 {
906 CalcCentroidPos<T>(psContext, &work.coverageMask[0], pBlendState->sampleMask, psContext.vX.UL, psContext.vY.UL);
907 }
908 else
909 {
910 psContext.vX.centroid = _simd_add_ps(psContext.vX.UL, _simd_set1_ps(0.5f));
911 psContext.vY.centroid = _simd_add_ps(psContext.vY.UL, _simd_set1_ps(0.5f));
912 }
913
914 CalcCentroidBarycentrics(coeffs, psContext, psContext.vX.UL, psContext.vY.UL);
915 AR_END(BEBarycentric, 0);
916 }
917 else
918 {
919 psContext.vX.centroid = _simd_add_ps(psContext.vX.UL, _simd_set1_ps(0.5f));
920 psContext.vY.centroid = _simd_add_ps(psContext.vY.UL, _simd_set1_ps(0.5f));
921 }
922
923 if(T::bForcedSampleCount)
924 {
925 // candidate pixels (that passed coverage) will cause shader invocation if any bits in the samplemask are set
926 const simdscalar vSampleMask = _simd_castsi_ps(_simd_cmpgt_epi32(_simd_set1_epi32(pBlendState->sampleMask), _simd_setzero_si()));
927 activeLanes = _simd_and_ps(activeLanes, vSampleMask);
928 }
929
930 // Early-Z?
931 if(T::bCanEarlyZ && !T::bForcedSampleCount)
932 {
933 uint32_t depthPassCount = PixelRateZTest(activeLanes, psContext, BEEarlyDepthTest);
934 UPDATE_STAT(DepthPassCount, depthPassCount);
935 }
936
937 // if we have no covered samples that passed depth at this point, go to next tile
938 if(!_simd_movemask_ps(activeLanes)) { goto Endtile; };
939
940 if(pPSState->usesSourceDepth)
941 {
942 AR_BEGIN(BEBarycentric, pDC->drawId);
943 // interpolate and quantize z
944 psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center);
945 psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
946 AR_END(BEBarycentric, 0);
947 }
948
949 // pixels that are currently active
950 psContext.activeMask = _simd_castps_si(activeLanes);
951 psContext.oMask = T::MultisampleT::FullSampleMask();
952
953 // execute pixel shader
954 AR_BEGIN(BEPixelShader, pDC->drawId);
955 state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
956 UPDATE_STAT(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(activeLanes)));
957 AR_END(BEPixelShader, 0);
958
959 // update active lanes to remove any discarded or oMask'd pixels
960 activeLanes = _simd_castsi_ps(_simd_and_si(psContext.activeMask, _simd_cmpgt_epi32(psContext.oMask, _simd_setzero_si())));
961 if(!_simd_movemask_ps(activeLanes)) { goto Endtile; };
962
963 // late-Z
964 if(!T::bCanEarlyZ && !T::bForcedSampleCount)
965 {
966 uint32_t depthPassCount = PixelRateZTest(activeLanes, psContext, BELateDepthTest);
967 UPDATE_STAT(DepthPassCount, depthPassCount);
968 }
969
970 // if we have no covered samples that passed depth at this point, skip OM and go to next tile
971 if(!_simd_movemask_ps(activeLanes)) { goto Endtile; };
972
973 // output merger
974 // loop over all samples, broadcasting the results of the PS to all passing pixels
975 for(uint32_t sample = 0; sample < GetNumOMSamples<T>(pBlendState->sampleCount); sample++)
976 {
977 AR_BEGIN(BEOutputMerger, pDC->drawId);
978 // center pattern does a single coverage/depth/stencil test, standard pattern tests all samples
979 uint32_t coverageSampleNum = (T::bIsStandardPattern) ? sample : 0;
980 simdscalar coverageMask, depthMask;
981 if(T::bForcedSampleCount)
982 {
983 coverageMask = depthMask = activeLanes;
984 }
985 else
986 {
987 coverageMask = PixelRateZTest.vCoverageMask[coverageSampleNum];
988 depthMask = PixelRateZTest.depthPassMask[coverageSampleNum];
989 if(!_simd_movemask_ps(depthMask))
990 {
991 // stencil should already have been written in early/lateZ tests
992 AR_END(BEOutputMerger, 0);
993 continue;
994 }
995 }
996
997 // broadcast the results of the PS to all passing pixels
998 OutputMerger(psContext, pColorBase, sample, pBlendState, state.pfnBlendFunc, coverageMask, depthMask, pPSState->numRenderTargets);
999
1000 if(!pPSState->forceEarlyZ && !T::bForcedSampleCount)
1001 {
1002 uint8_t *pDepthSample = pDepthBase + RasterTileDepthOffset(sample);
1003 uint8_t * pStencilSample = pStencilBase + RasterTileStencilOffset(sample);
1004
1005 DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, PixelRateZTest.vZ[coverageSampleNum],
1006 pDepthSample, depthMask, coverageMask, pStencilSample, PixelRateZTest.stencilPassMask[coverageSampleNum]);
1007 }
1008 AR_END(BEOutputMerger, 0);
1009 }
1010 Endtile:
1011 AR_BEGIN(BEEndTile, pDC->drawId);
1012 for(uint32_t sample = 0; sample < T::MultisampleT::numCoverageSamples; sample++)
1013 {
1014 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
1015 }
1016
1017 if(T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
1018 {
1019 work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
1020 }
1021 work.anyCoveredSamples >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
1022 pDepthBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
1023 pStencilBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
1024
1025 for(uint32_t rt = 0; rt < NumRT; ++rt)
1026 {
1027 pColorBase[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
1028 }
1029 AR_END(BEEndTile, 0);
1030 }
1031 }
1032 AR_END(BEPixelRateBackend, 0);
1033 }
1034 // optimized backend flow with NULL PS
1035 template<uint32_t sampleCountT>
1036 void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
1037 {
1038 SWR_CONTEXT *pContext = pDC->pContext;
1039
1040 AR_BEGIN(BENullBackend, pDC->drawId);
1041 ///@todo: handle center multisample pattern
1042 typedef SwrBackendTraits<sampleCountT, SWR_MSAA_STANDARD_PATTERN> T;
1043 AR_BEGIN(BESetup, pDC->drawId);
1044
1045 const API_STATE& state = GetApiState(pDC);
1046 const SWR_RASTSTATE& rastState = pDC->pState->state.rastState;
1047
1048 // broadcast scalars
1049 BarycentricCoeffs coeffs;
1050 coeffs.vIa = _simd_broadcast_ss(&work.I[0]);
1051 coeffs.vIb = _simd_broadcast_ss(&work.I[1]);
1052 coeffs.vIc = _simd_broadcast_ss(&work.I[2]);
1053
1054 coeffs.vJa = _simd_broadcast_ss(&work.J[0]);
1055 coeffs.vJb = _simd_broadcast_ss(&work.J[1]);
1056 coeffs.vJc = _simd_broadcast_ss(&work.J[2]);
1057
1058 coeffs.vZa = _simd_broadcast_ss(&work.Z[0]);
1059 coeffs.vZb = _simd_broadcast_ss(&work.Z[1]);
1060 coeffs.vZc = _simd_broadcast_ss(&work.Z[2]);
1061
1062 coeffs.vRecipDet = _simd_broadcast_ss(&work.recipDet);
1063
1064 uint8_t *pDepthBase = renderBuffers.pDepth, *pStencilBase = renderBuffers.pStencil;
1065
1066 AR_END(BESetup, 0);
1067
1068 SWR_PS_CONTEXT psContext;
1069 for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
1070 {
1071 // UL pixel corner
1072 simdscalar vYSamplePosUL = _simd_add_ps(vULOffsetsY, _simd_set1_ps((float)yy));
1073
1074 for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
1075 {
1076 // UL pixel corners
1077 simdscalar vXSamplePosUL = _simd_add_ps(vULOffsetsX, _simd_set1_ps((float)xx));
1078
1079 // iterate over active samples
1080 unsigned long sample = 0;
1081 uint32_t sampleMask = state.blendState.sampleMask;
1082 while (_BitScanForward(&sample, sampleMask))
1083 {
1084 sampleMask &= ~(1 << sample);
1085 simdmask coverageMask = work.coverageMask[sample] & MASK;
1086 if (coverageMask)
1087 {
1088 AR_BEGIN(BEBarycentric, pDC->drawId);
1089 // calculate per sample positions
1090 psContext.vX.sample = _simd_add_ps(vXSamplePosUL, T::MultisampleT::vX(sample));
1091 psContext.vY.sample = _simd_add_ps(vYSamplePosUL, T::MultisampleT::vY(sample));
1092
1093 CalcSampleBarycentrics(coeffs, psContext);
1094
1095 // interpolate and quantize z
1096 psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample);
1097 psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
1098
1099 AR_END(BEBarycentric, 0);
1100
1101 // interpolate user clip distance if available
1102 if (rastState.clipDistanceMask)
1103 {
1104 coverageMask &= ~ComputeUserClipMask(rastState.clipDistanceMask, work.pUserClipBuffer,
1105 psContext.vI.sample, psContext.vJ.sample);
1106 }
1107
1108 simdscalar vCoverageMask = vMask(coverageMask);
1109 simdscalar stencilPassMask = vCoverageMask;
1110
1111 // offset depth/stencil buffers current sample
1112 uint8_t *pDepthSample = pDepthBase + RasterTileDepthOffset(sample);
1113 uint8_t *pStencilSample = pStencilBase + RasterTileStencilOffset(sample);
1114
1115 AR_BEGIN(BEEarlyDepthTest, pDC->drawId);
1116 simdscalar depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
1117 psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
1118 DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
1119 pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
1120 AR_END(BEEarlyDepthTest, 0);
1121
1122 uint32_t statMask = _simd_movemask_ps(depthPassMask);
1123 uint32_t statCount = _mm_popcnt_u32(statMask);
1124 UPDATE_STAT(DepthPassCount, statCount);
1125 }
1126 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
1127 }
1128 pDepthBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
1129 pStencilBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
1130 }
1131 }
1132 AR_END(BENullBackend, 0);
1133 }
1134
1135 void InitClearTilesTable()
1136 {
1137 memset(sClearTilesTable, 0, sizeof(sClearTilesTable));
1138
1139 sClearTilesTable[R8G8B8A8_UNORM] = ClearMacroTile<R8G8B8A8_UNORM>;
1140 sClearTilesTable[B8G8R8A8_UNORM] = ClearMacroTile<B8G8R8A8_UNORM>;
1141 sClearTilesTable[R32_FLOAT] = ClearMacroTile<R32_FLOAT>;
1142 sClearTilesTable[R32G32B32A32_FLOAT] = ClearMacroTile<R32G32B32A32_FLOAT>;
1143 sClearTilesTable[R8_UINT] = ClearMacroTile<R8_UINT>;
1144 }
1145
1146 PFN_BACKEND_FUNC gBackendNullPs[SWR_MULTISAMPLE_TYPE_COUNT];
1147 PFN_BACKEND_FUNC gBackendSingleSample[SWR_INPUT_COVERAGE_COUNT]
1148 [2] // centroid
1149 [2] // canEarlyZ
1150 = {};
1151 PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_COUNT]
1152 [SWR_MSAA_SAMPLE_PATTERN_COUNT]
1153 [SWR_INPUT_COVERAGE_COUNT]
1154 [2] // centroid
1155 [2] // forcedSampleCount
1156 [2] // canEarlyZ
1157 = {};
1158 PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_COUNT]
1159 [SWR_INPUT_COVERAGE_COUNT]
1160 [2] // centroid
1161 [2] // canEarlyZ
1162 = {};
1163
1164 // Recursive template used to auto-nest conditionals. Converts dynamic enum function
1165 // arguments to static template arguments.
1166 template <uint32_t... ArgsT>
1167 struct BEChooser
1168 {
1169 // Last Arg Terminator
1170 static PFN_BACKEND_FUNC GetFunc(SWR_BACKEND_FUNCS tArg)
1171 {
1172 switch(tArg)
1173 {
1174 case SWR_BACKEND_SINGLE_SAMPLE: return BackendSingleSample<SwrBackendTraits<ArgsT...>>; break;
1175 case SWR_BACKEND_MSAA_PIXEL_RATE: return BackendPixelRate<SwrBackendTraits<ArgsT...>>; break;
1176 case SWR_BACKEND_MSAA_SAMPLE_RATE: return BackendSampleRate<SwrBackendTraits<ArgsT...>>; break;
1177 default:
1178 SWR_ASSERT(0 && "Invalid backend func\n");
1179 return nullptr;
1180 break;
1181 }
1182 }
1183
1184 // Recursively parse args
1185 template <typename... TArgsT>
1186 static PFN_BACKEND_FUNC GetFunc(SWR_MSAA_SAMPLE_PATTERN tArg, TArgsT... remainingArgs)
1187 {
1188 switch(tArg)
1189 {
1190 case SWR_MSAA_CENTER_PATTERN: return BEChooser<ArgsT..., SWR_MSAA_CENTER_PATTERN>::GetFunc(remainingArgs...); break;
1191 case SWR_MSAA_STANDARD_PATTERN: return BEChooser<ArgsT..., SWR_MSAA_STANDARD_PATTERN>::GetFunc(remainingArgs...); break;
1192 default:
1193 SWR_ASSERT(0 && "Invalid sample pattern\n");
1194 return BEChooser<ArgsT..., SWR_MSAA_STANDARD_PATTERN>::GetFunc(remainingArgs...);
1195 break;
1196 }
1197 }
1198
1199 // Recursively parse args
1200 template <typename... TArgsT>
1201 static PFN_BACKEND_FUNC GetFunc(SWR_INPUT_COVERAGE tArg, TArgsT... remainingArgs)
1202 {
1203 switch(tArg)
1204 {
1205 case SWR_INPUT_COVERAGE_NONE: return BEChooser<ArgsT..., SWR_INPUT_COVERAGE_NONE>::GetFunc(remainingArgs...); break;
1206 case SWR_INPUT_COVERAGE_NORMAL: return BEChooser<ArgsT..., SWR_INPUT_COVERAGE_NORMAL>::GetFunc(remainingArgs...); break;
1207 case SWR_INPUT_COVERAGE_INNER_CONSERVATIVE: return BEChooser<ArgsT..., SWR_INPUT_COVERAGE_INNER_CONSERVATIVE>::GetFunc(remainingArgs...); break;
1208 default:
1209 SWR_ASSERT(0 && "Invalid sample pattern\n");
1210 return BEChooser<ArgsT..., SWR_INPUT_COVERAGE_NONE>::GetFunc(remainingArgs...);
1211 break;
1212 }
1213 }
1214
1215 // Recursively parse args
1216 template <typename... TArgsT>
1217 static PFN_BACKEND_FUNC GetFunc(SWR_MULTISAMPLE_COUNT tArg, TArgsT... remainingArgs)
1218 {
1219 switch(tArg)
1220 {
1221 case SWR_MULTISAMPLE_1X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...); break;
1222 case SWR_MULTISAMPLE_2X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_2X>::GetFunc(remainingArgs...); break;
1223 case SWR_MULTISAMPLE_4X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_4X>::GetFunc(remainingArgs...); break;
1224 case SWR_MULTISAMPLE_8X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_8X>::GetFunc(remainingArgs...); break;
1225 case SWR_MULTISAMPLE_16X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_16X>::GetFunc(remainingArgs...); break;
1226 default:
1227 SWR_ASSERT(0 && "Invalid sample count\n");
1228 return BEChooser<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...);
1229 break;
1230 }
1231 }
1232
1233 // Recursively parse args
1234 template <typename... TArgsT>
1235 static PFN_BACKEND_FUNC GetFunc(bool tArg, TArgsT... remainingArgs)
1236 {
1237 if(tArg == true)
1238 {
1239 return BEChooser<ArgsT..., 1>::GetFunc(remainingArgs...);
1240 }
1241
1242 return BEChooser<ArgsT..., 0>::GetFunc(remainingArgs...);
1243 }
1244 };
1245
1246 void InitBackendSingleFuncTable(PFN_BACKEND_FUNC (&table)[SWR_INPUT_COVERAGE_COUNT][2][2])
1247 {
1248 for(uint32_t inputCoverage = 0; inputCoverage < SWR_INPUT_COVERAGE_COUNT; inputCoverage++)
1249 {
1250 for(uint32_t isCentroid = 0; isCentroid < 2; isCentroid++)
1251 {
1252 for(uint32_t canEarlyZ = 0; canEarlyZ < 2; canEarlyZ++)
1253 {
1254 table[inputCoverage][isCentroid][canEarlyZ] =
1255 BEChooser<>::GetFunc(SWR_MULTISAMPLE_1X, SWR_MSAA_STANDARD_PATTERN, (SWR_INPUT_COVERAGE)inputCoverage,
1256 (isCentroid > 0), false, (canEarlyZ > 0), SWR_BACKEND_SINGLE_SAMPLE);
1257 }
1258 }
1259 }
1260 }
1261
1262 void InitBackendPixelFuncTable(PFN_BACKEND_FUNC (&table)[SWR_MULTISAMPLE_TYPE_COUNT][SWR_MSAA_SAMPLE_PATTERN_COUNT][SWR_INPUT_COVERAGE_COUNT][2][2][2])
1263 {
1264 for(uint32_t sampleCount = SWR_MULTISAMPLE_1X; sampleCount < SWR_MULTISAMPLE_TYPE_COUNT; sampleCount++)
1265 {
1266 for(uint32_t samplePattern = SWR_MSAA_CENTER_PATTERN; samplePattern < SWR_MSAA_SAMPLE_PATTERN_COUNT; samplePattern++)
1267 {
1268 for(uint32_t inputCoverage = 0; inputCoverage < SWR_INPUT_COVERAGE_COUNT; inputCoverage++)
1269 {
1270 for(uint32_t isCentroid = 0; isCentroid < 2; isCentroid++)
1271 {
1272 for(uint32_t forcedSampleCount = 0; forcedSampleCount < 2; forcedSampleCount++)
1273 {
1274 for(uint32_t canEarlyZ = 0; canEarlyZ < 2; canEarlyZ++)
1275 {
1276 table[sampleCount][samplePattern][inputCoverage][isCentroid][forcedSampleCount][canEarlyZ] =
1277 BEChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, (SWR_MSAA_SAMPLE_PATTERN)samplePattern, (SWR_INPUT_COVERAGE)inputCoverage,
1278 (isCentroid > 0), (forcedSampleCount > 0), (canEarlyZ > 0), SWR_BACKEND_MSAA_PIXEL_RATE);
1279 }
1280 }
1281 }
1282 }
1283 }
1284 }
1285 }
1286
1287 void InitBackendSampleFuncTable(PFN_BACKEND_FUNC (&table)[SWR_MULTISAMPLE_TYPE_COUNT][SWR_INPUT_COVERAGE_COUNT][2][2])
1288 {
1289 for(uint32_t sampleCount = SWR_MULTISAMPLE_1X; sampleCount < SWR_MULTISAMPLE_TYPE_COUNT; sampleCount++)
1290 {
1291 for(uint32_t inputCoverage = 0; inputCoverage < SWR_INPUT_COVERAGE_COUNT; inputCoverage++)
1292 {
1293 for(uint32_t centroid = 0; centroid < 2; centroid++)
1294 {
1295 for(uint32_t canEarlyZ = 0; canEarlyZ < 2; canEarlyZ++)
1296 {
1297 table[sampleCount][inputCoverage][centroid][canEarlyZ] =
1298 BEChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, SWR_MSAA_STANDARD_PATTERN, (SWR_INPUT_COVERAGE)inputCoverage,
1299 (centroid > 0), false, (canEarlyZ > 0), (SWR_BACKEND_FUNCS)SWR_BACKEND_MSAA_SAMPLE_RATE);
1300 }
1301 }
1302 }
1303 }
1304 }
1305
1306 void InitBackendFuncTables()
1307 {
1308 InitBackendSingleFuncTable(gBackendSingleSample);
1309 InitBackendPixelFuncTable(gBackendPixelRateTable);
1310 InitBackendSampleFuncTable(gBackendSampleRateTable);
1311
1312 gBackendNullPs[SWR_MULTISAMPLE_1X] = &BackendNullPS < SWR_MULTISAMPLE_1X > ;
1313 gBackendNullPs[SWR_MULTISAMPLE_2X] = &BackendNullPS < SWR_MULTISAMPLE_2X > ;
1314 gBackendNullPs[SWR_MULTISAMPLE_4X] = &BackendNullPS < SWR_MULTISAMPLE_4X > ;
1315 gBackendNullPs[SWR_MULTISAMPLE_8X] = &BackendNullPS < SWR_MULTISAMPLE_8X > ;
1316 gBackendNullPs[SWR_MULTISAMPLE_16X] = &BackendNullPS < SWR_MULTISAMPLE_16X > ;
1317 }