swr: [rasterizer] Miscellaneous backend changes
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / backend.cpp
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file backend.cpp
24 *
25 * @brief Backend handles rasterization, pixel shading and output merger
26 * operations.
27 *
28 ******************************************************************************/
29
30 #include <smmintrin.h>
31
32 #include "backend.h"
33 #include "depthstencil.h"
34 #include "tilemgr.h"
35 #include "memory/tilingtraits.h"
36 #include "core/multisample.h"
37
38 #include <algorithm>
39
40 typedef void(*PFN_CLEAR_TILES)(DRAW_CONTEXT*, SWR_RENDERTARGET_ATTACHMENT rt, uint32_t, DWORD[4]);
41 static PFN_CLEAR_TILES sClearTilesTable[NUM_SWR_FORMATS];
42
43 //////////////////////////////////////////////////////////////////////////
44 /// @brief Process compute work.
45 /// @param pDC - pointer to draw context (dispatch).
46 /// @param workerId - The unique worker ID that is assigned to this thread.
47 /// @param threadGroupId - the linear index for the thread group within the dispatch.
48 void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId, void*& pSpillFillBuffer)
49 {
50 RDTSC_START(BEDispatch);
51
52 SWR_CONTEXT *pContext = pDC->pContext;
53
54 const COMPUTE_DESC* pTaskData = (COMPUTE_DESC*)pDC->pDispatch->GetTasksData();
55 SWR_ASSERT(pTaskData != nullptr);
56
57 // Ensure spill fill memory has been allocated.
58 size_t spillFillSize = pDC->pState->state.totalSpillFillSize;
59 if (spillFillSize && pSpillFillBuffer == nullptr)
60 {
61 pSpillFillBuffer = pDC->pArena->AllocAlignedSync(spillFillSize, KNOB_SIMD_BYTES);
62 }
63
64 const API_STATE& state = GetApiState(pDC);
65
66 SWR_CS_CONTEXT csContext{ 0 };
67 csContext.tileCounter = threadGroupId;
68 csContext.dispatchDims[0] = pTaskData->threadGroupCountX;
69 csContext.dispatchDims[1] = pTaskData->threadGroupCountY;
70 csContext.dispatchDims[2] = pTaskData->threadGroupCountZ;
71 csContext.pTGSM = pContext->pScratch[workerId];
72 csContext.pSpillFillBuffer = (uint8_t*)pSpillFillBuffer;
73
74 state.pfnCsFunc(GetPrivateState(pDC), &csContext);
75
76 UPDATE_STAT(CsInvocations, state.totalThreadsInGroup);
77
78 RDTSC_STOP(BEDispatch, 1, 0);
79 }
80
81 void ProcessSyncBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData)
82 {
83 SYNC_DESC *pSync = (SYNC_DESC*)pUserData;
84
85 uint32_t x, y;
86 MacroTileMgr::getTileIndices(macroTile, x, y);
87 SWR_ASSERT(x == 0 && y == 0);
88
89 if (pSync->pfnCallbackFunc != nullptr)
90 {
91 pSync->pfnCallbackFunc(pSync->userData, pSync->userData2, pSync->userData3);
92 }
93 }
94
95 void ProcessQueryStatsBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData)
96 {
97 QUERY_DESC* pQueryDesc = (QUERY_DESC*)pUserData;
98 SWR_STATS* pStats = pQueryDesc->pStats;
99 SWR_CONTEXT *pContext = pDC->pContext;
100
101 SWR_ASSERT(pStats != nullptr);
102
103 for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i)
104 {
105 pStats->DepthPassCount += pContext->stats[i].DepthPassCount;
106
107 pStats->IaVertices += pContext->stats[i].IaVertices;
108 pStats->IaPrimitives += pContext->stats[i].IaPrimitives;
109 pStats->VsInvocations += pContext->stats[i].VsInvocations;
110 pStats->HsInvocations += pContext->stats[i].HsInvocations;
111 pStats->DsInvocations += pContext->stats[i].DsInvocations;
112 pStats->GsInvocations += pContext->stats[i].GsInvocations;
113 pStats->PsInvocations += pContext->stats[i].PsInvocations;
114 pStats->CInvocations += pContext->stats[i].CInvocations;
115 pStats->CsInvocations += pContext->stats[i].CsInvocations;
116 pStats->CPrimitives += pContext->stats[i].CPrimitives;
117 pStats->GsPrimitives += pContext->stats[i].GsPrimitives;
118
119 for (uint32_t stream = 0; stream < MAX_SO_STREAMS; ++stream)
120 {
121 pStats->SoWriteOffset[stream] += pContext->stats[i].SoWriteOffset[stream];
122
123 /// @note client is required to provide valid write offset before every draw, so we clear
124 /// out the contents of the write offset when storing stats
125 pContext->stats[i].SoWriteOffset[stream] = 0;
126
127 pStats->SoPrimStorageNeeded[stream] += pContext->stats[i].SoPrimStorageNeeded[stream];
128 pStats->SoNumPrimsWritten[stream] += pContext->stats[i].SoNumPrimsWritten[stream];
129 }
130 }
131 }
132
133 template<SWR_FORMAT format>
134 void ClearRasterTile(uint8_t *pTileBuffer, simdvector &value)
135 {
136 auto lambda = [&](int comp)
137 {
138 FormatTraits<format>::storeSOA(comp, pTileBuffer, value.v[comp]);
139 pTileBuffer += (KNOB_SIMD_WIDTH * FormatTraits<format>::GetBPC(comp) / 8);
140 };
141
142 const uint32_t numIter = (KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD_TILE_X_DIM);
143 for (uint32_t i = 0; i < numIter; ++i)
144 {
145 UnrollerL<0, FormatTraits<format>::numComps, 1>::step(lambda);
146 }
147 }
148
149 template<SWR_FORMAT format>
150 INLINE void ClearMacroTile(DRAW_CONTEXT *pDC, SWR_RENDERTARGET_ATTACHMENT rt, uint32_t macroTile, DWORD clear[4])
151 {
152 // convert clear color to hottile format
153 // clear color is in RGBA float/uint32
154 simdvector vClear;
155 for (uint32_t comp = 0; comp < FormatTraits<format>::numComps; ++comp)
156 {
157 simdscalar vComp;
158 vComp = _simd_load1_ps((const float*)&clear[comp]);
159 if (FormatTraits<format>::isNormalized(comp))
160 {
161 vComp = _simd_mul_ps(vComp, _simd_set1_ps(FormatTraits<format>::fromFloat(comp)));
162 vComp = _simd_castsi_ps(_simd_cvtps_epi32(vComp));
163 }
164 vComp = FormatTraits<format>::pack(comp, vComp);
165 vClear.v[FormatTraits<format>::swizzle(comp)] = vComp;
166 }
167
168 uint32_t tileX, tileY;
169 MacroTileMgr::getTileIndices(macroTile, tileX, tileY);
170 const API_STATE& state = GetApiState(pDC);
171
172 int top = KNOB_MACROTILE_Y_DIM_FIXED * tileY;
173 int bottom = top + KNOB_MACROTILE_Y_DIM_FIXED - 1;
174 int left = KNOB_MACROTILE_X_DIM_FIXED * tileX;
175 int right = left + KNOB_MACROTILE_X_DIM_FIXED - 1;
176
177 // intersect with scissor
178 top = std::max(top, state.scissorInFixedPoint.top);
179 left = std::max(left, state.scissorInFixedPoint.left);
180 bottom = std::min(bottom, state.scissorInFixedPoint.bottom);
181 right = std::min(right, state.scissorInFixedPoint.right);
182
183 // translate to local hottile origin
184 top -= KNOB_MACROTILE_Y_DIM_FIXED * tileY;
185 bottom -= KNOB_MACROTILE_Y_DIM_FIXED * tileY;
186 left -= KNOB_MACROTILE_X_DIM_FIXED * tileX;
187 right -= KNOB_MACROTILE_X_DIM_FIXED * tileX;
188
189 // convert to raster tiles
190 top >>= (KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT);
191 bottom >>= (KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT);
192 left >>= (KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT);
193 right >>= (KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT);
194
195 const int numSamples = GetNumSamples(pDC->pState->state.rastState.sampleCount);
196 // compute steps between raster tile samples / raster tiles / macro tile rows
197 const uint32_t rasterTileSampleStep = KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<format>::bpp / 8;
198 const uint32_t rasterTileStep = (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<format>::bpp / 8)) * numSamples;
199 const uint32_t macroTileRowStep = (KNOB_MACROTILE_X_DIM / KNOB_TILE_X_DIM) * rasterTileStep;
200 const uint32_t pitch = (FormatTraits<format>::bpp * KNOB_MACROTILE_X_DIM / 8);
201
202 HOTTILE *pHotTile = pDC->pContext->pHotTileMgr->GetHotTile(pDC->pContext, pDC, macroTile, rt, true, numSamples);
203 uint32_t rasterTileStartOffset = (ComputeTileOffset2D< TilingTraits<SWR_TILE_SWRZ, FormatTraits<format>::bpp > >(pitch, left, top)) * numSamples;
204 uint8_t* pRasterTileRow = pHotTile->pBuffer + rasterTileStartOffset; //(ComputeTileOffset2D< TilingTraits<SWR_TILE_SWRZ, FormatTraits<format>::bpp > >(pitch, x, y)) * numSamples;
205
206 // loop over all raster tiles in the current hot tile
207 for (int y = top; y <= bottom; ++y)
208 {
209 uint8_t* pRasterTile = pRasterTileRow;
210 for (int x = left; x <= right; ++x)
211 {
212 for( int sampleNum = 0; sampleNum < numSamples; sampleNum++)
213 {
214 ClearRasterTile<format>(pRasterTile, vClear);
215 pRasterTile += rasterTileSampleStep;
216 }
217 }
218 pRasterTileRow += macroTileRowStep;
219 }
220
221 pHotTile->state = HOTTILE_DIRTY;
222 }
223
224
225 void ProcessClearBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData)
226 {
227 if (KNOB_FAST_CLEAR)
228 {
229 CLEAR_DESC *pClear = (CLEAR_DESC*)pUserData;
230 SWR_CONTEXT *pContext = pDC->pContext;
231 SWR_MULTISAMPLE_COUNT sampleCount = pDC->pState->state.rastState.sampleCount;
232 uint32_t numSamples = GetNumSamples(sampleCount);
233
234 SWR_ASSERT(pClear->flags.bits != 0); // shouldn't be here without a reason.
235
236 RDTSC_START(BEClear);
237
238 if (pClear->flags.mask & SWR_CLEAR_COLOR)
239 {
240 HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, SWR_ATTACHMENT_COLOR0, true, numSamples);
241 // All we want to do here is to mark the hot tile as being in a "needs clear" state.
242 pHotTile->clearData[0] = *(DWORD*)&(pClear->clearRTColor[0]);
243 pHotTile->clearData[1] = *(DWORD*)&(pClear->clearRTColor[1]);
244 pHotTile->clearData[2] = *(DWORD*)&(pClear->clearRTColor[2]);
245 pHotTile->clearData[3] = *(DWORD*)&(pClear->clearRTColor[3]);
246 pHotTile->state = HOTTILE_CLEAR;
247 }
248
249 if (pClear->flags.mask & SWR_CLEAR_DEPTH)
250 {
251 HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, SWR_ATTACHMENT_DEPTH, true, numSamples);
252 pHotTile->clearData[0] = *(DWORD*)&pClear->clearDepth;
253 pHotTile->state = HOTTILE_CLEAR;
254 }
255
256 if (pClear->flags.mask & SWR_CLEAR_STENCIL)
257 {
258 HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, SWR_ATTACHMENT_STENCIL, true, numSamples);
259
260 pHotTile->clearData[0] = *(DWORD*)&pClear->clearStencil;
261 pHotTile->state = HOTTILE_CLEAR;
262 }
263
264 RDTSC_STOP(BEClear, 0, 0);
265 }
266 else
267 {
268 // Legacy clear
269 CLEAR_DESC *pClear = (CLEAR_DESC*)pUserData;
270 RDTSC_START(BEClear);
271
272 if (pClear->flags.mask & SWR_CLEAR_COLOR)
273 {
274 /// @todo clear data should come in as RGBA32_FLOAT
275 DWORD clearData[4];
276 float clearFloat[4];
277 clearFloat[0] = ((uint8_t*)(&pClear->clearRTColor))[0] / 255.0f;
278 clearFloat[1] = ((uint8_t*)(&pClear->clearRTColor))[1] / 255.0f;
279 clearFloat[2] = ((uint8_t*)(&pClear->clearRTColor))[2] / 255.0f;
280 clearFloat[3] = ((uint8_t*)(&pClear->clearRTColor))[3] / 255.0f;
281 clearData[0] = *(DWORD*)&clearFloat[0];
282 clearData[1] = *(DWORD*)&clearFloat[1];
283 clearData[2] = *(DWORD*)&clearFloat[2];
284 clearData[3] = *(DWORD*)&clearFloat[3];
285
286 PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[KNOB_COLOR_HOT_TILE_FORMAT];
287 SWR_ASSERT(pfnClearTiles != nullptr);
288
289 pfnClearTiles(pDC, SWR_ATTACHMENT_COLOR0, macroTile, clearData);
290 }
291
292 if (pClear->flags.mask & SWR_CLEAR_DEPTH)
293 {
294 DWORD clearData[4];
295 clearData[0] = *(DWORD*)&pClear->clearDepth;
296 PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[KNOB_DEPTH_HOT_TILE_FORMAT];
297 SWR_ASSERT(pfnClearTiles != nullptr);
298
299 pfnClearTiles(pDC, SWR_ATTACHMENT_DEPTH, macroTile, clearData);
300 }
301
302 if (pClear->flags.mask & SWR_CLEAR_STENCIL)
303 {
304 uint32_t value = pClear->clearStencil;
305 DWORD clearData[4];
306 clearData[0] = *(DWORD*)&value;
307 PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[KNOB_STENCIL_HOT_TILE_FORMAT];
308
309 pfnClearTiles(pDC, SWR_ATTACHMENT_STENCIL, macroTile, clearData);
310 }
311
312 RDTSC_STOP(BEClear, 0, 0);
313 }
314 }
315
316
317 void ProcessStoreTileBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData)
318 {
319 RDTSC_START(BEStoreTiles);
320 STORE_TILES_DESC *pDesc = (STORE_TILES_DESC*)pData;
321 SWR_CONTEXT *pContext = pDC->pContext;
322
323 #ifdef KNOB_ENABLE_RDTSC
324 uint32_t numTiles = 0;
325 #endif
326 SWR_FORMAT srcFormat;
327 switch (pDesc->attachment)
328 {
329 case SWR_ATTACHMENT_COLOR0:
330 case SWR_ATTACHMENT_COLOR1:
331 case SWR_ATTACHMENT_COLOR2:
332 case SWR_ATTACHMENT_COLOR3:
333 case SWR_ATTACHMENT_COLOR4:
334 case SWR_ATTACHMENT_COLOR5:
335 case SWR_ATTACHMENT_COLOR6:
336 case SWR_ATTACHMENT_COLOR7: srcFormat = KNOB_COLOR_HOT_TILE_FORMAT; break;
337 case SWR_ATTACHMENT_DEPTH: srcFormat = KNOB_DEPTH_HOT_TILE_FORMAT; break;
338 case SWR_ATTACHMENT_STENCIL: srcFormat = KNOB_STENCIL_HOT_TILE_FORMAT; break;
339 default: SWR_ASSERT(false, "Unknown attachment: %d", pDesc->attachment); srcFormat = KNOB_COLOR_HOT_TILE_FORMAT; break;
340 }
341
342 uint32_t x, y;
343 MacroTileMgr::getTileIndices(macroTile, x, y);
344
345 // Only need to store the hottile if it's been rendered to...
346 HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, pDesc->attachment, false);
347 if (pHotTile)
348 {
349 // clear if clear is pending (i.e., not rendered to), then mark as dirty for store.
350 if (pHotTile->state == HOTTILE_CLEAR)
351 {
352 PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[srcFormat];
353 SWR_ASSERT(pfnClearTiles != nullptr);
354
355 pfnClearTiles(pDC, pDesc->attachment, macroTile, pHotTile->clearData);
356 }
357
358 if (pHotTile->state == HOTTILE_DIRTY || pDesc->postStoreTileState == (SWR_TILE_STATE)HOTTILE_DIRTY)
359 {
360 int destX = KNOB_MACROTILE_X_DIM * x;
361 int destY = KNOB_MACROTILE_Y_DIM * y;
362
363 pContext->pfnStoreTile(GetPrivateState(pDC), srcFormat,
364 pDesc->attachment, destX, destY, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer);
365 }
366
367
368 if (pHotTile->state == HOTTILE_DIRTY || pHotTile->state == HOTTILE_RESOLVED)
369 {
370 pHotTile->state = (HOTTILE_STATE)pDesc->postStoreTileState;
371 }
372 }
373 RDTSC_STOP(BEStoreTiles, numTiles, pDC->drawId);
374 }
375
376
377 void ProcessDiscardInvalidateTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData)
378 {
379 DISCARD_INVALIDATE_TILES_DESC *pDesc = (DISCARD_INVALIDATE_TILES_DESC *)pData;
380 SWR_CONTEXT *pContext = pDC->pContext;
381
382 const int numSamples = GetNumSamples(pDC->pState->state.rastState.sampleCount);
383
384 for (uint32_t i = 0; i < SWR_NUM_ATTACHMENTS; ++i)
385 {
386 if (pDesc->attachmentMask & (1 << i))
387 {
388 HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTileNoLoad(
389 pContext, pDC, macroTile, (SWR_RENDERTARGET_ATTACHMENT)i, pDesc->createNewTiles, numSamples);
390 if (pHotTile)
391 {
392 pHotTile->state = (HOTTILE_STATE)pDesc->newTileState;
393 }
394 }
395 }
396 }
397
398 #if KNOB_SIMD_WIDTH == 8
399 const __m256 vCenterOffsetsX = {0.5, 1.5, 0.5, 1.5, 2.5, 3.5, 2.5, 3.5};
400 const __m256 vCenterOffsetsY = {0.5, 0.5, 1.5, 1.5, 0.5, 0.5, 1.5, 1.5};
401 const __m256 vULOffsetsX = {0.0, 1.0, 0.0, 1.0, 2.0, 3.0, 2.0, 3.0};
402 const __m256 vULOffsetsY = {0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0};
403 #else
404 #error Unsupported vector width
405 #endif
406
407 INLINE
408 bool CanEarlyZ(const SWR_PS_STATE *pPSState)
409 {
410 return (pPSState->forceEarlyZ || (!pPSState->writesODepth && !pPSState->usesSourceDepth && !pPSState->usesUAV));
411 }
412
413 simdmask ComputeUserClipMask(uint8_t clipMask, float* pUserClipBuffer, simdscalar vI, simdscalar vJ)
414 {
415 simdscalar vClipMask = _simd_setzero_ps();
416 uint32_t numClipDistance = _mm_popcnt_u32(clipMask);
417
418 for (uint32_t i = 0; i < numClipDistance; ++i)
419 {
420 // pull triangle clip distance values from clip buffer
421 simdscalar vA = _simd_broadcast_ss(pUserClipBuffer++);
422 simdscalar vB = _simd_broadcast_ss(pUserClipBuffer++);
423 simdscalar vC = _simd_broadcast_ss(pUserClipBuffer++);
424
425 // interpolate
426 simdscalar vInterp = vplaneps(vA, vB, vC, vI, vJ);
427
428 // clip if interpolated clip distance is < 0 || NAN
429 simdscalar vCull = _simd_cmp_ps(_simd_setzero_ps(), vInterp, _CMP_NLE_UQ);
430
431 vClipMask = _simd_or_ps(vClipMask, vCull);
432 }
433
434 return _simd_movemask_ps(vClipMask);
435 }
436
437 template<typename T>
438 void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
439 {
440 RDTSC_START(BESingleSampleBackend);
441 RDTSC_START(BESetup);
442
443 SWR_CONTEXT *pContext = pDC->pContext;
444 const API_STATE& state = GetApiState(pDC);
445 const SWR_RASTSTATE& rastState = state.rastState;
446 const SWR_PS_STATE *pPSState = &state.psState;
447 const SWR_BLEND_STATE *pBlendState = &state.blendState;
448 uint64_t coverageMask = work.coverageMask[0];
449
450 // broadcast scalars
451 BarycentricCoeffs coeffs;
452 coeffs.vIa = _simd_broadcast_ss(&work.I[0]);
453 coeffs.vIb = _simd_broadcast_ss(&work.I[1]);
454 coeffs.vIc = _simd_broadcast_ss(&work.I[2]);
455
456 coeffs.vJa = _simd_broadcast_ss(&work.J[0]);
457 coeffs.vJb = _simd_broadcast_ss(&work.J[1]);
458 coeffs.vJc = _simd_broadcast_ss(&work.J[2]);
459
460 coeffs.vZa = _simd_broadcast_ss(&work.Z[0]);
461 coeffs.vZb = _simd_broadcast_ss(&work.Z[1]);
462 coeffs.vZc = _simd_broadcast_ss(&work.Z[2]);
463
464 coeffs.vRecipDet = _simd_broadcast_ss(&work.recipDet);
465
466 coeffs.vAOneOverW = _simd_broadcast_ss(&work.OneOverW[0]);
467 coeffs.vBOneOverW = _simd_broadcast_ss(&work.OneOverW[1]);
468 coeffs.vCOneOverW = _simd_broadcast_ss(&work.OneOverW[2]);
469
470 uint8_t *pColorBase[SWR_NUM_RENDERTARGETS];
471 uint32_t NumRT = state.psState.numRenderTargets;
472 for(uint32_t rt = 0; rt < NumRT; ++rt)
473 {
474 pColorBase[rt] = renderBuffers.pColor[rt];
475 }
476 uint8_t *pDepthBase = renderBuffers.pDepth, *pStencilBase = renderBuffers.pStencil;
477 RDTSC_STOP(BESetup, 0, 0);
478
479 SWR_PS_CONTEXT psContext;
480 psContext.pAttribs = work.pAttribs;
481 psContext.pPerspAttribs = work.pPerspAttribs;
482 psContext.frontFace = work.triFlags.frontFacing;
483 psContext.primID = work.triFlags.primID;
484
485 // save Ia/Ib/Ic and Ja/Jb/Jc if we need to reevaluate i/j/k in the shader because of pull attribs
486 psContext.I = work.I;
487 psContext.J = work.J;
488 psContext.recipDet = work.recipDet;
489 psContext.pRecipW = work.pRecipW;
490 psContext.pSamplePosX = (const float*)&T::MultisampleT::samplePosX;
491 psContext.pSamplePosY = (const float*)&T::MultisampleT::samplePosY;
492
493 for(uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
494 {
495 // UL pixel corner
496 psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps((float)yy));
497 // pixel center
498 psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps((float)yy));
499
500 for(uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
501 {
502 if(coverageMask & MASK)
503 {
504 psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps((float)xx));
505 // pixel center
506 psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps((float)xx));
507
508 if(T::bInputCoverage)
509 {
510 generateInputCoverage<T>(&work.coverageMask[0], psContext.inputMask, pBlendState->sampleMask);
511 }
512
513 RDTSC_START(BEBarycentric);
514 CalcPixelBarycentrics(coeffs, psContext);
515
516 if(T::bCentroidPos)
517 {
518 // for 1x case, centroid is pixel center
519 psContext.vX.centroid = psContext.vX.center;
520 psContext.vY.centroid = psContext.vY.center;
521 psContext.vI.centroid = psContext.vI.center;
522 psContext.vJ.centroid = psContext.vJ.center;
523 psContext.vOneOverW.centroid = psContext.vOneOverW.center;
524 }
525
526 // interpolate and quantize z
527 psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center);
528 psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
529 RDTSC_STOP(BEBarycentric, 0, 0);
530
531 simdmask clipCoverageMask = coverageMask & MASK;
532 // interpolate user clip distance if available
533 if(rastState.clipDistanceMask)
534 {
535 clipCoverageMask &= ~ComputeUserClipMask(rastState.clipDistanceMask, work.pUserClipBuffer,
536 psContext.vI.center, psContext.vJ.center);
537 }
538
539 simdscalar vCoverageMask = vMask(clipCoverageMask);
540 simdscalar depthPassMask = vCoverageMask;
541 simdscalar stencilPassMask = vCoverageMask;
542
543 // Early-Z?
544 if(T::bCanEarlyZ)
545 {
546 RDTSC_START(BEEarlyDepthTest);
547 depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing,
548 psContext.vZ, pDepthBase, vCoverageMask, pStencilBase, &stencilPassMask);
549 RDTSC_STOP(BEEarlyDepthTest, 0, 0);
550
551 // early-exit if no pixels passed depth or earlyZ is forced on
552 if(pPSState->forceEarlyZ || !_simd_movemask_ps(depthPassMask))
553 {
554 DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
555 pDepthBase, depthPassMask, vCoverageMask, pStencilBase, stencilPassMask);
556
557 if (!_simd_movemask_ps(depthPassMask))
558 {
559 goto Endtile;
560 }
561 }
562 }
563
564 psContext.sampleIndex = 0;
565 psContext.activeMask = _simd_castps_si(vCoverageMask);
566
567 // execute pixel shader
568 RDTSC_START(BEPixelShader);
569 UPDATE_STAT(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask)));
570 state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
571 RDTSC_STOP(BEPixelShader, 0, 0);
572
573 vCoverageMask = _simd_castsi_ps(psContext.activeMask);
574
575 // late-Z
576 if(!T::bCanEarlyZ)
577 {
578 RDTSC_START(BELateDepthTest);
579 depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing,
580 psContext.vZ, pDepthBase, vCoverageMask, pStencilBase, &stencilPassMask);
581 RDTSC_STOP(BELateDepthTest, 0, 0);
582
583 if(!_simd_movemask_ps(depthPassMask))
584 {
585 // need to call depth/stencil write for stencil write
586 DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
587 pDepthBase, depthPassMask, vCoverageMask, pStencilBase, stencilPassMask);
588 goto Endtile;
589 }
590 }
591
592 uint32_t statMask = _simd_movemask_ps(depthPassMask);
593 uint32_t statCount = _mm_popcnt_u32(statMask);
594 UPDATE_STAT(DepthPassCount, statCount);
595
596 // output merger
597 RDTSC_START(BEOutputMerger);
598 OutputMerger(psContext, pColorBase, 0, pBlendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, pPSState->numRenderTargets);
599
600 // do final depth write after all pixel kills
601 if (!pPSState->forceEarlyZ)
602 {
603 DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
604 pDepthBase, depthPassMask, vCoverageMask, pStencilBase, stencilPassMask);
605 }
606 RDTSC_STOP(BEOutputMerger, 0, 0);
607 }
608
609 Endtile:
610 RDTSC_START(BEEndTile);
611 coverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
612 pDepthBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
613 pStencilBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
614
615 for(uint32_t rt = 0; rt < NumRT; ++rt)
616 {
617 pColorBase[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
618 }
619 RDTSC_STOP(BEEndTile, 0, 0);
620 }
621 }
622 RDTSC_STOP(BESingleSampleBackend, 0, 0);
623 }
624
625 template<typename T>
626 void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
627 {
628 RDTSC_START(BESampleRateBackend);
629 RDTSC_START(BESetup);
630
631 SWR_CONTEXT *pContext = pDC->pContext;
632 const API_STATE& state = GetApiState(pDC);
633 const SWR_RASTSTATE& rastState = state.rastState;
634 const SWR_PS_STATE *pPSState = &state.psState;
635 const SWR_BLEND_STATE *pBlendState = &state.blendState;
636
637 // broadcast scalars
638 BarycentricCoeffs coeffs;
639 coeffs.vIa = _simd_broadcast_ss(&work.I[0]);
640 coeffs.vIb = _simd_broadcast_ss(&work.I[1]);
641 coeffs.vIc = _simd_broadcast_ss(&work.I[2]);
642
643 coeffs.vJa = _simd_broadcast_ss(&work.J[0]);
644 coeffs.vJb = _simd_broadcast_ss(&work.J[1]);
645 coeffs.vJc = _simd_broadcast_ss(&work.J[2]);
646
647 coeffs.vZa = _simd_broadcast_ss(&work.Z[0]);
648 coeffs.vZb = _simd_broadcast_ss(&work.Z[1]);
649 coeffs.vZc = _simd_broadcast_ss(&work.Z[2]);
650
651 coeffs.vRecipDet = _simd_broadcast_ss(&work.recipDet);
652
653 coeffs.vAOneOverW = _simd_broadcast_ss(&work.OneOverW[0]);
654 coeffs.vBOneOverW = _simd_broadcast_ss(&work.OneOverW[1]);
655 coeffs.vCOneOverW = _simd_broadcast_ss(&work.OneOverW[2]);
656
657 uint8_t *pColorBase[SWR_NUM_RENDERTARGETS];
658 uint32_t NumRT = state.psState.numRenderTargets;
659 for(uint32_t rt = 0; rt < NumRT; ++rt)
660 {
661 pColorBase[rt] = renderBuffers.pColor[rt];
662 }
663 uint8_t *pDepthBase = renderBuffers.pDepth, *pStencilBase = renderBuffers.pStencil;
664 RDTSC_STOP(BESetup, 0, 0);
665
666 SWR_PS_CONTEXT psContext;
667 psContext.pAttribs = work.pAttribs;
668 psContext.pPerspAttribs = work.pPerspAttribs;
669 psContext.pRecipW = work.pRecipW;
670 psContext.frontFace = work.triFlags.frontFacing;
671 psContext.primID = work.triFlags.primID;
672
673 // save Ia/Ib/Ic and Ja/Jb/Jc if we need to reevaluate i/j/k in the shader because of pull attribs
674 psContext.I = work.I;
675 psContext.J = work.J;
676 psContext.recipDet = work.recipDet;
677 psContext.pSamplePosX = (const float*)&T::MultisampleT::samplePosX;
678 psContext.pSamplePosY = (const float*)&T::MultisampleT::samplePosY;
679
680 for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
681 {
682 // UL pixel corner
683 psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps((float)yy));
684 // pixel center
685 psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps((float)yy));
686
687 for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
688 {
689 psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps((float)xx));
690 // pixel center
691 psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps((float)xx));
692
693 RDTSC_START(BEBarycentric);
694 CalcPixelBarycentrics(coeffs, psContext);
695 RDTSC_STOP(BEBarycentric, 0, 0);
696
697 if(T::bInputCoverage)
698 {
699 generateInputCoverage<T>(&work.coverageMask[0], psContext.inputMask, pBlendState->sampleMask);
700 }
701
702 if(T::bCentroidPos)
703 {
704 ///@ todo: don't need to genererate input coverage 2x if input coverage and centroid
705 RDTSC_START(BEBarycentric);
706 CalcCentroidBarycentrics<T>(coeffs, psContext, &work.coverageMask[0], pBlendState->sampleMask, psContext.vX.UL, psContext.vY.UL);
707 RDTSC_STOP(BEBarycentric, 0, 0);
708 }
709
710 for(uint32_t sample = 0; sample < T::MultisampleT::numSamples; sample++)
711 {
712 simdmask coverageMask = work.coverageMask[sample] & MASK;
713 if (coverageMask)
714 {
715 RDTSC_START(BEBarycentric);
716 // calculate per sample positions
717 psContext.vX.sample = _simd_add_ps(psContext.vX.UL, T::MultisampleT::vX(sample));
718 psContext.vY.sample = _simd_add_ps(psContext.vY.UL, T::MultisampleT::vY(sample));
719
720 CalcSampleBarycentrics(coeffs, psContext);
721
722 // interpolate and quantize z
723 psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample);
724 psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
725 RDTSC_STOP(BEBarycentric, 0, 0);
726
727 // interpolate user clip distance if available
728 if (rastState.clipDistanceMask)
729 {
730 coverageMask &= ~ComputeUserClipMask(rastState.clipDistanceMask, work.pUserClipBuffer,
731 psContext.vI.sample, psContext.vJ.sample);
732 }
733
734 simdscalar vCoverageMask = vMask(coverageMask);
735 simdscalar depthPassMask = vCoverageMask;
736 simdscalar stencilPassMask = vCoverageMask;
737
738 // offset depth/stencil buffers current sample
739 uint8_t *pDepthSample = pDepthBase + RasterTileDepthOffset(sample);
740 uint8_t *pStencilSample = pStencilBase + RasterTileStencilOffset(sample);
741
742 // Early-Z?
743 if (T::bCanEarlyZ)
744 {
745 RDTSC_START(BEEarlyDepthTest);
746 depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing,
747 psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
748 RDTSC_STOP(BEEarlyDepthTest, 0, 0);
749
750 // early-exit if no samples passed depth or earlyZ is forced on.
751 if (pPSState->forceEarlyZ || !_simd_movemask_ps(depthPassMask))
752 {
753 DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
754 pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
755
756 if (!_simd_movemask_ps(depthPassMask))
757 {
758 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
759 continue;
760 }
761 }
762 }
763
764 psContext.sampleIndex = sample;
765 psContext.activeMask = _simd_castps_si(vCoverageMask);
766
767 // execute pixel shader
768 RDTSC_START(BEPixelShader);
769 UPDATE_STAT(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask)));
770 state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
771 RDTSC_STOP(BEPixelShader, 0, 0);
772
773 vCoverageMask = _simd_castsi_ps(psContext.activeMask);
774
775 // late-Z
776 if (!T::bCanEarlyZ)
777 {
778 RDTSC_START(BELateDepthTest);
779 depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing,
780 psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
781 RDTSC_STOP(BELateDepthTest, 0, 0);
782
783 if (!_simd_movemask_ps(depthPassMask))
784 {
785 // need to call depth/stencil write for stencil write
786 DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
787 pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
788
789 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
790 continue;
791 }
792 }
793
794 uint32_t statMask = _simd_movemask_ps(depthPassMask);
795 uint32_t statCount = _mm_popcnt_u32(statMask);
796 UPDATE_STAT(DepthPassCount, statCount);
797
798 // output merger
799 RDTSC_START(BEOutputMerger);
800 OutputMerger(psContext, pColorBase, sample, pBlendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, pPSState->numRenderTargets);
801
802 // do final depth write after all pixel kills
803 if (!pPSState->forceEarlyZ)
804 {
805 DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
806 pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
807 }
808 RDTSC_STOP(BEOutputMerger, 0, 0);
809 }
810 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
811 }
812 RDTSC_START(BEEndTile);
813 pDepthBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
814 pStencilBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
815
816 for (uint32_t rt = 0; rt < NumRT; ++rt)
817 {
818 pColorBase[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
819 }
820 RDTSC_STOP(BEEndTile, 0, 0);
821 }
822 }
823 RDTSC_STOP(BESampleRateBackend, 0, 0);
824 }
825
826 template<typename T>
827 void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
828 {
829 RDTSC_START(BEPixelRateBackend);
830 RDTSC_START(BESetup);
831
832 SWR_CONTEXT *pContext = pDC->pContext;
833 const API_STATE& state = GetApiState(pDC);
834 const SWR_RASTSTATE& rastState = state.rastState;
835 const SWR_PS_STATE *pPSState = &state.psState;
836 const SWR_BLEND_STATE *pBlendState = &state.blendState;
837
838 // broadcast scalars
839 BarycentricCoeffs coeffs;
840 coeffs.vIa = _simd_broadcast_ss(&work.I[0]);
841 coeffs.vIb = _simd_broadcast_ss(&work.I[1]);
842 coeffs.vIc = _simd_broadcast_ss(&work.I[2]);
843
844 coeffs.vJa = _simd_broadcast_ss(&work.J[0]);
845 coeffs.vJb = _simd_broadcast_ss(&work.J[1]);
846 coeffs.vJc = _simd_broadcast_ss(&work.J[2]);
847
848 coeffs.vZa = _simd_broadcast_ss(&work.Z[0]);
849 coeffs.vZb = _simd_broadcast_ss(&work.Z[1]);
850 coeffs.vZc = _simd_broadcast_ss(&work.Z[2]);
851
852 coeffs.vRecipDet = _simd_broadcast_ss(&work.recipDet);
853
854 coeffs.vAOneOverW = _simd_broadcast_ss(&work.OneOverW[0]);
855 coeffs.vBOneOverW = _simd_broadcast_ss(&work.OneOverW[1]);
856 coeffs.vCOneOverW = _simd_broadcast_ss(&work.OneOverW[2]);
857
858 uint8_t *pColorBase[SWR_NUM_RENDERTARGETS];
859 uint32_t NumRT = state.psState.numRenderTargets;
860 for(uint32_t rt = 0; rt < NumRT; ++rt)
861 {
862 pColorBase[rt] = renderBuffers.pColor[rt];
863 }
864 uint8_t *pDepthBase = renderBuffers.pDepth, *pStencilBase = renderBuffers.pStencil;
865 RDTSC_STOP(BESetup, 0, 0);
866
867 SWR_PS_CONTEXT psContext;
868 psContext.pAttribs = work.pAttribs;
869 psContext.pPerspAttribs = work.pPerspAttribs;
870 psContext.frontFace = work.triFlags.frontFacing;
871 psContext.primID = work.triFlags.primID;
872 psContext.pRecipW = work.pRecipW;
873 // save Ia/Ib/Ic and Ja/Jb/Jc if we need to reevaluate i/j/k in the shader because of pull attribs
874 psContext.I = work.I;
875 psContext.J = work.J;
876 psContext.recipDet = work.recipDet;
877 psContext.pSamplePosX = (const float*)&T::MultisampleT::samplePosX;
878 psContext.pSamplePosY = (const float*)&T::MultisampleT::samplePosY;
879 psContext.sampleIndex = 0;
880
881 PixelRateZTestLoop<T> PixelRateZTest(pDC, work, coeffs, state, pDepthBase, pStencilBase, rastState.clipDistanceMask);
882
883 for(uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
884 {
885 psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps((float)yy));
886 psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps((float)yy));
887 for(uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
888 {
889 simdscalar activeLanes;
890 if(!(work.anyCoveredSamples & MASK)) {goto Endtile;};
891 activeLanes = vMask(work.anyCoveredSamples & MASK);
892
893 psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps((float)xx));
894 // set pixel center positions
895 psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps((float)xx));
896
897 RDTSC_START(BEBarycentric);
898 CalcPixelBarycentrics(coeffs, psContext);
899 RDTSC_STOP(BEBarycentric, 0, 0);
900
901 if (T::bInputCoverage)
902 {
903 generateInputCoverage<T>(&work.coverageMask[0], psContext.inputMask, pBlendState->sampleMask);
904 }
905
906 if(T::bCentroidPos)
907 {
908 ///@ todo: don't need to genererate input coverage 2x if input coverage and centroid
909 RDTSC_START(BEBarycentric);
910 CalcCentroidBarycentrics<T>(coeffs, psContext, &work.coverageMask[0], pBlendState->sampleMask, psContext.vX.UL, psContext.vY.UL);
911 RDTSC_STOP(BEBarycentric, 0, 0);
912 }
913
914 if(T::bForcedSampleCount)
915 {
916 // candidate pixels (that passed coverage) will cause shader invocation if any bits in the samplemask are set
917 const simdscalar vSampleMask = _simd_castsi_ps(_simd_cmpgt_epi32(_simd_set1_epi32(pBlendState->sampleMask), _simd_setzero_si()));
918 activeLanes = _simd_and_ps(activeLanes, vSampleMask);
919 }
920
921 // Early-Z?
922 if(T::bCanEarlyZ && !T::bForcedSampleCount)
923 {
924 uint32_t depthPassCount = PixelRateZTest(activeLanes, psContext, BEEarlyDepthTest);
925 UPDATE_STAT(DepthPassCount, depthPassCount);
926 }
927
928 // if we have no covered samples that passed depth at this point, go to next tile
929 if(!_simd_movemask_ps(activeLanes)) { goto Endtile; };
930
931 if(pPSState->usesSourceDepth)
932 {
933 RDTSC_START(BEBarycentric);
934 // interpolate and quantize z
935 psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center);
936 psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
937 RDTSC_STOP(BEBarycentric, 0, 0);
938 }
939
940 // pixels that are currently active
941 psContext.activeMask = _simd_castps_si(activeLanes);
942 psContext.oMask = T::MultisampleT::FullSampleMask();
943
944 // execute pixel shader
945 RDTSC_START(BEPixelShader);
946 state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
947 UPDATE_STAT(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(activeLanes)));
948 RDTSC_STOP(BEPixelShader, 0, 0);
949
950 // update active lanes to remove any discarded or oMask'd pixels
951 activeLanes = _simd_castsi_ps(_simd_and_si(psContext.activeMask, _simd_cmpgt_epi32(psContext.oMask, _simd_setzero_si())));
952 if(!_simd_movemask_ps(activeLanes)) { goto Endtile; };
953
954 // late-Z
955 if(!T::bCanEarlyZ && !T::bForcedSampleCount)
956 {
957 uint32_t depthPassCount = PixelRateZTest(activeLanes, psContext, BELateDepthTest);
958 UPDATE_STAT(DepthPassCount, depthPassCount);
959 }
960
961 // if we have no covered samples that passed depth at this point, skip OM and go to next tile
962 if(!_simd_movemask_ps(activeLanes)) { goto Endtile; };
963
964 // output merger
965 // loop over all samples, broadcasting the results of the PS to all passing pixels
966 for(uint32_t sample = 0; sample < GetNumOMSamples<T>(pBlendState->sampleCount); sample++)
967 {
968 RDTSC_START(BEOutputMerger);
969 // center pattern does a single coverage/depth/stencil test, standard pattern tests all samples
970 uint32_t coverageSampleNum = (T::bIsStandardPattern) ? sample : 0;
971 simdscalar coverageMask, depthMask;
972 if(T::bForcedSampleCount)
973 {
974 coverageMask = depthMask = activeLanes;
975 }
976 else
977 {
978 coverageMask = PixelRateZTest.vCoverageMask[coverageSampleNum];
979 depthMask = PixelRateZTest.depthPassMask[coverageSampleNum];
980 if(!_simd_movemask_ps(depthMask))
981 {
982 // stencil should already have been written in early/lateZ tests
983 RDTSC_STOP(BEOutputMerger, 0, 0);
984 continue;
985 }
986 }
987
988 // broadcast the results of the PS to all passing pixels
989 OutputMerger(psContext, pColorBase, sample, pBlendState, state.pfnBlendFunc, coverageMask, depthMask, pPSState->numRenderTargets);
990
991 if(!pPSState->forceEarlyZ && !T::bForcedSampleCount)
992 {
993 uint8_t *pDepthSample = pDepthBase + RasterTileDepthOffset(sample);
994 uint8_t * pStencilSample = pStencilBase + RasterTileStencilOffset(sample);
995
996 DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, PixelRateZTest.vZ[coverageSampleNum],
997 pDepthSample, depthMask, coverageMask, pStencilSample, PixelRateZTest.stencilPassMask[coverageSampleNum]);
998 }
999 RDTSC_STOP(BEOutputMerger, 0, 0);
1000 }
1001 Endtile:
1002 RDTSC_START(BEEndTile);
1003 for(uint32_t sample = 0; sample < T::MultisampleT::numCoverageSamples; sample++)
1004 {
1005 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
1006 }
1007
1008 work.anyCoveredSamples >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
1009 pDepthBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
1010 pStencilBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
1011
1012 for(uint32_t rt = 0; rt < NumRT; ++rt)
1013 {
1014 pColorBase[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
1015 }
1016 RDTSC_STOP(BEEndTile, 0, 0);
1017 }
1018 }
1019 RDTSC_STOP(BEPixelRateBackend, 0, 0);
1020 }
1021 // optimized backend flow with NULL PS
1022 template<uint32_t sampleCountT>
1023 void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
1024 {
1025 RDTSC_START(BENullBackend);
1026 ///@todo: handle center multisample pattern
1027 typedef SwrBackendTraits<sampleCountT, SWR_MSAA_STANDARD_PATTERN> T;
1028 RDTSC_START(BESetup);
1029
1030 SWR_CONTEXT *pContext = pDC->pContext;
1031 const API_STATE& state = GetApiState(pDC);
1032 const SWR_RASTSTATE& rastState = pDC->pState->state.rastState;
1033
1034 // broadcast scalars
1035 BarycentricCoeffs coeffs;
1036 coeffs.vIa = _simd_broadcast_ss(&work.I[0]);
1037 coeffs.vIb = _simd_broadcast_ss(&work.I[1]);
1038 coeffs.vIc = _simd_broadcast_ss(&work.I[2]);
1039
1040 coeffs.vJa = _simd_broadcast_ss(&work.J[0]);
1041 coeffs.vJb = _simd_broadcast_ss(&work.J[1]);
1042 coeffs.vJc = _simd_broadcast_ss(&work.J[2]);
1043
1044 coeffs.vZa = _simd_broadcast_ss(&work.Z[0]);
1045 coeffs.vZb = _simd_broadcast_ss(&work.Z[1]);
1046 coeffs.vZc = _simd_broadcast_ss(&work.Z[2]);
1047
1048 coeffs.vRecipDet = _simd_broadcast_ss(&work.recipDet);
1049
1050 uint8_t *pDepthBase = renderBuffers.pDepth, *pStencilBase = renderBuffers.pStencil;
1051
1052 RDTSC_STOP(BESetup, 0, 0);
1053
1054 SWR_PS_CONTEXT psContext;
1055 for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
1056 {
1057 // UL pixel corner
1058 simdscalar vYSamplePosUL = _simd_add_ps(vULOffsetsY, _simd_set1_ps((float)yy));
1059
1060 for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
1061 {
1062 // UL pixel corners
1063 simdscalar vXSamplePosUL = _simd_add_ps(vULOffsetsX, _simd_set1_ps((float)xx));
1064
1065 // iterate over active samples
1066 unsigned long sample = 0;
1067 uint32_t sampleMask = state.blendState.sampleMask;
1068 while (_BitScanForward(&sample, sampleMask))
1069 {
1070 sampleMask &= ~(1 << sample);
1071 simdmask coverageMask = work.coverageMask[sample] & MASK;
1072 if (coverageMask)
1073 {
1074 RDTSC_START(BEBarycentric);
1075 // calculate per sample positions
1076 psContext.vX.sample = _simd_add_ps(vXSamplePosUL, T::MultisampleT::vX(sample));
1077 psContext.vY.sample = _simd_add_ps(vYSamplePosUL, T::MultisampleT::vY(sample));
1078
1079 CalcSampleBarycentrics(coeffs, psContext);
1080
1081 // interpolate and quantize z
1082 psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample);
1083 psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
1084
1085 RDTSC_STOP(BEBarycentric, 0, 0);
1086
1087 // interpolate user clip distance if available
1088 if (rastState.clipDistanceMask)
1089 {
1090 coverageMask &= ~ComputeUserClipMask(rastState.clipDistanceMask, work.pUserClipBuffer,
1091 psContext.vI.sample, psContext.vJ.sample);
1092 }
1093
1094 simdscalar vCoverageMask = vMask(coverageMask);
1095 simdscalar stencilPassMask = vCoverageMask;
1096
1097 // offset depth/stencil buffers current sample
1098 uint8_t *pDepthSample = pDepthBase + RasterTileDepthOffset(sample);
1099 uint8_t *pStencilSample = pStencilBase + RasterTileStencilOffset(sample);
1100
1101 RDTSC_START(BEEarlyDepthTest);
1102 simdscalar depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing,
1103 psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
1104 DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
1105 pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
1106 RDTSC_STOP(BEEarlyDepthTest, 0, 0);
1107
1108 uint32_t statMask = _simd_movemask_ps(depthPassMask);
1109 uint32_t statCount = _mm_popcnt_u32(statMask);
1110 UPDATE_STAT(DepthPassCount, statCount);
1111 }
1112 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
1113 }
1114 pDepthBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
1115 pStencilBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
1116 }
1117 }
1118 RDTSC_STOP(BENullBackend, 0, 0);
1119 }
1120
1121 void InitClearTilesTable()
1122 {
1123 memset(sClearTilesTable, 0, sizeof(sClearTilesTable));
1124
1125 sClearTilesTable[R8G8B8A8_UNORM] = ClearMacroTile<R8G8B8A8_UNORM>;
1126 sClearTilesTable[B8G8R8A8_UNORM] = ClearMacroTile<B8G8R8A8_UNORM>;
1127 sClearTilesTable[R32_FLOAT] = ClearMacroTile<R32_FLOAT>;
1128 sClearTilesTable[R32G32B32A32_FLOAT] = ClearMacroTile<R32G32B32A32_FLOAT>;
1129 sClearTilesTable[R8_UINT] = ClearMacroTile<R8_UINT>;
1130 }
1131
1132 PFN_BACKEND_FUNC gBackendNullPs[SWR_MULTISAMPLE_TYPE_MAX];
1133 PFN_BACKEND_FUNC gBackendSingleSample[2] // input coverage
1134 [2] // centroid
1135 [2] // canEarlyZ
1136 = {};
1137 PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_MAX]
1138 [SWR_MSAA_SAMPLE_PATTERN_MAX]
1139 [SWR_INPUT_COVERAGE_MAX]
1140 [2] // centroid
1141 [2] // forcedSampleCount
1142 [2] // canEarlyZ
1143 = {};
1144 PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_MAX][SWR_INPUT_COVERAGE_MAX]
1145 [2] // centroid
1146 [2] // canEarlyZ
1147 = {};
1148
1149 // Recursive template used to auto-nest conditionals. Converts dynamic enum function
1150 // arguments to static template arguments.
1151 template <uint32_t... ArgsT>
1152 struct BEChooser
1153 {
1154 // Last Arg Terminator
1155 static PFN_BACKEND_FUNC GetFunc(SWR_BACKEND_FUNCS tArg)
1156 {
1157 switch(tArg)
1158 {
1159 case SWR_BACKEND_SINGLE_SAMPLE: return BackendSingleSample<SwrBackendTraits<ArgsT...>>; break;
1160 case SWR_BACKEND_MSAA_PIXEL_RATE: return BackendPixelRate<SwrBackendTraits<ArgsT...>>; break;
1161 case SWR_BACKEND_MSAA_SAMPLE_RATE: return BackendSampleRate<SwrBackendTraits<ArgsT...>>; break;
1162 default:
1163 SWR_ASSERT(0 && "Invalid backend func\n");
1164 return nullptr;
1165 break;
1166 }
1167 }
1168
1169 // Recursively parse args
1170 template <typename... TArgsT>
1171 static PFN_BACKEND_FUNC GetFunc(SWR_MSAA_SAMPLE_PATTERN tArg, TArgsT... remainingArgs)
1172 {
1173 switch(tArg)
1174 {
1175 case SWR_MSAA_CENTER_PATTERN: return BEChooser<ArgsT..., SWR_MSAA_CENTER_PATTERN>::GetFunc(remainingArgs...); break;
1176 case SWR_MSAA_STANDARD_PATTERN: return BEChooser<ArgsT..., SWR_MSAA_STANDARD_PATTERN>::GetFunc(remainingArgs...); break;
1177 default:
1178 SWR_ASSERT(0 && "Invalid sample pattern\n");
1179 return BEChooser<ArgsT..., SWR_MSAA_STANDARD_PATTERN>::GetFunc(remainingArgs...);
1180 break;
1181 }
1182 }
1183
1184 // Recursively parse args
1185 template <typename... TArgsT>
1186 static PFN_BACKEND_FUNC GetFunc(SWR_MULTISAMPLE_COUNT tArg, TArgsT... remainingArgs)
1187 {
1188 switch(tArg)
1189 {
1190 case SWR_MULTISAMPLE_1X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...); break;
1191 case SWR_MULTISAMPLE_2X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_2X>::GetFunc(remainingArgs...); break;
1192 case SWR_MULTISAMPLE_4X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_4X>::GetFunc(remainingArgs...); break;
1193 case SWR_MULTISAMPLE_8X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_8X>::GetFunc(remainingArgs...); break;
1194 case SWR_MULTISAMPLE_16X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_16X>::GetFunc(remainingArgs...); break;
1195 default:
1196 SWR_ASSERT(0 && "Invalid sample count\n");
1197 return BEChooser<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...);
1198 break;
1199 }
1200 }
1201
1202 // Recursively parse args
1203 template <typename... TArgsT>
1204 static PFN_BACKEND_FUNC GetFunc(bool tArg, TArgsT... remainingArgs)
1205 {
1206 if(tArg == true)
1207 {
1208 return BEChooser<ArgsT..., 1>::GetFunc(remainingArgs...);
1209 }
1210
1211 return BEChooser<ArgsT..., 0>::GetFunc(remainingArgs...);
1212 }
1213 };
1214
1215 void InitBackendSingleFuncTable(PFN_BACKEND_FUNC (&table)[2][2][2])
1216 {
1217 for(uint32_t inputCoverage = SWR_INPUT_COVERAGE_NONE; inputCoverage < SWR_INPUT_COVERAGE_MAX; inputCoverage++)
1218 {
1219 for(uint32_t isCentroid = 0; isCentroid < 2; isCentroid++)
1220 {
1221 for(uint32_t canEarlyZ = 0; canEarlyZ < 2; canEarlyZ++)
1222 {
1223 table[inputCoverage][isCentroid][canEarlyZ] =
1224 BEChooser<>::GetFunc(SWR_MULTISAMPLE_1X, SWR_MSAA_STANDARD_PATTERN, (inputCoverage == SWR_INPUT_COVERAGE_NORMAL),
1225 (isCentroid > 0), false, (canEarlyZ > 0), SWR_BACKEND_SINGLE_SAMPLE);
1226 }
1227 }
1228 }
1229 }
1230
1231 void InitBackendPixelFuncTable(PFN_BACKEND_FUNC (&table)[SWR_MULTISAMPLE_TYPE_MAX][SWR_MSAA_SAMPLE_PATTERN_MAX][SWR_INPUT_COVERAGE_MAX]
1232 [2][2][2])
1233 {
1234 for(uint32_t sampleCount = SWR_MULTISAMPLE_1X; sampleCount < SWR_MULTISAMPLE_TYPE_MAX; sampleCount++)
1235 {
1236 for(uint32_t samplePattern = SWR_MSAA_CENTER_PATTERN; samplePattern < SWR_MSAA_SAMPLE_PATTERN_MAX; samplePattern++)
1237 {
1238 for(uint32_t inputCoverage = SWR_INPUT_COVERAGE_NONE; inputCoverage < SWR_INPUT_COVERAGE_MAX; inputCoverage++)
1239 {
1240 for(uint32_t isCentroid = 0; isCentroid < 2; isCentroid++)
1241 {
1242 for(uint32_t forcedSampleCount = 0; forcedSampleCount < 2; forcedSampleCount++)
1243 {
1244 for(uint32_t canEarlyZ = 0; canEarlyZ < 2; canEarlyZ++)
1245 {
1246 table[sampleCount][samplePattern][inputCoverage][isCentroid][forcedSampleCount][canEarlyZ] =
1247 BEChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, (SWR_MSAA_SAMPLE_PATTERN)samplePattern, (inputCoverage == SWR_INPUT_COVERAGE_NORMAL),
1248 (isCentroid > 0), (forcedSampleCount > 0), (canEarlyZ > 0), SWR_BACKEND_MSAA_PIXEL_RATE);
1249 }
1250 }
1251 }
1252 }
1253 }
1254 }
1255 }
1256
1257 void InitBackendSampleFuncTable(PFN_BACKEND_FUNC (&table)[SWR_MULTISAMPLE_TYPE_MAX][SWR_INPUT_COVERAGE_MAX][2][2])
1258 {
1259 for(uint32_t sampleCount = SWR_MULTISAMPLE_1X; sampleCount < SWR_MULTISAMPLE_TYPE_MAX; sampleCount++)
1260 {
1261 for(uint32_t inputCoverage = SWR_INPUT_COVERAGE_NONE; inputCoverage < SWR_INPUT_COVERAGE_MAX; inputCoverage++)
1262 {
1263 for(uint32_t centroid = 0; centroid < 2; centroid++)
1264 {
1265 for(uint32_t canEarlyZ = 0; canEarlyZ < 2; canEarlyZ++)
1266 {
1267 table[sampleCount][inputCoverage][centroid][canEarlyZ] =
1268 BEChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, SWR_MSAA_STANDARD_PATTERN, (inputCoverage == SWR_INPUT_COVERAGE_NORMAL),
1269 (centroid > 0), false, (canEarlyZ > 0), (SWR_BACKEND_FUNCS)SWR_BACKEND_MSAA_SAMPLE_RATE);
1270 }
1271 }
1272 }
1273 }
1274 }
1275
1276 void InitBackendFuncTables()
1277 {
1278 InitBackendSingleFuncTable(gBackendSingleSample);
1279 InitBackendPixelFuncTable(gBackendPixelRateTable);
1280 InitBackendSampleFuncTable(gBackendSampleRateTable);
1281
1282 gBackendNullPs[SWR_MULTISAMPLE_1X] = &BackendNullPS < SWR_MULTISAMPLE_1X > ;
1283 gBackendNullPs[SWR_MULTISAMPLE_2X] = &BackendNullPS < SWR_MULTISAMPLE_2X > ;
1284 gBackendNullPs[SWR_MULTISAMPLE_4X] = &BackendNullPS < SWR_MULTISAMPLE_4X > ;
1285 gBackendNullPs[SWR_MULTISAMPLE_8X] = &BackendNullPS < SWR_MULTISAMPLE_8X > ;
1286 gBackendNullPs[SWR_MULTISAMPLE_16X] = &BackendNullPS < SWR_MULTISAMPLE_16X > ;
1287 }