5d83baf391c3c7b4781e36fa7f86e8ab77ebfc8d
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / backend.cpp
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file backend.cpp
24 *
25 * @brief Backend handles rasterization, pixel shading and output merger
26 * operations.
27 *
28 ******************************************************************************/
29
30 #include <smmintrin.h>
31
32 #include "backend.h"
33 #include "depthstencil.h"
34 #include "tilemgr.h"
35 #include "memory/tilingtraits.h"
36 #include "core/multisample.h"
37
38 #include <algorithm>
39
40 typedef void(*PFN_CLEAR_TILES)(DRAW_CONTEXT*, SWR_RENDERTARGET_ATTACHMENT rt, uint32_t, DWORD[4]);
41 static PFN_CLEAR_TILES sClearTilesTable[NUM_SWR_FORMATS];
42
43 //////////////////////////////////////////////////////////////////////////
44 /// @brief Process compute work.
45 /// @param pDC - pointer to draw context (dispatch).
46 /// @param workerId - The unique worker ID that is assigned to this thread.
47 /// @param threadGroupId - the linear index for the thread group within the dispatch.
48 void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId, void*& pSpillFillBuffer)
49 {
50 RDTSC_START(BEDispatch);
51
52 SWR_CONTEXT *pContext = pDC->pContext;
53
54 const COMPUTE_DESC* pTaskData = (COMPUTE_DESC*)pDC->pDispatch->GetTasksData();
55 SWR_ASSERT(pTaskData != nullptr);
56
57 // Ensure spill fill memory has been allocated.
58 size_t spillFillSize = pDC->pState->state.totalSpillFillSize;
59 if (spillFillSize && pSpillFillBuffer == nullptr)
60 {
61 pSpillFillBuffer = pDC->pArena->AllocAlignedSync(spillFillSize, KNOB_SIMD_BYTES);
62 }
63
64 const API_STATE& state = GetApiState(pDC);
65
66 SWR_CS_CONTEXT csContext{ 0 };
67 csContext.tileCounter = threadGroupId;
68 csContext.dispatchDims[0] = pTaskData->threadGroupCountX;
69 csContext.dispatchDims[1] = pTaskData->threadGroupCountY;
70 csContext.dispatchDims[2] = pTaskData->threadGroupCountZ;
71 csContext.pTGSM = pContext->pScratch[workerId];
72 csContext.pSpillFillBuffer = (uint8_t*)pSpillFillBuffer;
73
74 state.pfnCsFunc(GetPrivateState(pDC), &csContext);
75
76 UPDATE_STAT(CsInvocations, state.totalThreadsInGroup);
77
78 RDTSC_STOP(BEDispatch, 1, 0);
79 }
80
81 void ProcessSyncBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData)
82 {
83 SYNC_DESC *pSync = (SYNC_DESC*)pUserData;
84
85 uint32_t x, y;
86 MacroTileMgr::getTileIndices(macroTile, x, y);
87 SWR_ASSERT(x == 0 && y == 0);
88
89 if (pSync->pfnCallbackFunc != nullptr)
90 {
91 pSync->pfnCallbackFunc(pSync->userData, pSync->userData2, pSync->userData3);
92 }
93 }
94
95 void ProcessQueryStatsBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData)
96 {
97 QUERY_DESC* pQueryDesc = (QUERY_DESC*)pUserData;
98 SWR_STATS* pStats = pQueryDesc->pStats;
99 SWR_CONTEXT *pContext = pDC->pContext;
100
101 SWR_ASSERT(pStats != nullptr);
102
103 for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i)
104 {
105 pStats->DepthPassCount += pContext->stats[i].DepthPassCount;
106
107 pStats->IaVertices += pContext->stats[i].IaVertices;
108 pStats->IaPrimitives += pContext->stats[i].IaPrimitives;
109 pStats->VsInvocations += pContext->stats[i].VsInvocations;
110 pStats->HsInvocations += pContext->stats[i].HsInvocations;
111 pStats->DsInvocations += pContext->stats[i].DsInvocations;
112 pStats->GsInvocations += pContext->stats[i].GsInvocations;
113 pStats->PsInvocations += pContext->stats[i].PsInvocations;
114 pStats->CInvocations += pContext->stats[i].CInvocations;
115 pStats->CsInvocations += pContext->stats[i].CsInvocations;
116 pStats->CPrimitives += pContext->stats[i].CPrimitives;
117 pStats->GsPrimitives += pContext->stats[i].GsPrimitives;
118
119 for (uint32_t stream = 0; stream < MAX_SO_STREAMS; ++stream)
120 {
121 pStats->SoWriteOffset[stream] += pContext->stats[i].SoWriteOffset[stream];
122
123 /// @note client is required to provide valid write offset before every draw, so we clear
124 /// out the contents of the write offset when storing stats
125 pContext->stats[i].SoWriteOffset[stream] = 0;
126
127 pStats->SoPrimStorageNeeded[stream] += pContext->stats[i].SoPrimStorageNeeded[stream];
128 pStats->SoNumPrimsWritten[stream] += pContext->stats[i].SoNumPrimsWritten[stream];
129 }
130 }
131 }
132
133 template<SWR_FORMAT format>
134 void ClearRasterTile(uint8_t *pTileBuffer, simdvector &value)
135 {
136 auto lambda = [&](int comp)
137 {
138 FormatTraits<format>::storeSOA(comp, pTileBuffer, value.v[comp]);
139 pTileBuffer += (KNOB_SIMD_WIDTH * FormatTraits<format>::GetBPC(comp) / 8);
140 };
141
142 const uint32_t numIter = (KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD_TILE_X_DIM);
143 for (uint32_t i = 0; i < numIter; ++i)
144 {
145 UnrollerL<0, FormatTraits<format>::numComps, 1>::step(lambda);
146 }
147 }
148
149 template<SWR_FORMAT format>
150 INLINE void ClearMacroTile(DRAW_CONTEXT *pDC, SWR_RENDERTARGET_ATTACHMENT rt, uint32_t macroTile, DWORD clear[4])
151 {
152 // convert clear color to hottile format
153 // clear color is in RGBA float/uint32
154 simdvector vClear;
155 for (uint32_t comp = 0; comp < FormatTraits<format>::numComps; ++comp)
156 {
157 simdscalar vComp;
158 vComp = _simd_load1_ps((const float*)&clear[comp]);
159 if (FormatTraits<format>::isNormalized(comp))
160 {
161 vComp = _simd_mul_ps(vComp, _simd_set1_ps(FormatTraits<format>::fromFloat(comp)));
162 vComp = _simd_castsi_ps(_simd_cvtps_epi32(vComp));
163 }
164 vComp = FormatTraits<format>::pack(comp, vComp);
165 vClear.v[FormatTraits<format>::swizzle(comp)] = vComp;
166 }
167
168 uint32_t tileX, tileY;
169 MacroTileMgr::getTileIndices(macroTile, tileX, tileY);
170 const API_STATE& state = GetApiState(pDC);
171
172 int top = KNOB_MACROTILE_Y_DIM_FIXED * tileY;
173 int bottom = top + KNOB_MACROTILE_Y_DIM_FIXED - 1;
174 int left = KNOB_MACROTILE_X_DIM_FIXED * tileX;
175 int right = left + KNOB_MACROTILE_X_DIM_FIXED - 1;
176
177 // intersect with scissor
178 top = std::max(top, state.scissorInFixedPoint.top);
179 left = std::max(left, state.scissorInFixedPoint.left);
180 bottom = std::min(bottom, state.scissorInFixedPoint.bottom);
181 right = std::min(right, state.scissorInFixedPoint.right);
182
183 // translate to local hottile origin
184 top -= KNOB_MACROTILE_Y_DIM_FIXED * tileY;
185 bottom -= KNOB_MACROTILE_Y_DIM_FIXED * tileY;
186 left -= KNOB_MACROTILE_X_DIM_FIXED * tileX;
187 right -= KNOB_MACROTILE_X_DIM_FIXED * tileX;
188
189 // convert to raster tiles
190 top >>= (KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT);
191 bottom >>= (KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT);
192 left >>= (KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT);
193 right >>= (KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT);
194
195 const int numSamples = GetNumSamples(pDC->pState->state.rastState.sampleCount);
196 // compute steps between raster tile samples / raster tiles / macro tile rows
197 const uint32_t rasterTileSampleStep = KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<format>::bpp / 8;
198 const uint32_t rasterTileStep = (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<format>::bpp / 8)) * numSamples;
199 const uint32_t macroTileRowStep = (KNOB_MACROTILE_X_DIM / KNOB_TILE_X_DIM) * rasterTileStep;
200 const uint32_t pitch = (FormatTraits<format>::bpp * KNOB_MACROTILE_X_DIM / 8);
201
202 HOTTILE *pHotTile = pDC->pContext->pHotTileMgr->GetHotTile(pDC->pContext, pDC, macroTile, rt, true, numSamples);
203 uint32_t rasterTileStartOffset = (ComputeTileOffset2D< TilingTraits<SWR_TILE_SWRZ, FormatTraits<format>::bpp > >(pitch, left, top)) * numSamples;
204 uint8_t* pRasterTileRow = pHotTile->pBuffer + rasterTileStartOffset; //(ComputeTileOffset2D< TilingTraits<SWR_TILE_SWRZ, FormatTraits<format>::bpp > >(pitch, x, y)) * numSamples;
205
206 // loop over all raster tiles in the current hot tile
207 for (int y = top; y <= bottom; ++y)
208 {
209 uint8_t* pRasterTile = pRasterTileRow;
210 for (int x = left; x <= right; ++x)
211 {
212 for( int sampleNum = 0; sampleNum < numSamples; sampleNum++)
213 {
214 ClearRasterTile<format>(pRasterTile, vClear);
215 pRasterTile += rasterTileSampleStep;
216 }
217 }
218 pRasterTileRow += macroTileRowStep;
219 }
220
221 pHotTile->state = HOTTILE_DIRTY;
222 }
223
224
225 void ProcessClearBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData)
226 {
227 if (KNOB_FAST_CLEAR)
228 {
229 CLEAR_DESC *pClear = (CLEAR_DESC*)pUserData;
230 SWR_CONTEXT *pContext = pDC->pContext;
231 SWR_MULTISAMPLE_COUNT sampleCount = pDC->pState->state.rastState.sampleCount;
232 uint32_t numSamples = GetNumSamples(sampleCount);
233
234 SWR_ASSERT(pClear->flags.bits != 0); // shouldn't be here without a reason.
235
236 RDTSC_START(BEClear);
237
238 if (pClear->flags.mask & SWR_CLEAR_COLOR)
239 {
240 HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, SWR_ATTACHMENT_COLOR0, true, numSamples);
241 // All we want to do here is to mark the hot tile as being in a "needs clear" state.
242 pHotTile->clearData[0] = *(DWORD*)&(pClear->clearRTColor[0]);
243 pHotTile->clearData[1] = *(DWORD*)&(pClear->clearRTColor[1]);
244 pHotTile->clearData[2] = *(DWORD*)&(pClear->clearRTColor[2]);
245 pHotTile->clearData[3] = *(DWORD*)&(pClear->clearRTColor[3]);
246 pHotTile->state = HOTTILE_CLEAR;
247 }
248
249 if (pClear->flags.mask & SWR_CLEAR_DEPTH)
250 {
251 HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, SWR_ATTACHMENT_DEPTH, true, numSamples);
252 pHotTile->clearData[0] = *(DWORD*)&pClear->clearDepth;
253 pHotTile->state = HOTTILE_CLEAR;
254 }
255
256 if (pClear->flags.mask & SWR_CLEAR_STENCIL)
257 {
258 HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, SWR_ATTACHMENT_STENCIL, true, numSamples);
259
260 pHotTile->clearData[0] = *(DWORD*)&pClear->clearStencil;
261 pHotTile->state = HOTTILE_CLEAR;
262 }
263
264 RDTSC_STOP(BEClear, 0, 0);
265 }
266 else
267 {
268 // Legacy clear
269 CLEAR_DESC *pClear = (CLEAR_DESC*)pUserData;
270 RDTSC_START(BEClear);
271
272 if (pClear->flags.mask & SWR_CLEAR_COLOR)
273 {
274 /// @todo clear data should come in as RGBA32_FLOAT
275 DWORD clearData[4];
276 float clearFloat[4];
277 clearFloat[0] = ((uint8_t*)(&pClear->clearRTColor))[0] / 255.0f;
278 clearFloat[1] = ((uint8_t*)(&pClear->clearRTColor))[1] / 255.0f;
279 clearFloat[2] = ((uint8_t*)(&pClear->clearRTColor))[2] / 255.0f;
280 clearFloat[3] = ((uint8_t*)(&pClear->clearRTColor))[3] / 255.0f;
281 clearData[0] = *(DWORD*)&clearFloat[0];
282 clearData[1] = *(DWORD*)&clearFloat[1];
283 clearData[2] = *(DWORD*)&clearFloat[2];
284 clearData[3] = *(DWORD*)&clearFloat[3];
285
286 PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[KNOB_COLOR_HOT_TILE_FORMAT];
287 SWR_ASSERT(pfnClearTiles != nullptr);
288
289 pfnClearTiles(pDC, SWR_ATTACHMENT_COLOR0, macroTile, clearData);
290 }
291
292 if (pClear->flags.mask & SWR_CLEAR_DEPTH)
293 {
294 DWORD clearData[4];
295 clearData[0] = *(DWORD*)&pClear->clearDepth;
296 PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[KNOB_DEPTH_HOT_TILE_FORMAT];
297 SWR_ASSERT(pfnClearTiles != nullptr);
298
299 pfnClearTiles(pDC, SWR_ATTACHMENT_DEPTH, macroTile, clearData);
300 }
301
302 if (pClear->flags.mask & SWR_CLEAR_STENCIL)
303 {
304 uint32_t value = pClear->clearStencil;
305 DWORD clearData[4];
306 clearData[0] = *(DWORD*)&value;
307 PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[KNOB_STENCIL_HOT_TILE_FORMAT];
308
309 pfnClearTiles(pDC, SWR_ATTACHMENT_STENCIL, macroTile, clearData);
310 }
311
312 RDTSC_STOP(BEClear, 0, 0);
313 }
314 }
315
316
317 void ProcessStoreTileBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData)
318 {
319 RDTSC_START(BEStoreTiles);
320 STORE_TILES_DESC *pDesc = (STORE_TILES_DESC*)pData;
321 SWR_CONTEXT *pContext = pDC->pContext;
322
323 #ifdef KNOB_ENABLE_RDTSC
324 uint32_t numTiles = 0;
325 #endif
326 SWR_FORMAT srcFormat;
327 switch (pDesc->attachment)
328 {
329 case SWR_ATTACHMENT_COLOR0:
330 case SWR_ATTACHMENT_COLOR1:
331 case SWR_ATTACHMENT_COLOR2:
332 case SWR_ATTACHMENT_COLOR3:
333 case SWR_ATTACHMENT_COLOR4:
334 case SWR_ATTACHMENT_COLOR5:
335 case SWR_ATTACHMENT_COLOR6:
336 case SWR_ATTACHMENT_COLOR7: srcFormat = KNOB_COLOR_HOT_TILE_FORMAT; break;
337 case SWR_ATTACHMENT_DEPTH: srcFormat = KNOB_DEPTH_HOT_TILE_FORMAT; break;
338 case SWR_ATTACHMENT_STENCIL: srcFormat = KNOB_STENCIL_HOT_TILE_FORMAT; break;
339 default: SWR_ASSERT(false, "Unknown attachment: %d", pDesc->attachment); srcFormat = KNOB_COLOR_HOT_TILE_FORMAT; break;
340 }
341
342 uint32_t x, y;
343 MacroTileMgr::getTileIndices(macroTile, x, y);
344
345 // Only need to store the hottile if it's been rendered to...
346 HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, pDesc->attachment, false);
347 if (pHotTile)
348 {
349 // clear if clear is pending (i.e., not rendered to), then mark as dirty for store.
350 if (pHotTile->state == HOTTILE_CLEAR)
351 {
352 PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[srcFormat];
353 SWR_ASSERT(pfnClearTiles != nullptr);
354
355 pfnClearTiles(pDC, pDesc->attachment, macroTile, pHotTile->clearData);
356 }
357
358 if (pHotTile->state == HOTTILE_DIRTY || pDesc->postStoreTileState == (SWR_TILE_STATE)HOTTILE_DIRTY)
359 {
360 int destX = KNOB_MACROTILE_X_DIM * x;
361 int destY = KNOB_MACROTILE_Y_DIM * y;
362
363 pContext->pfnStoreTile(GetPrivateState(pDC), srcFormat,
364 pDesc->attachment, destX, destY, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer);
365 }
366
367
368 if (pHotTile->state == HOTTILE_DIRTY || pHotTile->state == HOTTILE_RESOLVED)
369 {
370 pHotTile->state = (HOTTILE_STATE)pDesc->postStoreTileState;
371 }
372 }
373 RDTSC_STOP(BEStoreTiles, numTiles, pDC->drawId);
374 }
375
376
377 void ProcessDiscardInvalidateTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData)
378 {
379 DISCARD_INVALIDATE_TILES_DESC *pDesc = (DISCARD_INVALIDATE_TILES_DESC *)pData;
380 SWR_CONTEXT *pContext = pDC->pContext;
381
382 const int numSamples = GetNumSamples(pDC->pState->state.rastState.sampleCount);
383
384 for (uint32_t i = 0; i < SWR_NUM_ATTACHMENTS; ++i)
385 {
386 if (pDesc->attachmentMask & (1 << i))
387 {
388 HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTileNoLoad(
389 pContext, pDC, macroTile, (SWR_RENDERTARGET_ATTACHMENT)i, pDesc->createNewTiles, numSamples);
390 if (pHotTile)
391 {
392 pHotTile->state = (HOTTILE_STATE)pDesc->newTileState;
393 }
394 }
395 }
396 }
397
398 #if KNOB_SIMD_WIDTH == 8
399 const __m256 vCenterOffsetsX = {0.5, 1.5, 0.5, 1.5, 2.5, 3.5, 2.5, 3.5};
400 const __m256 vCenterOffsetsY = {0.5, 0.5, 1.5, 1.5, 0.5, 0.5, 1.5, 1.5};
401 const __m256 vULOffsetsX = {0.0, 1.0, 0.0, 1.0, 2.0, 3.0, 2.0, 3.0};
402 const __m256 vULOffsetsY = {0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0};
403 #else
404 #error Unsupported vector width
405 #endif
406
407 INLINE
408 bool CanEarlyZ(const SWR_PS_STATE *pPSState)
409 {
410 return (pPSState->forceEarlyZ || (!pPSState->writesODepth && !pPSState->usesSourceDepth && !pPSState->usesUAV));
411 }
412
413 simdmask ComputeUserClipMask(uint8_t clipMask, float* pUserClipBuffer, simdscalar vI, simdscalar vJ)
414 {
415 simdscalar vClipMask = _simd_setzero_ps();
416 uint32_t numClipDistance = _mm_popcnt_u32(clipMask);
417
418 for (uint32_t i = 0; i < numClipDistance; ++i)
419 {
420 // pull triangle clip distance values from clip buffer
421 simdscalar vA = _simd_broadcast_ss(pUserClipBuffer++);
422 simdscalar vB = _simd_broadcast_ss(pUserClipBuffer++);
423 simdscalar vC = _simd_broadcast_ss(pUserClipBuffer++);
424
425 // interpolate
426 simdscalar vInterp = vplaneps(vA, vB, vC, vI, vJ);
427
428 // clip if interpolated clip distance is < 0 || NAN
429 simdscalar vCull = _simd_cmp_ps(_simd_setzero_ps(), vInterp, _CMP_NLE_UQ);
430
431 vClipMask = _simd_or_ps(vClipMask, vCull);
432 }
433
434 return _simd_movemask_ps(vClipMask);
435 }
436
437 template<typename T>
438 void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
439 {
440 RDTSC_START(BESingleSampleBackend);
441 RDTSC_START(BESetup);
442
443 SWR_CONTEXT *pContext = pDC->pContext;
444 const API_STATE& state = GetApiState(pDC);
445 const SWR_RASTSTATE& rastState = state.rastState;
446 const SWR_PS_STATE *pPSState = &state.psState;
447 const SWR_BLEND_STATE *pBlendState = &state.blendState;
448 uint64_t coverageMask = work.coverageMask[0];
449
450 // broadcast scalars
451 BarycentricCoeffs coeffs;
452 coeffs.vIa = _simd_broadcast_ss(&work.I[0]);
453 coeffs.vIb = _simd_broadcast_ss(&work.I[1]);
454 coeffs.vIc = _simd_broadcast_ss(&work.I[2]);
455
456 coeffs.vJa = _simd_broadcast_ss(&work.J[0]);
457 coeffs.vJb = _simd_broadcast_ss(&work.J[1]);
458 coeffs.vJc = _simd_broadcast_ss(&work.J[2]);
459
460 coeffs.vZa = _simd_broadcast_ss(&work.Z[0]);
461 coeffs.vZb = _simd_broadcast_ss(&work.Z[1]);
462 coeffs.vZc = _simd_broadcast_ss(&work.Z[2]);
463
464 coeffs.vRecipDet = _simd_broadcast_ss(&work.recipDet);
465
466 coeffs.vAOneOverW = _simd_broadcast_ss(&work.OneOverW[0]);
467 coeffs.vBOneOverW = _simd_broadcast_ss(&work.OneOverW[1]);
468 coeffs.vCOneOverW = _simd_broadcast_ss(&work.OneOverW[2]);
469
470 uint8_t *pColorBase[SWR_NUM_RENDERTARGETS];
471 uint32_t NumRT = state.psState.numRenderTargets;
472 for(uint32_t rt = 0; rt < NumRT; ++rt)
473 {
474 pColorBase[rt] = renderBuffers.pColor[rt];
475 }
476 uint8_t *pDepthBase = renderBuffers.pDepth, *pStencilBase = renderBuffers.pStencil;
477 RDTSC_STOP(BESetup, 0, 0);
478
479 SWR_PS_CONTEXT psContext;
480 psContext.pAttribs = work.pAttribs;
481 psContext.pPerspAttribs = work.pPerspAttribs;
482 psContext.frontFace = work.triFlags.frontFacing;
483 psContext.primID = work.triFlags.primID;
484
485 // save Ia/Ib/Ic and Ja/Jb/Jc if we need to reevaluate i/j/k in the shader because of pull attribs
486 psContext.I = work.I;
487 psContext.J = work.J;
488 psContext.recipDet = work.recipDet;
489 psContext.pRecipW = work.pRecipW;
490 psContext.pSamplePosX = (const float*)&T::MultisampleT::samplePosX;
491 psContext.pSamplePosY = (const float*)&T::MultisampleT::samplePosY;
492
493 for(uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
494 {
495 // UL pixel corner
496 psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps((float)yy));
497 // pixel center
498 psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps((float)yy));
499
500 for(uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
501 {
502 if(coverageMask & MASK)
503 {
504 psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps((float)xx));
505 // pixel center
506 psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps((float)xx));
507
508 if(T::bInputCoverage)
509 {
510 generateInputCoverage<T>(&work.coverageMask[0], psContext.inputMask, pBlendState->sampleMask);
511 }
512
513 RDTSC_START(BEBarycentric);
514 CalcPixelBarycentrics(coeffs, psContext);
515
516 // for 1x case, centroid is pixel center
517 psContext.vX.centroid = psContext.vX.center;
518 psContext.vY.centroid = psContext.vY.center;
519 psContext.vI.centroid = psContext.vI.center;
520 psContext.vJ.centroid = psContext.vJ.center;
521 psContext.vOneOverW.centroid = psContext.vOneOverW.center;
522
523 // interpolate and quantize z
524 psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center);
525 psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
526 RDTSC_STOP(BEBarycentric, 0, 0);
527
528 simdmask clipCoverageMask = coverageMask & MASK;
529 // interpolate user clip distance if available
530 if(rastState.clipDistanceMask)
531 {
532 clipCoverageMask &= ~ComputeUserClipMask(rastState.clipDistanceMask, work.pUserClipBuffer,
533 psContext.vI.center, psContext.vJ.center);
534 }
535
536 simdscalar vCoverageMask = vMask(clipCoverageMask);
537 simdscalar depthPassMask = vCoverageMask;
538 simdscalar stencilPassMask = vCoverageMask;
539
540 // Early-Z?
541 if(T::bCanEarlyZ)
542 {
543 RDTSC_START(BEEarlyDepthTest);
544 depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing,
545 psContext.vZ, pDepthBase, vCoverageMask, pStencilBase, &stencilPassMask);
546 RDTSC_STOP(BEEarlyDepthTest, 0, 0);
547
548 // early-exit if no pixels passed depth or earlyZ is forced on
549 if(pPSState->forceEarlyZ || !_simd_movemask_ps(depthPassMask))
550 {
551 DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
552 pDepthBase, depthPassMask, vCoverageMask, pStencilBase, stencilPassMask);
553
554 if (!_simd_movemask_ps(depthPassMask))
555 {
556 goto Endtile;
557 }
558 }
559 }
560
561 psContext.sampleIndex = 0;
562 psContext.activeMask = _simd_castps_si(vCoverageMask);
563
564 // execute pixel shader
565 RDTSC_START(BEPixelShader);
566 UPDATE_STAT(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask)));
567 state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
568 RDTSC_STOP(BEPixelShader, 0, 0);
569
570 vCoverageMask = _simd_castsi_ps(psContext.activeMask);
571
572 // late-Z
573 if(!T::bCanEarlyZ)
574 {
575 RDTSC_START(BELateDepthTest);
576 depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing,
577 psContext.vZ, pDepthBase, vCoverageMask, pStencilBase, &stencilPassMask);
578 RDTSC_STOP(BELateDepthTest, 0, 0);
579
580 if(!_simd_movemask_ps(depthPassMask))
581 {
582 // need to call depth/stencil write for stencil write
583 DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
584 pDepthBase, depthPassMask, vCoverageMask, pStencilBase, stencilPassMask);
585 goto Endtile;
586 }
587 }
588
589 uint32_t statMask = _simd_movemask_ps(depthPassMask);
590 uint32_t statCount = _mm_popcnt_u32(statMask);
591 UPDATE_STAT(DepthPassCount, statCount);
592
593 // output merger
594 RDTSC_START(BEOutputMerger);
595 OutputMerger(psContext, pColorBase, 0, pBlendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, pPSState->numRenderTargets);
596
597 // do final depth write after all pixel kills
598 if (!pPSState->forceEarlyZ)
599 {
600 DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
601 pDepthBase, depthPassMask, vCoverageMask, pStencilBase, stencilPassMask);
602 }
603 RDTSC_STOP(BEOutputMerger, 0, 0);
604 }
605
606 Endtile:
607 RDTSC_START(BEEndTile);
608 coverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
609 pDepthBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
610 pStencilBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
611
612 for(uint32_t rt = 0; rt < NumRT; ++rt)
613 {
614 pColorBase[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
615 }
616 RDTSC_STOP(BEEndTile, 0, 0);
617 }
618 }
619 RDTSC_STOP(BESingleSampleBackend, 0, 0);
620 }
621
622 template<typename T>
623 void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
624 {
625 RDTSC_START(BESampleRateBackend);
626 RDTSC_START(BESetup);
627
628 SWR_CONTEXT *pContext = pDC->pContext;
629 const API_STATE& state = GetApiState(pDC);
630 const SWR_RASTSTATE& rastState = state.rastState;
631 const SWR_PS_STATE *pPSState = &state.psState;
632 const SWR_BLEND_STATE *pBlendState = &state.blendState;
633
634 // broadcast scalars
635 BarycentricCoeffs coeffs;
636 coeffs.vIa = _simd_broadcast_ss(&work.I[0]);
637 coeffs.vIb = _simd_broadcast_ss(&work.I[1]);
638 coeffs.vIc = _simd_broadcast_ss(&work.I[2]);
639
640 coeffs.vJa = _simd_broadcast_ss(&work.J[0]);
641 coeffs.vJb = _simd_broadcast_ss(&work.J[1]);
642 coeffs.vJc = _simd_broadcast_ss(&work.J[2]);
643
644 coeffs.vZa = _simd_broadcast_ss(&work.Z[0]);
645 coeffs.vZb = _simd_broadcast_ss(&work.Z[1]);
646 coeffs.vZc = _simd_broadcast_ss(&work.Z[2]);
647
648 coeffs.vRecipDet = _simd_broadcast_ss(&work.recipDet);
649
650 coeffs.vAOneOverW = _simd_broadcast_ss(&work.OneOverW[0]);
651 coeffs.vBOneOverW = _simd_broadcast_ss(&work.OneOverW[1]);
652 coeffs.vCOneOverW = _simd_broadcast_ss(&work.OneOverW[2]);
653
654 uint8_t *pColorBase[SWR_NUM_RENDERTARGETS];
655 uint32_t NumRT = state.psState.numRenderTargets;
656 for(uint32_t rt = 0; rt < NumRT; ++rt)
657 {
658 pColorBase[rt] = renderBuffers.pColor[rt];
659 }
660 uint8_t *pDepthBase = renderBuffers.pDepth, *pStencilBase = renderBuffers.pStencil;
661 RDTSC_STOP(BESetup, 0, 0);
662
663 SWR_PS_CONTEXT psContext;
664 psContext.pAttribs = work.pAttribs;
665 psContext.pPerspAttribs = work.pPerspAttribs;
666 psContext.pRecipW = work.pRecipW;
667 psContext.frontFace = work.triFlags.frontFacing;
668 psContext.primID = work.triFlags.primID;
669
670 // save Ia/Ib/Ic and Ja/Jb/Jc if we need to reevaluate i/j/k in the shader because of pull attribs
671 psContext.I = work.I;
672 psContext.J = work.J;
673 psContext.recipDet = work.recipDet;
674 psContext.pSamplePosX = (const float*)&T::MultisampleT::samplePosX;
675 psContext.pSamplePosY = (const float*)&T::MultisampleT::samplePosY;
676
677 for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
678 {
679 // UL pixel corner
680 psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps((float)yy));
681 // pixel center
682 psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps((float)yy));
683
684 for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
685 {
686 psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps((float)xx));
687 // pixel center
688 psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps((float)xx));
689
690 RDTSC_START(BEBarycentric);
691 CalcPixelBarycentrics(coeffs, psContext);
692 RDTSC_STOP(BEBarycentric, 0, 0);
693
694 if(T::bInputCoverage)
695 {
696 generateInputCoverage<T>(&work.coverageMask[0], psContext.inputMask, pBlendState->sampleMask);
697 }
698
699 if(T::bCentroidPos)
700 {
701 ///@ todo: don't need to genererate input coverage 2x if input coverage and centroid
702 RDTSC_START(BEBarycentric);
703 if(T::bIsStandardPattern)
704 {
705 CalcCentroidPos<T>(psContext, &work.coverageMask[0], pBlendState->sampleMask, psContext.vX.UL, psContext.vY.UL);
706 }
707 else
708 {
709 psContext.vX.centroid = _simd_add_ps(psContext.vX.UL, _simd_set1_ps(0.5f));
710 psContext.vY.centroid = _simd_add_ps(psContext.vY.UL, _simd_set1_ps(0.5f));
711 }
712 CalcCentroidBarycentrics(coeffs, psContext, psContext.vX.UL, psContext.vY.UL);
713 RDTSC_STOP(BEBarycentric, 0, 0);
714 }
715 else
716 {
717 psContext.vX.centroid = psContext.vX.sample;
718 psContext.vY.centroid = psContext.vY.sample;
719 }
720
721 for(uint32_t sample = 0; sample < T::MultisampleT::numSamples; sample++)
722 {
723 simdmask coverageMask = work.coverageMask[sample] & MASK;
724 if (coverageMask)
725 {
726 RDTSC_START(BEBarycentric);
727 // calculate per sample positions
728 psContext.vX.sample = _simd_add_ps(psContext.vX.UL, T::MultisampleT::vX(sample));
729 psContext.vY.sample = _simd_add_ps(psContext.vY.UL, T::MultisampleT::vY(sample));
730
731 CalcSampleBarycentrics(coeffs, psContext);
732
733 // interpolate and quantize z
734 psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample);
735 psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
736 RDTSC_STOP(BEBarycentric, 0, 0);
737
738 // interpolate user clip distance if available
739 if (rastState.clipDistanceMask)
740 {
741 coverageMask &= ~ComputeUserClipMask(rastState.clipDistanceMask, work.pUserClipBuffer,
742 psContext.vI.sample, psContext.vJ.sample);
743 }
744
745 simdscalar vCoverageMask = vMask(coverageMask);
746 simdscalar depthPassMask = vCoverageMask;
747 simdscalar stencilPassMask = vCoverageMask;
748
749 // offset depth/stencil buffers current sample
750 uint8_t *pDepthSample = pDepthBase + RasterTileDepthOffset(sample);
751 uint8_t *pStencilSample = pStencilBase + RasterTileStencilOffset(sample);
752
753 // Early-Z?
754 if (T::bCanEarlyZ)
755 {
756 RDTSC_START(BEEarlyDepthTest);
757 depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing,
758 psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
759 RDTSC_STOP(BEEarlyDepthTest, 0, 0);
760
761 // early-exit if no samples passed depth or earlyZ is forced on.
762 if (pPSState->forceEarlyZ || !_simd_movemask_ps(depthPassMask))
763 {
764 DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
765 pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
766
767 if (!_simd_movemask_ps(depthPassMask))
768 {
769 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
770 continue;
771 }
772 }
773 }
774
775 psContext.sampleIndex = sample;
776 psContext.activeMask = _simd_castps_si(vCoverageMask);
777
778 // execute pixel shader
779 RDTSC_START(BEPixelShader);
780 UPDATE_STAT(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask)));
781 state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
782 RDTSC_STOP(BEPixelShader, 0, 0);
783
784 vCoverageMask = _simd_castsi_ps(psContext.activeMask);
785
786 // late-Z
787 if (!T::bCanEarlyZ)
788 {
789 RDTSC_START(BELateDepthTest);
790 depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing,
791 psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
792 RDTSC_STOP(BELateDepthTest, 0, 0);
793
794 if (!_simd_movemask_ps(depthPassMask))
795 {
796 // need to call depth/stencil write for stencil write
797 DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
798 pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
799
800 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
801 continue;
802 }
803 }
804
805 uint32_t statMask = _simd_movemask_ps(depthPassMask);
806 uint32_t statCount = _mm_popcnt_u32(statMask);
807 UPDATE_STAT(DepthPassCount, statCount);
808
809 // output merger
810 RDTSC_START(BEOutputMerger);
811 OutputMerger(psContext, pColorBase, sample, pBlendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, pPSState->numRenderTargets);
812
813 // do final depth write after all pixel kills
814 if (!pPSState->forceEarlyZ)
815 {
816 DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
817 pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
818 }
819 RDTSC_STOP(BEOutputMerger, 0, 0);
820 }
821 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
822 }
823 RDTSC_START(BEEndTile);
824 pDepthBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
825 pStencilBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
826
827 for (uint32_t rt = 0; rt < NumRT; ++rt)
828 {
829 pColorBase[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
830 }
831 RDTSC_STOP(BEEndTile, 0, 0);
832 }
833 }
834 RDTSC_STOP(BESampleRateBackend, 0, 0);
835 }
836
837 template<typename T>
838 void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
839 {
840 RDTSC_START(BEPixelRateBackend);
841 RDTSC_START(BESetup);
842
843 SWR_CONTEXT *pContext = pDC->pContext;
844 const API_STATE& state = GetApiState(pDC);
845 const SWR_RASTSTATE& rastState = state.rastState;
846 const SWR_PS_STATE *pPSState = &state.psState;
847 const SWR_BLEND_STATE *pBlendState = &state.blendState;
848
849 // broadcast scalars
850 BarycentricCoeffs coeffs;
851 coeffs.vIa = _simd_broadcast_ss(&work.I[0]);
852 coeffs.vIb = _simd_broadcast_ss(&work.I[1]);
853 coeffs.vIc = _simd_broadcast_ss(&work.I[2]);
854
855 coeffs.vJa = _simd_broadcast_ss(&work.J[0]);
856 coeffs.vJb = _simd_broadcast_ss(&work.J[1]);
857 coeffs.vJc = _simd_broadcast_ss(&work.J[2]);
858
859 coeffs.vZa = _simd_broadcast_ss(&work.Z[0]);
860 coeffs.vZb = _simd_broadcast_ss(&work.Z[1]);
861 coeffs.vZc = _simd_broadcast_ss(&work.Z[2]);
862
863 coeffs.vRecipDet = _simd_broadcast_ss(&work.recipDet);
864
865 coeffs.vAOneOverW = _simd_broadcast_ss(&work.OneOverW[0]);
866 coeffs.vBOneOverW = _simd_broadcast_ss(&work.OneOverW[1]);
867 coeffs.vCOneOverW = _simd_broadcast_ss(&work.OneOverW[2]);
868
869 uint8_t *pColorBase[SWR_NUM_RENDERTARGETS];
870 uint32_t NumRT = state.psState.numRenderTargets;
871 for(uint32_t rt = 0; rt < NumRT; ++rt)
872 {
873 pColorBase[rt] = renderBuffers.pColor[rt];
874 }
875 uint8_t *pDepthBase = renderBuffers.pDepth, *pStencilBase = renderBuffers.pStencil;
876 RDTSC_STOP(BESetup, 0, 0);
877
878 SWR_PS_CONTEXT psContext;
879 psContext.pAttribs = work.pAttribs;
880 psContext.pPerspAttribs = work.pPerspAttribs;
881 psContext.frontFace = work.triFlags.frontFacing;
882 psContext.primID = work.triFlags.primID;
883 psContext.pRecipW = work.pRecipW;
884 // save Ia/Ib/Ic and Ja/Jb/Jc if we need to reevaluate i/j/k in the shader because of pull attribs
885 psContext.I = work.I;
886 psContext.J = work.J;
887 psContext.recipDet = work.recipDet;
888 psContext.pSamplePosX = (const float*)&T::MultisampleT::samplePosX;
889 psContext.pSamplePosY = (const float*)&T::MultisampleT::samplePosY;
890 psContext.sampleIndex = 0;
891
892 PixelRateZTestLoop<T> PixelRateZTest(pDC, work, coeffs, state, pDepthBase, pStencilBase, rastState.clipDistanceMask);
893
894 for(uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
895 {
896 psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps((float)yy));
897 psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps((float)yy));
898 for(uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
899 {
900 simdscalar activeLanes;
901 if(!(work.anyCoveredSamples & MASK)) {goto Endtile;};
902 activeLanes = vMask(work.anyCoveredSamples & MASK);
903
904 psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps((float)xx));
905 // set pixel center positions
906 psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps((float)xx));
907
908 RDTSC_START(BEBarycentric);
909 CalcPixelBarycentrics(coeffs, psContext);
910 RDTSC_STOP(BEBarycentric, 0, 0);
911
912 if (T::bInputCoverage)
913 {
914 generateInputCoverage<T>(&work.coverageMask[0], psContext.inputMask, pBlendState->sampleMask);
915 }
916
917 if(T::bCentroidPos)
918 {
919 ///@ todo: don't need to genererate input coverage 2x if input coverage and centroid
920 RDTSC_START(BEBarycentric);
921 if(T::bIsStandardPattern)
922 {
923 CalcCentroidPos<T>(psContext, &work.coverageMask[0], pBlendState->sampleMask, psContext.vX.UL, psContext.vY.UL);
924 }
925 else
926 {
927 psContext.vX.centroid = _simd_add_ps(psContext.vX.UL, _simd_set1_ps(0.5f));
928 psContext.vY.centroid = _simd_add_ps(psContext.vY.UL, _simd_set1_ps(0.5f));
929 }
930
931 CalcCentroidBarycentrics(coeffs, psContext, psContext.vX.UL, psContext.vY.UL);
932 RDTSC_STOP(BEBarycentric, 0, 0);
933 }
934 else
935 {
936 psContext.vX.centroid = _simd_add_ps(psContext.vX.UL, _simd_set1_ps(0.5f));
937 psContext.vY.centroid = _simd_add_ps(psContext.vY.UL, _simd_set1_ps(0.5f));
938 }
939
940 if(T::bForcedSampleCount)
941 {
942 // candidate pixels (that passed coverage) will cause shader invocation if any bits in the samplemask are set
943 const simdscalar vSampleMask = _simd_castsi_ps(_simd_cmpgt_epi32(_simd_set1_epi32(pBlendState->sampleMask), _simd_setzero_si()));
944 activeLanes = _simd_and_ps(activeLanes, vSampleMask);
945 }
946
947 // Early-Z?
948 if(T::bCanEarlyZ && !T::bForcedSampleCount)
949 {
950 uint32_t depthPassCount = PixelRateZTest(activeLanes, psContext, BEEarlyDepthTest);
951 UPDATE_STAT(DepthPassCount, depthPassCount);
952 }
953
954 // if we have no covered samples that passed depth at this point, go to next tile
955 if(!_simd_movemask_ps(activeLanes)) { goto Endtile; };
956
957 if(pPSState->usesSourceDepth)
958 {
959 RDTSC_START(BEBarycentric);
960 // interpolate and quantize z
961 psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center);
962 psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
963 RDTSC_STOP(BEBarycentric, 0, 0);
964 }
965
966 // pixels that are currently active
967 psContext.activeMask = _simd_castps_si(activeLanes);
968 psContext.oMask = T::MultisampleT::FullSampleMask();
969
970 // execute pixel shader
971 RDTSC_START(BEPixelShader);
972 state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
973 UPDATE_STAT(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(activeLanes)));
974 RDTSC_STOP(BEPixelShader, 0, 0);
975
976 // update active lanes to remove any discarded or oMask'd pixels
977 activeLanes = _simd_castsi_ps(_simd_and_si(psContext.activeMask, _simd_cmpgt_epi32(psContext.oMask, _simd_setzero_si())));
978 if(!_simd_movemask_ps(activeLanes)) { goto Endtile; };
979
980 // late-Z
981 if(!T::bCanEarlyZ && !T::bForcedSampleCount)
982 {
983 uint32_t depthPassCount = PixelRateZTest(activeLanes, psContext, BELateDepthTest);
984 UPDATE_STAT(DepthPassCount, depthPassCount);
985 }
986
987 // if we have no covered samples that passed depth at this point, skip OM and go to next tile
988 if(!_simd_movemask_ps(activeLanes)) { goto Endtile; };
989
990 // output merger
991 // loop over all samples, broadcasting the results of the PS to all passing pixels
992 for(uint32_t sample = 0; sample < GetNumOMSamples<T>(pBlendState->sampleCount); sample++)
993 {
994 RDTSC_START(BEOutputMerger);
995 // center pattern does a single coverage/depth/stencil test, standard pattern tests all samples
996 uint32_t coverageSampleNum = (T::bIsStandardPattern) ? sample : 0;
997 simdscalar coverageMask, depthMask;
998 if(T::bForcedSampleCount)
999 {
1000 coverageMask = depthMask = activeLanes;
1001 }
1002 else
1003 {
1004 coverageMask = PixelRateZTest.vCoverageMask[coverageSampleNum];
1005 depthMask = PixelRateZTest.depthPassMask[coverageSampleNum];
1006 if(!_simd_movemask_ps(depthMask))
1007 {
1008 // stencil should already have been written in early/lateZ tests
1009 RDTSC_STOP(BEOutputMerger, 0, 0);
1010 continue;
1011 }
1012 }
1013
1014 // broadcast the results of the PS to all passing pixels
1015 OutputMerger(psContext, pColorBase, sample, pBlendState, state.pfnBlendFunc, coverageMask, depthMask, pPSState->numRenderTargets);
1016
1017 if(!pPSState->forceEarlyZ && !T::bForcedSampleCount)
1018 {
1019 uint8_t *pDepthSample = pDepthBase + RasterTileDepthOffset(sample);
1020 uint8_t * pStencilSample = pStencilBase + RasterTileStencilOffset(sample);
1021
1022 DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, PixelRateZTest.vZ[coverageSampleNum],
1023 pDepthSample, depthMask, coverageMask, pStencilSample, PixelRateZTest.stencilPassMask[coverageSampleNum]);
1024 }
1025 RDTSC_STOP(BEOutputMerger, 0, 0);
1026 }
1027 Endtile:
1028 RDTSC_START(BEEndTile);
1029 for(uint32_t sample = 0; sample < T::MultisampleT::numCoverageSamples; sample++)
1030 {
1031 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
1032 }
1033
1034 work.anyCoveredSamples >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
1035 pDepthBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
1036 pStencilBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
1037
1038 for(uint32_t rt = 0; rt < NumRT; ++rt)
1039 {
1040 pColorBase[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
1041 }
1042 RDTSC_STOP(BEEndTile, 0, 0);
1043 }
1044 }
1045 RDTSC_STOP(BEPixelRateBackend, 0, 0);
1046 }
1047 // optimized backend flow with NULL PS
1048 template<uint32_t sampleCountT>
1049 void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
1050 {
1051 RDTSC_START(BENullBackend);
1052 ///@todo: handle center multisample pattern
1053 typedef SwrBackendTraits<sampleCountT, SWR_MSAA_STANDARD_PATTERN> T;
1054 RDTSC_START(BESetup);
1055
1056 SWR_CONTEXT *pContext = pDC->pContext;
1057 const API_STATE& state = GetApiState(pDC);
1058 const SWR_RASTSTATE& rastState = pDC->pState->state.rastState;
1059
1060 // broadcast scalars
1061 BarycentricCoeffs coeffs;
1062 coeffs.vIa = _simd_broadcast_ss(&work.I[0]);
1063 coeffs.vIb = _simd_broadcast_ss(&work.I[1]);
1064 coeffs.vIc = _simd_broadcast_ss(&work.I[2]);
1065
1066 coeffs.vJa = _simd_broadcast_ss(&work.J[0]);
1067 coeffs.vJb = _simd_broadcast_ss(&work.J[1]);
1068 coeffs.vJc = _simd_broadcast_ss(&work.J[2]);
1069
1070 coeffs.vZa = _simd_broadcast_ss(&work.Z[0]);
1071 coeffs.vZb = _simd_broadcast_ss(&work.Z[1]);
1072 coeffs.vZc = _simd_broadcast_ss(&work.Z[2]);
1073
1074 coeffs.vRecipDet = _simd_broadcast_ss(&work.recipDet);
1075
1076 uint8_t *pDepthBase = renderBuffers.pDepth, *pStencilBase = renderBuffers.pStencil;
1077
1078 RDTSC_STOP(BESetup, 0, 0);
1079
1080 SWR_PS_CONTEXT psContext;
1081 for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
1082 {
1083 // UL pixel corner
1084 simdscalar vYSamplePosUL = _simd_add_ps(vULOffsetsY, _simd_set1_ps((float)yy));
1085
1086 for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
1087 {
1088 // UL pixel corners
1089 simdscalar vXSamplePosUL = _simd_add_ps(vULOffsetsX, _simd_set1_ps((float)xx));
1090
1091 // iterate over active samples
1092 unsigned long sample = 0;
1093 uint32_t sampleMask = state.blendState.sampleMask;
1094 while (_BitScanForward(&sample, sampleMask))
1095 {
1096 sampleMask &= ~(1 << sample);
1097 simdmask coverageMask = work.coverageMask[sample] & MASK;
1098 if (coverageMask)
1099 {
1100 RDTSC_START(BEBarycentric);
1101 // calculate per sample positions
1102 psContext.vX.sample = _simd_add_ps(vXSamplePosUL, T::MultisampleT::vX(sample));
1103 psContext.vY.sample = _simd_add_ps(vYSamplePosUL, T::MultisampleT::vY(sample));
1104
1105 CalcSampleBarycentrics(coeffs, psContext);
1106
1107 // interpolate and quantize z
1108 psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample);
1109 psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
1110
1111 RDTSC_STOP(BEBarycentric, 0, 0);
1112
1113 // interpolate user clip distance if available
1114 if (rastState.clipDistanceMask)
1115 {
1116 coverageMask &= ~ComputeUserClipMask(rastState.clipDistanceMask, work.pUserClipBuffer,
1117 psContext.vI.sample, psContext.vJ.sample);
1118 }
1119
1120 simdscalar vCoverageMask = vMask(coverageMask);
1121 simdscalar stencilPassMask = vCoverageMask;
1122
1123 // offset depth/stencil buffers current sample
1124 uint8_t *pDepthSample = pDepthBase + RasterTileDepthOffset(sample);
1125 uint8_t *pStencilSample = pStencilBase + RasterTileStencilOffset(sample);
1126
1127 RDTSC_START(BEEarlyDepthTest);
1128 simdscalar depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing,
1129 psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
1130 DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
1131 pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
1132 RDTSC_STOP(BEEarlyDepthTest, 0, 0);
1133
1134 uint32_t statMask = _simd_movemask_ps(depthPassMask);
1135 uint32_t statCount = _mm_popcnt_u32(statMask);
1136 UPDATE_STAT(DepthPassCount, statCount);
1137 }
1138 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
1139 }
1140 pDepthBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
1141 pStencilBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
1142 }
1143 }
1144 RDTSC_STOP(BENullBackend, 0, 0);
1145 }
1146
1147 void InitClearTilesTable()
1148 {
1149 memset(sClearTilesTable, 0, sizeof(sClearTilesTable));
1150
1151 sClearTilesTable[R8G8B8A8_UNORM] = ClearMacroTile<R8G8B8A8_UNORM>;
1152 sClearTilesTable[B8G8R8A8_UNORM] = ClearMacroTile<B8G8R8A8_UNORM>;
1153 sClearTilesTable[R32_FLOAT] = ClearMacroTile<R32_FLOAT>;
1154 sClearTilesTable[R32G32B32A32_FLOAT] = ClearMacroTile<R32G32B32A32_FLOAT>;
1155 sClearTilesTable[R8_UINT] = ClearMacroTile<R8_UINT>;
1156 }
1157
1158 PFN_BACKEND_FUNC gBackendNullPs[SWR_MULTISAMPLE_TYPE_MAX];
1159 PFN_BACKEND_FUNC gBackendSingleSample[2] // input coverage
1160 [2] // centroid
1161 [2] // canEarlyZ
1162 = {};
1163 PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_MAX]
1164 [SWR_MSAA_SAMPLE_PATTERN_MAX]
1165 [2] // input coverage
1166 [2] // centroid
1167 [2] // forcedSampleCount
1168 [2] // canEarlyZ
1169 = {};
1170 PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_MAX]
1171 [2] // input coverage
1172 [2] // centroid
1173 [2] // canEarlyZ
1174 = {};
1175
1176 // Recursive template used to auto-nest conditionals. Converts dynamic enum function
1177 // arguments to static template arguments.
1178 template <uint32_t... ArgsT>
1179 struct BEChooser
1180 {
1181 // Last Arg Terminator
1182 static PFN_BACKEND_FUNC GetFunc(SWR_BACKEND_FUNCS tArg)
1183 {
1184 switch(tArg)
1185 {
1186 case SWR_BACKEND_SINGLE_SAMPLE: return BackendSingleSample<SwrBackendTraits<ArgsT...>>; break;
1187 case SWR_BACKEND_MSAA_PIXEL_RATE: return BackendPixelRate<SwrBackendTraits<ArgsT...>>; break;
1188 case SWR_BACKEND_MSAA_SAMPLE_RATE: return BackendSampleRate<SwrBackendTraits<ArgsT...>>; break;
1189 default:
1190 SWR_ASSERT(0 && "Invalid backend func\n");
1191 return nullptr;
1192 break;
1193 }
1194 }
1195
1196 // Recursively parse args
1197 template <typename... TArgsT>
1198 static PFN_BACKEND_FUNC GetFunc(SWR_MSAA_SAMPLE_PATTERN tArg, TArgsT... remainingArgs)
1199 {
1200 switch(tArg)
1201 {
1202 case SWR_MSAA_CENTER_PATTERN: return BEChooser<ArgsT..., SWR_MSAA_CENTER_PATTERN>::GetFunc(remainingArgs...); break;
1203 case SWR_MSAA_STANDARD_PATTERN: return BEChooser<ArgsT..., SWR_MSAA_STANDARD_PATTERN>::GetFunc(remainingArgs...); break;
1204 default:
1205 SWR_ASSERT(0 && "Invalid sample pattern\n");
1206 return BEChooser<ArgsT..., SWR_MSAA_STANDARD_PATTERN>::GetFunc(remainingArgs...);
1207 break;
1208 }
1209 }
1210
1211 // Recursively parse args
1212 template <typename... TArgsT>
1213 static PFN_BACKEND_FUNC GetFunc(SWR_MULTISAMPLE_COUNT tArg, TArgsT... remainingArgs)
1214 {
1215 switch(tArg)
1216 {
1217 case SWR_MULTISAMPLE_1X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...); break;
1218 case SWR_MULTISAMPLE_2X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_2X>::GetFunc(remainingArgs...); break;
1219 case SWR_MULTISAMPLE_4X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_4X>::GetFunc(remainingArgs...); break;
1220 case SWR_MULTISAMPLE_8X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_8X>::GetFunc(remainingArgs...); break;
1221 case SWR_MULTISAMPLE_16X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_16X>::GetFunc(remainingArgs...); break;
1222 default:
1223 SWR_ASSERT(0 && "Invalid sample count\n");
1224 return BEChooser<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...);
1225 break;
1226 }
1227 }
1228
1229 // Recursively parse args
1230 template <typename... TArgsT>
1231 static PFN_BACKEND_FUNC GetFunc(bool tArg, TArgsT... remainingArgs)
1232 {
1233 if(tArg == true)
1234 {
1235 return BEChooser<ArgsT..., 1>::GetFunc(remainingArgs...);
1236 }
1237
1238 return BEChooser<ArgsT..., 0>::GetFunc(remainingArgs...);
1239 }
1240 };
1241
1242 void InitBackendSingleFuncTable(PFN_BACKEND_FUNC (&table)[2][2][2])
1243 {
1244 for(uint32_t inputCoverage = 0; inputCoverage < 2; inputCoverage++)
1245 {
1246 for(uint32_t isCentroid = 0; isCentroid < 2; isCentroid++)
1247 {
1248 for(uint32_t canEarlyZ = 0; canEarlyZ < 2; canEarlyZ++)
1249 {
1250 table[inputCoverage][isCentroid][canEarlyZ] =
1251 BEChooser<>::GetFunc(SWR_MULTISAMPLE_1X, SWR_MSAA_STANDARD_PATTERN, (inputCoverage > 0),
1252 (isCentroid > 0), false, (canEarlyZ > 0), SWR_BACKEND_SINGLE_SAMPLE);
1253 }
1254 }
1255 }
1256 }
1257
1258 void InitBackendPixelFuncTable(PFN_BACKEND_FUNC (&table)[SWR_MULTISAMPLE_TYPE_MAX][SWR_MSAA_SAMPLE_PATTERN_MAX][2][2][2][2])
1259 {
1260 for(uint32_t sampleCount = SWR_MULTISAMPLE_1X; sampleCount < SWR_MULTISAMPLE_TYPE_MAX; sampleCount++)
1261 {
1262 for(uint32_t samplePattern = SWR_MSAA_CENTER_PATTERN; samplePattern < SWR_MSAA_SAMPLE_PATTERN_MAX; samplePattern++)
1263 {
1264 for(uint32_t inputCoverage = 0; inputCoverage < 2; inputCoverage++)
1265 {
1266 for(uint32_t isCentroid = 0; isCentroid < 2; isCentroid++)
1267 {
1268 for(uint32_t forcedSampleCount = 0; forcedSampleCount < 2; forcedSampleCount++)
1269 {
1270 for(uint32_t canEarlyZ = 0; canEarlyZ < 2; canEarlyZ++)
1271 {
1272 table[sampleCount][samplePattern][inputCoverage][isCentroid][forcedSampleCount][canEarlyZ] =
1273 BEChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, (SWR_MSAA_SAMPLE_PATTERN)samplePattern, (inputCoverage > 0),
1274 (isCentroid > 0), (forcedSampleCount > 0), (canEarlyZ > 0), SWR_BACKEND_MSAA_PIXEL_RATE);
1275 }
1276 }
1277 }
1278 }
1279 }
1280 }
1281 }
1282
1283 void InitBackendSampleFuncTable(PFN_BACKEND_FUNC (&table)[SWR_MULTISAMPLE_TYPE_MAX][2][2][2])
1284 {
1285 for(uint32_t sampleCount = SWR_MULTISAMPLE_1X; sampleCount < SWR_MULTISAMPLE_TYPE_MAX; sampleCount++)
1286 {
1287 for(uint32_t inputCoverage = 0; inputCoverage < 2; inputCoverage++)
1288 {
1289 for(uint32_t centroid = 0; centroid < 2; centroid++)
1290 {
1291 for(uint32_t canEarlyZ = 0; canEarlyZ < 2; canEarlyZ++)
1292 {
1293 table[sampleCount][inputCoverage][centroid][canEarlyZ] =
1294 BEChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, SWR_MSAA_STANDARD_PATTERN, (inputCoverage > 0),
1295 (centroid > 0), false, (canEarlyZ > 0), (SWR_BACKEND_FUNCS)SWR_BACKEND_MSAA_SAMPLE_RATE);
1296 }
1297 }
1298 }
1299 }
1300 }
1301
1302 void InitBackendFuncTables()
1303 {
1304 InitBackendSingleFuncTable(gBackendSingleSample);
1305 InitBackendPixelFuncTable(gBackendPixelRateTable);
1306 InitBackendSampleFuncTable(gBackendSampleRateTable);
1307
1308 gBackendNullPs[SWR_MULTISAMPLE_1X] = &BackendNullPS < SWR_MULTISAMPLE_1X > ;
1309 gBackendNullPs[SWR_MULTISAMPLE_2X] = &BackendNullPS < SWR_MULTISAMPLE_2X > ;
1310 gBackendNullPs[SWR_MULTISAMPLE_4X] = &BackendNullPS < SWR_MULTISAMPLE_4X > ;
1311 gBackendNullPs[SWR_MULTISAMPLE_8X] = &BackendNullPS < SWR_MULTISAMPLE_8X > ;
1312 gBackendNullPs[SWR_MULTISAMPLE_16X] = &BackendNullPS < SWR_MULTISAMPLE_16X > ;
1313 }