b1e6c91871500d76775b16fae5e4afb9708c3947
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / backend.cpp
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file backend.cpp
24 *
25 * @brief Backend handles rasterization, pixel shading and output merger
26 * operations.
27 *
28 ******************************************************************************/
29
30 #include <smmintrin.h>
31
32 #include "backend.h"
33 #include "depthstencil.h"
34 #include "tilemgr.h"
35 #include "memory/tilingtraits.h"
36 #include "core/multisample.h"
37
38 #include <algorithm>
39
40 typedef void(*PFN_CLEAR_TILES)(DRAW_CONTEXT*, SWR_RENDERTARGET_ATTACHMENT rt, uint32_t, DWORD[4]);
41 static PFN_CLEAR_TILES sClearTilesTable[NUM_SWR_FORMATS];
42
43 //////////////////////////////////////////////////////////////////////////
44 /// @brief Process compute work.
45 /// @param pDC - pointer to draw context (dispatch).
46 /// @param workerId - The unique worker ID that is assigned to this thread.
47 /// @param threadGroupId - the linear index for the thread group within the dispatch.
48 void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId, void*& pSpillFillBuffer)
49 {
50 RDTSC_START(BEDispatch);
51
52 SWR_CONTEXT *pContext = pDC->pContext;
53
54 const COMPUTE_DESC* pTaskData = (COMPUTE_DESC*)pDC->pDispatch->GetTasksData();
55 SWR_ASSERT(pTaskData != nullptr);
56
57 // Ensure spill fill memory has been allocated.
58 size_t spillFillSize = pDC->pState->state.totalSpillFillSize;
59 if (spillFillSize && pSpillFillBuffer == nullptr)
60 {
61 pSpillFillBuffer = pDC->pArena->AllocAlignedSync(spillFillSize, KNOB_SIMD_BYTES);
62 }
63
64 const API_STATE& state = GetApiState(pDC);
65
66 SWR_CS_CONTEXT csContext{ 0 };
67 csContext.tileCounter = threadGroupId;
68 csContext.dispatchDims[0] = pTaskData->threadGroupCountX;
69 csContext.dispatchDims[1] = pTaskData->threadGroupCountY;
70 csContext.dispatchDims[2] = pTaskData->threadGroupCountZ;
71 csContext.pTGSM = pContext->pScratch[workerId];
72 csContext.pSpillFillBuffer = (uint8_t*)pSpillFillBuffer;
73
74 state.pfnCsFunc(GetPrivateState(pDC), &csContext);
75
76 UPDATE_STAT(CsInvocations, state.totalThreadsInGroup);
77
78 RDTSC_STOP(BEDispatch, 1, 0);
79 }
80
81 void ProcessSyncBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData)
82 {
83 uint32_t x, y;
84 MacroTileMgr::getTileIndices(macroTile, x, y);
85 SWR_ASSERT(x == 0 && y == 0);
86 }
87
88 void ProcessQueryStatsBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData)
89 {
90 QUERY_DESC* pQueryDesc = (QUERY_DESC*)pUserData;
91 SWR_STATS* pStats = pQueryDesc->pStats;
92 SWR_CONTEXT *pContext = pDC->pContext;
93
94 SWR_ASSERT(pStats != nullptr);
95
96 for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i)
97 {
98 pStats->DepthPassCount += pContext->stats[i].DepthPassCount;
99
100 pStats->IaVertices += pContext->stats[i].IaVertices;
101 pStats->IaPrimitives += pContext->stats[i].IaPrimitives;
102 pStats->VsInvocations += pContext->stats[i].VsInvocations;
103 pStats->HsInvocations += pContext->stats[i].HsInvocations;
104 pStats->DsInvocations += pContext->stats[i].DsInvocations;
105 pStats->GsInvocations += pContext->stats[i].GsInvocations;
106 pStats->PsInvocations += pContext->stats[i].PsInvocations;
107 pStats->CInvocations += pContext->stats[i].CInvocations;
108 pStats->CsInvocations += pContext->stats[i].CsInvocations;
109 pStats->CPrimitives += pContext->stats[i].CPrimitives;
110 pStats->GsPrimitives += pContext->stats[i].GsPrimitives;
111
112 for (uint32_t stream = 0; stream < MAX_SO_STREAMS; ++stream)
113 {
114 pStats->SoWriteOffset[stream] += pContext->stats[i].SoWriteOffset[stream];
115
116 /// @note client is required to provide valid write offset before every draw, so we clear
117 /// out the contents of the write offset when storing stats
118 pContext->stats[i].SoWriteOffset[stream] = 0;
119
120 pStats->SoPrimStorageNeeded[stream] += pContext->stats[i].SoPrimStorageNeeded[stream];
121 pStats->SoNumPrimsWritten[stream] += pContext->stats[i].SoNumPrimsWritten[stream];
122 }
123 }
124 }
125
126 template<SWR_FORMAT format>
127 void ClearRasterTile(uint8_t *pTileBuffer, simdvector &value)
128 {
129 auto lambda = [&](int comp)
130 {
131 FormatTraits<format>::storeSOA(comp, pTileBuffer, value.v[comp]);
132 pTileBuffer += (KNOB_SIMD_WIDTH * FormatTraits<format>::GetBPC(comp) / 8);
133 };
134
135 const uint32_t numIter = (KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD_TILE_X_DIM);
136 for (uint32_t i = 0; i < numIter; ++i)
137 {
138 UnrollerL<0, FormatTraits<format>::numComps, 1>::step(lambda);
139 }
140 }
141
142 template<SWR_FORMAT format>
143 INLINE void ClearMacroTile(DRAW_CONTEXT *pDC, SWR_RENDERTARGET_ATTACHMENT rt, uint32_t macroTile, DWORD clear[4])
144 {
145 // convert clear color to hottile format
146 // clear color is in RGBA float/uint32
147 simdvector vClear;
148 for (uint32_t comp = 0; comp < FormatTraits<format>::numComps; ++comp)
149 {
150 simdscalar vComp;
151 vComp = _simd_load1_ps((const float*)&clear[comp]);
152 if (FormatTraits<format>::isNormalized(comp))
153 {
154 vComp = _simd_mul_ps(vComp, _simd_set1_ps(FormatTraits<format>::fromFloat(comp)));
155 vComp = _simd_castsi_ps(_simd_cvtps_epi32(vComp));
156 }
157 vComp = FormatTraits<format>::pack(comp, vComp);
158 vClear.v[FormatTraits<format>::swizzle(comp)] = vComp;
159 }
160
161 uint32_t tileX, tileY;
162 MacroTileMgr::getTileIndices(macroTile, tileX, tileY);
163 const API_STATE& state = GetApiState(pDC);
164
165 int top = KNOB_MACROTILE_Y_DIM_FIXED * tileY;
166 int bottom = top + KNOB_MACROTILE_Y_DIM_FIXED - 1;
167 int left = KNOB_MACROTILE_X_DIM_FIXED * tileX;
168 int right = left + KNOB_MACROTILE_X_DIM_FIXED - 1;
169
170 // intersect with scissor
171 top = std::max(top, state.scissorInFixedPoint.top);
172 left = std::max(left, state.scissorInFixedPoint.left);
173 bottom = std::min(bottom, state.scissorInFixedPoint.bottom);
174 right = std::min(right, state.scissorInFixedPoint.right);
175
176 // translate to local hottile origin
177 top -= KNOB_MACROTILE_Y_DIM_FIXED * tileY;
178 bottom -= KNOB_MACROTILE_Y_DIM_FIXED * tileY;
179 left -= KNOB_MACROTILE_X_DIM_FIXED * tileX;
180 right -= KNOB_MACROTILE_X_DIM_FIXED * tileX;
181
182 // convert to raster tiles
183 top >>= (KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT);
184 bottom >>= (KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT);
185 left >>= (KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT);
186 right >>= (KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT);
187
188 const int numSamples = GetNumSamples(pDC->pState->state.rastState.sampleCount);
189 // compute steps between raster tile samples / raster tiles / macro tile rows
190 const uint32_t rasterTileSampleStep = KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<format>::bpp / 8;
191 const uint32_t rasterTileStep = (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<format>::bpp / 8)) * numSamples;
192 const uint32_t macroTileRowStep = (KNOB_MACROTILE_X_DIM / KNOB_TILE_X_DIM) * rasterTileStep;
193 const uint32_t pitch = (FormatTraits<format>::bpp * KNOB_MACROTILE_X_DIM / 8);
194
195 HOTTILE *pHotTile = pDC->pContext->pHotTileMgr->GetHotTile(pDC->pContext, pDC, macroTile, rt, true, numSamples);
196 uint32_t rasterTileStartOffset = (ComputeTileOffset2D< TilingTraits<SWR_TILE_SWRZ, FormatTraits<format>::bpp > >(pitch, left, top)) * numSamples;
197 uint8_t* pRasterTileRow = pHotTile->pBuffer + rasterTileStartOffset; //(ComputeTileOffset2D< TilingTraits<SWR_TILE_SWRZ, FormatTraits<format>::bpp > >(pitch, x, y)) * numSamples;
198
199 // loop over all raster tiles in the current hot tile
200 for (int y = top; y <= bottom; ++y)
201 {
202 uint8_t* pRasterTile = pRasterTileRow;
203 for (int x = left; x <= right; ++x)
204 {
205 for( int sampleNum = 0; sampleNum < numSamples; sampleNum++)
206 {
207 ClearRasterTile<format>(pRasterTile, vClear);
208 pRasterTile += rasterTileSampleStep;
209 }
210 }
211 pRasterTileRow += macroTileRowStep;
212 }
213
214 pHotTile->state = HOTTILE_DIRTY;
215 }
216
217
218 void ProcessClearBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData)
219 {
220 if (KNOB_FAST_CLEAR)
221 {
222 CLEAR_DESC *pClear = (CLEAR_DESC*)pUserData;
223 SWR_CONTEXT *pContext = pDC->pContext;
224 SWR_MULTISAMPLE_COUNT sampleCount = pDC->pState->state.rastState.sampleCount;
225 uint32_t numSamples = GetNumSamples(sampleCount);
226
227 SWR_ASSERT(pClear->flags.bits != 0); // shouldn't be here without a reason.
228
229 RDTSC_START(BEClear);
230
231 if (pClear->flags.mask & SWR_CLEAR_COLOR)
232 {
233 HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, SWR_ATTACHMENT_COLOR0, true, numSamples);
234 // All we want to do here is to mark the hot tile as being in a "needs clear" state.
235 pHotTile->clearData[0] = *(DWORD*)&(pClear->clearRTColor[0]);
236 pHotTile->clearData[1] = *(DWORD*)&(pClear->clearRTColor[1]);
237 pHotTile->clearData[2] = *(DWORD*)&(pClear->clearRTColor[2]);
238 pHotTile->clearData[3] = *(DWORD*)&(pClear->clearRTColor[3]);
239 pHotTile->state = HOTTILE_CLEAR;
240 }
241
242 if (pClear->flags.mask & SWR_CLEAR_DEPTH)
243 {
244 HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, SWR_ATTACHMENT_DEPTH, true, numSamples);
245 pHotTile->clearData[0] = *(DWORD*)&pClear->clearDepth;
246 pHotTile->state = HOTTILE_CLEAR;
247 }
248
249 if (pClear->flags.mask & SWR_CLEAR_STENCIL)
250 {
251 HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, SWR_ATTACHMENT_STENCIL, true, numSamples);
252
253 pHotTile->clearData[0] = *(DWORD*)&pClear->clearStencil;
254 pHotTile->state = HOTTILE_CLEAR;
255 }
256
257 RDTSC_STOP(BEClear, 0, 0);
258 }
259 else
260 {
261 // Legacy clear
262 CLEAR_DESC *pClear = (CLEAR_DESC*)pUserData;
263 RDTSC_START(BEClear);
264
265 if (pClear->flags.mask & SWR_CLEAR_COLOR)
266 {
267 /// @todo clear data should come in as RGBA32_FLOAT
268 DWORD clearData[4];
269 float clearFloat[4];
270 clearFloat[0] = ((uint8_t*)(&pClear->clearRTColor))[0] / 255.0f;
271 clearFloat[1] = ((uint8_t*)(&pClear->clearRTColor))[1] / 255.0f;
272 clearFloat[2] = ((uint8_t*)(&pClear->clearRTColor))[2] / 255.0f;
273 clearFloat[3] = ((uint8_t*)(&pClear->clearRTColor))[3] / 255.0f;
274 clearData[0] = *(DWORD*)&clearFloat[0];
275 clearData[1] = *(DWORD*)&clearFloat[1];
276 clearData[2] = *(DWORD*)&clearFloat[2];
277 clearData[3] = *(DWORD*)&clearFloat[3];
278
279 PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[KNOB_COLOR_HOT_TILE_FORMAT];
280 SWR_ASSERT(pfnClearTiles != nullptr);
281
282 pfnClearTiles(pDC, SWR_ATTACHMENT_COLOR0, macroTile, clearData);
283 }
284
285 if (pClear->flags.mask & SWR_CLEAR_DEPTH)
286 {
287 DWORD clearData[4];
288 clearData[0] = *(DWORD*)&pClear->clearDepth;
289 PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[KNOB_DEPTH_HOT_TILE_FORMAT];
290 SWR_ASSERT(pfnClearTiles != nullptr);
291
292 pfnClearTiles(pDC, SWR_ATTACHMENT_DEPTH, macroTile, clearData);
293 }
294
295 if (pClear->flags.mask & SWR_CLEAR_STENCIL)
296 {
297 uint32_t value = pClear->clearStencil;
298 DWORD clearData[4];
299 clearData[0] = *(DWORD*)&value;
300 PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[KNOB_STENCIL_HOT_TILE_FORMAT];
301
302 pfnClearTiles(pDC, SWR_ATTACHMENT_STENCIL, macroTile, clearData);
303 }
304
305 RDTSC_STOP(BEClear, 0, 0);
306 }
307 }
308
309
310 void ProcessStoreTileBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData)
311 {
312 RDTSC_START(BEStoreTiles);
313 STORE_TILES_DESC *pDesc = (STORE_TILES_DESC*)pData;
314 SWR_CONTEXT *pContext = pDC->pContext;
315
316 #ifdef KNOB_ENABLE_RDTSC
317 uint32_t numTiles = 0;
318 #endif
319 SWR_FORMAT srcFormat;
320 switch (pDesc->attachment)
321 {
322 case SWR_ATTACHMENT_COLOR0:
323 case SWR_ATTACHMENT_COLOR1:
324 case SWR_ATTACHMENT_COLOR2:
325 case SWR_ATTACHMENT_COLOR3:
326 case SWR_ATTACHMENT_COLOR4:
327 case SWR_ATTACHMENT_COLOR5:
328 case SWR_ATTACHMENT_COLOR6:
329 case SWR_ATTACHMENT_COLOR7: srcFormat = KNOB_COLOR_HOT_TILE_FORMAT; break;
330 case SWR_ATTACHMENT_DEPTH: srcFormat = KNOB_DEPTH_HOT_TILE_FORMAT; break;
331 case SWR_ATTACHMENT_STENCIL: srcFormat = KNOB_STENCIL_HOT_TILE_FORMAT; break;
332 default: SWR_ASSERT(false, "Unknown attachment: %d", pDesc->attachment); srcFormat = KNOB_COLOR_HOT_TILE_FORMAT; break;
333 }
334
335 uint32_t x, y;
336 MacroTileMgr::getTileIndices(macroTile, x, y);
337
338 // Only need to store the hottile if it's been rendered to...
339 HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, pDesc->attachment, false);
340 if (pHotTile)
341 {
342 // clear if clear is pending (i.e., not rendered to), then mark as dirty for store.
343 if (pHotTile->state == HOTTILE_CLEAR)
344 {
345 PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[srcFormat];
346 SWR_ASSERT(pfnClearTiles != nullptr);
347
348 pfnClearTiles(pDC, pDesc->attachment, macroTile, pHotTile->clearData);
349 }
350
351 if (pHotTile->state == HOTTILE_DIRTY || pDesc->postStoreTileState == (SWR_TILE_STATE)HOTTILE_DIRTY)
352 {
353 int destX = KNOB_MACROTILE_X_DIM * x;
354 int destY = KNOB_MACROTILE_Y_DIM * y;
355
356 pContext->pfnStoreTile(GetPrivateState(pDC), srcFormat,
357 pDesc->attachment, destX, destY, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer);
358 }
359
360
361 if (pHotTile->state == HOTTILE_DIRTY || pHotTile->state == HOTTILE_RESOLVED)
362 {
363 pHotTile->state = (HOTTILE_STATE)pDesc->postStoreTileState;
364 }
365 }
366 RDTSC_STOP(BEStoreTiles, numTiles, pDC->drawId);
367 }
368
369
370 void ProcessDiscardInvalidateTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData)
371 {
372 DISCARD_INVALIDATE_TILES_DESC *pDesc = (DISCARD_INVALIDATE_TILES_DESC *)pData;
373 SWR_CONTEXT *pContext = pDC->pContext;
374
375 const int numSamples = GetNumSamples(pDC->pState->state.rastState.sampleCount);
376
377 for (uint32_t i = 0; i < SWR_NUM_ATTACHMENTS; ++i)
378 {
379 if (pDesc->attachmentMask & (1 << i))
380 {
381 HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTileNoLoad(
382 pContext, pDC, macroTile, (SWR_RENDERTARGET_ATTACHMENT)i, pDesc->createNewTiles, numSamples);
383 if (pHotTile)
384 {
385 pHotTile->state = (HOTTILE_STATE)pDesc->newTileState;
386 }
387 }
388 }
389 }
390
391 #if KNOB_SIMD_WIDTH == 8
392 const __m256 vCenterOffsetsX = {0.5, 1.5, 0.5, 1.5, 2.5, 3.5, 2.5, 3.5};
393 const __m256 vCenterOffsetsY = {0.5, 0.5, 1.5, 1.5, 0.5, 0.5, 1.5, 1.5};
394 const __m256 vULOffsetsX = {0.0, 1.0, 0.0, 1.0, 2.0, 3.0, 2.0, 3.0};
395 const __m256 vULOffsetsY = {0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0};
396 #else
397 #error Unsupported vector width
398 #endif
399
400 simdmask ComputeUserClipMask(uint8_t clipMask, float* pUserClipBuffer, simdscalar vI, simdscalar vJ)
401 {
402 simdscalar vClipMask = _simd_setzero_ps();
403 uint32_t numClipDistance = _mm_popcnt_u32(clipMask);
404
405 for (uint32_t i = 0; i < numClipDistance; ++i)
406 {
407 // pull triangle clip distance values from clip buffer
408 simdscalar vA = _simd_broadcast_ss(pUserClipBuffer++);
409 simdscalar vB = _simd_broadcast_ss(pUserClipBuffer++);
410 simdscalar vC = _simd_broadcast_ss(pUserClipBuffer++);
411
412 // interpolate
413 simdscalar vInterp = vplaneps(vA, vB, vC, vI, vJ);
414
415 // clip if interpolated clip distance is < 0 || NAN
416 simdscalar vCull = _simd_cmp_ps(_simd_setzero_ps(), vInterp, _CMP_NLE_UQ);
417
418 vClipMask = _simd_or_ps(vClipMask, vCull);
419 }
420
421 return _simd_movemask_ps(vClipMask);
422 }
423
424 template<typename T>
425 void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
426 {
427 RDTSC_START(BESingleSampleBackend);
428 RDTSC_START(BESetup);
429
430 SWR_CONTEXT *pContext = pDC->pContext;
431 const API_STATE& state = GetApiState(pDC);
432 const SWR_RASTSTATE& rastState = state.rastState;
433 const SWR_PS_STATE *pPSState = &state.psState;
434 const SWR_BLEND_STATE *pBlendState = &state.blendState;
435 uint64_t coverageMask = work.coverageMask[0];
436
437 // broadcast scalars
438 BarycentricCoeffs coeffs;
439 coeffs.vIa = _simd_broadcast_ss(&work.I[0]);
440 coeffs.vIb = _simd_broadcast_ss(&work.I[1]);
441 coeffs.vIc = _simd_broadcast_ss(&work.I[2]);
442
443 coeffs.vJa = _simd_broadcast_ss(&work.J[0]);
444 coeffs.vJb = _simd_broadcast_ss(&work.J[1]);
445 coeffs.vJc = _simd_broadcast_ss(&work.J[2]);
446
447 coeffs.vZa = _simd_broadcast_ss(&work.Z[0]);
448 coeffs.vZb = _simd_broadcast_ss(&work.Z[1]);
449 coeffs.vZc = _simd_broadcast_ss(&work.Z[2]);
450
451 coeffs.vRecipDet = _simd_broadcast_ss(&work.recipDet);
452
453 coeffs.vAOneOverW = _simd_broadcast_ss(&work.OneOverW[0]);
454 coeffs.vBOneOverW = _simd_broadcast_ss(&work.OneOverW[1]);
455 coeffs.vCOneOverW = _simd_broadcast_ss(&work.OneOverW[2]);
456
457 uint8_t *pColorBase[SWR_NUM_RENDERTARGETS];
458 uint32_t NumRT = state.psState.numRenderTargets;
459 for(uint32_t rt = 0; rt < NumRT; ++rt)
460 {
461 pColorBase[rt] = renderBuffers.pColor[rt];
462 }
463 uint8_t *pDepthBase = renderBuffers.pDepth, *pStencilBase = renderBuffers.pStencil;
464 RDTSC_STOP(BESetup, 0, 0);
465
466 SWR_PS_CONTEXT psContext;
467 psContext.pAttribs = work.pAttribs;
468 psContext.pPerspAttribs = work.pPerspAttribs;
469 psContext.frontFace = work.triFlags.frontFacing;
470 psContext.primID = work.triFlags.primID;
471
472 // save Ia/Ib/Ic and Ja/Jb/Jc if we need to reevaluate i/j/k in the shader because of pull attribs
473 psContext.I = work.I;
474 psContext.J = work.J;
475 psContext.recipDet = work.recipDet;
476 psContext.pRecipW = work.pRecipW;
477 psContext.pSamplePosX = (const float*)&T::MultisampleT::samplePosX;
478 psContext.pSamplePosY = (const float*)&T::MultisampleT::samplePosY;
479
480 for(uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
481 {
482 // UL pixel corner
483 psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps((float)yy));
484 // pixel center
485 psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps((float)yy));
486
487 for(uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
488 {
489 if(coverageMask & MASK)
490 {
491 psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps((float)xx));
492 // pixel center
493 psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps((float)xx));
494
495 if(T::bInputCoverage)
496 {
497 generateInputCoverage<T>(&work.coverageMask[0], psContext.inputMask, pBlendState->sampleMask);
498 }
499
500 RDTSC_START(BEBarycentric);
501 CalcPixelBarycentrics(coeffs, psContext);
502
503 // for 1x case, centroid is pixel center
504 psContext.vX.centroid = psContext.vX.center;
505 psContext.vY.centroid = psContext.vY.center;
506 psContext.vI.centroid = psContext.vI.center;
507 psContext.vJ.centroid = psContext.vJ.center;
508 psContext.vOneOverW.centroid = psContext.vOneOverW.center;
509
510 // interpolate and quantize z
511 psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center);
512 psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
513 RDTSC_STOP(BEBarycentric, 0, 0);
514
515 simdmask clipCoverageMask = coverageMask & MASK;
516 // interpolate user clip distance if available
517 if(rastState.clipDistanceMask)
518 {
519 clipCoverageMask &= ~ComputeUserClipMask(rastState.clipDistanceMask, work.pUserClipBuffer,
520 psContext.vI.center, psContext.vJ.center);
521 }
522
523 simdscalar vCoverageMask = vMask(clipCoverageMask);
524 simdscalar depthPassMask = vCoverageMask;
525 simdscalar stencilPassMask = vCoverageMask;
526
527 // Early-Z?
528 if(T::bCanEarlyZ)
529 {
530 RDTSC_START(BEEarlyDepthTest);
531 depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing,
532 psContext.vZ, pDepthBase, vCoverageMask, pStencilBase, &stencilPassMask);
533 RDTSC_STOP(BEEarlyDepthTest, 0, 0);
534
535 // early-exit if no pixels passed depth or earlyZ is forced on
536 if(pPSState->forceEarlyZ || !_simd_movemask_ps(depthPassMask))
537 {
538 DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
539 pDepthBase, depthPassMask, vCoverageMask, pStencilBase, stencilPassMask);
540
541 if (!_simd_movemask_ps(depthPassMask))
542 {
543 goto Endtile;
544 }
545 }
546 }
547
548 psContext.sampleIndex = 0;
549 psContext.activeMask = _simd_castps_si(vCoverageMask);
550
551 // execute pixel shader
552 RDTSC_START(BEPixelShader);
553 UPDATE_STAT(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask)));
554 state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
555 RDTSC_STOP(BEPixelShader, 0, 0);
556
557 vCoverageMask = _simd_castsi_ps(psContext.activeMask);
558
559 // late-Z
560 if(!T::bCanEarlyZ)
561 {
562 RDTSC_START(BELateDepthTest);
563 depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing,
564 psContext.vZ, pDepthBase, vCoverageMask, pStencilBase, &stencilPassMask);
565 RDTSC_STOP(BELateDepthTest, 0, 0);
566
567 if(!_simd_movemask_ps(depthPassMask))
568 {
569 // need to call depth/stencil write for stencil write
570 DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
571 pDepthBase, depthPassMask, vCoverageMask, pStencilBase, stencilPassMask);
572 goto Endtile;
573 }
574 }
575
576 uint32_t statMask = _simd_movemask_ps(depthPassMask);
577 uint32_t statCount = _mm_popcnt_u32(statMask);
578 UPDATE_STAT(DepthPassCount, statCount);
579
580 // output merger
581 RDTSC_START(BEOutputMerger);
582 OutputMerger(psContext, pColorBase, 0, pBlendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, pPSState->numRenderTargets);
583
584 // do final depth write after all pixel kills
585 if (!pPSState->forceEarlyZ)
586 {
587 DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
588 pDepthBase, depthPassMask, vCoverageMask, pStencilBase, stencilPassMask);
589 }
590 RDTSC_STOP(BEOutputMerger, 0, 0);
591 }
592
593 Endtile:
594 RDTSC_START(BEEndTile);
595 coverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
596 pDepthBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
597 pStencilBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
598
599 for(uint32_t rt = 0; rt < NumRT; ++rt)
600 {
601 pColorBase[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
602 }
603 RDTSC_STOP(BEEndTile, 0, 0);
604 }
605 }
606 RDTSC_STOP(BESingleSampleBackend, 0, 0);
607 }
608
609 template<typename T>
610 void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
611 {
612 RDTSC_START(BESampleRateBackend);
613 RDTSC_START(BESetup);
614
615 SWR_CONTEXT *pContext = pDC->pContext;
616 const API_STATE& state = GetApiState(pDC);
617 const SWR_RASTSTATE& rastState = state.rastState;
618 const SWR_PS_STATE *pPSState = &state.psState;
619 const SWR_BLEND_STATE *pBlendState = &state.blendState;
620
621 // broadcast scalars
622 BarycentricCoeffs coeffs;
623 coeffs.vIa = _simd_broadcast_ss(&work.I[0]);
624 coeffs.vIb = _simd_broadcast_ss(&work.I[1]);
625 coeffs.vIc = _simd_broadcast_ss(&work.I[2]);
626
627 coeffs.vJa = _simd_broadcast_ss(&work.J[0]);
628 coeffs.vJb = _simd_broadcast_ss(&work.J[1]);
629 coeffs.vJc = _simd_broadcast_ss(&work.J[2]);
630
631 coeffs.vZa = _simd_broadcast_ss(&work.Z[0]);
632 coeffs.vZb = _simd_broadcast_ss(&work.Z[1]);
633 coeffs.vZc = _simd_broadcast_ss(&work.Z[2]);
634
635 coeffs.vRecipDet = _simd_broadcast_ss(&work.recipDet);
636
637 coeffs.vAOneOverW = _simd_broadcast_ss(&work.OneOverW[0]);
638 coeffs.vBOneOverW = _simd_broadcast_ss(&work.OneOverW[1]);
639 coeffs.vCOneOverW = _simd_broadcast_ss(&work.OneOverW[2]);
640
641 uint8_t *pColorBase[SWR_NUM_RENDERTARGETS];
642 uint32_t NumRT = state.psState.numRenderTargets;
643 for(uint32_t rt = 0; rt < NumRT; ++rt)
644 {
645 pColorBase[rt] = renderBuffers.pColor[rt];
646 }
647 uint8_t *pDepthBase = renderBuffers.pDepth, *pStencilBase = renderBuffers.pStencil;
648 RDTSC_STOP(BESetup, 0, 0);
649
650 SWR_PS_CONTEXT psContext;
651 psContext.pAttribs = work.pAttribs;
652 psContext.pPerspAttribs = work.pPerspAttribs;
653 psContext.pRecipW = work.pRecipW;
654 psContext.frontFace = work.triFlags.frontFacing;
655 psContext.primID = work.triFlags.primID;
656
657 // save Ia/Ib/Ic and Ja/Jb/Jc if we need to reevaluate i/j/k in the shader because of pull attribs
658 psContext.I = work.I;
659 psContext.J = work.J;
660 psContext.recipDet = work.recipDet;
661 psContext.pSamplePosX = (const float*)&T::MultisampleT::samplePosX;
662 psContext.pSamplePosY = (const float*)&T::MultisampleT::samplePosY;
663
664 for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
665 {
666 // UL pixel corner
667 psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps((float)yy));
668 // pixel center
669 psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps((float)yy));
670
671 for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
672 {
673 psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps((float)xx));
674 // pixel center
675 psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps((float)xx));
676
677 RDTSC_START(BEBarycentric);
678 CalcPixelBarycentrics(coeffs, psContext);
679 RDTSC_STOP(BEBarycentric, 0, 0);
680
681 if(T::bInputCoverage)
682 {
683 generateInputCoverage<T>(&work.coverageMask[0], psContext.inputMask, pBlendState->sampleMask);
684 }
685
686 if(T::bCentroidPos)
687 {
688 ///@ todo: don't need to genererate input coverage 2x if input coverage and centroid
689 RDTSC_START(BEBarycentric);
690 if(T::bIsStandardPattern)
691 {
692 CalcCentroidPos<T>(psContext, &work.coverageMask[0], pBlendState->sampleMask, psContext.vX.UL, psContext.vY.UL);
693 }
694 else
695 {
696 psContext.vX.centroid = _simd_add_ps(psContext.vX.UL, _simd_set1_ps(0.5f));
697 psContext.vY.centroid = _simd_add_ps(psContext.vY.UL, _simd_set1_ps(0.5f));
698 }
699 CalcCentroidBarycentrics(coeffs, psContext, psContext.vX.UL, psContext.vY.UL);
700 RDTSC_STOP(BEBarycentric, 0, 0);
701 }
702 else
703 {
704 psContext.vX.centroid = psContext.vX.sample;
705 psContext.vY.centroid = psContext.vY.sample;
706 }
707
708 for(uint32_t sample = 0; sample < T::MultisampleT::numSamples; sample++)
709 {
710 simdmask coverageMask = work.coverageMask[sample] & MASK;
711 if (coverageMask)
712 {
713 RDTSC_START(BEBarycentric);
714 // calculate per sample positions
715 psContext.vX.sample = _simd_add_ps(psContext.vX.UL, T::MultisampleT::vX(sample));
716 psContext.vY.sample = _simd_add_ps(psContext.vY.UL, T::MultisampleT::vY(sample));
717
718 CalcSampleBarycentrics(coeffs, psContext);
719
720 // interpolate and quantize z
721 psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample);
722 psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
723 RDTSC_STOP(BEBarycentric, 0, 0);
724
725 // interpolate user clip distance if available
726 if (rastState.clipDistanceMask)
727 {
728 coverageMask &= ~ComputeUserClipMask(rastState.clipDistanceMask, work.pUserClipBuffer,
729 psContext.vI.sample, psContext.vJ.sample);
730 }
731
732 simdscalar vCoverageMask = vMask(coverageMask);
733 simdscalar depthPassMask = vCoverageMask;
734 simdscalar stencilPassMask = vCoverageMask;
735
736 // offset depth/stencil buffers current sample
737 uint8_t *pDepthSample = pDepthBase + RasterTileDepthOffset(sample);
738 uint8_t *pStencilSample = pStencilBase + RasterTileStencilOffset(sample);
739
740 // Early-Z?
741 if (T::bCanEarlyZ)
742 {
743 RDTSC_START(BEEarlyDepthTest);
744 depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing,
745 psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
746 RDTSC_STOP(BEEarlyDepthTest, 0, 0);
747
748 // early-exit if no samples passed depth or earlyZ is forced on.
749 if (pPSState->forceEarlyZ || !_simd_movemask_ps(depthPassMask))
750 {
751 DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
752 pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
753
754 if (!_simd_movemask_ps(depthPassMask))
755 {
756 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
757 continue;
758 }
759 }
760 }
761
762 psContext.sampleIndex = sample;
763 psContext.activeMask = _simd_castps_si(vCoverageMask);
764
765 // execute pixel shader
766 RDTSC_START(BEPixelShader);
767 UPDATE_STAT(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask)));
768 state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
769 RDTSC_STOP(BEPixelShader, 0, 0);
770
771 vCoverageMask = _simd_castsi_ps(psContext.activeMask);
772
773 // late-Z
774 if (!T::bCanEarlyZ)
775 {
776 RDTSC_START(BELateDepthTest);
777 depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing,
778 psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
779 RDTSC_STOP(BELateDepthTest, 0, 0);
780
781 if (!_simd_movemask_ps(depthPassMask))
782 {
783 // need to call depth/stencil write for stencil write
784 DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
785 pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
786
787 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
788 continue;
789 }
790 }
791
792 uint32_t statMask = _simd_movemask_ps(depthPassMask);
793 uint32_t statCount = _mm_popcnt_u32(statMask);
794 UPDATE_STAT(DepthPassCount, statCount);
795
796 // output merger
797 RDTSC_START(BEOutputMerger);
798 OutputMerger(psContext, pColorBase, sample, pBlendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, pPSState->numRenderTargets);
799
800 // do final depth write after all pixel kills
801 if (!pPSState->forceEarlyZ)
802 {
803 DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
804 pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
805 }
806 RDTSC_STOP(BEOutputMerger, 0, 0);
807 }
808 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
809 }
810 RDTSC_START(BEEndTile);
811 pDepthBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
812 pStencilBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
813
814 for (uint32_t rt = 0; rt < NumRT; ++rt)
815 {
816 pColorBase[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
817 }
818 RDTSC_STOP(BEEndTile, 0, 0);
819 }
820 }
821 RDTSC_STOP(BESampleRateBackend, 0, 0);
822 }
823
824 template<typename T>
825 void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
826 {
827 RDTSC_START(BEPixelRateBackend);
828 RDTSC_START(BESetup);
829
830 SWR_CONTEXT *pContext = pDC->pContext;
831 const API_STATE& state = GetApiState(pDC);
832 const SWR_RASTSTATE& rastState = state.rastState;
833 const SWR_PS_STATE *pPSState = &state.psState;
834 const SWR_BLEND_STATE *pBlendState = &state.blendState;
835
836 // broadcast scalars
837 BarycentricCoeffs coeffs;
838 coeffs.vIa = _simd_broadcast_ss(&work.I[0]);
839 coeffs.vIb = _simd_broadcast_ss(&work.I[1]);
840 coeffs.vIc = _simd_broadcast_ss(&work.I[2]);
841
842 coeffs.vJa = _simd_broadcast_ss(&work.J[0]);
843 coeffs.vJb = _simd_broadcast_ss(&work.J[1]);
844 coeffs.vJc = _simd_broadcast_ss(&work.J[2]);
845
846 coeffs.vZa = _simd_broadcast_ss(&work.Z[0]);
847 coeffs.vZb = _simd_broadcast_ss(&work.Z[1]);
848 coeffs.vZc = _simd_broadcast_ss(&work.Z[2]);
849
850 coeffs.vRecipDet = _simd_broadcast_ss(&work.recipDet);
851
852 coeffs.vAOneOverW = _simd_broadcast_ss(&work.OneOverW[0]);
853 coeffs.vBOneOverW = _simd_broadcast_ss(&work.OneOverW[1]);
854 coeffs.vCOneOverW = _simd_broadcast_ss(&work.OneOverW[2]);
855
856 uint8_t *pColorBase[SWR_NUM_RENDERTARGETS];
857 uint32_t NumRT = state.psState.numRenderTargets;
858 for(uint32_t rt = 0; rt < NumRT; ++rt)
859 {
860 pColorBase[rt] = renderBuffers.pColor[rt];
861 }
862 uint8_t *pDepthBase = renderBuffers.pDepth, *pStencilBase = renderBuffers.pStencil;
863 RDTSC_STOP(BESetup, 0, 0);
864
865 SWR_PS_CONTEXT psContext;
866 psContext.pAttribs = work.pAttribs;
867 psContext.pPerspAttribs = work.pPerspAttribs;
868 psContext.frontFace = work.triFlags.frontFacing;
869 psContext.primID = work.triFlags.primID;
870 psContext.pRecipW = work.pRecipW;
871 // save Ia/Ib/Ic and Ja/Jb/Jc if we need to reevaluate i/j/k in the shader because of pull attribs
872 psContext.I = work.I;
873 psContext.J = work.J;
874 psContext.recipDet = work.recipDet;
875 psContext.pSamplePosX = (const float*)&T::MultisampleT::samplePosX;
876 psContext.pSamplePosY = (const float*)&T::MultisampleT::samplePosY;
877 psContext.sampleIndex = 0;
878
879 PixelRateZTestLoop<T> PixelRateZTest(pDC, work, coeffs, state, pDepthBase, pStencilBase, rastState.clipDistanceMask);
880
881 for(uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
882 {
883 psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps((float)yy));
884 psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps((float)yy));
885 for(uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
886 {
887 simdscalar activeLanes;
888 if(!(work.anyCoveredSamples & MASK)) {goto Endtile;};
889 activeLanes = vMask(work.anyCoveredSamples & MASK);
890
891 psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps((float)xx));
892 // set pixel center positions
893 psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps((float)xx));
894
895 RDTSC_START(BEBarycentric);
896 CalcPixelBarycentrics(coeffs, psContext);
897 RDTSC_STOP(BEBarycentric, 0, 0);
898
899 if (T::bInputCoverage)
900 {
901 generateInputCoverage<T>(&work.coverageMask[0], psContext.inputMask, pBlendState->sampleMask);
902 }
903
904 if(T::bCentroidPos)
905 {
906 ///@ todo: don't need to genererate input coverage 2x if input coverage and centroid
907 RDTSC_START(BEBarycentric);
908 if(T::bIsStandardPattern)
909 {
910 CalcCentroidPos<T>(psContext, &work.coverageMask[0], pBlendState->sampleMask, psContext.vX.UL, psContext.vY.UL);
911 }
912 else
913 {
914 psContext.vX.centroid = _simd_add_ps(psContext.vX.UL, _simd_set1_ps(0.5f));
915 psContext.vY.centroid = _simd_add_ps(psContext.vY.UL, _simd_set1_ps(0.5f));
916 }
917
918 CalcCentroidBarycentrics(coeffs, psContext, psContext.vX.UL, psContext.vY.UL);
919 RDTSC_STOP(BEBarycentric, 0, 0);
920 }
921 else
922 {
923 psContext.vX.centroid = _simd_add_ps(psContext.vX.UL, _simd_set1_ps(0.5f));
924 psContext.vY.centroid = _simd_add_ps(psContext.vY.UL, _simd_set1_ps(0.5f));
925 }
926
927 if(T::bForcedSampleCount)
928 {
929 // candidate pixels (that passed coverage) will cause shader invocation if any bits in the samplemask are set
930 const simdscalar vSampleMask = _simd_castsi_ps(_simd_cmpgt_epi32(_simd_set1_epi32(pBlendState->sampleMask), _simd_setzero_si()));
931 activeLanes = _simd_and_ps(activeLanes, vSampleMask);
932 }
933
934 // Early-Z?
935 if(T::bCanEarlyZ && !T::bForcedSampleCount)
936 {
937 uint32_t depthPassCount = PixelRateZTest(activeLanes, psContext, BEEarlyDepthTest);
938 UPDATE_STAT(DepthPassCount, depthPassCount);
939 }
940
941 // if we have no covered samples that passed depth at this point, go to next tile
942 if(!_simd_movemask_ps(activeLanes)) { goto Endtile; };
943
944 if(pPSState->usesSourceDepth)
945 {
946 RDTSC_START(BEBarycentric);
947 // interpolate and quantize z
948 psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center);
949 psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
950 RDTSC_STOP(BEBarycentric, 0, 0);
951 }
952
953 // pixels that are currently active
954 psContext.activeMask = _simd_castps_si(activeLanes);
955 psContext.oMask = T::MultisampleT::FullSampleMask();
956
957 // execute pixel shader
958 RDTSC_START(BEPixelShader);
959 state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
960 UPDATE_STAT(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(activeLanes)));
961 RDTSC_STOP(BEPixelShader, 0, 0);
962
963 // update active lanes to remove any discarded or oMask'd pixels
964 activeLanes = _simd_castsi_ps(_simd_and_si(psContext.activeMask, _simd_cmpgt_epi32(psContext.oMask, _simd_setzero_si())));
965 if(!_simd_movemask_ps(activeLanes)) { goto Endtile; };
966
967 // late-Z
968 if(!T::bCanEarlyZ && !T::bForcedSampleCount)
969 {
970 uint32_t depthPassCount = PixelRateZTest(activeLanes, psContext, BELateDepthTest);
971 UPDATE_STAT(DepthPassCount, depthPassCount);
972 }
973
974 // if we have no covered samples that passed depth at this point, skip OM and go to next tile
975 if(!_simd_movemask_ps(activeLanes)) { goto Endtile; };
976
977 // output merger
978 // loop over all samples, broadcasting the results of the PS to all passing pixels
979 for(uint32_t sample = 0; sample < GetNumOMSamples<T>(pBlendState->sampleCount); sample++)
980 {
981 RDTSC_START(BEOutputMerger);
982 // center pattern does a single coverage/depth/stencil test, standard pattern tests all samples
983 uint32_t coverageSampleNum = (T::bIsStandardPattern) ? sample : 0;
984 simdscalar coverageMask, depthMask;
985 if(T::bForcedSampleCount)
986 {
987 coverageMask = depthMask = activeLanes;
988 }
989 else
990 {
991 coverageMask = PixelRateZTest.vCoverageMask[coverageSampleNum];
992 depthMask = PixelRateZTest.depthPassMask[coverageSampleNum];
993 if(!_simd_movemask_ps(depthMask))
994 {
995 // stencil should already have been written in early/lateZ tests
996 RDTSC_STOP(BEOutputMerger, 0, 0);
997 continue;
998 }
999 }
1000
1001 // broadcast the results of the PS to all passing pixels
1002 OutputMerger(psContext, pColorBase, sample, pBlendState, state.pfnBlendFunc, coverageMask, depthMask, pPSState->numRenderTargets);
1003
1004 if(!pPSState->forceEarlyZ && !T::bForcedSampleCount)
1005 {
1006 uint8_t *pDepthSample = pDepthBase + RasterTileDepthOffset(sample);
1007 uint8_t * pStencilSample = pStencilBase + RasterTileStencilOffset(sample);
1008
1009 DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, PixelRateZTest.vZ[coverageSampleNum],
1010 pDepthSample, depthMask, coverageMask, pStencilSample, PixelRateZTest.stencilPassMask[coverageSampleNum]);
1011 }
1012 RDTSC_STOP(BEOutputMerger, 0, 0);
1013 }
1014 Endtile:
1015 RDTSC_START(BEEndTile);
1016 for(uint32_t sample = 0; sample < T::MultisampleT::numCoverageSamples; sample++)
1017 {
1018 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
1019 }
1020
1021 work.anyCoveredSamples >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
1022 pDepthBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
1023 pStencilBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
1024
1025 for(uint32_t rt = 0; rt < NumRT; ++rt)
1026 {
1027 pColorBase[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
1028 }
1029 RDTSC_STOP(BEEndTile, 0, 0);
1030 }
1031 }
1032 RDTSC_STOP(BEPixelRateBackend, 0, 0);
1033 }
1034 // optimized backend flow with NULL PS
1035 template<uint32_t sampleCountT>
1036 void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
1037 {
1038 RDTSC_START(BENullBackend);
1039 ///@todo: handle center multisample pattern
1040 typedef SwrBackendTraits<sampleCountT, SWR_MSAA_STANDARD_PATTERN> T;
1041 RDTSC_START(BESetup);
1042
1043 SWR_CONTEXT *pContext = pDC->pContext;
1044 const API_STATE& state = GetApiState(pDC);
1045 const SWR_RASTSTATE& rastState = pDC->pState->state.rastState;
1046
1047 // broadcast scalars
1048 BarycentricCoeffs coeffs;
1049 coeffs.vIa = _simd_broadcast_ss(&work.I[0]);
1050 coeffs.vIb = _simd_broadcast_ss(&work.I[1]);
1051 coeffs.vIc = _simd_broadcast_ss(&work.I[2]);
1052
1053 coeffs.vJa = _simd_broadcast_ss(&work.J[0]);
1054 coeffs.vJb = _simd_broadcast_ss(&work.J[1]);
1055 coeffs.vJc = _simd_broadcast_ss(&work.J[2]);
1056
1057 coeffs.vZa = _simd_broadcast_ss(&work.Z[0]);
1058 coeffs.vZb = _simd_broadcast_ss(&work.Z[1]);
1059 coeffs.vZc = _simd_broadcast_ss(&work.Z[2]);
1060
1061 coeffs.vRecipDet = _simd_broadcast_ss(&work.recipDet);
1062
1063 uint8_t *pDepthBase = renderBuffers.pDepth, *pStencilBase = renderBuffers.pStencil;
1064
1065 RDTSC_STOP(BESetup, 0, 0);
1066
1067 SWR_PS_CONTEXT psContext;
1068 for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
1069 {
1070 // UL pixel corner
1071 simdscalar vYSamplePosUL = _simd_add_ps(vULOffsetsY, _simd_set1_ps((float)yy));
1072
1073 for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
1074 {
1075 // UL pixel corners
1076 simdscalar vXSamplePosUL = _simd_add_ps(vULOffsetsX, _simd_set1_ps((float)xx));
1077
1078 // iterate over active samples
1079 unsigned long sample = 0;
1080 uint32_t sampleMask = state.blendState.sampleMask;
1081 while (_BitScanForward(&sample, sampleMask))
1082 {
1083 sampleMask &= ~(1 << sample);
1084 simdmask coverageMask = work.coverageMask[sample] & MASK;
1085 if (coverageMask)
1086 {
1087 RDTSC_START(BEBarycentric);
1088 // calculate per sample positions
1089 psContext.vX.sample = _simd_add_ps(vXSamplePosUL, T::MultisampleT::vX(sample));
1090 psContext.vY.sample = _simd_add_ps(vYSamplePosUL, T::MultisampleT::vY(sample));
1091
1092 CalcSampleBarycentrics(coeffs, psContext);
1093
1094 // interpolate and quantize z
1095 psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample);
1096 psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
1097
1098 RDTSC_STOP(BEBarycentric, 0, 0);
1099
1100 // interpolate user clip distance if available
1101 if (rastState.clipDistanceMask)
1102 {
1103 coverageMask &= ~ComputeUserClipMask(rastState.clipDistanceMask, work.pUserClipBuffer,
1104 psContext.vI.sample, psContext.vJ.sample);
1105 }
1106
1107 simdscalar vCoverageMask = vMask(coverageMask);
1108 simdscalar stencilPassMask = vCoverageMask;
1109
1110 // offset depth/stencil buffers current sample
1111 uint8_t *pDepthSample = pDepthBase + RasterTileDepthOffset(sample);
1112 uint8_t *pStencilSample = pStencilBase + RasterTileStencilOffset(sample);
1113
1114 RDTSC_START(BEEarlyDepthTest);
1115 simdscalar depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing,
1116 psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
1117 DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
1118 pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
1119 RDTSC_STOP(BEEarlyDepthTest, 0, 0);
1120
1121 uint32_t statMask = _simd_movemask_ps(depthPassMask);
1122 uint32_t statCount = _mm_popcnt_u32(statMask);
1123 UPDATE_STAT(DepthPassCount, statCount);
1124 }
1125 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
1126 }
1127 pDepthBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
1128 pStencilBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
1129 }
1130 }
1131 RDTSC_STOP(BENullBackend, 0, 0);
1132 }
1133
1134 void InitClearTilesTable()
1135 {
1136 memset(sClearTilesTable, 0, sizeof(sClearTilesTable));
1137
1138 sClearTilesTable[R8G8B8A8_UNORM] = ClearMacroTile<R8G8B8A8_UNORM>;
1139 sClearTilesTable[B8G8R8A8_UNORM] = ClearMacroTile<B8G8R8A8_UNORM>;
1140 sClearTilesTable[R32_FLOAT] = ClearMacroTile<R32_FLOAT>;
1141 sClearTilesTable[R32G32B32A32_FLOAT] = ClearMacroTile<R32G32B32A32_FLOAT>;
1142 sClearTilesTable[R8_UINT] = ClearMacroTile<R8_UINT>;
1143 }
1144
1145 PFN_BACKEND_FUNC gBackendNullPs[SWR_MULTISAMPLE_TYPE_COUNT];
1146 PFN_BACKEND_FUNC gBackendSingleSample[2] // input coverage
1147 [2] // centroid
1148 [2] // canEarlyZ
1149 = {};
1150 PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_COUNT]
1151 [SWR_MSAA_SAMPLE_PATTERN_COUNT]
1152 [2] // input coverage
1153 [2] // centroid
1154 [2] // forcedSampleCount
1155 [2] // canEarlyZ
1156 = {};
1157 PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_COUNT]
1158 [2] // input coverage
1159 [2] // centroid
1160 [2] // canEarlyZ
1161 = {};
1162
1163 // Recursive template used to auto-nest conditionals. Converts dynamic enum function
1164 // arguments to static template arguments.
1165 template <uint32_t... ArgsT>
1166 struct BEChooser
1167 {
1168 // Last Arg Terminator
1169 static PFN_BACKEND_FUNC GetFunc(SWR_BACKEND_FUNCS tArg)
1170 {
1171 switch(tArg)
1172 {
1173 case SWR_BACKEND_SINGLE_SAMPLE: return BackendSingleSample<SwrBackendTraits<ArgsT...>>; break;
1174 case SWR_BACKEND_MSAA_PIXEL_RATE: return BackendPixelRate<SwrBackendTraits<ArgsT...>>; break;
1175 case SWR_BACKEND_MSAA_SAMPLE_RATE: return BackendSampleRate<SwrBackendTraits<ArgsT...>>; break;
1176 default:
1177 SWR_ASSERT(0 && "Invalid backend func\n");
1178 return nullptr;
1179 break;
1180 }
1181 }
1182
1183 // Recursively parse args
1184 template <typename... TArgsT>
1185 static PFN_BACKEND_FUNC GetFunc(SWR_MSAA_SAMPLE_PATTERN tArg, TArgsT... remainingArgs)
1186 {
1187 switch(tArg)
1188 {
1189 case SWR_MSAA_CENTER_PATTERN: return BEChooser<ArgsT..., SWR_MSAA_CENTER_PATTERN>::GetFunc(remainingArgs...); break;
1190 case SWR_MSAA_STANDARD_PATTERN: return BEChooser<ArgsT..., SWR_MSAA_STANDARD_PATTERN>::GetFunc(remainingArgs...); break;
1191 default:
1192 SWR_ASSERT(0 && "Invalid sample pattern\n");
1193 return BEChooser<ArgsT..., SWR_MSAA_STANDARD_PATTERN>::GetFunc(remainingArgs...);
1194 break;
1195 }
1196 }
1197
1198 // Recursively parse args
1199 template <typename... TArgsT>
1200 static PFN_BACKEND_FUNC GetFunc(SWR_MULTISAMPLE_COUNT tArg, TArgsT... remainingArgs)
1201 {
1202 switch(tArg)
1203 {
1204 case SWR_MULTISAMPLE_1X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...); break;
1205 case SWR_MULTISAMPLE_2X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_2X>::GetFunc(remainingArgs...); break;
1206 case SWR_MULTISAMPLE_4X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_4X>::GetFunc(remainingArgs...); break;
1207 case SWR_MULTISAMPLE_8X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_8X>::GetFunc(remainingArgs...); break;
1208 case SWR_MULTISAMPLE_16X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_16X>::GetFunc(remainingArgs...); break;
1209 default:
1210 SWR_ASSERT(0 && "Invalid sample count\n");
1211 return BEChooser<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...);
1212 break;
1213 }
1214 }
1215
1216 // Recursively parse args
1217 template <typename... TArgsT>
1218 static PFN_BACKEND_FUNC GetFunc(bool tArg, TArgsT... remainingArgs)
1219 {
1220 if(tArg == true)
1221 {
1222 return BEChooser<ArgsT..., 1>::GetFunc(remainingArgs...);
1223 }
1224
1225 return BEChooser<ArgsT..., 0>::GetFunc(remainingArgs...);
1226 }
1227 };
1228
1229 void InitBackendSingleFuncTable(PFN_BACKEND_FUNC (&table)[2][2][2])
1230 {
1231 for(uint32_t inputCoverage = 0; inputCoverage < 2; inputCoverage++)
1232 {
1233 for(uint32_t isCentroid = 0; isCentroid < 2; isCentroid++)
1234 {
1235 for(uint32_t canEarlyZ = 0; canEarlyZ < 2; canEarlyZ++)
1236 {
1237 table[inputCoverage][isCentroid][canEarlyZ] =
1238 BEChooser<>::GetFunc(SWR_MULTISAMPLE_1X, SWR_MSAA_STANDARD_PATTERN, (inputCoverage > 0),
1239 (isCentroid > 0), false, (canEarlyZ > 0), SWR_BACKEND_SINGLE_SAMPLE);
1240 }
1241 }
1242 }
1243 }
1244
1245 void InitBackendPixelFuncTable(PFN_BACKEND_FUNC (&table)[SWR_MULTISAMPLE_TYPE_COUNT][SWR_MSAA_SAMPLE_PATTERN_COUNT][2][2][2][2])
1246 {
1247 for(uint32_t sampleCount = SWR_MULTISAMPLE_1X; sampleCount < SWR_MULTISAMPLE_TYPE_COUNT; sampleCount++)
1248 {
1249 for(uint32_t samplePattern = SWR_MSAA_CENTER_PATTERN; samplePattern < SWR_MSAA_SAMPLE_PATTERN_COUNT; samplePattern++)
1250 {
1251 for(uint32_t inputCoverage = 0; inputCoverage < 2; inputCoverage++)
1252 {
1253 for(uint32_t isCentroid = 0; isCentroid < 2; isCentroid++)
1254 {
1255 for(uint32_t forcedSampleCount = 0; forcedSampleCount < 2; forcedSampleCount++)
1256 {
1257 for(uint32_t canEarlyZ = 0; canEarlyZ < 2; canEarlyZ++)
1258 {
1259 table[sampleCount][samplePattern][inputCoverage][isCentroid][forcedSampleCount][canEarlyZ] =
1260 BEChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, (SWR_MSAA_SAMPLE_PATTERN)samplePattern, (inputCoverage > 0),
1261 (isCentroid > 0), (forcedSampleCount > 0), (canEarlyZ > 0), SWR_BACKEND_MSAA_PIXEL_RATE);
1262 }
1263 }
1264 }
1265 }
1266 }
1267 }
1268 }
1269
1270 void InitBackendSampleFuncTable(PFN_BACKEND_FUNC (&table)[SWR_MULTISAMPLE_TYPE_COUNT][2][2][2])
1271 {
1272 for(uint32_t sampleCount = SWR_MULTISAMPLE_1X; sampleCount < SWR_MULTISAMPLE_TYPE_COUNT; sampleCount++)
1273 {
1274 for(uint32_t inputCoverage = 0; inputCoverage < 2; inputCoverage++)
1275 {
1276 for(uint32_t centroid = 0; centroid < 2; centroid++)
1277 {
1278 for(uint32_t canEarlyZ = 0; canEarlyZ < 2; canEarlyZ++)
1279 {
1280 table[sampleCount][inputCoverage][centroid][canEarlyZ] =
1281 BEChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, SWR_MSAA_STANDARD_PATTERN, (inputCoverage > 0),
1282 (centroid > 0), false, (canEarlyZ > 0), (SWR_BACKEND_FUNCS)SWR_BACKEND_MSAA_SAMPLE_RATE);
1283 }
1284 }
1285 }
1286 }
1287 }
1288
1289 void InitBackendFuncTables()
1290 {
1291 InitBackendSingleFuncTable(gBackendSingleSample);
1292 InitBackendPixelFuncTable(gBackendPixelRateTable);
1293 InitBackendSampleFuncTable(gBackendSampleRateTable);
1294
1295 gBackendNullPs[SWR_MULTISAMPLE_1X] = &BackendNullPS < SWR_MULTISAMPLE_1X > ;
1296 gBackendNullPs[SWR_MULTISAMPLE_2X] = &BackendNullPS < SWR_MULTISAMPLE_2X > ;
1297 gBackendNullPs[SWR_MULTISAMPLE_4X] = &BackendNullPS < SWR_MULTISAMPLE_4X > ;
1298 gBackendNullPs[SWR_MULTISAMPLE_8X] = &BackendNullPS < SWR_MULTISAMPLE_8X > ;
1299 gBackendNullPs[SWR_MULTISAMPLE_16X] = &BackendNullPS < SWR_MULTISAMPLE_16X > ;
1300 }