7dd6c0db3de2d0d16b95c5b55206607565312d99
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / backend.cpp
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file backend.cpp
24 *
25 * @brief Backend handles rasterization, pixel shading and output merger
26 * operations.
27 *
28 ******************************************************************************/
29
30 #include <smmintrin.h>
31
32 #include "backend.h"
33 #include "depthstencil.h"
34 #include "tilemgr.h"
35 #include "memory/tilingtraits.h"
36 #include "core/multisample.h"
37
38 #include <algorithm>
39
40 typedef void(*PFN_CLEAR_TILES)(DRAW_CONTEXT*, SWR_RENDERTARGET_ATTACHMENT rt, uint32_t, DWORD[4], const SWR_RECT& rect);
41 static PFN_CLEAR_TILES sClearTilesTable[NUM_SWR_FORMATS];
42
43 //////////////////////////////////////////////////////////////////////////
44 /// @brief Process compute work.
45 /// @param pDC - pointer to draw context (dispatch).
46 /// @param workerId - The unique worker ID that is assigned to this thread.
47 /// @param threadGroupId - the linear index for the thread group within the dispatch.
48 void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId, void*& pSpillFillBuffer)
49 {
50 RDTSC_START(BEDispatch);
51
52 SWR_CONTEXT *pContext = pDC->pContext;
53
54 const COMPUTE_DESC* pTaskData = (COMPUTE_DESC*)pDC->pDispatch->GetTasksData();
55 SWR_ASSERT(pTaskData != nullptr);
56
57 // Ensure spill fill memory has been allocated.
58 size_t spillFillSize = pDC->pState->state.totalSpillFillSize;
59 if (spillFillSize && pSpillFillBuffer == nullptr)
60 {
61 pSpillFillBuffer = pDC->pArena->AllocAlignedSync(spillFillSize, KNOB_SIMD_BYTES);
62 }
63
64 const API_STATE& state = GetApiState(pDC);
65
66 SWR_CS_CONTEXT csContext{ 0 };
67 csContext.tileCounter = threadGroupId;
68 csContext.dispatchDims[0] = pTaskData->threadGroupCountX;
69 csContext.dispatchDims[1] = pTaskData->threadGroupCountY;
70 csContext.dispatchDims[2] = pTaskData->threadGroupCountZ;
71 csContext.pTGSM = pContext->pScratch[workerId];
72 csContext.pSpillFillBuffer = (uint8_t*)pSpillFillBuffer;
73
74 state.pfnCsFunc(GetPrivateState(pDC), &csContext);
75
76 UPDATE_STAT(CsInvocations, state.totalThreadsInGroup);
77
78 RDTSC_STOP(BEDispatch, 1, 0);
79 }
80
81 void ProcessSyncBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData)
82 {
83 uint32_t x, y;
84 MacroTileMgr::getTileIndices(macroTile, x, y);
85 SWR_ASSERT(x == 0 && y == 0);
86 }
87
88 template<SWR_FORMAT format>
89 void ClearRasterTile(uint8_t *pTileBuffer, simdvector &value)
90 {
91 auto lambda = [&](int32_t comp)
92 {
93 FormatTraits<format>::storeSOA(comp, pTileBuffer, value.v[comp]);
94 pTileBuffer += (KNOB_SIMD_WIDTH * FormatTraits<format>::GetBPC(comp) / 8);
95 };
96
97 const uint32_t numIter = (KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD_TILE_X_DIM);
98 for (uint32_t i = 0; i < numIter; ++i)
99 {
100 UnrollerL<0, FormatTraits<format>::numComps, 1>::step(lambda);
101 }
102 }
103
104 template<SWR_FORMAT format>
105 INLINE void ClearMacroTile(DRAW_CONTEXT *pDC, SWR_RENDERTARGET_ATTACHMENT rt, uint32_t macroTile, DWORD clear[4], const SWR_RECT& rect)
106 {
107 // convert clear color to hottile format
108 // clear color is in RGBA float/uint32
109 simdvector vClear;
110 for (uint32_t comp = 0; comp < FormatTraits<format>::numComps; ++comp)
111 {
112 simdscalar vComp;
113 vComp = _simd_load1_ps((const float*)&clear[comp]);
114 if (FormatTraits<format>::isNormalized(comp))
115 {
116 vComp = _simd_mul_ps(vComp, _simd_set1_ps(FormatTraits<format>::fromFloat(comp)));
117 vComp = _simd_castsi_ps(_simd_cvtps_epi32(vComp));
118 }
119 vComp = FormatTraits<format>::pack(comp, vComp);
120 vClear.v[FormatTraits<format>::swizzle(comp)] = vComp;
121 }
122
123 uint32_t tileX, tileY;
124 MacroTileMgr::getTileIndices(macroTile, tileX, tileY);
125
126 // Init to full macrotile
127 SWR_RECT clearTile =
128 {
129 KNOB_MACROTILE_X_DIM * int32_t(tileX),
130 KNOB_MACROTILE_Y_DIM * int32_t(tileY),
131 KNOB_MACROTILE_X_DIM * int32_t(tileX + 1),
132 KNOB_MACROTILE_Y_DIM * int32_t(tileY + 1),
133 };
134
135 // intersect with clear rect
136 clearTile &= rect;
137
138 // translate to local hottile origin
139 clearTile.Translate(-int32_t(tileX) * KNOB_MACROTILE_X_DIM, -int32_t(tileY) * KNOB_MACROTILE_Y_DIM);
140
141 // Make maximums inclusive (needed for convert to raster tiles)
142 clearTile.xmax -= 1;
143 clearTile.ymax -= 1;
144
145 // convert to raster tiles
146 clearTile.ymin >>= (KNOB_TILE_Y_DIM_SHIFT);
147 clearTile.ymax >>= (KNOB_TILE_Y_DIM_SHIFT);
148 clearTile.xmin >>= (KNOB_TILE_X_DIM_SHIFT);
149 clearTile.xmax >>= (KNOB_TILE_X_DIM_SHIFT);
150
151 const int32_t numSamples = GetNumSamples(pDC->pState->state.rastState.sampleCount);
152 // compute steps between raster tile samples / raster tiles / macro tile rows
153 const uint32_t rasterTileSampleStep = KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<format>::bpp / 8;
154 const uint32_t rasterTileStep = (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<format>::bpp / 8)) * numSamples;
155 const uint32_t macroTileRowStep = (KNOB_MACROTILE_X_DIM / KNOB_TILE_X_DIM) * rasterTileStep;
156 const uint32_t pitch = (FormatTraits<format>::bpp * KNOB_MACROTILE_X_DIM / 8);
157
158 HOTTILE *pHotTile = pDC->pContext->pHotTileMgr->GetHotTile(pDC->pContext, pDC, macroTile, rt, true, numSamples);
159 uint32_t rasterTileStartOffset = (ComputeTileOffset2D< TilingTraits<SWR_TILE_SWRZ, FormatTraits<format>::bpp > >(pitch, clearTile.xmin, clearTile.ymin)) * numSamples;
160 uint8_t* pRasterTileRow = pHotTile->pBuffer + rasterTileStartOffset; //(ComputeTileOffset2D< TilingTraits<SWR_TILE_SWRZ, FormatTraits<format>::bpp > >(pitch, x, y)) * numSamples;
161
162 // loop over all raster tiles in the current hot tile
163 for (int32_t y = clearTile.ymin; y <= clearTile.ymax; ++y)
164 {
165 uint8_t* pRasterTile = pRasterTileRow;
166 for (int32_t x = clearTile.xmin; x <= clearTile.xmax; ++x)
167 {
168 for( int32_t sampleNum = 0; sampleNum < numSamples; sampleNum++)
169 {
170 ClearRasterTile<format>(pRasterTile, vClear);
171 pRasterTile += rasterTileSampleStep;
172 }
173 }
174 pRasterTileRow += macroTileRowStep;
175 }
176
177 pHotTile->state = HOTTILE_DIRTY;
178 }
179
180
181 void ProcessClearBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData)
182 {
183 if (KNOB_FAST_CLEAR)
184 {
185 CLEAR_DESC *pClear = (CLEAR_DESC*)pUserData;
186 SWR_CONTEXT *pContext = pDC->pContext;
187 SWR_MULTISAMPLE_COUNT sampleCount = pDC->pState->state.rastState.sampleCount;
188 uint32_t numSamples = GetNumSamples(sampleCount);
189
190 SWR_ASSERT(pClear->flags.bits != 0); // shouldn't be here without a reason.
191
192 RDTSC_START(BEClear);
193
194 if (pClear->flags.mask & SWR_CLEAR_COLOR)
195 {
196 HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, SWR_ATTACHMENT_COLOR0, true, numSamples);
197 // All we want to do here is to mark the hot tile as being in a "needs clear" state.
198 pHotTile->clearData[0] = *(DWORD*)&(pClear->clearRTColor[0]);
199 pHotTile->clearData[1] = *(DWORD*)&(pClear->clearRTColor[1]);
200 pHotTile->clearData[2] = *(DWORD*)&(pClear->clearRTColor[2]);
201 pHotTile->clearData[3] = *(DWORD*)&(pClear->clearRTColor[3]);
202 pHotTile->state = HOTTILE_CLEAR;
203 }
204
205 if (pClear->flags.mask & SWR_CLEAR_DEPTH)
206 {
207 HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, SWR_ATTACHMENT_DEPTH, true, numSamples);
208 pHotTile->clearData[0] = *(DWORD*)&pClear->clearDepth;
209 pHotTile->state = HOTTILE_CLEAR;
210 }
211
212 if (pClear->flags.mask & SWR_CLEAR_STENCIL)
213 {
214 HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, SWR_ATTACHMENT_STENCIL, true, numSamples);
215
216 pHotTile->clearData[0] = *(DWORD*)&pClear->clearStencil;
217 pHotTile->state = HOTTILE_CLEAR;
218 }
219
220 RDTSC_STOP(BEClear, 0, 0);
221 }
222 else
223 {
224 // Legacy clear
225 CLEAR_DESC *pClear = (CLEAR_DESC*)pUserData;
226 RDTSC_START(BEClear);
227
228 if (pClear->flags.mask & SWR_CLEAR_COLOR)
229 {
230 /// @todo clear data should come in as RGBA32_FLOAT
231 DWORD clearData[4];
232 float clearFloat[4];
233 clearFloat[0] = ((uint8_t*)(&pClear->clearRTColor))[0] / 255.0f;
234 clearFloat[1] = ((uint8_t*)(&pClear->clearRTColor))[1] / 255.0f;
235 clearFloat[2] = ((uint8_t*)(&pClear->clearRTColor))[2] / 255.0f;
236 clearFloat[3] = ((uint8_t*)(&pClear->clearRTColor))[3] / 255.0f;
237 clearData[0] = *(DWORD*)&clearFloat[0];
238 clearData[1] = *(DWORD*)&clearFloat[1];
239 clearData[2] = *(DWORD*)&clearFloat[2];
240 clearData[3] = *(DWORD*)&clearFloat[3];
241
242 PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[KNOB_COLOR_HOT_TILE_FORMAT];
243 SWR_ASSERT(pfnClearTiles != nullptr);
244
245 pfnClearTiles(pDC, SWR_ATTACHMENT_COLOR0, macroTile, clearData, pClear->rect);
246 }
247
248 if (pClear->flags.mask & SWR_CLEAR_DEPTH)
249 {
250 DWORD clearData[4];
251 clearData[0] = *(DWORD*)&pClear->clearDepth;
252 PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[KNOB_DEPTH_HOT_TILE_FORMAT];
253 SWR_ASSERT(pfnClearTiles != nullptr);
254
255 pfnClearTiles(pDC, SWR_ATTACHMENT_DEPTH, macroTile, clearData, pClear->rect);
256 }
257
258 if (pClear->flags.mask & SWR_CLEAR_STENCIL)
259 {
260 uint32_t value = pClear->clearStencil;
261 DWORD clearData[4];
262 clearData[0] = *(DWORD*)&value;
263 PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[KNOB_STENCIL_HOT_TILE_FORMAT];
264
265 pfnClearTiles(pDC, SWR_ATTACHMENT_STENCIL, macroTile, clearData, pClear->rect);
266 }
267
268 RDTSC_STOP(BEClear, 0, 0);
269 }
270 }
271
272
273 void ProcessStoreTileBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData)
274 {
275 RDTSC_START(BEStoreTiles);
276 STORE_TILES_DESC *pDesc = (STORE_TILES_DESC*)pData;
277 SWR_CONTEXT *pContext = pDC->pContext;
278
279 #ifdef KNOB_ENABLE_RDTSC
280 uint32_t numTiles = 0;
281 #endif
282 SWR_FORMAT srcFormat;
283 switch (pDesc->attachment)
284 {
285 case SWR_ATTACHMENT_COLOR0:
286 case SWR_ATTACHMENT_COLOR1:
287 case SWR_ATTACHMENT_COLOR2:
288 case SWR_ATTACHMENT_COLOR3:
289 case SWR_ATTACHMENT_COLOR4:
290 case SWR_ATTACHMENT_COLOR5:
291 case SWR_ATTACHMENT_COLOR6:
292 case SWR_ATTACHMENT_COLOR7: srcFormat = KNOB_COLOR_HOT_TILE_FORMAT; break;
293 case SWR_ATTACHMENT_DEPTH: srcFormat = KNOB_DEPTH_HOT_TILE_FORMAT; break;
294 case SWR_ATTACHMENT_STENCIL: srcFormat = KNOB_STENCIL_HOT_TILE_FORMAT; break;
295 default: SWR_ASSERT(false, "Unknown attachment: %d", pDesc->attachment); srcFormat = KNOB_COLOR_HOT_TILE_FORMAT; break;
296 }
297
298 uint32_t x, y;
299 MacroTileMgr::getTileIndices(macroTile, x, y);
300
301 // Only need to store the hottile if it's been rendered to...
302 HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, pDesc->attachment, false);
303 if (pHotTile)
304 {
305 // clear if clear is pending (i.e., not rendered to), then mark as dirty for store.
306 if (pHotTile->state == HOTTILE_CLEAR)
307 {
308 PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[srcFormat];
309 SWR_ASSERT(pfnClearTiles != nullptr);
310
311 pfnClearTiles(pDC, pDesc->attachment, macroTile, pHotTile->clearData, pDesc->rect);
312 }
313
314 if (pHotTile->state == HOTTILE_DIRTY || pDesc->postStoreTileState == (SWR_TILE_STATE)HOTTILE_DIRTY)
315 {
316 int32_t destX = KNOB_MACROTILE_X_DIM * x;
317 int32_t destY = KNOB_MACROTILE_Y_DIM * y;
318
319 pContext->pfnStoreTile(GetPrivateState(pDC), srcFormat,
320 pDesc->attachment, destX, destY, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer);
321 }
322
323
324 if (pHotTile->state == HOTTILE_DIRTY || pHotTile->state == HOTTILE_RESOLVED)
325 {
326 pHotTile->state = (HOTTILE_STATE)pDesc->postStoreTileState;
327 }
328 }
329 RDTSC_STOP(BEStoreTiles, numTiles, pDC->drawId);
330 }
331
332
333 void ProcessDiscardInvalidateTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData)
334 {
335 DISCARD_INVALIDATE_TILES_DESC *pDesc = (DISCARD_INVALIDATE_TILES_DESC *)pData;
336 SWR_CONTEXT *pContext = pDC->pContext;
337
338 const int32_t numSamples = GetNumSamples(pDC->pState->state.rastState.sampleCount);
339
340 for (uint32_t i = 0; i < SWR_NUM_ATTACHMENTS; ++i)
341 {
342 if (pDesc->attachmentMask & (1 << i))
343 {
344 HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTileNoLoad(
345 pContext, pDC, macroTile, (SWR_RENDERTARGET_ATTACHMENT)i, pDesc->createNewTiles, numSamples);
346 if (pHotTile)
347 {
348 pHotTile->state = (HOTTILE_STATE)pDesc->newTileState;
349 }
350 }
351 }
352 }
353
354 #if KNOB_SIMD_WIDTH == 8
355 const __m256 vCenterOffsetsX = {0.5, 1.5, 0.5, 1.5, 2.5, 3.5, 2.5, 3.5};
356 const __m256 vCenterOffsetsY = {0.5, 0.5, 1.5, 1.5, 0.5, 0.5, 1.5, 1.5};
357 const __m256 vULOffsetsX = {0.0, 1.0, 0.0, 1.0, 2.0, 3.0, 2.0, 3.0};
358 const __m256 vULOffsetsY = {0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0};
359 #else
360 #error Unsupported vector width
361 #endif
362
363 simdmask ComputeUserClipMask(uint8_t clipMask, float* pUserClipBuffer, simdscalar vI, simdscalar vJ)
364 {
365 simdscalar vClipMask = _simd_setzero_ps();
366 uint32_t numClipDistance = _mm_popcnt_u32(clipMask);
367
368 for (uint32_t i = 0; i < numClipDistance; ++i)
369 {
370 // pull triangle clip distance values from clip buffer
371 simdscalar vA = _simd_broadcast_ss(pUserClipBuffer++);
372 simdscalar vB = _simd_broadcast_ss(pUserClipBuffer++);
373 simdscalar vC = _simd_broadcast_ss(pUserClipBuffer++);
374
375 // interpolate
376 simdscalar vInterp = vplaneps(vA, vB, vC, vI, vJ);
377
378 // clip if interpolated clip distance is < 0 || NAN
379 simdscalar vCull = _simd_cmp_ps(_simd_setzero_ps(), vInterp, _CMP_NLE_UQ);
380
381 vClipMask = _simd_or_ps(vClipMask, vCull);
382 }
383
384 return _simd_movemask_ps(vClipMask);
385 }
386
387 template<typename T>
388 void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
389 {
390 RDTSC_START(BESingleSampleBackend);
391 RDTSC_START(BESetup);
392
393 const API_STATE& state = GetApiState(pDC);
394 const SWR_RASTSTATE& rastState = state.rastState;
395 const SWR_PS_STATE *pPSState = &state.psState;
396 const SWR_BLEND_STATE *pBlendState = &state.blendState;
397 uint64_t coverageMask = work.coverageMask[0];
398
399 // broadcast scalars
400 BarycentricCoeffs coeffs;
401 coeffs.vIa = _simd_broadcast_ss(&work.I[0]);
402 coeffs.vIb = _simd_broadcast_ss(&work.I[1]);
403 coeffs.vIc = _simd_broadcast_ss(&work.I[2]);
404
405 coeffs.vJa = _simd_broadcast_ss(&work.J[0]);
406 coeffs.vJb = _simd_broadcast_ss(&work.J[1]);
407 coeffs.vJc = _simd_broadcast_ss(&work.J[2]);
408
409 coeffs.vZa = _simd_broadcast_ss(&work.Z[0]);
410 coeffs.vZb = _simd_broadcast_ss(&work.Z[1]);
411 coeffs.vZc = _simd_broadcast_ss(&work.Z[2]);
412
413 coeffs.vRecipDet = _simd_broadcast_ss(&work.recipDet);
414
415 coeffs.vAOneOverW = _simd_broadcast_ss(&work.OneOverW[0]);
416 coeffs.vBOneOverW = _simd_broadcast_ss(&work.OneOverW[1]);
417 coeffs.vCOneOverW = _simd_broadcast_ss(&work.OneOverW[2]);
418
419 uint8_t *pColorBase[SWR_NUM_RENDERTARGETS];
420 uint32_t NumRT = state.psState.numRenderTargets;
421 for(uint32_t rt = 0; rt < NumRT; ++rt)
422 {
423 pColorBase[rt] = renderBuffers.pColor[rt];
424 }
425 uint8_t *pDepthBase = renderBuffers.pDepth, *pStencilBase = renderBuffers.pStencil;
426 RDTSC_STOP(BESetup, 0, 0);
427
428 SWR_PS_CONTEXT psContext;
429 psContext.pAttribs = work.pAttribs;
430 psContext.pPerspAttribs = work.pPerspAttribs;
431 psContext.frontFace = work.triFlags.frontFacing;
432 psContext.primID = work.triFlags.primID;
433
434 // save Ia/Ib/Ic and Ja/Jb/Jc if we need to reevaluate i/j/k in the shader because of pull attribs
435 psContext.I = work.I;
436 psContext.J = work.J;
437 psContext.recipDet = work.recipDet;
438 psContext.pRecipW = work.pRecipW;
439 psContext.pSamplePosX = (const float*)&T::MultisampleT::samplePosX;
440 psContext.pSamplePosY = (const float*)&T::MultisampleT::samplePosY;
441 psContext.rasterizerSampleCount = T::MultisampleT::numSamples;
442
443 for(uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
444 {
445 // UL pixel corner
446 psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps((float)yy));
447 // pixel center
448 psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps((float)yy));
449
450 for(uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
451 {
452 if(coverageMask & MASK)
453 {
454 psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps((float)xx));
455 // pixel center
456 psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps((float)xx));
457
458 if(T::InputCoverage != SWR_INPUT_COVERAGE_NONE)
459 {
460 const uint64_t* pCoverageMask = (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) ? &work.innerCoverageMask :
461 &work.coverageMask[0];
462 generateInputCoverage<T, T::InputCoverage>(pCoverageMask, psContext.inputMask, pBlendState->sampleMask);
463 }
464
465 RDTSC_START(BEBarycentric);
466 CalcPixelBarycentrics(coeffs, psContext);
467
468 // for 1x case, centroid is pixel center
469 psContext.vX.centroid = psContext.vX.center;
470 psContext.vY.centroid = psContext.vY.center;
471 psContext.vI.centroid = psContext.vI.center;
472 psContext.vJ.centroid = psContext.vJ.center;
473 psContext.vOneOverW.centroid = psContext.vOneOverW.center;
474
475 // interpolate and quantize z
476 psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center);
477 psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
478 RDTSC_STOP(BEBarycentric, 0, 0);
479
480 simdmask clipCoverageMask = coverageMask & MASK;
481 // interpolate user clip distance if available
482 if(rastState.clipDistanceMask)
483 {
484 clipCoverageMask &= ~ComputeUserClipMask(rastState.clipDistanceMask, work.pUserClipBuffer,
485 psContext.vI.center, psContext.vJ.center);
486 }
487
488 simdscalar vCoverageMask = vMask(clipCoverageMask);
489 simdscalar depthPassMask = vCoverageMask;
490 simdscalar stencilPassMask = vCoverageMask;
491
492 // Early-Z?
493 if(T::bCanEarlyZ)
494 {
495 RDTSC_START(BEEarlyDepthTest);
496 depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
497 psContext.vZ, pDepthBase, vCoverageMask, pStencilBase, &stencilPassMask);
498 RDTSC_STOP(BEEarlyDepthTest, 0, 0);
499
500 // early-exit if no pixels passed depth or earlyZ is forced on
501 if(pPSState->forceEarlyZ || !_simd_movemask_ps(depthPassMask))
502 {
503 DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
504 pDepthBase, depthPassMask, vCoverageMask, pStencilBase, stencilPassMask);
505
506 if (!_simd_movemask_ps(depthPassMask))
507 {
508 goto Endtile;
509 }
510 }
511 }
512
513 psContext.sampleIndex = 0;
514 psContext.activeMask = _simd_castps_si(vCoverageMask);
515
516 // execute pixel shader
517 RDTSC_START(BEPixelShader);
518 UPDATE_STAT(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask)));
519 state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
520 RDTSC_STOP(BEPixelShader, 0, 0);
521
522 vCoverageMask = _simd_castsi_ps(psContext.activeMask);
523
524 // late-Z
525 if(!T::bCanEarlyZ)
526 {
527 RDTSC_START(BELateDepthTest);
528 depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
529 psContext.vZ, pDepthBase, vCoverageMask, pStencilBase, &stencilPassMask);
530 RDTSC_STOP(BELateDepthTest, 0, 0);
531
532 if(!_simd_movemask_ps(depthPassMask))
533 {
534 // need to call depth/stencil write for stencil write
535 DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
536 pDepthBase, depthPassMask, vCoverageMask, pStencilBase, stencilPassMask);
537 goto Endtile;
538 }
539 }
540
541 uint32_t statMask = _simd_movemask_ps(depthPassMask);
542 uint32_t statCount = _mm_popcnt_u32(statMask);
543 UPDATE_STAT(DepthPassCount, statCount);
544
545 // output merger
546 RDTSC_START(BEOutputMerger);
547 OutputMerger(psContext, pColorBase, 0, pBlendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, pPSState->numRenderTargets);
548
549 // do final depth write after all pixel kills
550 if (!pPSState->forceEarlyZ)
551 {
552 DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
553 pDepthBase, depthPassMask, vCoverageMask, pStencilBase, stencilPassMask);
554 }
555 RDTSC_STOP(BEOutputMerger, 0, 0);
556 }
557
558 Endtile:
559 RDTSC_START(BEEndTile);
560 coverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
561 if(T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
562 {
563 work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
564 }
565 pDepthBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
566 pStencilBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
567
568 for(uint32_t rt = 0; rt < NumRT; ++rt)
569 {
570 pColorBase[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
571 }
572 RDTSC_STOP(BEEndTile, 0, 0);
573 }
574 }
575 RDTSC_STOP(BESingleSampleBackend, 0, 0);
576 }
577
578 template<typename T>
579 void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
580 {
581 RDTSC_START(BESampleRateBackend);
582 RDTSC_START(BESetup);
583
584 const API_STATE& state = GetApiState(pDC);
585 const SWR_RASTSTATE& rastState = state.rastState;
586 const SWR_PS_STATE *pPSState = &state.psState;
587 const SWR_BLEND_STATE *pBlendState = &state.blendState;
588
589 // broadcast scalars
590 BarycentricCoeffs coeffs;
591 coeffs.vIa = _simd_broadcast_ss(&work.I[0]);
592 coeffs.vIb = _simd_broadcast_ss(&work.I[1]);
593 coeffs.vIc = _simd_broadcast_ss(&work.I[2]);
594
595 coeffs.vJa = _simd_broadcast_ss(&work.J[0]);
596 coeffs.vJb = _simd_broadcast_ss(&work.J[1]);
597 coeffs.vJc = _simd_broadcast_ss(&work.J[2]);
598
599 coeffs.vZa = _simd_broadcast_ss(&work.Z[0]);
600 coeffs.vZb = _simd_broadcast_ss(&work.Z[1]);
601 coeffs.vZc = _simd_broadcast_ss(&work.Z[2]);
602
603 coeffs.vRecipDet = _simd_broadcast_ss(&work.recipDet);
604
605 coeffs.vAOneOverW = _simd_broadcast_ss(&work.OneOverW[0]);
606 coeffs.vBOneOverW = _simd_broadcast_ss(&work.OneOverW[1]);
607 coeffs.vCOneOverW = _simd_broadcast_ss(&work.OneOverW[2]);
608
609 uint8_t *pColorBase[SWR_NUM_RENDERTARGETS];
610 uint32_t NumRT = state.psState.numRenderTargets;
611 for(uint32_t rt = 0; rt < NumRT; ++rt)
612 {
613 pColorBase[rt] = renderBuffers.pColor[rt];
614 }
615 uint8_t *pDepthBase = renderBuffers.pDepth, *pStencilBase = renderBuffers.pStencil;
616 RDTSC_STOP(BESetup, 0, 0);
617
618 SWR_PS_CONTEXT psContext;
619 psContext.pAttribs = work.pAttribs;
620 psContext.pPerspAttribs = work.pPerspAttribs;
621 psContext.pRecipW = work.pRecipW;
622 psContext.frontFace = work.triFlags.frontFacing;
623 psContext.primID = work.triFlags.primID;
624
625 // save Ia/Ib/Ic and Ja/Jb/Jc if we need to reevaluate i/j/k in the shader because of pull attribs
626 psContext.I = work.I;
627 psContext.J = work.J;
628 psContext.recipDet = work.recipDet;
629 psContext.pSamplePosX = (const float*)&T::MultisampleT::samplePosX;
630 psContext.pSamplePosY = (const float*)&T::MultisampleT::samplePosY;
631 psContext.rasterizerSampleCount = T::MultisampleT::numSamples;
632
633 for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
634 {
635 // UL pixel corner
636 psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps((float)yy));
637 // pixel center
638 psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps((float)yy));
639
640 for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
641 {
642 psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps((float)xx));
643 // pixel center
644 psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps((float)xx));
645
646 RDTSC_START(BEBarycentric);
647 CalcPixelBarycentrics(coeffs, psContext);
648 RDTSC_STOP(BEBarycentric, 0, 0);
649
650 if(T::InputCoverage != SWR_INPUT_COVERAGE_NONE)
651 {
652 const uint64_t* pCoverageMask = (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) ? &work.innerCoverageMask :
653 &work.coverageMask[0];
654 generateInputCoverage<T, T::InputCoverage>(pCoverageMask, psContext.inputMask, pBlendState->sampleMask);
655 }
656
657 if(T::bCentroidPos)
658 {
659 ///@ todo: don't need to genererate input coverage 2x if input coverage and centroid
660 RDTSC_START(BEBarycentric);
661 if(T::bIsStandardPattern)
662 {
663 CalcCentroidPos<T>(psContext, &work.coverageMask[0], pBlendState->sampleMask, psContext.vX.UL, psContext.vY.UL);
664 }
665 else
666 {
667 psContext.vX.centroid = _simd_add_ps(psContext.vX.UL, _simd_set1_ps(0.5f));
668 psContext.vY.centroid = _simd_add_ps(psContext.vY.UL, _simd_set1_ps(0.5f));
669 }
670 CalcCentroidBarycentrics(coeffs, psContext, psContext.vX.UL, psContext.vY.UL);
671 RDTSC_STOP(BEBarycentric, 0, 0);
672 }
673 else
674 {
675 psContext.vX.centroid = psContext.vX.sample;
676 psContext.vY.centroid = psContext.vY.sample;
677 }
678
679 for(uint32_t sample = 0; sample < T::MultisampleT::numSamples; sample++)
680 {
681 simdmask coverageMask = work.coverageMask[sample] & MASK;
682 if (coverageMask)
683 {
684 RDTSC_START(BEBarycentric);
685 // calculate per sample positions
686 psContext.vX.sample = _simd_add_ps(psContext.vX.UL, T::MultisampleT::vX(sample));
687 psContext.vY.sample = _simd_add_ps(psContext.vY.UL, T::MultisampleT::vY(sample));
688
689 CalcSampleBarycentrics(coeffs, psContext);
690
691 // interpolate and quantize z
692 psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample);
693 psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
694 RDTSC_STOP(BEBarycentric, 0, 0);
695
696 // interpolate user clip distance if available
697 if (rastState.clipDistanceMask)
698 {
699 coverageMask &= ~ComputeUserClipMask(rastState.clipDistanceMask, work.pUserClipBuffer,
700 psContext.vI.sample, psContext.vJ.sample);
701 }
702
703 simdscalar vCoverageMask = vMask(coverageMask);
704 simdscalar depthPassMask = vCoverageMask;
705 simdscalar stencilPassMask = vCoverageMask;
706
707 // offset depth/stencil buffers current sample
708 uint8_t *pDepthSample = pDepthBase + RasterTileDepthOffset(sample);
709 uint8_t *pStencilSample = pStencilBase + RasterTileStencilOffset(sample);
710
711 // Early-Z?
712 if (T::bCanEarlyZ)
713 {
714 RDTSC_START(BEEarlyDepthTest);
715 depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
716 psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
717 RDTSC_STOP(BEEarlyDepthTest, 0, 0);
718
719 // early-exit if no samples passed depth or earlyZ is forced on.
720 if (pPSState->forceEarlyZ || !_simd_movemask_ps(depthPassMask))
721 {
722 DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
723 pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
724
725 if (!_simd_movemask_ps(depthPassMask))
726 {
727 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
728 continue;
729 }
730 }
731 }
732
733 psContext.sampleIndex = sample;
734 psContext.activeMask = _simd_castps_si(vCoverageMask);
735
736 // execute pixel shader
737 RDTSC_START(BEPixelShader);
738 UPDATE_STAT(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask)));
739 state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
740 RDTSC_STOP(BEPixelShader, 0, 0);
741
742 vCoverageMask = _simd_castsi_ps(psContext.activeMask);
743
744 // late-Z
745 if (!T::bCanEarlyZ)
746 {
747 RDTSC_START(BELateDepthTest);
748 depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
749 psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
750 RDTSC_STOP(BELateDepthTest, 0, 0);
751
752 if (!_simd_movemask_ps(depthPassMask))
753 {
754 // need to call depth/stencil write for stencil write
755 DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
756 pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
757
758 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
759 continue;
760 }
761 }
762
763 uint32_t statMask = _simd_movemask_ps(depthPassMask);
764 uint32_t statCount = _mm_popcnt_u32(statMask);
765 UPDATE_STAT(DepthPassCount, statCount);
766
767 // output merger
768 RDTSC_START(BEOutputMerger);
769 OutputMerger(psContext, pColorBase, sample, pBlendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, pPSState->numRenderTargets);
770
771 // do final depth write after all pixel kills
772 if (!pPSState->forceEarlyZ)
773 {
774 DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
775 pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
776 }
777 RDTSC_STOP(BEOutputMerger, 0, 0);
778 }
779 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
780 }
781 RDTSC_START(BEEndTile);
782 if(T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
783 {
784 work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
785 }
786 pDepthBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
787 pStencilBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
788
789 for (uint32_t rt = 0; rt < NumRT; ++rt)
790 {
791 pColorBase[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
792 }
793 RDTSC_STOP(BEEndTile, 0, 0);
794 }
795 }
796 RDTSC_STOP(BESampleRateBackend, 0, 0);
797 }
798
799 template<typename T>
800 void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
801 {
802 RDTSC_START(BEPixelRateBackend);
803 RDTSC_START(BESetup);
804
805 const API_STATE& state = GetApiState(pDC);
806 const SWR_RASTSTATE& rastState = state.rastState;
807 const SWR_PS_STATE *pPSState = &state.psState;
808 const SWR_BLEND_STATE *pBlendState = &state.blendState;
809
810 // broadcast scalars
811 BarycentricCoeffs coeffs;
812 coeffs.vIa = _simd_broadcast_ss(&work.I[0]);
813 coeffs.vIb = _simd_broadcast_ss(&work.I[1]);
814 coeffs.vIc = _simd_broadcast_ss(&work.I[2]);
815
816 coeffs.vJa = _simd_broadcast_ss(&work.J[0]);
817 coeffs.vJb = _simd_broadcast_ss(&work.J[1]);
818 coeffs.vJc = _simd_broadcast_ss(&work.J[2]);
819
820 coeffs.vZa = _simd_broadcast_ss(&work.Z[0]);
821 coeffs.vZb = _simd_broadcast_ss(&work.Z[1]);
822 coeffs.vZc = _simd_broadcast_ss(&work.Z[2]);
823
824 coeffs.vRecipDet = _simd_broadcast_ss(&work.recipDet);
825
826 coeffs.vAOneOverW = _simd_broadcast_ss(&work.OneOverW[0]);
827 coeffs.vBOneOverW = _simd_broadcast_ss(&work.OneOverW[1]);
828 coeffs.vCOneOverW = _simd_broadcast_ss(&work.OneOverW[2]);
829
830 uint8_t *pColorBase[SWR_NUM_RENDERTARGETS];
831 uint32_t NumRT = state.psState.numRenderTargets;
832 for(uint32_t rt = 0; rt < NumRT; ++rt)
833 {
834 pColorBase[rt] = renderBuffers.pColor[rt];
835 }
836 uint8_t *pDepthBase = renderBuffers.pDepth, *pStencilBase = renderBuffers.pStencil;
837 RDTSC_STOP(BESetup, 0, 0);
838
839 SWR_PS_CONTEXT psContext;
840 psContext.pAttribs = work.pAttribs;
841 psContext.pPerspAttribs = work.pPerspAttribs;
842 psContext.frontFace = work.triFlags.frontFacing;
843 psContext.primID = work.triFlags.primID;
844 psContext.pRecipW = work.pRecipW;
845 // save Ia/Ib/Ic and Ja/Jb/Jc if we need to reevaluate i/j/k in the shader because of pull attribs
846 psContext.I = work.I;
847 psContext.J = work.J;
848 psContext.recipDet = work.recipDet;
849 psContext.pSamplePosX = (const float*)&T::MultisampleT::samplePosX;
850 psContext.pSamplePosY = (const float*)&T::MultisampleT::samplePosY;
851 psContext.rasterizerSampleCount = T::MultisampleT::numSamples;
852
853 psContext.sampleIndex = 0;
854
855 PixelRateZTestLoop<T> PixelRateZTest(pDC, work, coeffs, state, pDepthBase, pStencilBase, rastState.clipDistanceMask);
856
857 for(uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
858 {
859 psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps((float)yy));
860 psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps((float)yy));
861 for(uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
862 {
863 simdscalar activeLanes;
864 if(!(work.anyCoveredSamples & MASK)) {goto Endtile;};
865 activeLanes = vMask(work.anyCoveredSamples & MASK);
866
867 psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps((float)xx));
868 // set pixel center positions
869 psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps((float)xx));
870
871 RDTSC_START(BEBarycentric);
872 CalcPixelBarycentrics(coeffs, psContext);
873 RDTSC_STOP(BEBarycentric, 0, 0);
874
875 if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE)
876 {
877 const uint64_t* pCoverageMask = (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) ? &work.innerCoverageMask :
878 &work.coverageMask[0];
879 generateInputCoverage<T, T::InputCoverage>(pCoverageMask, psContext.inputMask, pBlendState->sampleMask);
880 }
881
882 if(T::bCentroidPos)
883 {
884 ///@ todo: don't need to genererate input coverage 2x if input coverage and centroid
885 RDTSC_START(BEBarycentric);
886 if(T::bIsStandardPattern)
887 {
888 CalcCentroidPos<T>(psContext, &work.coverageMask[0], pBlendState->sampleMask, psContext.vX.UL, psContext.vY.UL);
889 }
890 else
891 {
892 psContext.vX.centroid = _simd_add_ps(psContext.vX.UL, _simd_set1_ps(0.5f));
893 psContext.vY.centroid = _simd_add_ps(psContext.vY.UL, _simd_set1_ps(0.5f));
894 }
895
896 CalcCentroidBarycentrics(coeffs, psContext, psContext.vX.UL, psContext.vY.UL);
897 RDTSC_STOP(BEBarycentric, 0, 0);
898 }
899 else
900 {
901 psContext.vX.centroid = _simd_add_ps(psContext.vX.UL, _simd_set1_ps(0.5f));
902 psContext.vY.centroid = _simd_add_ps(psContext.vY.UL, _simd_set1_ps(0.5f));
903 }
904
905 if(T::bForcedSampleCount)
906 {
907 // candidate pixels (that passed coverage) will cause shader invocation if any bits in the samplemask are set
908 const simdscalar vSampleMask = _simd_castsi_ps(_simd_cmpgt_epi32(_simd_set1_epi32(pBlendState->sampleMask), _simd_setzero_si()));
909 activeLanes = _simd_and_ps(activeLanes, vSampleMask);
910 }
911
912 // Early-Z?
913 if(T::bCanEarlyZ && !T::bForcedSampleCount)
914 {
915 uint32_t depthPassCount = PixelRateZTest(activeLanes, psContext, BEEarlyDepthTest);
916 UPDATE_STAT(DepthPassCount, depthPassCount);
917 }
918
919 // if we have no covered samples that passed depth at this point, go to next tile
920 if(!_simd_movemask_ps(activeLanes)) { goto Endtile; };
921
922 if(pPSState->usesSourceDepth)
923 {
924 RDTSC_START(BEBarycentric);
925 // interpolate and quantize z
926 psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center);
927 psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
928 RDTSC_STOP(BEBarycentric, 0, 0);
929 }
930
931 // pixels that are currently active
932 psContext.activeMask = _simd_castps_si(activeLanes);
933 psContext.oMask = T::MultisampleT::FullSampleMask();
934
935 // execute pixel shader
936 RDTSC_START(BEPixelShader);
937 state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
938 UPDATE_STAT(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(activeLanes)));
939 RDTSC_STOP(BEPixelShader, 0, 0);
940
941 // update active lanes to remove any discarded or oMask'd pixels
942 activeLanes = _simd_castsi_ps(_simd_and_si(psContext.activeMask, _simd_cmpgt_epi32(psContext.oMask, _simd_setzero_si())));
943 if(!_simd_movemask_ps(activeLanes)) { goto Endtile; };
944
945 // late-Z
946 if(!T::bCanEarlyZ && !T::bForcedSampleCount)
947 {
948 uint32_t depthPassCount = PixelRateZTest(activeLanes, psContext, BELateDepthTest);
949 UPDATE_STAT(DepthPassCount, depthPassCount);
950 }
951
952 // if we have no covered samples that passed depth at this point, skip OM and go to next tile
953 if(!_simd_movemask_ps(activeLanes)) { goto Endtile; };
954
955 // output merger
956 // loop over all samples, broadcasting the results of the PS to all passing pixels
957 for(uint32_t sample = 0; sample < GetNumOMSamples<T>(pBlendState->sampleCount); sample++)
958 {
959 RDTSC_START(BEOutputMerger);
960 // center pattern does a single coverage/depth/stencil test, standard pattern tests all samples
961 uint32_t coverageSampleNum = (T::bIsStandardPattern) ? sample : 0;
962 simdscalar coverageMask, depthMask;
963 if(T::bForcedSampleCount)
964 {
965 coverageMask = depthMask = activeLanes;
966 }
967 else
968 {
969 coverageMask = PixelRateZTest.vCoverageMask[coverageSampleNum];
970 depthMask = PixelRateZTest.depthPassMask[coverageSampleNum];
971 if(!_simd_movemask_ps(depthMask))
972 {
973 // stencil should already have been written in early/lateZ tests
974 RDTSC_STOP(BEOutputMerger, 0, 0);
975 continue;
976 }
977 }
978
979 // broadcast the results of the PS to all passing pixels
980 OutputMerger(psContext, pColorBase, sample, pBlendState, state.pfnBlendFunc, coverageMask, depthMask, pPSState->numRenderTargets);
981
982 if(!pPSState->forceEarlyZ && !T::bForcedSampleCount)
983 {
984 uint8_t *pDepthSample = pDepthBase + RasterTileDepthOffset(sample);
985 uint8_t * pStencilSample = pStencilBase + RasterTileStencilOffset(sample);
986
987 DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, PixelRateZTest.vZ[coverageSampleNum],
988 pDepthSample, depthMask, coverageMask, pStencilSample, PixelRateZTest.stencilPassMask[coverageSampleNum]);
989 }
990 RDTSC_STOP(BEOutputMerger, 0, 0);
991 }
992 Endtile:
993 RDTSC_START(BEEndTile);
994 for(uint32_t sample = 0; sample < T::MultisampleT::numCoverageSamples; sample++)
995 {
996 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
997 }
998
999 if(T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
1000 {
1001 work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
1002 }
1003 work.anyCoveredSamples >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
1004 pDepthBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
1005 pStencilBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
1006
1007 for(uint32_t rt = 0; rt < NumRT; ++rt)
1008 {
1009 pColorBase[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
1010 }
1011 RDTSC_STOP(BEEndTile, 0, 0);
1012 }
1013 }
1014 RDTSC_STOP(BEPixelRateBackend, 0, 0);
1015 }
1016 // optimized backend flow with NULL PS
1017 template<uint32_t sampleCountT>
1018 void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
1019 {
1020 RDTSC_START(BENullBackend);
1021 ///@todo: handle center multisample pattern
1022 typedef SwrBackendTraits<sampleCountT, SWR_MSAA_STANDARD_PATTERN> T;
1023 RDTSC_START(BESetup);
1024
1025 const API_STATE& state = GetApiState(pDC);
1026 const SWR_RASTSTATE& rastState = pDC->pState->state.rastState;
1027
1028 // broadcast scalars
1029 BarycentricCoeffs coeffs;
1030 coeffs.vIa = _simd_broadcast_ss(&work.I[0]);
1031 coeffs.vIb = _simd_broadcast_ss(&work.I[1]);
1032 coeffs.vIc = _simd_broadcast_ss(&work.I[2]);
1033
1034 coeffs.vJa = _simd_broadcast_ss(&work.J[0]);
1035 coeffs.vJb = _simd_broadcast_ss(&work.J[1]);
1036 coeffs.vJc = _simd_broadcast_ss(&work.J[2]);
1037
1038 coeffs.vZa = _simd_broadcast_ss(&work.Z[0]);
1039 coeffs.vZb = _simd_broadcast_ss(&work.Z[1]);
1040 coeffs.vZc = _simd_broadcast_ss(&work.Z[2]);
1041
1042 coeffs.vRecipDet = _simd_broadcast_ss(&work.recipDet);
1043
1044 uint8_t *pDepthBase = renderBuffers.pDepth, *pStencilBase = renderBuffers.pStencil;
1045
1046 RDTSC_STOP(BESetup, 0, 0);
1047
1048 SWR_PS_CONTEXT psContext;
1049 for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
1050 {
1051 // UL pixel corner
1052 simdscalar vYSamplePosUL = _simd_add_ps(vULOffsetsY, _simd_set1_ps((float)yy));
1053
1054 for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
1055 {
1056 // UL pixel corners
1057 simdscalar vXSamplePosUL = _simd_add_ps(vULOffsetsX, _simd_set1_ps((float)xx));
1058
1059 // iterate over active samples
1060 unsigned long sample = 0;
1061 uint32_t sampleMask = state.blendState.sampleMask;
1062 while (_BitScanForward(&sample, sampleMask))
1063 {
1064 sampleMask &= ~(1 << sample);
1065 simdmask coverageMask = work.coverageMask[sample] & MASK;
1066 if (coverageMask)
1067 {
1068 RDTSC_START(BEBarycentric);
1069 // calculate per sample positions
1070 psContext.vX.sample = _simd_add_ps(vXSamplePosUL, T::MultisampleT::vX(sample));
1071 psContext.vY.sample = _simd_add_ps(vYSamplePosUL, T::MultisampleT::vY(sample));
1072
1073 CalcSampleBarycentrics(coeffs, psContext);
1074
1075 // interpolate and quantize z
1076 psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample);
1077 psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
1078
1079 RDTSC_STOP(BEBarycentric, 0, 0);
1080
1081 // interpolate user clip distance if available
1082 if (rastState.clipDistanceMask)
1083 {
1084 coverageMask &= ~ComputeUserClipMask(rastState.clipDistanceMask, work.pUserClipBuffer,
1085 psContext.vI.sample, psContext.vJ.sample);
1086 }
1087
1088 simdscalar vCoverageMask = vMask(coverageMask);
1089 simdscalar stencilPassMask = vCoverageMask;
1090
1091 // offset depth/stencil buffers current sample
1092 uint8_t *pDepthSample = pDepthBase + RasterTileDepthOffset(sample);
1093 uint8_t *pStencilSample = pStencilBase + RasterTileStencilOffset(sample);
1094
1095 RDTSC_START(BEEarlyDepthTest);
1096 simdscalar depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
1097 psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
1098 DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
1099 pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
1100 RDTSC_STOP(BEEarlyDepthTest, 0, 0);
1101
1102 uint32_t statMask = _simd_movemask_ps(depthPassMask);
1103 uint32_t statCount = _mm_popcnt_u32(statMask);
1104 UPDATE_STAT(DepthPassCount, statCount);
1105 }
1106 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
1107 }
1108 pDepthBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
1109 pStencilBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
1110 }
1111 }
1112 RDTSC_STOP(BENullBackend, 0, 0);
1113 }
1114
1115 void InitClearTilesTable()
1116 {
1117 memset(sClearTilesTable, 0, sizeof(sClearTilesTable));
1118
1119 sClearTilesTable[R8G8B8A8_UNORM] = ClearMacroTile<R8G8B8A8_UNORM>;
1120 sClearTilesTable[B8G8R8A8_UNORM] = ClearMacroTile<B8G8R8A8_UNORM>;
1121 sClearTilesTable[R32_FLOAT] = ClearMacroTile<R32_FLOAT>;
1122 sClearTilesTable[R32G32B32A32_FLOAT] = ClearMacroTile<R32G32B32A32_FLOAT>;
1123 sClearTilesTable[R8_UINT] = ClearMacroTile<R8_UINT>;
1124 }
1125
1126 PFN_BACKEND_FUNC gBackendNullPs[SWR_MULTISAMPLE_TYPE_COUNT];
1127 PFN_BACKEND_FUNC gBackendSingleSample[SWR_INPUT_COVERAGE_COUNT]
1128 [2] // centroid
1129 [2] // canEarlyZ
1130 = {};
1131 PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_COUNT]
1132 [SWR_MSAA_SAMPLE_PATTERN_COUNT]
1133 [SWR_INPUT_COVERAGE_COUNT]
1134 [2] // centroid
1135 [2] // forcedSampleCount
1136 [2] // canEarlyZ
1137 = {};
1138 PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_COUNT]
1139 [SWR_INPUT_COVERAGE_COUNT]
1140 [2] // centroid
1141 [2] // canEarlyZ
1142 = {};
1143
1144 // Recursive template used to auto-nest conditionals. Converts dynamic enum function
1145 // arguments to static template arguments.
1146 template <uint32_t... ArgsT>
1147 struct BEChooser
1148 {
1149 // Last Arg Terminator
1150 static PFN_BACKEND_FUNC GetFunc(SWR_BACKEND_FUNCS tArg)
1151 {
1152 switch(tArg)
1153 {
1154 case SWR_BACKEND_SINGLE_SAMPLE: return BackendSingleSample<SwrBackendTraits<ArgsT...>>; break;
1155 case SWR_BACKEND_MSAA_PIXEL_RATE: return BackendPixelRate<SwrBackendTraits<ArgsT...>>; break;
1156 case SWR_BACKEND_MSAA_SAMPLE_RATE: return BackendSampleRate<SwrBackendTraits<ArgsT...>>; break;
1157 default:
1158 SWR_ASSERT(0 && "Invalid backend func\n");
1159 return nullptr;
1160 break;
1161 }
1162 }
1163
1164 // Recursively parse args
1165 template <typename... TArgsT>
1166 static PFN_BACKEND_FUNC GetFunc(SWR_MSAA_SAMPLE_PATTERN tArg, TArgsT... remainingArgs)
1167 {
1168 switch(tArg)
1169 {
1170 case SWR_MSAA_CENTER_PATTERN: return BEChooser<ArgsT..., SWR_MSAA_CENTER_PATTERN>::GetFunc(remainingArgs...); break;
1171 case SWR_MSAA_STANDARD_PATTERN: return BEChooser<ArgsT..., SWR_MSAA_STANDARD_PATTERN>::GetFunc(remainingArgs...); break;
1172 default:
1173 SWR_ASSERT(0 && "Invalid sample pattern\n");
1174 return BEChooser<ArgsT..., SWR_MSAA_STANDARD_PATTERN>::GetFunc(remainingArgs...);
1175 break;
1176 }
1177 }
1178
1179 // Recursively parse args
1180 template <typename... TArgsT>
1181 static PFN_BACKEND_FUNC GetFunc(SWR_INPUT_COVERAGE tArg, TArgsT... remainingArgs)
1182 {
1183 switch(tArg)
1184 {
1185 case SWR_INPUT_COVERAGE_NONE: return BEChooser<ArgsT..., SWR_INPUT_COVERAGE_NONE>::GetFunc(remainingArgs...); break;
1186 case SWR_INPUT_COVERAGE_NORMAL: return BEChooser<ArgsT..., SWR_INPUT_COVERAGE_NORMAL>::GetFunc(remainingArgs...); break;
1187 case SWR_INPUT_COVERAGE_INNER_CONSERVATIVE: return BEChooser<ArgsT..., SWR_INPUT_COVERAGE_INNER_CONSERVATIVE>::GetFunc(remainingArgs...); break;
1188 default:
1189 SWR_ASSERT(0 && "Invalid sample pattern\n");
1190 return BEChooser<ArgsT..., SWR_INPUT_COVERAGE_NONE>::GetFunc(remainingArgs...);
1191 break;
1192 }
1193 }
1194
1195 // Recursively parse args
1196 template <typename... TArgsT>
1197 static PFN_BACKEND_FUNC GetFunc(SWR_MULTISAMPLE_COUNT tArg, TArgsT... remainingArgs)
1198 {
1199 switch(tArg)
1200 {
1201 case SWR_MULTISAMPLE_1X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...); break;
1202 case SWR_MULTISAMPLE_2X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_2X>::GetFunc(remainingArgs...); break;
1203 case SWR_MULTISAMPLE_4X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_4X>::GetFunc(remainingArgs...); break;
1204 case SWR_MULTISAMPLE_8X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_8X>::GetFunc(remainingArgs...); break;
1205 case SWR_MULTISAMPLE_16X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_16X>::GetFunc(remainingArgs...); break;
1206 default:
1207 SWR_ASSERT(0 && "Invalid sample count\n");
1208 return BEChooser<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...);
1209 break;
1210 }
1211 }
1212
1213 // Recursively parse args
1214 template <typename... TArgsT>
1215 static PFN_BACKEND_FUNC GetFunc(bool tArg, TArgsT... remainingArgs)
1216 {
1217 if(tArg == true)
1218 {
1219 return BEChooser<ArgsT..., 1>::GetFunc(remainingArgs...);
1220 }
1221
1222 return BEChooser<ArgsT..., 0>::GetFunc(remainingArgs...);
1223 }
1224 };
1225
1226 void InitBackendSingleFuncTable(PFN_BACKEND_FUNC (&table)[SWR_INPUT_COVERAGE_COUNT][2][2])
1227 {
1228 for(uint32_t inputCoverage = 0; inputCoverage < SWR_INPUT_COVERAGE_COUNT; inputCoverage++)
1229 {
1230 for(uint32_t isCentroid = 0; isCentroid < 2; isCentroid++)
1231 {
1232 for(uint32_t canEarlyZ = 0; canEarlyZ < 2; canEarlyZ++)
1233 {
1234 table[inputCoverage][isCentroid][canEarlyZ] =
1235 BEChooser<>::GetFunc(SWR_MULTISAMPLE_1X, SWR_MSAA_STANDARD_PATTERN, (SWR_INPUT_COVERAGE)inputCoverage,
1236 (isCentroid > 0), false, (canEarlyZ > 0), SWR_BACKEND_SINGLE_SAMPLE);
1237 }
1238 }
1239 }
1240 }
1241
1242 void InitBackendPixelFuncTable(PFN_BACKEND_FUNC (&table)[SWR_MULTISAMPLE_TYPE_COUNT][SWR_MSAA_SAMPLE_PATTERN_COUNT][SWR_INPUT_COVERAGE_COUNT][2][2][2])
1243 {
1244 for(uint32_t sampleCount = SWR_MULTISAMPLE_1X; sampleCount < SWR_MULTISAMPLE_TYPE_COUNT; sampleCount++)
1245 {
1246 for(uint32_t samplePattern = SWR_MSAA_CENTER_PATTERN; samplePattern < SWR_MSAA_SAMPLE_PATTERN_COUNT; samplePattern++)
1247 {
1248 for(uint32_t inputCoverage = 0; inputCoverage < SWR_INPUT_COVERAGE_COUNT; inputCoverage++)
1249 {
1250 for(uint32_t isCentroid = 0; isCentroid < 2; isCentroid++)
1251 {
1252 for(uint32_t forcedSampleCount = 0; forcedSampleCount < 2; forcedSampleCount++)
1253 {
1254 for(uint32_t canEarlyZ = 0; canEarlyZ < 2; canEarlyZ++)
1255 {
1256 table[sampleCount][samplePattern][inputCoverage][isCentroid][forcedSampleCount][canEarlyZ] =
1257 BEChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, (SWR_MSAA_SAMPLE_PATTERN)samplePattern, (SWR_INPUT_COVERAGE)inputCoverage,
1258 (isCentroid > 0), (forcedSampleCount > 0), (canEarlyZ > 0), SWR_BACKEND_MSAA_PIXEL_RATE);
1259 }
1260 }
1261 }
1262 }
1263 }
1264 }
1265 }
1266
1267 void InitBackendSampleFuncTable(PFN_BACKEND_FUNC (&table)[SWR_MULTISAMPLE_TYPE_COUNT][SWR_INPUT_COVERAGE_COUNT][2][2])
1268 {
1269 for(uint32_t sampleCount = SWR_MULTISAMPLE_1X; sampleCount < SWR_MULTISAMPLE_TYPE_COUNT; sampleCount++)
1270 {
1271 for(uint32_t inputCoverage = 0; inputCoverage < SWR_INPUT_COVERAGE_COUNT; inputCoverage++)
1272 {
1273 for(uint32_t centroid = 0; centroid < 2; centroid++)
1274 {
1275 for(uint32_t canEarlyZ = 0; canEarlyZ < 2; canEarlyZ++)
1276 {
1277 table[sampleCount][inputCoverage][centroid][canEarlyZ] =
1278 BEChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, SWR_MSAA_STANDARD_PATTERN, (SWR_INPUT_COVERAGE)inputCoverage,
1279 (centroid > 0), false, (canEarlyZ > 0), (SWR_BACKEND_FUNCS)SWR_BACKEND_MSAA_SAMPLE_RATE);
1280 }
1281 }
1282 }
1283 }
1284 }
1285
1286 void InitBackendFuncTables()
1287 {
1288 InitBackendSingleFuncTable(gBackendSingleSample);
1289 InitBackendPixelFuncTable(gBackendPixelRateTable);
1290 InitBackendSampleFuncTable(gBackendSampleRateTable);
1291
1292 gBackendNullPs[SWR_MULTISAMPLE_1X] = &BackendNullPS < SWR_MULTISAMPLE_1X > ;
1293 gBackendNullPs[SWR_MULTISAMPLE_2X] = &BackendNullPS < SWR_MULTISAMPLE_2X > ;
1294 gBackendNullPs[SWR_MULTISAMPLE_4X] = &BackendNullPS < SWR_MULTISAMPLE_4X > ;
1295 gBackendNullPs[SWR_MULTISAMPLE_8X] = &BackendNullPS < SWR_MULTISAMPLE_8X > ;
1296 gBackendNullPs[SWR_MULTISAMPLE_16X] = &BackendNullPS < SWR_MULTISAMPLE_16X > ;
1297 }