swr: [rasterizer core] backend refactor
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / backend.cpp
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file backend.cpp
24 *
25 * @brief Backend handles rasterization, pixel shading and output merger
26 * operations.
27 *
28 ******************************************************************************/
29
30 #include <smmintrin.h>
31
32 #include "rdtsc_core.h"
33 #include "backend.h"
34 #include "depthstencil.h"
35 #include "tilemgr.h"
36 #include "memory/tilingtraits.h"
37 #include "core/multisample.h"
38
39 #include <algorithm>
40
41 const __m128 vTileOffsetsX = {0.5, KNOB_TILE_X_DIM - 0.5, 0.5, KNOB_TILE_X_DIM - 0.5};
42 const __m128 vTileOffsetsY = {0.5, 0.5, KNOB_TILE_Y_DIM - 0.5, KNOB_TILE_Y_DIM - 0.5};
43
44 /// @todo move to common lib
45 #define MASKTOVEC(i3,i2,i1,i0) {-i0,-i1,-i2,-i3}
46 static const __m128 gMaskToVec[] = {
47 MASKTOVEC(0,0,0,0),
48 MASKTOVEC(0,0,0,1),
49 MASKTOVEC(0,0,1,0),
50 MASKTOVEC(0,0,1,1),
51 MASKTOVEC(0,1,0,0),
52 MASKTOVEC(0,1,0,1),
53 MASKTOVEC(0,1,1,0),
54 MASKTOVEC(0,1,1,1),
55 MASKTOVEC(1,0,0,0),
56 MASKTOVEC(1,0,0,1),
57 MASKTOVEC(1,0,1,0),
58 MASKTOVEC(1,0,1,1),
59 MASKTOVEC(1,1,0,0),
60 MASKTOVEC(1,1,0,1),
61 MASKTOVEC(1,1,1,0),
62 MASKTOVEC(1,1,1,1),
63 };
64
65 typedef void(*PFN_CLEAR_TILES)(DRAW_CONTEXT*, SWR_RENDERTARGET_ATTACHMENT rt, uint32_t, DWORD[4]);
66 static PFN_CLEAR_TILES sClearTilesTable[NUM_SWR_FORMATS];
67
68 //////////////////////////////////////////////////////////////////////////
69 /// @brief Process compute work.
70 /// @param pDC - pointer to draw context (dispatch).
71 /// @param workerId - The unique worker ID that is assigned to this thread.
72 /// @param threadGroupId - the linear index for the thread group within the dispatch.
73 void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId, void*& pSpillFillBuffer)
74 {
75 RDTSC_START(BEDispatch);
76
77 SWR_CONTEXT *pContext = pDC->pContext;
78
79 const COMPUTE_DESC* pTaskData = (COMPUTE_DESC*)pDC->pDispatch->GetTasksData();
80 SWR_ASSERT(pTaskData != nullptr);
81
82 // Ensure spill fill memory has been allocated.
83 size_t spillFillSize = pDC->pState->state.totalSpillFillSize;
84 if (spillFillSize && pSpillFillBuffer == nullptr)
85 {
86 pSpillFillBuffer = pDC->pArena->AllocAlignedSync(spillFillSize, KNOB_SIMD_BYTES);
87 }
88
89 const API_STATE& state = GetApiState(pDC);
90
91 SWR_CS_CONTEXT csContext{ 0 };
92 csContext.tileCounter = threadGroupId;
93 csContext.dispatchDims[0] = pTaskData->threadGroupCountX;
94 csContext.dispatchDims[1] = pTaskData->threadGroupCountY;
95 csContext.dispatchDims[2] = pTaskData->threadGroupCountZ;
96 csContext.pTGSM = pContext->pScratch[workerId];
97 csContext.pSpillFillBuffer = (uint8_t*)pSpillFillBuffer;
98
99 state.pfnCsFunc(GetPrivateState(pDC), &csContext);
100
101 UPDATE_STAT(CsInvocations, state.totalThreadsInGroup);
102
103 RDTSC_STOP(BEDispatch, 1, 0);
104 }
105
106 void ProcessSyncBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData)
107 {
108 SYNC_DESC *pSync = (SYNC_DESC*)pUserData;
109
110 uint32_t x, y;
111 MacroTileMgr::getTileIndices(macroTile, x, y);
112 SWR_ASSERT(x == 0 && y == 0);
113
114 if (pSync->pfnCallbackFunc != nullptr)
115 {
116 pSync->pfnCallbackFunc(pSync->userData, pSync->userData2, pSync->userData3);
117 }
118 }
119
120 void ProcessQueryStatsBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData)
121 {
122 QUERY_DESC* pQueryDesc = (QUERY_DESC*)pUserData;
123 SWR_STATS* pStats = pQueryDesc->pStats;
124 SWR_CONTEXT *pContext = pDC->pContext;
125
126 SWR_ASSERT(pStats != nullptr);
127
128 for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i)
129 {
130 pStats->DepthPassCount += pContext->stats[i].DepthPassCount;
131
132 pStats->IaVertices += pContext->stats[i].IaVertices;
133 pStats->IaPrimitives += pContext->stats[i].IaPrimitives;
134 pStats->VsInvocations += pContext->stats[i].VsInvocations;
135 pStats->HsInvocations += pContext->stats[i].HsInvocations;
136 pStats->DsInvocations += pContext->stats[i].DsInvocations;
137 pStats->GsInvocations += pContext->stats[i].GsInvocations;
138 pStats->PsInvocations += pContext->stats[i].PsInvocations;
139 pStats->CInvocations += pContext->stats[i].CInvocations;
140 pStats->CsInvocations += pContext->stats[i].CsInvocations;
141 pStats->CPrimitives += pContext->stats[i].CPrimitives;
142 pStats->GsPrimitives += pContext->stats[i].GsPrimitives;
143
144 for (uint32_t stream = 0; stream < MAX_SO_STREAMS; ++stream)
145 {
146 pStats->SoWriteOffset[stream] += pContext->stats[i].SoWriteOffset[stream];
147
148 /// @note client is required to provide valid write offset before every draw, so we clear
149 /// out the contents of the write offset when storing stats
150 pContext->stats[i].SoWriteOffset[stream] = 0;
151
152 pStats->SoPrimStorageNeeded[stream] += pContext->stats[i].SoPrimStorageNeeded[stream];
153 pStats->SoNumPrimsWritten[stream] += pContext->stats[i].SoNumPrimsWritten[stream];
154 }
155 }
156 }
157
158 template<SWR_FORMAT format>
159 void ClearRasterTile(uint8_t *pTileBuffer, simdvector &value)
160 {
161 auto lambda = [&](int comp)
162 {
163 FormatTraits<format>::storeSOA(comp, pTileBuffer, value.v[comp]);
164 pTileBuffer += (KNOB_SIMD_WIDTH * FormatTraits<format>::GetBPC(comp) / 8);
165 };
166
167 const uint32_t numIter = (KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD_TILE_X_DIM);
168 for (uint32_t i = 0; i < numIter; ++i)
169 {
170 UnrollerL<0, FormatTraits<format>::numComps, 1>::step(lambda);
171 }
172 }
173
174 template<SWR_FORMAT format>
175 INLINE void ClearMacroTile(DRAW_CONTEXT *pDC, SWR_RENDERTARGET_ATTACHMENT rt, uint32_t macroTile, DWORD clear[4])
176 {
177 // convert clear color to hottile format
178 // clear color is in RGBA float/uint32
179 simdvector vClear;
180 for (uint32_t comp = 0; comp < FormatTraits<format>::numComps; ++comp)
181 {
182 simdscalar vComp;
183 vComp = _simd_load1_ps((const float*)&clear[comp]);
184 if (FormatTraits<format>::isNormalized(comp))
185 {
186 vComp = _simd_mul_ps(vComp, _simd_set1_ps(FormatTraits<format>::fromFloat(comp)));
187 vComp = _simd_castsi_ps(_simd_cvtps_epi32(vComp));
188 }
189 vComp = FormatTraits<format>::pack(comp, vComp);
190 vClear.v[FormatTraits<format>::swizzle(comp)] = vComp;
191 }
192
193 uint32_t tileX, tileY;
194 MacroTileMgr::getTileIndices(macroTile, tileX, tileY);
195 const API_STATE& state = GetApiState(pDC);
196
197 int top = KNOB_MACROTILE_Y_DIM_FIXED * tileY;
198 int bottom = top + KNOB_MACROTILE_Y_DIM_FIXED - 1;
199 int left = KNOB_MACROTILE_X_DIM_FIXED * tileX;
200 int right = left + KNOB_MACROTILE_X_DIM_FIXED - 1;
201
202 // intersect with scissor
203 top = std::max(top, state.scissorInFixedPoint.top);
204 left = std::max(left, state.scissorInFixedPoint.left);
205 bottom = std::min(bottom, state.scissorInFixedPoint.bottom);
206 right = std::min(right, state.scissorInFixedPoint.right);
207
208 // translate to local hottile origin
209 top -= KNOB_MACROTILE_Y_DIM_FIXED * tileY;
210 bottom -= KNOB_MACROTILE_Y_DIM_FIXED * tileY;
211 left -= KNOB_MACROTILE_X_DIM_FIXED * tileX;
212 right -= KNOB_MACROTILE_X_DIM_FIXED * tileX;
213
214 // convert to raster tiles
215 top >>= (KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT);
216 bottom >>= (KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT);
217 left >>= (KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT);
218 right >>= (KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT);
219
220 const int numSamples = GetNumSamples(pDC->pState->state.rastState.sampleCount);
221 // compute steps between raster tile samples / raster tiles / macro tile rows
222 const uint32_t rasterTileSampleStep = KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<format>::bpp / 8;
223 const uint32_t rasterTileStep = (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<format>::bpp / 8)) * numSamples;
224 const uint32_t macroTileRowStep = (KNOB_MACROTILE_X_DIM / KNOB_TILE_X_DIM) * rasterTileStep;
225 const uint32_t pitch = (FormatTraits<format>::bpp * KNOB_MACROTILE_X_DIM / 8);
226
227 HOTTILE *pHotTile = pDC->pContext->pHotTileMgr->GetHotTile(pDC->pContext, pDC, macroTile, rt, true, numSamples);
228 uint32_t rasterTileStartOffset = (ComputeTileOffset2D< TilingTraits<SWR_TILE_SWRZ, FormatTraits<format>::bpp > >(pitch, left, top)) * numSamples;
229 uint8_t* pRasterTileRow = pHotTile->pBuffer + rasterTileStartOffset; //(ComputeTileOffset2D< TilingTraits<SWR_TILE_SWRZ, FormatTraits<format>::bpp > >(pitch, x, y)) * numSamples;
230
231 // loop over all raster tiles in the current hot tile
232 for (int y = top; y <= bottom; ++y)
233 {
234 uint8_t* pRasterTile = pRasterTileRow;
235 for (int x = left; x <= right; ++x)
236 {
237 for( int sampleNum = 0; sampleNum < numSamples; sampleNum++)
238 {
239 ClearRasterTile<format>(pRasterTile, vClear);
240 pRasterTile += rasterTileSampleStep;
241 }
242 }
243 pRasterTileRow += macroTileRowStep;
244 }
245
246 pHotTile->state = HOTTILE_DIRTY;
247 }
248
249
250 void ProcessClearBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData)
251 {
252 if (KNOB_FAST_CLEAR)
253 {
254 CLEAR_DESC *pClear = (CLEAR_DESC*)pUserData;
255 SWR_CONTEXT *pContext = pDC->pContext;
256 SWR_MULTISAMPLE_COUNT sampleCount = pDC->pState->state.rastState.sampleCount;
257 uint32_t numSamples = GetNumSamples(sampleCount);
258
259 SWR_ASSERT(pClear->flags.bits != 0); // shouldn't be here without a reason.
260
261 RDTSC_START(BEClear);
262
263 if (pClear->flags.mask & SWR_CLEAR_COLOR)
264 {
265 HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, SWR_ATTACHMENT_COLOR0, true, numSamples);
266 // All we want to do here is to mark the hot tile as being in a "needs clear" state.
267 pHotTile->clearData[0] = *(DWORD*)&(pClear->clearRTColor[0]);
268 pHotTile->clearData[1] = *(DWORD*)&(pClear->clearRTColor[1]);
269 pHotTile->clearData[2] = *(DWORD*)&(pClear->clearRTColor[2]);
270 pHotTile->clearData[3] = *(DWORD*)&(pClear->clearRTColor[3]);
271 pHotTile->state = HOTTILE_CLEAR;
272 }
273
274 if (pClear->flags.mask & SWR_CLEAR_DEPTH)
275 {
276 HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, SWR_ATTACHMENT_DEPTH, true, numSamples);
277 pHotTile->clearData[0] = *(DWORD*)&pClear->clearDepth;
278 pHotTile->state = HOTTILE_CLEAR;
279 }
280
281 if (pClear->flags.mask & SWR_CLEAR_STENCIL)
282 {
283 HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, SWR_ATTACHMENT_STENCIL, true, numSamples);
284
285 pHotTile->clearData[0] = *(DWORD*)&pClear->clearStencil;
286 pHotTile->state = HOTTILE_CLEAR;
287 }
288
289 RDTSC_STOP(BEClear, 0, 0);
290 }
291 else
292 {
293 // Legacy clear
294 CLEAR_DESC *pClear = (CLEAR_DESC*)pUserData;
295 RDTSC_START(BEClear);
296
297 if (pClear->flags.mask & SWR_CLEAR_COLOR)
298 {
299 /// @todo clear data should come in as RGBA32_FLOAT
300 DWORD clearData[4];
301 float clearFloat[4];
302 clearFloat[0] = ((uint8_t*)(&pClear->clearRTColor))[0] / 255.0f;
303 clearFloat[1] = ((uint8_t*)(&pClear->clearRTColor))[1] / 255.0f;
304 clearFloat[2] = ((uint8_t*)(&pClear->clearRTColor))[2] / 255.0f;
305 clearFloat[3] = ((uint8_t*)(&pClear->clearRTColor))[3] / 255.0f;
306 clearData[0] = *(DWORD*)&clearFloat[0];
307 clearData[1] = *(DWORD*)&clearFloat[1];
308 clearData[2] = *(DWORD*)&clearFloat[2];
309 clearData[3] = *(DWORD*)&clearFloat[3];
310
311 PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[KNOB_COLOR_HOT_TILE_FORMAT];
312 SWR_ASSERT(pfnClearTiles != nullptr);
313
314 pfnClearTiles(pDC, SWR_ATTACHMENT_COLOR0, macroTile, clearData);
315 }
316
317 if (pClear->flags.mask & SWR_CLEAR_DEPTH)
318 {
319 DWORD clearData[4];
320 clearData[0] = *(DWORD*)&pClear->clearDepth;
321 PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[KNOB_DEPTH_HOT_TILE_FORMAT];
322 SWR_ASSERT(pfnClearTiles != nullptr);
323
324 pfnClearTiles(pDC, SWR_ATTACHMENT_DEPTH, macroTile, clearData);
325 }
326
327 if (pClear->flags.mask & SWR_CLEAR_STENCIL)
328 {
329 uint32_t value = pClear->clearStencil;
330 DWORD clearData[4];
331 clearData[0] = *(DWORD*)&value;
332 PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[KNOB_STENCIL_HOT_TILE_FORMAT];
333
334 pfnClearTiles(pDC, SWR_ATTACHMENT_STENCIL, macroTile, clearData);
335 }
336
337 RDTSC_STOP(BEClear, 0, 0);
338 }
339 }
340
341
342 void ProcessStoreTileBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData)
343 {
344 RDTSC_START(BEStoreTiles);
345 STORE_TILES_DESC *pDesc = (STORE_TILES_DESC*)pData;
346 SWR_CONTEXT *pContext = pDC->pContext;
347
348 #ifdef KNOB_ENABLE_RDTSC
349 uint32_t numTiles = 0;
350 #endif
351 SWR_FORMAT srcFormat;
352 switch (pDesc->attachment)
353 {
354 case SWR_ATTACHMENT_COLOR0:
355 case SWR_ATTACHMENT_COLOR1:
356 case SWR_ATTACHMENT_COLOR2:
357 case SWR_ATTACHMENT_COLOR3:
358 case SWR_ATTACHMENT_COLOR4:
359 case SWR_ATTACHMENT_COLOR5:
360 case SWR_ATTACHMENT_COLOR6:
361 case SWR_ATTACHMENT_COLOR7: srcFormat = KNOB_COLOR_HOT_TILE_FORMAT; break;
362 case SWR_ATTACHMENT_DEPTH: srcFormat = KNOB_DEPTH_HOT_TILE_FORMAT; break;
363 case SWR_ATTACHMENT_STENCIL: srcFormat = KNOB_STENCIL_HOT_TILE_FORMAT; break;
364 default: SWR_ASSERT(false, "Unknown attachment: %d", pDesc->attachment); srcFormat = KNOB_COLOR_HOT_TILE_FORMAT; break;
365 }
366
367 uint32_t x, y;
368 MacroTileMgr::getTileIndices(macroTile, x, y);
369
370 // Only need to store the hottile if it's been rendered to...
371 HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, pDesc->attachment, false);
372 if (pHotTile)
373 {
374 // clear if clear is pending (i.e., not rendered to), then mark as dirty for store.
375 if (pHotTile->state == HOTTILE_CLEAR)
376 {
377 PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[srcFormat];
378 SWR_ASSERT(pfnClearTiles != nullptr);
379
380 pfnClearTiles(pDC, pDesc->attachment, macroTile, pHotTile->clearData);
381 }
382
383 if (pHotTile->state == HOTTILE_DIRTY || pDesc->postStoreTileState == (SWR_TILE_STATE)HOTTILE_DIRTY)
384 {
385 int destX = KNOB_MACROTILE_X_DIM * x;
386 int destY = KNOB_MACROTILE_Y_DIM * y;
387
388 pContext->pfnStoreTile(GetPrivateState(pDC), srcFormat,
389 pDesc->attachment, destX, destY, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer);
390 }
391
392
393 if (pHotTile->state == HOTTILE_DIRTY || pHotTile->state == HOTTILE_RESOLVED)
394 {
395 pHotTile->state = (HOTTILE_STATE)pDesc->postStoreTileState;
396 }
397 }
398 RDTSC_STOP(BEStoreTiles, numTiles, pDC->drawId);
399 }
400
401
402 void ProcessDiscardInvalidateTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData)
403 {
404 DISCARD_INVALIDATE_TILES_DESC *pDesc = (DISCARD_INVALIDATE_TILES_DESC *)pData;
405 SWR_CONTEXT *pContext = pDC->pContext;
406
407 const int numSamples = GetNumSamples(pDC->pState->state.rastState.sampleCount);
408
409 for (uint32_t i = 0; i < SWR_NUM_ATTACHMENTS; ++i)
410 {
411 if (pDesc->attachmentMask & (1 << i))
412 {
413 HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTileNoLoad(
414 pContext, pDC, macroTile, (SWR_RENDERTARGET_ATTACHMENT)i, pDesc->createNewTiles, numSamples);
415 if (pHotTile)
416 {
417 pHotTile->state = (HOTTILE_STATE)pDesc->newTileState;
418 }
419 }
420 }
421 }
422
423 #if KNOB_SIMD_WIDTH == 8
424 const __m256 vCenterOffsetsX = {0.5, 1.5, 0.5, 1.5, 2.5, 3.5, 2.5, 3.5};
425 const __m256 vCenterOffsetsY = {0.5, 0.5, 1.5, 1.5, 0.5, 0.5, 1.5, 1.5};
426 const __m256 vULOffsetsX = {0.0, 1.0, 0.0, 1.0, 2.0, 3.0, 2.0, 3.0};
427 const __m256 vULOffsetsY = {0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0};
428 #else
429 #error Unsupported vector width
430 #endif
431
432 INLINE
433 bool CanEarlyZ(const SWR_PS_STATE *pPSState)
434 {
435 return (pPSState->forceEarlyZ || (!pPSState->writesODepth && !pPSState->usesSourceDepth && !pPSState->usesUAV));
436 }
437
438 simdmask ComputeUserClipMask(uint8_t clipMask, float* pUserClipBuffer, simdscalar vI, simdscalar vJ)
439 {
440 simdscalar vClipMask = _simd_setzero_ps();
441 uint32_t numClipDistance = _mm_popcnt_u32(clipMask);
442
443 for (uint32_t i = 0; i < numClipDistance; ++i)
444 {
445 // pull triangle clip distance values from clip buffer
446 simdscalar vA = _simd_broadcast_ss(pUserClipBuffer++);
447 simdscalar vB = _simd_broadcast_ss(pUserClipBuffer++);
448 simdscalar vC = _simd_broadcast_ss(pUserClipBuffer++);
449
450 // interpolate
451 simdscalar vInterp = vplaneps(vA, vB, vC, vI, vJ);
452
453 // clip if interpolated clip distance is < 0 || NAN
454 simdscalar vCull = _simd_cmp_ps(_simd_setzero_ps(), vInterp, _CMP_NLE_UQ);
455
456 vClipMask = _simd_or_ps(vClipMask, vCull);
457 }
458
459 return _simd_movemask_ps(vClipMask);
460 }
461
462 template<bool bGenerateBarycentrics>
463 INLINE void CalcPixelBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext)
464 {
465 if(bGenerateBarycentrics)
466 {
467 // evaluate I,J
468 psContext.vI.center = vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.center, psContext.vY.center);
469 psContext.vJ.center = vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.center, psContext.vY.center);
470 psContext.vI.center = _simd_mul_ps(psContext.vI.center, coeffs.vRecipDet);
471 psContext.vJ.center = _simd_mul_ps(psContext.vJ.center, coeffs.vRecipDet);
472
473 // interpolate 1/w
474 psContext.vOneOverW.center = vplaneps(coeffs.vAOneOverW, coeffs.vBOneOverW, coeffs.vCOneOverW, psContext.vI.center, psContext.vJ.center);
475 }
476 }
477
478 template<bool bGenerateBarycentrics>
479 INLINE void CalcSampleBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext)
480 {
481 if(bGenerateBarycentrics)
482 {
483 // evaluate I,J
484 psContext.vI.sample = vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.sample, psContext.vY.sample);
485 psContext.vJ.sample = vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.sample, psContext.vY.sample);
486 psContext.vI.sample = _simd_mul_ps(psContext.vI.sample, coeffs.vRecipDet);
487 psContext.vJ.sample = _simd_mul_ps(psContext.vJ.sample, coeffs.vRecipDet);
488
489 // interpolate 1/w
490 psContext.vOneOverW.sample = vplaneps(coeffs.vAOneOverW, coeffs.vBOneOverW, coeffs.vCOneOverW, psContext.vI.sample, psContext.vJ.sample);
491 }
492 }
493
494
495 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
496 // Centroid behaves exactly as follows :
497 // (1) If all samples in the primitive are covered, the attribute is evaluated at the pixel center (even if the sample pattern does not happen to
498 // have a sample location there).
499 // (2) Else the attribute is evaluated at the first covered sample, in increasing order of sample index, where sample coverage is after ANDing the
500 // coverage with the SampleMask Rasterizer State.
501 // (3) If no samples are covered, such as on helper pixels executed off the bounds of a primitive to fill out 2x2 pixel stamps, the attribute is
502 // evaluated as follows : If the SampleMask Rasterizer state is a subset of the samples in the pixel, then the first sample covered by the
503 // SampleMask Rasterizer State is the evaluation point.Otherwise (full SampleMask), the pixel center is the evaluation point.
504 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
505 template<typename T>
506 INLINE void CalcCentroidPos(SWR_PS_CONTEXT &psContext, const uint64_t *const coverageMask, const uint32_t sampleMask,
507 const simdscalar vXSamplePosUL, const simdscalar vYSamplePosUL)
508 {
509 uint32_t inputMask[KNOB_SIMD_WIDTH];
510 generateInputCoverage<T>(coverageMask, inputMask, sampleMask);
511
512 // Case (2) - partially covered pixel
513
514 // scan for first covered sample per pixel in the 4x2 span
515 unsigned long sampleNum[KNOB_SIMD_WIDTH];
516 (inputMask[0] > 0) ? (_BitScanForward(&sampleNum[0], inputMask[0])) : (sampleNum[0] = 0);
517 (inputMask[1] > 0) ? (_BitScanForward(&sampleNum[1], inputMask[1])) : (sampleNum[1] = 0);
518 (inputMask[2] > 0) ? (_BitScanForward(&sampleNum[2], inputMask[2])) : (sampleNum[2] = 0);
519 (inputMask[3] > 0) ? (_BitScanForward(&sampleNum[3], inputMask[3])) : (sampleNum[3] = 0);
520 (inputMask[4] > 0) ? (_BitScanForward(&sampleNum[4], inputMask[4])) : (sampleNum[4] = 0);
521 (inputMask[5] > 0) ? (_BitScanForward(&sampleNum[5], inputMask[5])) : (sampleNum[5] = 0);
522 (inputMask[6] > 0) ? (_BitScanForward(&sampleNum[6], inputMask[6])) : (sampleNum[6] = 0);
523 (inputMask[7] > 0) ? (_BitScanForward(&sampleNum[7], inputMask[7])) : (sampleNum[7] = 0);
524
525 // look up and set the sample offsets from UL pixel corner for first covered sample
526 __m256 vXSample = _mm256_set_ps(T::MultisampleT::X(sampleNum[7]),
527 T::MultisampleT::X(sampleNum[6]),
528 T::MultisampleT::X(sampleNum[5]),
529 T::MultisampleT::X(sampleNum[4]),
530 T::MultisampleT::X(sampleNum[3]),
531 T::MultisampleT::X(sampleNum[2]),
532 T::MultisampleT::X(sampleNum[1]),
533 T::MultisampleT::X(sampleNum[0]));
534
535 __m256 vYSample = _mm256_set_ps(T::MultisampleT::Y(sampleNum[7]),
536 T::MultisampleT::Y(sampleNum[6]),
537 T::MultisampleT::Y(sampleNum[5]),
538 T::MultisampleT::Y(sampleNum[4]),
539 T::MultisampleT::Y(sampleNum[3]),
540 T::MultisampleT::Y(sampleNum[2]),
541 T::MultisampleT::Y(sampleNum[1]),
542 T::MultisampleT::Y(sampleNum[0]));
543 // add sample offset to UL pixel corner
544 vXSample = _simd_add_ps(vXSamplePosUL, vXSample);
545 vYSample = _simd_add_ps(vYSamplePosUL, vYSample);
546
547 // Case (1) and case (3b) - All samples covered or not covered with full SampleMask
548 static const __m256i vFullyCoveredMask = T::MultisampleT::FullSampleMask();
549 __m256i vInputCoveragei = _mm256_set_epi32(inputMask[7], inputMask[6], inputMask[5], inputMask[4], inputMask[3], inputMask[2], inputMask[1], inputMask[0]);
550 __m256i vAllSamplesCovered = _simd_cmpeq_epi32(vInputCoveragei, vFullyCoveredMask);
551
552 static const __m256i vZero = _simd_setzero_si();
553 const __m256i vSampleMask = _simd_and_si(_simd_set1_epi32(sampleMask), vFullyCoveredMask);
554 __m256i vNoSamplesCovered = _simd_cmpeq_epi32(vInputCoveragei, vZero);
555 __m256i vIsFullSampleMask = _simd_cmpeq_epi32(vSampleMask, vFullyCoveredMask);
556 __m256i vCase3b = _simd_and_si(vNoSamplesCovered, vIsFullSampleMask);
557
558 __m256i vEvalAtCenter = _simd_or_si(vAllSamplesCovered, vCase3b);
559
560 // set the centroid position based on results from above
561 psContext.vX.centroid = _simd_blendv_ps(vXSample, psContext.vX.center, _simd_castsi_ps(vEvalAtCenter));
562 psContext.vY.centroid = _simd_blendv_ps(vYSample, psContext.vY.center, _simd_castsi_ps(vEvalAtCenter));
563
564 // Case (3a) No samples covered and partial sample mask
565 __m256i vSomeSampleMaskSamples = _simd_cmplt_epi32(vSampleMask, vFullyCoveredMask);
566 // sample mask should never be all 0's for this case, but handle it anyways
567 unsigned long firstCoveredSampleMaskSample = 0;
568 (sampleMask > 0) ? (_BitScanForward(&firstCoveredSampleMaskSample, sampleMask)) : (firstCoveredSampleMaskSample = 0);
569
570 __m256i vCase3a = _simd_and_si(vNoSamplesCovered, vSomeSampleMaskSamples);
571
572 vXSample = _simd_set1_ps(T::MultisampleT::X(firstCoveredSampleMaskSample));
573 vYSample = _simd_set1_ps(T::MultisampleT::Y(firstCoveredSampleMaskSample));
574
575 // blend in case 3a pixel locations
576 psContext.vX.centroid = _simd_blendv_ps(psContext.vX.centroid, vXSample, _simd_castsi_ps(vCase3a));
577 psContext.vY.centroid = _simd_blendv_ps(psContext.vY.centroid, vYSample, _simd_castsi_ps(vCase3a));
578 }
579
580 template<typename T>
581 INLINE void CalcCentroidBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext,
582 const uint64_t *const coverageMask, const uint32_t sampleMask,
583 const simdscalar vXSamplePosUL, const simdscalar vYSamplePosUL)
584 {
585 if(T::bIsStandardPattern)
586 {
587 ///@ todo: don't need to generate input coverage 2x if input coverage and centroid
588 CalcCentroidPos<T>(psContext, coverageMask, sampleMask, vXSamplePosUL, vYSamplePosUL);
589 }
590 else
591 {
592 static const __m256 pixelCenter = _simd_set1_ps(0.5f);
593 psContext.vX.centroid = _simd_add_ps(vXSamplePosUL, pixelCenter);
594 psContext.vY.centroid = _simd_add_ps(vYSamplePosUL, pixelCenter);
595 }
596 // evaluate I,J
597 psContext.vI.centroid = vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.centroid, psContext.vY.centroid);
598 psContext.vJ.centroid = vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.centroid, psContext.vY.centroid);
599 psContext.vI.centroid = _simd_mul_ps(psContext.vI.centroid, coeffs.vRecipDet);
600 psContext.vJ.centroid = _simd_mul_ps(psContext.vJ.centroid, coeffs.vRecipDet);
601
602 // interpolate 1/w
603 psContext.vOneOverW.centroid = vplaneps(coeffs.vAOneOverW, coeffs.vBOneOverW, coeffs.vCOneOverW, psContext.vI.centroid, psContext.vJ.centroid);
604 }
605
606 template<uint32_t NumRT, uint32_t sampleCountT>
607 void OutputMerger(SWR_PS_CONTEXT &psContext, uint8_t* (&pColorBase)[SWR_NUM_RENDERTARGETS], uint32_t sample, const SWR_BLEND_STATE *pBlendState,
608 const PFN_BLEND_JIT_FUNC (&pfnBlendFunc)[SWR_NUM_RENDERTARGETS], simdscalar &coverageMask, simdscalar depthPassMask)
609 {
610 // type safety guaranteed from template instantiation in BEChooser<>::GetFunc
611 static const SWR_MULTISAMPLE_COUNT sampleCount = (SWR_MULTISAMPLE_COUNT)sampleCountT;
612 uint32_t rasterTileColorOffset = MultisampleTraits<sampleCount>::RasterTileColorOffset(sample);
613 simdvector blendOut;
614
615 for(uint32_t rt = 0; rt < NumRT; ++rt)
616 {
617 uint8_t *pColorSample;
618 if(sampleCount == SWR_MULTISAMPLE_1X)
619 {
620 pColorSample = pColorBase[rt];
621 }
622 else
623 {
624 pColorSample = pColorBase[rt] + rasterTileColorOffset;
625 }
626
627 const SWR_RENDER_TARGET_BLEND_STATE *pRTBlend = &pBlendState->renderTarget[rt];
628 // pfnBlendFunc may not update all channels. Initialize with PS output.
629 /// TODO: move this into the blend JIT.
630 blendOut = psContext.shaded[rt];
631
632 // Blend outputs and update coverage mask for alpha test
633 if(pfnBlendFunc[rt] != nullptr)
634 {
635 pfnBlendFunc[rt](
636 pBlendState,
637 psContext.shaded[rt],
638 psContext.shaded[1],
639 sample,
640 pColorSample,
641 blendOut,
642 &psContext.oMask,
643 (simdscalari*)&coverageMask);
644 }
645
646 // final write mask
647 simdscalari outputMask = _simd_castps_si(_simd_and_ps(coverageMask, depthPassMask));
648
649 ///@todo can only use maskstore fast path if bpc is 32. Assuming hot tile is RGBA32_FLOAT.
650 static_assert(KNOB_COLOR_HOT_TILE_FORMAT == R32G32B32A32_FLOAT, "Unsupported hot tile format");
651
652 const uint32_t simd = KNOB_SIMD_WIDTH * sizeof(float);
653
654 // store with color mask
655 if(!pRTBlend->writeDisableRed)
656 {
657 _simd_maskstore_ps((float*)pColorSample, outputMask, blendOut.x);
658 }
659 if(!pRTBlend->writeDisableGreen)
660 {
661 _simd_maskstore_ps((float*)(pColorSample + simd), outputMask, blendOut.y);
662 }
663 if(!pRTBlend->writeDisableBlue)
664 {
665 _simd_maskstore_ps((float*)(pColorSample + simd * 2), outputMask, blendOut.z);
666 }
667 if(!pRTBlend->writeDisableAlpha)
668 {
669 _simd_maskstore_ps((float*)(pColorSample + simd * 3), outputMask, blendOut.w);
670 }
671 }
672 }
673
674 template<typename T>
675 void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
676 {
677 RDTSC_START(BESetup);
678
679 SWR_CONTEXT *pContext = pDC->pContext;
680 const API_STATE& state = GetApiState(pDC);
681 const SWR_RASTSTATE& rastState = state.rastState;
682 const SWR_PS_STATE *pPSState = &state.psState;
683 const SWR_BLEND_STATE *pBlendState = &state.blendState;
684 const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs;
685 uint64_t coverageMask = work.coverageMask[0];
686
687 // broadcast scalars
688 BarycentricCoeffs coeffs;
689 coeffs.vIa = _simd_broadcast_ss(&work.I[0]);
690 coeffs.vIb = _simd_broadcast_ss(&work.I[1]);
691 coeffs.vIc = _simd_broadcast_ss(&work.I[2]);
692
693 coeffs.vJa = _simd_broadcast_ss(&work.J[0]);
694 coeffs.vJb = _simd_broadcast_ss(&work.J[1]);
695 coeffs.vJc = _simd_broadcast_ss(&work.J[2]);
696
697 coeffs.vZa = _simd_broadcast_ss(&work.Z[0]);
698 coeffs.vZb = _simd_broadcast_ss(&work.Z[1]);
699 coeffs.vZc = _simd_broadcast_ss(&work.Z[2]);
700
701 coeffs.vRecipDet = _simd_broadcast_ss(&work.recipDet);
702
703 coeffs.vAOneOverW = _simd_broadcast_ss(&work.OneOverW[0]);
704 coeffs.vBOneOverW = _simd_broadcast_ss(&work.OneOverW[1]);
705 coeffs.vCOneOverW = _simd_broadcast_ss(&work.OneOverW[2]);
706
707 uint8_t *pColorBase[SWR_NUM_RENDERTARGETS];
708 uint32_t NumRT = state.psState.numRenderTargets;
709 for(uint32_t rt = 0; rt < NumRT; ++rt)
710 {
711 pColorBase[rt] = renderBuffers.pColor[rt];
712 }
713 uint8_t *pDepthBase = renderBuffers.pDepth, *pStencilBase = renderBuffers.pStencil;
714 RDTSC_STOP(BESetup, 0, 0);
715
716 SWR_PS_CONTEXT psContext;
717 psContext.pAttribs = work.pAttribs;
718 psContext.pPerspAttribs = work.pPerspAttribs;
719 psContext.frontFace = work.triFlags.frontFacing;
720 psContext.primID = work.triFlags.primID;
721
722 // save Ia/Ib/Ic and Ja/Jb/Jc if we need to reevaluate i/j/k in the shader because of pull attribs
723 psContext.I = work.I;
724 psContext.J = work.J;
725 psContext.recipDet = work.recipDet;
726 psContext.pRecipW = work.pRecipW;
727 psContext.pSamplePosX = (const float*)&T::MultisampleT::samplePosX;
728 psContext.pSamplePosY = (const float*)&T::MultisampleT::samplePosY;
729
730 for(uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
731 {
732 // UL pixel corner
733 psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps((float)yy));
734 // pixel center
735 psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps((float)yy));
736
737 for(uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
738 {
739 if(T::bInputCoverage)
740 {
741 generateInputCoverage<T>(&work.coverageMask[0], psContext.inputMask, pBlendState->sampleMask);
742 }
743
744 if(coverageMask & MASK)
745 {
746 RDTSC_START(BEBarycentric);
747 psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps((float)xx));
748 // pixel center
749 psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps((float)xx));
750
751 backendFuncs.pfnCalcPixelBarycentrics(coeffs, psContext);
752
753 if(T::bCentroidPos)
754 {
755 // for 1x case, centroid is pixel center
756 psContext.vX.centroid = psContext.vX.center;
757 psContext.vY.centroid = psContext.vY.center;
758 psContext.vI.centroid = psContext.vI.center;
759 psContext.vJ.centroid = psContext.vJ.center;
760 psContext.vOneOverW.centroid = psContext.vOneOverW.center;
761 }
762
763 // interpolate and quantize z
764 psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center);
765 psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
766
767 RDTSC_STOP(BEBarycentric, 0, 0);
768
769 simdmask clipCoverageMask = coverageMask & MASK;
770
771 // interpolate user clip distance if available
772 if(rastState.clipDistanceMask)
773 {
774 clipCoverageMask &= ~ComputeUserClipMask(rastState.clipDistanceMask, work.pUserClipBuffer,
775 psContext.vI.center, psContext.vJ.center);
776 }
777
778 simdscalar vCoverageMask = vMask(clipCoverageMask);
779 simdscalar depthPassMask = vCoverageMask;
780 simdscalar stencilPassMask = vCoverageMask;
781
782 // Early-Z?
783 if(CanEarlyZ(pPSState))
784 {
785 RDTSC_START(BEEarlyDepthTest);
786 depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing,
787 psContext.vZ, pDepthBase, vCoverageMask, pStencilBase, &stencilPassMask);
788 RDTSC_STOP(BEEarlyDepthTest, 0, 0);
789
790 // early-exit if no pixels passed depth or earlyZ is forced on
791 if(pPSState->forceEarlyZ || !_simd_movemask_ps(depthPassMask))
792 {
793 DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
794 pDepthBase, depthPassMask, vCoverageMask, pStencilBase, stencilPassMask);
795
796 if (!_simd_movemask_ps(depthPassMask))
797 {
798 goto Endtile;
799 }
800 }
801 }
802
803 psContext.sampleIndex = 0;
804 psContext.activeMask = _simd_castps_si(vCoverageMask);
805
806 // execute pixel shader
807 RDTSC_START(BEPixelShader);
808 UPDATE_STAT(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask)));
809 state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
810 RDTSC_STOP(BEPixelShader, 0, 0);
811
812 vCoverageMask = _simd_castsi_ps(psContext.activeMask);
813
814 // late-Z
815 if(!CanEarlyZ(pPSState))
816 {
817 RDTSC_START(BELateDepthTest);
818 depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing,
819 psContext.vZ, pDepthBase, vCoverageMask, pStencilBase, &stencilPassMask);
820 RDTSC_STOP(BELateDepthTest, 0, 0);
821
822 if(!_simd_movemask_ps(depthPassMask))
823 {
824 // need to call depth/stencil write for stencil write
825 DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
826 pDepthBase, depthPassMask, vCoverageMask, pStencilBase, stencilPassMask);
827 goto Endtile;
828 }
829 }
830
831 uint32_t statMask = _simd_movemask_ps(depthPassMask);
832 uint32_t statCount = _mm_popcnt_u32(statMask);
833 UPDATE_STAT(DepthPassCount, statCount);
834
835 // output merger
836 RDTSC_START(BEOutputMerger);
837 backendFuncs.pfnOutputMerger(psContext, pColorBase, 0, pBlendState, state.pfnBlendFunc,
838 vCoverageMask, depthPassMask);
839
840 // do final depth write after all pixel kills
841 if (!pPSState->forceEarlyZ)
842 {
843 DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
844 pDepthBase, depthPassMask, vCoverageMask, pStencilBase, stencilPassMask);
845 }
846 RDTSC_STOP(BEOutputMerger, 0, 0);
847 }
848
849 Endtile:
850 RDTSC_START(BEEndTile);
851 coverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
852 pDepthBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
853 pStencilBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
854
855 for(uint32_t rt = 0; rt < NumRT; ++rt)
856 {
857 pColorBase[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
858 }
859 RDTSC_STOP(BEEndTile, 0, 0);
860 }
861 }
862 }
863
864 template<typename T>
865 void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
866 {
867 RDTSC_START(BESetup);
868
869 SWR_CONTEXT *pContext = pDC->pContext;
870 const API_STATE& state = GetApiState(pDC);
871 const SWR_RASTSTATE& rastState = state.rastState;
872 const SWR_PS_STATE *pPSState = &state.psState;
873 const SWR_BLEND_STATE *pBlendState = &state.blendState;
874 const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs;
875
876 // broadcast scalars
877 BarycentricCoeffs coeffs;
878 coeffs.vIa = _simd_broadcast_ss(&work.I[0]);
879 coeffs.vIb = _simd_broadcast_ss(&work.I[1]);
880 coeffs.vIc = _simd_broadcast_ss(&work.I[2]);
881
882 coeffs.vJa = _simd_broadcast_ss(&work.J[0]);
883 coeffs.vJb = _simd_broadcast_ss(&work.J[1]);
884 coeffs.vJc = _simd_broadcast_ss(&work.J[2]);
885
886 coeffs.vZa = _simd_broadcast_ss(&work.Z[0]);
887 coeffs.vZb = _simd_broadcast_ss(&work.Z[1]);
888 coeffs.vZc = _simd_broadcast_ss(&work.Z[2]);
889
890 coeffs.vRecipDet = _simd_broadcast_ss(&work.recipDet);
891
892 coeffs.vAOneOverW = _simd_broadcast_ss(&work.OneOverW[0]);
893 coeffs.vBOneOverW = _simd_broadcast_ss(&work.OneOverW[1]);
894 coeffs.vCOneOverW = _simd_broadcast_ss(&work.OneOverW[2]);
895
896 uint8_t *pColorBase[SWR_NUM_RENDERTARGETS];
897 uint32_t NumRT = state.psState.numRenderTargets;
898 for(uint32_t rt = 0; rt < NumRT; ++rt)
899 {
900 pColorBase[rt] = renderBuffers.pColor[rt];
901 }
902 uint8_t *pDepthBase = renderBuffers.pDepth, *pStencilBase = renderBuffers.pStencil;
903 RDTSC_STOP(BESetup, 0, 0);
904
905 SWR_PS_CONTEXT psContext;
906 psContext.pAttribs = work.pAttribs;
907 psContext.pPerspAttribs = work.pPerspAttribs;
908 psContext.pRecipW = work.pRecipW;
909 psContext.frontFace = work.triFlags.frontFacing;
910 psContext.primID = work.triFlags.primID;
911
912 // save Ia/Ib/Ic and Ja/Jb/Jc if we need to reevaluate i/j/k in the shader because of pull attribs
913 psContext.I = work.I;
914 psContext.J = work.J;
915 psContext.recipDet = work.recipDet;
916 psContext.pSamplePosX = (const float*)&T::MultisampleT::samplePosX;
917 psContext.pSamplePosY = (const float*)&T::MultisampleT::samplePosY;
918 const uint32_t numSamples = T::MultisampleT::numSamples;
919
920 for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
921 {
922 // UL pixel corner
923 psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps((float)yy));
924 // pixel center
925 psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps((float)yy));
926
927 for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
928 {
929 psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps((float)xx));
930 // pixel center
931 psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps((float)xx));
932
933 RDTSC_START(BEBarycentric);
934 backendFuncs.pfnCalcPixelBarycentrics(coeffs, psContext);
935 RDTSC_STOP(BEBarycentric, 0, 0);
936
937 if(T::bInputCoverage)
938 {
939 generateInputCoverage<T>(&work.coverageMask[0], psContext.inputMask, pBlendState->sampleMask);
940 }
941
942 if(T::bCentroidPos)
943 {
944 ///@ todo: don't need to genererate input coverage 2x if input coverage and centroid
945 RDTSC_START(BEBarycentric);
946 CalcCentroidBarycentrics<T>(coeffs, psContext, &work.coverageMask[0], pBlendState->sampleMask, psContext.vX.UL, psContext.vY.UL);
947 RDTSC_STOP(BEBarycentric, 0, 0);
948 }
949
950 for(uint32_t sample = 0; sample < numSamples; sample++)
951 {
952 if (work.coverageMask[sample] & MASK)
953 {
954 RDTSC_START(BEBarycentric);
955
956 // calculate per sample positions
957 psContext.vX.sample = _simd_add_ps(psContext.vX.UL, T::MultisampleT::vX(sample));
958 psContext.vY.sample = _simd_add_ps(psContext.vY.UL, T::MultisampleT::vY(sample));
959
960 simdmask coverageMask = work.coverageMask[sample] & MASK;
961 simdscalar vCoverageMask = vMask(coverageMask);
962
963 backendFuncs.pfnCalcSampleBarycentrics(coeffs, psContext);
964
965 // interpolate and quantize z
966 psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample);
967 psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
968
969 RDTSC_STOP(BEBarycentric, 0, 0);
970
971 // interpolate user clip distance if available
972 if (rastState.clipDistanceMask)
973 {
974 coverageMask &= ~ComputeUserClipMask(rastState.clipDistanceMask, work.pUserClipBuffer,
975 psContext.vI.sample, psContext.vJ.sample);
976 }
977
978 simdscalar depthPassMask = vCoverageMask;
979 simdscalar stencilPassMask = vCoverageMask;
980
981 // offset depth/stencil buffers current sample
982 uint8_t *pDepthSample = pDepthBase + T::MultisampleT::RasterTileDepthOffset(sample);
983 uint8_t *pStencilSample = pStencilBase + T::MultisampleT::RasterTileStencilOffset(sample);
984
985 // Early-Z?
986 if (CanEarlyZ(pPSState))
987 {
988 RDTSC_START(BEEarlyDepthTest);
989 depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing,
990 psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
991 RDTSC_STOP(BEEarlyDepthTest, 0, 0);
992
993 // early-exit if no samples passed depth or earlyZ is forced on.
994 if (pPSState->forceEarlyZ || !_simd_movemask_ps(depthPassMask))
995 {
996 DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
997 pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
998
999 if (!_simd_movemask_ps(depthPassMask))
1000 {
1001 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
1002 continue;
1003 }
1004 }
1005 }
1006
1007 psContext.sampleIndex = sample;
1008 psContext.activeMask = _simd_castps_si(vCoverageMask);
1009
1010 // execute pixel shader
1011 RDTSC_START(BEPixelShader);
1012 UPDATE_STAT(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask)));
1013 state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
1014 RDTSC_STOP(BEPixelShader, 0, 0);
1015
1016 vCoverageMask = _simd_castsi_ps(psContext.activeMask);
1017
1018 // late-Z
1019 if (!CanEarlyZ(pPSState))
1020 {
1021 RDTSC_START(BELateDepthTest);
1022 depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing,
1023 psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
1024 RDTSC_STOP(BELateDepthTest, 0, 0);
1025
1026 if (!_simd_movemask_ps(depthPassMask))
1027 {
1028 // need to call depth/stencil write for stencil write
1029 DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
1030 pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
1031
1032 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
1033 continue;
1034 }
1035 }
1036
1037 uint32_t statMask = _simd_movemask_ps(depthPassMask);
1038 uint32_t statCount = _mm_popcnt_u32(statMask);
1039 UPDATE_STAT(DepthPassCount, statCount);
1040
1041 // output merger
1042 RDTSC_START(BEOutputMerger);
1043 backendFuncs.pfnOutputMerger(psContext, pColorBase, sample, pBlendState, state.pfnBlendFunc,
1044 vCoverageMask, depthPassMask);
1045
1046 // do final depth write after all pixel kills
1047 if (!pPSState->forceEarlyZ)
1048 {
1049 DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
1050 pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
1051 }
1052 RDTSC_STOP(BEOutputMerger, 0, 0);
1053 }
1054 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
1055 }
1056 RDTSC_START(BEEndTile);
1057 pDepthBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
1058 pStencilBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
1059
1060 for (uint32_t rt = 0; rt < NumRT; ++rt)
1061 {
1062 pColorBase[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
1063 }
1064 RDTSC_STOP(BEEndTile, 0, 0);
1065 }
1066 }
1067 }
1068
1069 template<typename T>
1070 void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
1071 {
1072 RDTSC_START(BESetup);
1073
1074 SWR_CONTEXT *pContext = pDC->pContext;
1075 const API_STATE& state = GetApiState(pDC);
1076 const SWR_RASTSTATE& rastState = state.rastState;
1077 const SWR_PS_STATE *pPSState = &state.psState;
1078 const SWR_BLEND_STATE *pBlendState = &state.blendState;
1079 const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs;
1080
1081 // broadcast scalars
1082 BarycentricCoeffs coeffs;
1083 coeffs.vIa = _simd_broadcast_ss(&work.I[0]);
1084 coeffs.vIb = _simd_broadcast_ss(&work.I[1]);
1085 coeffs.vIc = _simd_broadcast_ss(&work.I[2]);
1086
1087 coeffs.vJa = _simd_broadcast_ss(&work.J[0]);
1088 coeffs.vJb = _simd_broadcast_ss(&work.J[1]);
1089 coeffs.vJc = _simd_broadcast_ss(&work.J[2]);
1090
1091 coeffs.vZa = _simd_broadcast_ss(&work.Z[0]);
1092 coeffs.vZb = _simd_broadcast_ss(&work.Z[1]);
1093 coeffs.vZc = _simd_broadcast_ss(&work.Z[2]);
1094
1095 coeffs.vRecipDet = _simd_broadcast_ss(&work.recipDet);
1096
1097 coeffs.vAOneOverW = _simd_broadcast_ss(&work.OneOverW[0]);
1098 coeffs.vBOneOverW = _simd_broadcast_ss(&work.OneOverW[1]);
1099 coeffs.vCOneOverW = _simd_broadcast_ss(&work.OneOverW[2]);
1100
1101 uint8_t *pColorBase[SWR_NUM_RENDERTARGETS];
1102 uint32_t NumRT = state.psState.numRenderTargets;
1103 for(uint32_t rt = 0; rt < NumRT; ++rt)
1104 {
1105 pColorBase[rt] = renderBuffers.pColor[rt];
1106 }
1107 uint8_t *pDepthBase = renderBuffers.pDepth, *pStencilBase = renderBuffers.pStencil;
1108 RDTSC_STOP(BESetup, 0, 0);
1109
1110 SWR_PS_CONTEXT psContext;
1111 psContext.pAttribs = work.pAttribs;
1112 psContext.pPerspAttribs = work.pPerspAttribs;
1113 psContext.frontFace = work.triFlags.frontFacing;
1114 psContext.primID = work.triFlags.primID;
1115 psContext.pRecipW = work.pRecipW;
1116 // save Ia/Ib/Ic and Ja/Jb/Jc if we need to reevaluate i/j/k in the shader because of pull attribs
1117 psContext.I = work.I;
1118 psContext.J = work.J;
1119 psContext.recipDet = work.recipDet;
1120 psContext.pSamplePosX = (const float*)&T::MultisampleT::samplePosX;
1121 psContext.pSamplePosY = (const float*)&T::MultisampleT::samplePosY;
1122 psContext.sampleIndex = 0;
1123
1124 uint32_t numOMSamples;
1125 // RT has to be single sample if we're in forcedMSAA mode
1126 if(T::bForcedSampleCount && (T::MultisampleT::sampleCount > SWR_MULTISAMPLE_1X))
1127 {
1128 numOMSamples = 1;
1129 }
1130 // unless we're forced to single sample, in which case we run the OM at the sample count of the RT
1131 else if(T::bForcedSampleCount && (T::MultisampleT::sampleCount == SWR_MULTISAMPLE_1X))
1132 {
1133 numOMSamples = GetNumSamples(pBlendState->sampleCount);
1134 }
1135 // else we're in normal MSAA mode and rasterizer and OM are running at the same sample count
1136 else
1137 {
1138 numOMSamples = T::MultisampleT::numSamples;
1139 }
1140
1141 for(uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
1142 {
1143 psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps((float)yy));
1144 psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps((float)yy));
1145 for(uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
1146 {
1147 simdscalar vZ[T::MultisampleT::numSamples]{ 0 };
1148 psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps((float)xx));
1149 // set pixel center positions
1150 psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps((float)xx));
1151
1152 if (T::bInputCoverage)
1153 {
1154 generateInputCoverage<T>(&work.coverageMask[0], psContext.inputMask, pBlendState->sampleMask);
1155 }
1156
1157 if(T::bCentroidPos)
1158 {
1159 ///@ todo: don't need to genererate input coverage 2x if input coverage and centroid
1160 RDTSC_START(BEBarycentric);
1161 CalcCentroidBarycentrics<T>(coeffs, psContext, &work.coverageMask[0], pBlendState->sampleMask, psContext.vX.UL, psContext.vY.UL);
1162 RDTSC_STOP(BEBarycentric, 0, 0);
1163 }
1164
1165 // if oDepth written to, or there is a potential to discard any samples, we need to
1166 // run the PS early, then interp or broadcast Z and test
1167 if(pPSState->writesODepth || pPSState->killsPixel)
1168 {
1169 RDTSC_START(BEBarycentric);
1170 backendFuncs.pfnCalcPixelBarycentrics(coeffs, psContext);
1171
1172 // interpolate and quantize z
1173 psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center);
1174 psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
1175 RDTSC_STOP(BEBarycentric, 0, 0);
1176
1177 // execute pixel shader
1178 RDTSC_START(BEPixelShader);
1179 state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
1180 RDTSC_STOP(BEPixelShader, 0, 0);
1181 }
1182 else
1183 {
1184 psContext.activeMask = _simd_set1_epi32(-1);
1185 }
1186
1187 // need to declare enough space for all samples
1188 simdscalar vCoverageMask[T::MultisampleT::numSamples];
1189 simdscalar depthPassMask[T::MultisampleT::numSamples];
1190 simdscalar stencilPassMask[T::MultisampleT::numSamples];
1191 simdscalar anyDepthSamplePassed = _simd_setzero_ps();
1192 simdscalar anyStencilSamplePassed = _simd_setzero_ps();
1193 for(uint32_t sample = 0; sample < T::MultisampleT::numCoverageSamples; sample++)
1194 {
1195 vCoverageMask[sample] = vMask(work.coverageMask[sample] & MASK);
1196
1197 // pull mask back out for any discards and and with coverage
1198 vCoverageMask[sample] = _simd_and_ps(vCoverageMask[sample], _simd_castsi_ps(psContext.activeMask));
1199
1200 if (!_simd_movemask_ps(vCoverageMask[sample]))
1201 {
1202 vCoverageMask[sample] = depthPassMask[sample] = stencilPassMask[sample] = _simd_setzero_ps();
1203 continue;
1204 }
1205
1206 if(T::bForcedSampleCount)
1207 {
1208 // candidate pixels (that passed coverage) will cause shader invocation if any bits in the samplemask are set
1209 const simdscalar vSampleMask = _simd_castsi_ps(_simd_cmpgt_epi32(_simd_set1_epi32(pBlendState->sampleMask), _simd_setzero_si()));
1210 anyDepthSamplePassed = _simd_or_ps(anyDepthSamplePassed, _simd_and_ps(vCoverageMask[sample], vSampleMask));
1211 continue;
1212 }
1213
1214 depthPassMask[sample] = vCoverageMask[sample];
1215
1216 // if oDepth isn't written to, we need to interpolate Z for each sample
1217 // if clip distances are enabled, we need to interpolate for each sample
1218 if(!pPSState->writesODepth || rastState.clipDistanceMask)
1219 {
1220 RDTSC_START(BEBarycentric);
1221 if(T::bIsStandardPattern)
1222 {
1223 // calculate per sample positions
1224 psContext.vX.sample = _simd_add_ps(psContext.vX.UL, T::MultisampleT::vX(sample));
1225 psContext.vY.sample = _simd_add_ps(psContext.vY.UL, T::MultisampleT::vY(sample));
1226 }
1227 else
1228 {
1229 psContext.vX.sample = psContext.vX.center;
1230 psContext.vY.sample = psContext.vY.center;
1231 }
1232
1233 // calc I & J per sample
1234 backendFuncs.pfnCalcSampleBarycentrics(coeffs, psContext);
1235
1236 // interpolate and quantize z
1237 if (!pPSState->writesODepth)
1238 {
1239 vZ[sample] = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample);
1240 vZ[sample] = state.pfnQuantizeDepth(vZ[sample]);
1241 }
1242
1243 ///@todo: perspective correct vs non-perspective correct clipping?
1244 // interpolate clip distances
1245 if (rastState.clipDistanceMask)
1246 {
1247 uint8_t clipMask = ComputeUserClipMask(rastState.clipDistanceMask, work.pUserClipBuffer,
1248 psContext.vI.sample, psContext.vJ.sample);
1249 vCoverageMask[sample] = _simd_and_ps(vCoverageMask[sample], vMask(~clipMask));
1250 }
1251 RDTSC_STOP(BEBarycentric, 0, 0);
1252 }
1253 // else 'broadcast' and test psContext.vZ written from the PS each sample
1254 else
1255 {
1256 vZ[sample] = psContext.vZ;
1257 }
1258
1259 // offset depth/stencil buffers current sample
1260 uint8_t *pDepthSample = pDepthBase + T::MultisampleT::RasterTileDepthOffset(sample);
1261 uint8_t * pStencilSample = pStencilBase + T::MultisampleT::RasterTileStencilOffset(sample);
1262
1263 // ZTest for this sample
1264 RDTSC_START(BEEarlyDepthTest);
1265 stencilPassMask[sample] = vCoverageMask[sample];
1266 depthPassMask[sample] = DepthStencilTest(&state, work.triFlags.frontFacing,
1267 vZ[sample], pDepthSample, vCoverageMask[sample], pStencilSample, &stencilPassMask[sample]);
1268 RDTSC_STOP(BEEarlyDepthTest, 0, 0);
1269
1270 anyDepthSamplePassed = _simd_or_ps(anyDepthSamplePassed, depthPassMask[sample]);
1271 anyStencilSamplePassed = _simd_or_ps(anyStencilSamplePassed, stencilPassMask[sample]);
1272 uint32_t statMask = _simd_movemask_ps(depthPassMask[sample]);
1273 uint32_t statCount = _mm_popcnt_u32(statMask);
1274 UPDATE_STAT(DepthPassCount, statCount);
1275 }
1276
1277 // if we didn't have to execute the PS early, and at least 1 sample passed the depth test, run the PS
1278 if(!pPSState->writesODepth && !pPSState->killsPixel && _simd_movemask_ps(anyDepthSamplePassed))
1279 {
1280 RDTSC_START(BEBarycentric);
1281 backendFuncs.pfnCalcPixelBarycentrics(coeffs, psContext);
1282 // interpolate and quantize z
1283 psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center);
1284 psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
1285 RDTSC_STOP(BEBarycentric, 0, 0);
1286
1287 // execute pixel shader
1288 RDTSC_START(BEPixelShader);
1289 state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
1290 RDTSC_STOP(BEPixelShader, 0, 0);
1291 }
1292 ///@todo: make sure this works for kill pixel
1293 else if(!_simd_movemask_ps(anyStencilSamplePassed))
1294 {
1295 goto Endtile;
1296 }
1297
1298 // loop over all samples, broadcasting the results of the PS to all passing pixels
1299 for(uint32_t sample = 0; sample < numOMSamples; sample++)
1300 {
1301 uint8_t *pDepthSample = pDepthBase + T::MultisampleT::RasterTileDepthOffset(sample);
1302 uint8_t * pStencilSample = pStencilBase + T::MultisampleT::RasterTileStencilOffset(sample);
1303
1304 // output merger
1305 RDTSC_START(BEOutputMerger);
1306
1307 // skip if none of the pixels for this sample passed
1308 simdscalar coverageMaskSample;
1309 simdscalar depthMaskSample;
1310 simdscalar stencilMaskSample;
1311 simdscalar vInterpolatedZ;
1312
1313 // forcedSampleCount outputs to any pixels with covered samples not masked off by SampleMask
1314 // depth test is disabled, so just set the z val to 0.
1315 if(T::bForcedSampleCount)
1316 {
1317 coverageMaskSample = depthMaskSample = anyDepthSamplePassed;
1318 vInterpolatedZ = _simd_setzero_ps();
1319 }
1320 else if(T::bIsStandardPattern)
1321 {
1322 if(!_simd_movemask_ps(depthPassMask[sample]))
1323 {
1324 depthPassMask[sample] = _simd_setzero_ps();
1325 DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, vZ[sample], pDepthSample, depthPassMask[sample],
1326 vCoverageMask[sample], pStencilSample, stencilPassMask[sample]);
1327 continue;
1328 }
1329 coverageMaskSample = vCoverageMask[sample];
1330 depthMaskSample = depthPassMask[sample];
1331 stencilMaskSample = stencilPassMask[sample];
1332 vInterpolatedZ = vZ[sample];
1333 }
1334 else
1335 {
1336 // center pattern only needs to use a single depth test as all samples are at the same position
1337 if(!_simd_movemask_ps(depthPassMask[0]))
1338 {
1339 depthPassMask[0] = _simd_setzero_ps();
1340 DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, vZ[0], pDepthSample, depthPassMask[0],
1341 vCoverageMask[0], pStencilSample, stencilPassMask[0]);
1342 continue;
1343 }
1344 coverageMaskSample = (vCoverageMask[0]);
1345 depthMaskSample = depthPassMask[0];
1346 stencilMaskSample = stencilPassMask[0];
1347 vInterpolatedZ = vZ[0];
1348 }
1349
1350 // output merger
1351 RDTSC_START(BEOutputMerger);
1352 backendFuncs.pfnOutputMerger(psContext, pColorBase, sample, pBlendState, state.pfnBlendFunc,
1353 coverageMaskSample, depthMaskSample);
1354
1355 DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, vInterpolatedZ, pDepthSample, depthMaskSample,
1356 coverageMaskSample, pStencilSample, stencilMaskSample);
1357 RDTSC_STOP(BEOutputMerger, 0, 0);
1358 }
1359
1360 Endtile:
1361 RDTSC_START(BEEndTile);
1362 for(uint32_t sample = 0; sample < T::MultisampleT::numCoverageSamples; sample++)
1363 {
1364 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
1365 }
1366
1367 pDepthBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
1368 pStencilBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
1369
1370 for(uint32_t rt = 0; rt < NumRT; ++rt)
1371 {
1372 pColorBase[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
1373 }
1374 RDTSC_STOP(BEEndTile, 0, 0);
1375 }
1376 }
1377 }
1378 // optimized backend flow with NULL PS
1379 template<uint32_t sampleCountT>
1380 void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
1381 {
1382 ///@todo: handle center multisample pattern
1383 typedef SwrBackendTraits<sampleCountT, SWR_MSAA_STANDARD_PATTERN> T;
1384 RDTSC_START(BESetup);
1385
1386 SWR_CONTEXT *pContext = pDC->pContext;
1387 const API_STATE& state = GetApiState(pDC);
1388 const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs;
1389 const SWR_RASTSTATE& rastState = pDC->pState->state.rastState;
1390
1391 // broadcast scalars
1392 BarycentricCoeffs coeffs;
1393 coeffs.vIa = _simd_broadcast_ss(&work.I[0]);
1394 coeffs.vIb = _simd_broadcast_ss(&work.I[1]);
1395 coeffs.vIc = _simd_broadcast_ss(&work.I[2]);
1396
1397 coeffs.vJa = _simd_broadcast_ss(&work.J[0]);
1398 coeffs.vJb = _simd_broadcast_ss(&work.J[1]);
1399 coeffs.vJc = _simd_broadcast_ss(&work.J[2]);
1400
1401 coeffs.vZa = _simd_broadcast_ss(&work.Z[0]);
1402 coeffs.vZb = _simd_broadcast_ss(&work.Z[1]);
1403 coeffs.vZc = _simd_broadcast_ss(&work.Z[2]);
1404
1405 coeffs.vRecipDet = _simd_broadcast_ss(&work.recipDet);
1406
1407 uint8_t *pDepthBase = renderBuffers.pDepth, *pStencilBase = renderBuffers.pStencil;
1408
1409 RDTSC_STOP(BESetup, 0, 0);
1410
1411 SWR_PS_CONTEXT psContext;
1412 for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
1413 {
1414 // UL pixel corner
1415 simdscalar vYSamplePosUL = _simd_add_ps(vULOffsetsY, _simd_set1_ps((float)yy));
1416
1417 for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
1418 {
1419 // UL pixel corners
1420 simdscalar vXSamplePosUL = _simd_add_ps(vULOffsetsX, _simd_set1_ps((float)xx));
1421
1422 // iterate over active samples
1423 unsigned long sample = 0;
1424 uint32_t sampleMask = state.blendState.sampleMask;
1425 while (_BitScanForward(&sample, sampleMask))
1426 {
1427 sampleMask &= ~(1 << sample);
1428 simdmask coverageMask = work.coverageMask[sample] & MASK;
1429 if (coverageMask)
1430 {
1431 RDTSC_START(BEBarycentric);
1432 // calculate per sample positions
1433 psContext.vX.sample = _simd_add_ps(vXSamplePosUL, T::MultisampleT::vX(sample));
1434 psContext.vY.sample = _simd_add_ps(vYSamplePosUL, T::MultisampleT::vY(sample));
1435
1436 backendFuncs.pfnCalcSampleBarycentrics(coeffs, psContext);
1437
1438 // interpolate and quantize z
1439 psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample);
1440 psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
1441
1442 RDTSC_STOP(BEBarycentric, 0, 0);
1443
1444 // interpolate user clip distance if available
1445 if (rastState.clipDistanceMask)
1446 {
1447 coverageMask &= ~ComputeUserClipMask(rastState.clipDistanceMask, work.pUserClipBuffer,
1448 psContext.vI.sample, psContext.vJ.sample);
1449 }
1450
1451 simdscalar vCoverageMask = vMask(coverageMask);
1452 simdscalar stencilPassMask = vCoverageMask;
1453
1454 // offset depth/stencil buffers current sample
1455 uint8_t *pDepthSample = pDepthBase + T::MultisampleT::RasterTileDepthOffset(sample);
1456 uint8_t *pStencilSample = pStencilBase + T::MultisampleT::RasterTileStencilOffset(sample);
1457
1458 RDTSC_START(BEEarlyDepthTest);
1459 simdscalar depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing,
1460 psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
1461 DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
1462 pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
1463 RDTSC_STOP(BEEarlyDepthTest, 0, 0);
1464
1465 uint32_t statMask = _simd_movemask_ps(depthPassMask);
1466 uint32_t statCount = _mm_popcnt_u32(statMask);
1467 UPDATE_STAT(DepthPassCount, statCount);
1468 }
1469 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
1470 }
1471 pDepthBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
1472 pStencilBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
1473 }
1474 }
1475 }
1476
1477 void InitClearTilesTable()
1478 {
1479 memset(sClearTilesTable, 0, sizeof(sClearTilesTable));
1480
1481 sClearTilesTable[R8G8B8A8_UNORM] = ClearMacroTile<R8G8B8A8_UNORM>;
1482 sClearTilesTable[B8G8R8A8_UNORM] = ClearMacroTile<B8G8R8A8_UNORM>;
1483 sClearTilesTable[R32_FLOAT] = ClearMacroTile<R32_FLOAT>;
1484 sClearTilesTable[R32G32B32A32_FLOAT] = ClearMacroTile<R32G32B32A32_FLOAT>;
1485 sClearTilesTable[R8_UINT] = ClearMacroTile<R8_UINT>;
1486 }
1487
1488 PFN_BACKEND_FUNC gBackendNullPs[SWR_MULTISAMPLE_TYPE_MAX];
1489 PFN_BACKEND_FUNC gBackendSingleSample[2][2] = {};
1490 PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_MAX][SWR_MSAA_SAMPLE_PATTERN_MAX][SWR_INPUT_COVERAGE_MAX][2][2] = {};
1491 PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_MAX][SWR_INPUT_COVERAGE_MAX][2] = {};
1492 PFN_OUTPUT_MERGER gBackendOutputMergerTable[SWR_NUM_RENDERTARGETS+1][SWR_MULTISAMPLE_TYPE_MAX] = {};
1493 PFN_CALC_PIXEL_BARYCENTRICS gPixelBarycentricTable[2] = {};
1494 PFN_CALC_SAMPLE_BARYCENTRICS gSampleBarycentricTable[2] = {};
1495
1496 // Recursive template used to auto-nest conditionals. Converts dynamic enum function
1497 // arguments to static template arguments.
1498 template <uint32_t... ArgsT>
1499 struct OMChooser
1500 {
1501 // Last Arg Terminator
1502 static PFN_OUTPUT_MERGER GetFunc(SWR_MULTISAMPLE_COUNT tArg)
1503 {
1504 switch(tArg)
1505 {
1506 case SWR_MULTISAMPLE_1X: return OutputMerger<ArgsT..., SWR_MULTISAMPLE_1X>; break;
1507 case SWR_MULTISAMPLE_2X: return OutputMerger<ArgsT..., SWR_MULTISAMPLE_2X>; break;
1508 case SWR_MULTISAMPLE_4X: return OutputMerger<ArgsT..., SWR_MULTISAMPLE_4X>; break;
1509 case SWR_MULTISAMPLE_8X: return OutputMerger<ArgsT..., SWR_MULTISAMPLE_8X>; break;
1510 case SWR_MULTISAMPLE_16X: return OutputMerger<ArgsT..., SWR_MULTISAMPLE_16X>; break;
1511 default:
1512 SWR_ASSERT(0 && "Invalid sample count\n");
1513 return nullptr;
1514 break;
1515 }
1516 }
1517
1518 // Recursively parse args
1519 template <typename... TArgsT>
1520 static PFN_OUTPUT_MERGER GetFunc(uint32_t tArg, TArgsT... remainingArgs)
1521 {
1522 switch(tArg)
1523 {
1524 case 0: return OMChooser<ArgsT..., 0>::GetFunc(remainingArgs...); break;
1525 case 1: return OMChooser<ArgsT..., 1>::GetFunc(remainingArgs...); break;
1526 case 2: return OMChooser<ArgsT..., 2>::GetFunc(remainingArgs...); break;
1527 case 3: return OMChooser<ArgsT..., 3>::GetFunc(remainingArgs...); break;
1528 case 4: return OMChooser<ArgsT..., 4>::GetFunc(remainingArgs...); break;
1529 case 5: return OMChooser<ArgsT..., 5>::GetFunc(remainingArgs...); break;
1530 case 6: return OMChooser<ArgsT..., 6>::GetFunc(remainingArgs...); break;
1531 case 7: return OMChooser<ArgsT..., 7>::GetFunc(remainingArgs...); break;
1532 case 8: return OMChooser<ArgsT..., 8>::GetFunc(remainingArgs...); break;
1533 default:
1534 SWR_ASSERT(0 && "Invalid RT index\n");
1535 return nullptr;
1536 break;
1537 }
1538 }
1539 };
1540
1541 // Recursive template used to auto-nest conditionals. Converts dynamic enum function
1542 // arguments to static template arguments.
1543 template <uint32_t... ArgsT>
1544 struct BEChooser
1545 {
1546 // Last Arg Terminator
1547 static PFN_BACKEND_FUNC GetFunc(SWR_BACKEND_FUNCS tArg)
1548 {
1549 switch(tArg)
1550 {
1551 case SWR_BACKEND_SINGLE_SAMPLE: return BackendSingleSample<SwrBackendTraits<ArgsT...>>; break;
1552 case SWR_BACKEND_MSAA_PIXEL_RATE: return BackendPixelRate<SwrBackendTraits<ArgsT...>>; break;
1553 case SWR_BACKEND_MSAA_SAMPLE_RATE: return BackendSampleRate<SwrBackendTraits<ArgsT...>>; break;
1554 default:
1555 SWR_ASSERT(0 && "Invalid backend func\n");
1556 return nullptr;
1557 break;
1558 }
1559 }
1560
1561 // Recursively parse args
1562 template <typename... TArgsT>
1563 static PFN_BACKEND_FUNC GetFunc(SWR_MSAA_SAMPLE_PATTERN tArg, TArgsT... remainingArgs)
1564 {
1565 switch(tArg)
1566 {
1567 case SWR_MSAA_CENTER_PATTERN: return BEChooser<ArgsT..., SWR_MSAA_CENTER_PATTERN>::GetFunc(remainingArgs...); break;
1568 case SWR_MSAA_STANDARD_PATTERN: return BEChooser<ArgsT..., SWR_MSAA_STANDARD_PATTERN>::GetFunc(remainingArgs...); break;
1569 default:
1570 SWR_ASSERT(0 && "Invalid sample pattern\n");
1571 return BEChooser<ArgsT..., SWR_MSAA_STANDARD_PATTERN>::GetFunc(remainingArgs...);
1572 break;
1573 }
1574 }
1575
1576 // Recursively parse args
1577 template <typename... TArgsT>
1578 static PFN_BACKEND_FUNC GetFunc(SWR_MULTISAMPLE_COUNT tArg, TArgsT... remainingArgs)
1579 {
1580 switch(tArg)
1581 {
1582 case SWR_MULTISAMPLE_1X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...); break;
1583 case SWR_MULTISAMPLE_2X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_2X>::GetFunc(remainingArgs...); break;
1584 case SWR_MULTISAMPLE_4X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_4X>::GetFunc(remainingArgs...); break;
1585 case SWR_MULTISAMPLE_8X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_8X>::GetFunc(remainingArgs...); break;
1586 case SWR_MULTISAMPLE_16X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_16X>::GetFunc(remainingArgs...); break;
1587 default:
1588 SWR_ASSERT(0 && "Invalid sample count\n");
1589 return BEChooser<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...);
1590 break;
1591 }
1592 }
1593
1594 // Recursively parse args
1595 template <typename... TArgsT>
1596 static PFN_BACKEND_FUNC GetFunc(bool tArg, TArgsT... remainingArgs)
1597 {
1598 if(tArg == true)
1599 {
1600 return BEChooser<ArgsT..., 1>::GetFunc(remainingArgs...);
1601 }
1602
1603 return BEChooser<ArgsT..., 0>::GetFunc(remainingArgs...);
1604 }
1605 };
1606
1607 template <uint32_t numRenderTargets, SWR_MULTISAMPLE_COUNT numSampleRates>
1608 void InitBackendOMFuncTable(PFN_OUTPUT_MERGER (&table)[numRenderTargets][numSampleRates])
1609 {
1610 for(uint32_t rtNum = SWR_ATTACHMENT_COLOR0; rtNum < numRenderTargets; rtNum++)
1611 {
1612 for(uint32_t sampleCount = SWR_MULTISAMPLE_1X; sampleCount < numSampleRates; sampleCount++)
1613 {
1614 table[rtNum][sampleCount] =
1615 OMChooser<>::GetFunc((SWR_RENDERTARGET_ATTACHMENT)rtNum, (SWR_MULTISAMPLE_COUNT)sampleCount);
1616 }
1617 }
1618 }
1619
1620 template <SWR_MULTISAMPLE_COUNT numSampleRates>
1621 void InitBackendBarycentricsTables(PFN_CALC_PIXEL_BARYCENTRICS (&pixelTable)[2],
1622 PFN_CALC_SAMPLE_BARYCENTRICS (&sampleTable)[2])
1623 {
1624 pixelTable[0] = CalcPixelBarycentrics<0>;
1625 pixelTable[1] = CalcPixelBarycentrics<1>;
1626
1627 sampleTable[0] = CalcSampleBarycentrics<0>;
1628 sampleTable[1] = CalcSampleBarycentrics<1>;
1629 }
1630
1631 void InitBackendSampleFuncTable(PFN_BACKEND_FUNC (&table)[2][2])
1632 {
1633 gBackendSingleSample[0][0] = BEChooser<>::GetFunc(SWR_MULTISAMPLE_1X, SWR_MSAA_STANDARD_PATTERN, false, false, false, false, (SWR_BACKEND_FUNCS)SWR_BACKEND_SINGLE_SAMPLE);
1634 gBackendSingleSample[0][1] = BEChooser<>::GetFunc(SWR_MULTISAMPLE_1X, SWR_MSAA_STANDARD_PATTERN, false, true, false, false, (SWR_BACKEND_FUNCS)SWR_BACKEND_SINGLE_SAMPLE);
1635 gBackendSingleSample[1][0] = BEChooser<>::GetFunc(SWR_MULTISAMPLE_1X, SWR_MSAA_STANDARD_PATTERN, true, false, false, false, (SWR_BACKEND_FUNCS)SWR_BACKEND_SINGLE_SAMPLE);
1636 gBackendSingleSample[1][1] = BEChooser<>::GetFunc(SWR_MULTISAMPLE_1X, SWR_MSAA_STANDARD_PATTERN, true, true, false, false,(SWR_BACKEND_FUNCS)SWR_BACKEND_SINGLE_SAMPLE);
1637 }
1638
1639 template <SWR_MULTISAMPLE_COUNT numSampleRates, SWR_MSAA_SAMPLE_PATTERN numSamplePatterns, SWR_INPUT_COVERAGE numCoverageModes>
1640 void InitBackendPixelFuncTable(PFN_BACKEND_FUNC (&table)[numSampleRates][numSamplePatterns][numCoverageModes][2][2])
1641 {
1642 for(uint32_t sampleCount = SWR_MULTISAMPLE_1X; sampleCount < numSampleRates; sampleCount++)
1643 {
1644 for(uint32_t samplePattern = SWR_MSAA_CENTER_PATTERN; samplePattern < numSamplePatterns; samplePattern++)
1645 {
1646 for(uint32_t inputCoverage = SWR_INPUT_COVERAGE_NONE; inputCoverage < numCoverageModes; inputCoverage++)
1647 {
1648 for(uint32_t isCentroid = 0; isCentroid < 2; isCentroid++)
1649 {
1650 table[sampleCount][samplePattern][inputCoverage][isCentroid][0] =
1651 BEChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, (SWR_MSAA_SAMPLE_PATTERN)samplePattern, (inputCoverage == SWR_INPUT_COVERAGE_NORMAL), (isCentroid > 0),
1652 false, false, SWR_BACKEND_MSAA_PIXEL_RATE);
1653 table[sampleCount][samplePattern][inputCoverage][isCentroid][1] =
1654 BEChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, (SWR_MSAA_SAMPLE_PATTERN)samplePattern, (inputCoverage == SWR_INPUT_COVERAGE_NORMAL), (isCentroid > 0),
1655 true, false, SWR_BACKEND_MSAA_PIXEL_RATE);
1656 }
1657 }
1658 }
1659 }
1660 }
1661
1662 template <uint32_t numSampleRates, uint32_t numCoverageModes>
1663 void InitBackendSampleFuncTable(PFN_BACKEND_FUNC (&table)[numSampleRates][numCoverageModes][2])
1664 {
1665 for(uint32_t sampleCount = SWR_MULTISAMPLE_1X; sampleCount < numSampleRates; sampleCount++)
1666 {
1667 for(uint32_t inputCoverage = SWR_INPUT_COVERAGE_NONE; inputCoverage < numCoverageModes; inputCoverage++)
1668 {
1669 table[sampleCount][inputCoverage][0] =
1670 BEChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, SWR_MSAA_STANDARD_PATTERN, (inputCoverage == SWR_INPUT_COVERAGE_NORMAL), false, false, false, (SWR_BACKEND_FUNCS)SWR_BACKEND_MSAA_SAMPLE_RATE);
1671 table[sampleCount][inputCoverage][1] =
1672 BEChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, SWR_MSAA_STANDARD_PATTERN, (inputCoverage == SWR_INPUT_COVERAGE_NORMAL), true, false, false, (SWR_BACKEND_FUNCS)SWR_BACKEND_MSAA_SAMPLE_RATE);
1673 }
1674 }
1675 }
1676
1677 void InitBackendFuncTables()
1678 {
1679 InitBackendSampleFuncTable(gBackendSingleSample);
1680 InitBackendPixelFuncTable<(SWR_MULTISAMPLE_COUNT)SWR_MULTISAMPLE_TYPE_MAX, SWR_MSAA_SAMPLE_PATTERN_MAX, SWR_INPUT_COVERAGE_MAX>(gBackendPixelRateTable);
1681 InitBackendSampleFuncTable<SWR_MULTISAMPLE_TYPE_MAX, SWR_INPUT_COVERAGE_MAX>(gBackendSampleRateTable);
1682 InitBackendOMFuncTable<SWR_NUM_RENDERTARGETS+1, SWR_MULTISAMPLE_TYPE_MAX>(gBackendOutputMergerTable);
1683 InitBackendBarycentricsTables<(SWR_MULTISAMPLE_COUNT)(SWR_MULTISAMPLE_TYPE_MAX)>(gPixelBarycentricTable, gSampleBarycentricTable);
1684
1685 gBackendNullPs[SWR_MULTISAMPLE_1X] = &BackendNullPS < SWR_MULTISAMPLE_1X > ;
1686 gBackendNullPs[SWR_MULTISAMPLE_2X] = &BackendNullPS < SWR_MULTISAMPLE_2X > ;
1687 gBackendNullPs[SWR_MULTISAMPLE_4X] = &BackendNullPS < SWR_MULTISAMPLE_4X > ;
1688 gBackendNullPs[SWR_MULTISAMPLE_8X] = &BackendNullPS < SWR_MULTISAMPLE_8X > ;
1689 gBackendNullPs[SWR_MULTISAMPLE_16X] = &BackendNullPS < SWR_MULTISAMPLE_16X > ;
1690 }