Merge remote-tracking branch 'public/master' into vulkan
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / backend.cpp
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file backend.cpp
24 *
25 * @brief Backend handles rasterization, pixel shading and output merger
26 * operations.
27 *
28 ******************************************************************************/
29
30 #include <smmintrin.h>
31
32 #include "rdtsc_core.h"
33 #include "backend.h"
34 #include "depthstencil.h"
35 #include "tilemgr.h"
36 #include "memory/tilingtraits.h"
37 #include "core/multisample.h"
38
39 #include <algorithm>
40
41 const __m128 vTileOffsetsX = {0.5, KNOB_TILE_X_DIM - 0.5, 0.5, KNOB_TILE_X_DIM - 0.5};
42 const __m128 vTileOffsetsY = {0.5, 0.5, KNOB_TILE_Y_DIM - 0.5, KNOB_TILE_Y_DIM - 0.5};
43
44 /// @todo move to common lib
45 #define MASKTOVEC(i3,i2,i1,i0) {-i0,-i1,-i2,-i3}
46 static const __m128 gMaskToVec[] = {
47 MASKTOVEC(0,0,0,0),
48 MASKTOVEC(0,0,0,1),
49 MASKTOVEC(0,0,1,0),
50 MASKTOVEC(0,0,1,1),
51 MASKTOVEC(0,1,0,0),
52 MASKTOVEC(0,1,0,1),
53 MASKTOVEC(0,1,1,0),
54 MASKTOVEC(0,1,1,1),
55 MASKTOVEC(1,0,0,0),
56 MASKTOVEC(1,0,0,1),
57 MASKTOVEC(1,0,1,0),
58 MASKTOVEC(1,0,1,1),
59 MASKTOVEC(1,1,0,0),
60 MASKTOVEC(1,1,0,1),
61 MASKTOVEC(1,1,1,0),
62 MASKTOVEC(1,1,1,1),
63 };
64
65 typedef void(*PFN_CLEAR_TILES)(DRAW_CONTEXT*, SWR_RENDERTARGET_ATTACHMENT rt, uint32_t, DWORD[4]);
66 static PFN_CLEAR_TILES sClearTilesTable[NUM_SWR_FORMATS];
67
68 //////////////////////////////////////////////////////////////////////////
69 /// @brief Process compute work.
70 /// @param pDC - pointer to draw context (dispatch).
71 /// @param workerId - The unique worker ID that is assigned to this thread.
72 /// @param threadGroupId - the linear index for the thread group within the dispatch.
73 void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId, void*& pSpillFillBuffer)
74 {
75 RDTSC_START(BEDispatch);
76
77 SWR_CONTEXT *pContext = pDC->pContext;
78
79 const COMPUTE_DESC* pTaskData = (COMPUTE_DESC*)pDC->pDispatch->GetTasksData();
80 SWR_ASSERT(pTaskData != nullptr);
81
82 // Ensure spill fill memory has been allocated.
83 if (pSpillFillBuffer == nullptr)
84 {
85 ///@todo Add state which indicates the spill fill size.
86 pSpillFillBuffer = pDC->pArena->AllocAlignedSync(4 * sizeof(MEGABYTE), sizeof(float) * 8);
87 }
88
89 const API_STATE& state = GetApiState(pDC);
90
91 SWR_CS_CONTEXT csContext{ 0 };
92 csContext.tileCounter = threadGroupId;
93 csContext.dispatchDims[0] = pTaskData->threadGroupCountX;
94 csContext.dispatchDims[1] = pTaskData->threadGroupCountY;
95 csContext.dispatchDims[2] = pTaskData->threadGroupCountZ;
96 csContext.pTGSM = pContext->pScratch[workerId];
97 csContext.pSpillFillBuffer = (uint8_t*)pSpillFillBuffer;
98
99 state.pfnCsFunc(GetPrivateState(pDC), &csContext);
100
101 UPDATE_STAT(CsInvocations, state.totalThreadsInGroup);
102
103 RDTSC_STOP(BEDispatch, 1, 0);
104 }
105
106 void ProcessSyncBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData)
107 {
108 SYNC_DESC *pSync = (SYNC_DESC*)pUserData;
109
110 uint32_t x, y;
111 MacroTileMgr::getTileIndices(macroTile, x, y);
112 SWR_ASSERT(x == 0 && y == 0);
113
114 if (pSync->pfnCallbackFunc != nullptr)
115 {
116 pSync->pfnCallbackFunc(pSync->userData, pSync->userData2, pSync->userData3);
117 }
118 }
119
120 void ProcessQueryStatsBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData)
121 {
122 QUERY_DESC* pQueryDesc = (QUERY_DESC*)pUserData;
123 SWR_STATS* pStats = pQueryDesc->pStats;
124 SWR_CONTEXT *pContext = pDC->pContext;
125
126 SWR_ASSERT(pStats != nullptr);
127
128 for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i)
129 {
130 pStats->DepthPassCount += pContext->stats[i].DepthPassCount;
131
132 pStats->IaVertices += pContext->stats[i].IaVertices;
133 pStats->IaPrimitives += pContext->stats[i].IaPrimitives;
134 pStats->VsInvocations += pContext->stats[i].VsInvocations;
135 pStats->HsInvocations += pContext->stats[i].HsInvocations;
136 pStats->DsInvocations += pContext->stats[i].DsInvocations;
137 pStats->GsInvocations += pContext->stats[i].GsInvocations;
138 pStats->PsInvocations += pContext->stats[i].PsInvocations;
139 pStats->CInvocations += pContext->stats[i].CInvocations;
140 pStats->CsInvocations += pContext->stats[i].CsInvocations;
141 pStats->CPrimitives += pContext->stats[i].CPrimitives;
142 pStats->GsPrimitives += pContext->stats[i].GsPrimitives;
143
144 for (uint32_t stream = 0; stream < MAX_SO_STREAMS; ++stream)
145 {
146 pStats->SoWriteOffset[stream] += pContext->stats[i].SoWriteOffset[stream];
147
148 /// @note client is required to provide valid write offset before every draw, so we clear
149 /// out the contents of the write offset when storing stats
150 pContext->stats[i].SoWriteOffset[stream] = 0;
151
152 pStats->SoPrimStorageNeeded[stream] += pContext->stats[i].SoPrimStorageNeeded[stream];
153 pStats->SoNumPrimsWritten[stream] += pContext->stats[i].SoNumPrimsWritten[stream];
154 }
155 }
156 }
157
158 template<SWR_FORMAT format>
159 void ClearRasterTile(uint8_t *pTileBuffer, simdvector &value)
160 {
161 auto lambda = [&](int comp)
162 {
163 FormatTraits<format>::storeSOA(comp, pTileBuffer, value.v[comp]);
164 pTileBuffer += (KNOB_SIMD_WIDTH * FormatTraits<format>::GetBPC(comp) / 8);
165 };
166
167 const uint32_t numIter = (KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD_TILE_X_DIM);
168 for (uint32_t i = 0; i < numIter; ++i)
169 {
170 UnrollerL<0, FormatTraits<format>::numComps, 1>::step(lambda);
171 }
172 }
173
174 template<SWR_FORMAT format>
175 INLINE void ClearMacroTile(DRAW_CONTEXT *pDC, SWR_RENDERTARGET_ATTACHMENT rt, uint32_t macroTile, DWORD clear[4])
176 {
177 // convert clear color to hottile format
178 // clear color is in RGBA float/uint32
179 simdvector vClear;
180 for (uint32_t comp = 0; comp < FormatTraits<format>::numComps; ++comp)
181 {
182 simdscalar vComp;
183 vComp = _simd_load1_ps((const float*)&clear[comp]);
184 if (FormatTraits<format>::isNormalized(comp))
185 {
186 vComp = _simd_mul_ps(vComp, _simd_set1_ps(FormatTraits<format>::fromFloat(comp)));
187 vComp = _simd_castsi_ps(_simd_cvtps_epi32(vComp));
188 }
189 vComp = FormatTraits<format>::pack(comp, vComp);
190 vClear.v[FormatTraits<format>::swizzle(comp)] = vComp;
191 }
192
193 uint32_t tileX, tileY;
194 MacroTileMgr::getTileIndices(macroTile, tileX, tileY);
195 const API_STATE& state = GetApiState(pDC);
196
197 int top = KNOB_MACROTILE_Y_DIM_FIXED * tileY;
198 int bottom = top + KNOB_MACROTILE_Y_DIM_FIXED - 1;
199 int left = KNOB_MACROTILE_X_DIM_FIXED * tileX;
200 int right = left + KNOB_MACROTILE_X_DIM_FIXED - 1;
201
202 // intersect with scissor
203 top = std::max(top, state.scissorInFixedPoint.top);
204 left = std::max(left, state.scissorInFixedPoint.left);
205 bottom = std::min(bottom, state.scissorInFixedPoint.bottom);
206 right = std::min(right, state.scissorInFixedPoint.right);
207
208 // translate to local hottile origin
209 top -= KNOB_MACROTILE_Y_DIM_FIXED * tileY;
210 bottom -= KNOB_MACROTILE_Y_DIM_FIXED * tileY;
211 left -= KNOB_MACROTILE_X_DIM_FIXED * tileX;
212 right -= KNOB_MACROTILE_X_DIM_FIXED * tileX;
213
214 // convert to raster tiles
215 top >>= (KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT);
216 bottom >>= (KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT);
217 left >>= (KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT);
218 right >>= (KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT);
219
220 const int numSamples = GetNumSamples(pDC->pState->state.rastState.sampleCount);
221 // compute steps between raster tile samples / raster tiles / macro tile rows
222 const uint32_t rasterTileSampleStep = KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<format>::bpp / 8;
223 const uint32_t rasterTileStep = (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<format>::bpp / 8)) * numSamples;
224 const uint32_t macroTileRowStep = (KNOB_MACROTILE_X_DIM / KNOB_TILE_X_DIM) * rasterTileStep;
225 const uint32_t pitch = (FormatTraits<format>::bpp * KNOB_MACROTILE_X_DIM / 8);
226
227 HOTTILE *pHotTile = pDC->pContext->pHotTileMgr->GetHotTile(pDC->pContext, pDC, macroTile, rt, true, numSamples);
228 uint32_t rasterTileStartOffset = (ComputeTileOffset2D< TilingTraits<SWR_TILE_SWRZ, FormatTraits<format>::bpp > >(pitch, left, top)) * numSamples;
229 uint8_t* pRasterTileRow = pHotTile->pBuffer + rasterTileStartOffset; //(ComputeTileOffset2D< TilingTraits<SWR_TILE_SWRZ, FormatTraits<format>::bpp > >(pitch, x, y)) * numSamples;
230
231 // loop over all raster tiles in the current hot tile
232 for (int y = top; y <= bottom; ++y)
233 {
234 uint8_t* pRasterTile = pRasterTileRow;
235 for (int x = left; x <= right; ++x)
236 {
237 for( int sampleNum = 0; sampleNum < numSamples; sampleNum++)
238 {
239 ClearRasterTile<format>(pRasterTile, vClear);
240 pRasterTile += rasterTileSampleStep;
241 }
242 }
243 pRasterTileRow += macroTileRowStep;
244 }
245
246 pHotTile->state = HOTTILE_DIRTY;
247 }
248
249
250 void ProcessClearBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData)
251 {
252 if (KNOB_FAST_CLEAR)
253 {
254 CLEAR_DESC *pClear = (CLEAR_DESC*)pUserData;
255 SWR_CONTEXT *pContext = pDC->pContext;
256 SWR_MULTISAMPLE_COUNT sampleCount = pDC->pState->state.rastState.sampleCount;
257 uint32_t numSamples = GetNumSamples(sampleCount);
258
259 SWR_ASSERT(pClear->flags.bits != 0); // shouldn't be here without a reason.
260
261 RDTSC_START(BEClear);
262
263 if (pClear->flags.mask & SWR_CLEAR_COLOR)
264 {
265 HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, SWR_ATTACHMENT_COLOR0, true, numSamples);
266 // All we want to do here is to mark the hot tile as being in a "needs clear" state.
267 pHotTile->clearData[0] = *(DWORD*)&(pClear->clearRTColor[0]);
268 pHotTile->clearData[1] = *(DWORD*)&(pClear->clearRTColor[1]);
269 pHotTile->clearData[2] = *(DWORD*)&(pClear->clearRTColor[2]);
270 pHotTile->clearData[3] = *(DWORD*)&(pClear->clearRTColor[3]);
271 pHotTile->state = HOTTILE_CLEAR;
272 }
273
274 if (pClear->flags.mask & SWR_CLEAR_DEPTH)
275 {
276 HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, SWR_ATTACHMENT_DEPTH, true, numSamples);
277 pHotTile->clearData[0] = *(DWORD*)&pClear->clearDepth;
278 pHotTile->state = HOTTILE_CLEAR;
279 }
280
281 if (pClear->flags.mask & SWR_CLEAR_STENCIL)
282 {
283 HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, SWR_ATTACHMENT_STENCIL, true, numSamples);
284
285 pHotTile->clearData[0] = *(DWORD*)&pClear->clearStencil;
286 pHotTile->state = HOTTILE_CLEAR;
287 }
288
289 RDTSC_STOP(BEClear, 0, 0);
290 }
291 else
292 {
293 // Legacy clear
294 CLEAR_DESC *pClear = (CLEAR_DESC*)pUserData;
295 RDTSC_START(BEClear);
296
297 if (pClear->flags.mask & SWR_CLEAR_COLOR)
298 {
299 /// @todo clear data should come in as RGBA32_FLOAT
300 DWORD clearData[4];
301 float clearFloat[4];
302 clearFloat[0] = ((uint8_t*)(&pClear->clearRTColor))[0] / 255.0f;
303 clearFloat[1] = ((uint8_t*)(&pClear->clearRTColor))[1] / 255.0f;
304 clearFloat[2] = ((uint8_t*)(&pClear->clearRTColor))[2] / 255.0f;
305 clearFloat[3] = ((uint8_t*)(&pClear->clearRTColor))[3] / 255.0f;
306 clearData[0] = *(DWORD*)&clearFloat[0];
307 clearData[1] = *(DWORD*)&clearFloat[1];
308 clearData[2] = *(DWORD*)&clearFloat[2];
309 clearData[3] = *(DWORD*)&clearFloat[3];
310
311 PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[KNOB_COLOR_HOT_TILE_FORMAT];
312 SWR_ASSERT(pfnClearTiles != nullptr);
313
314 pfnClearTiles(pDC, SWR_ATTACHMENT_COLOR0, macroTile, clearData);
315 }
316
317 if (pClear->flags.mask & SWR_CLEAR_DEPTH)
318 {
319 DWORD clearData[4];
320 clearData[0] = *(DWORD*)&pClear->clearDepth;
321 PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[KNOB_DEPTH_HOT_TILE_FORMAT];
322 SWR_ASSERT(pfnClearTiles != nullptr);
323
324 pfnClearTiles(pDC, SWR_ATTACHMENT_DEPTH, macroTile, clearData);
325 }
326
327 if (pClear->flags.mask & SWR_CLEAR_STENCIL)
328 {
329 uint32_t value = pClear->clearStencil;
330 DWORD clearData[4];
331 clearData[0] = *(DWORD*)&value;
332 PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[KNOB_STENCIL_HOT_TILE_FORMAT];
333
334 pfnClearTiles(pDC, SWR_ATTACHMENT_STENCIL, macroTile, clearData);
335 }
336
337 RDTSC_STOP(BEClear, 0, 0);
338 }
339 }
340
341
342 void ProcessStoreTileBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData)
343 {
344 RDTSC_START(BEStoreTiles);
345 STORE_TILES_DESC *pDesc = (STORE_TILES_DESC*)pData;
346 SWR_CONTEXT *pContext = pDC->pContext;
347
348 #ifdef KNOB_ENABLE_RDTSC
349 uint32_t numTiles = 0;
350 #endif
351 SWR_FORMAT srcFormat;
352 switch (pDesc->attachment)
353 {
354 case SWR_ATTACHMENT_COLOR0:
355 case SWR_ATTACHMENT_COLOR1:
356 case SWR_ATTACHMENT_COLOR2:
357 case SWR_ATTACHMENT_COLOR3:
358 case SWR_ATTACHMENT_COLOR4:
359 case SWR_ATTACHMENT_COLOR5:
360 case SWR_ATTACHMENT_COLOR6:
361 case SWR_ATTACHMENT_COLOR7: srcFormat = KNOB_COLOR_HOT_TILE_FORMAT; break;
362 case SWR_ATTACHMENT_DEPTH: srcFormat = KNOB_DEPTH_HOT_TILE_FORMAT; break;
363 case SWR_ATTACHMENT_STENCIL: srcFormat = KNOB_STENCIL_HOT_TILE_FORMAT; break;
364 default: SWR_ASSERT(false, "Unknown attachment: %d", pDesc->attachment); srcFormat = KNOB_COLOR_HOT_TILE_FORMAT; break;
365 }
366
367 uint32_t x, y;
368 MacroTileMgr::getTileIndices(macroTile, x, y);
369
370 // Only need to store the hottile if it's been rendered to...
371 HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, pDesc->attachment, false);
372 if (pHotTile)
373 {
374 // clear if clear is pending (i.e., not rendered to), then mark as dirty for store.
375 if (pHotTile->state == HOTTILE_CLEAR)
376 {
377 PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[srcFormat];
378 SWR_ASSERT(pfnClearTiles != nullptr);
379
380 pfnClearTiles(pDC, pDesc->attachment, macroTile, pHotTile->clearData);
381 }
382
383 if (pHotTile->state == HOTTILE_DIRTY || pDesc->postStoreTileState == (SWR_TILE_STATE)HOTTILE_DIRTY)
384 {
385 int destX = KNOB_MACROTILE_X_DIM * x;
386 int destY = KNOB_MACROTILE_Y_DIM * y;
387
388 pContext->pfnStoreTile(GetPrivateState(pDC), srcFormat,
389 pDesc->attachment, destX, destY, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer);
390 }
391
392
393 if (pHotTile->state == HOTTILE_DIRTY || pHotTile->state == HOTTILE_RESOLVED)
394 {
395 pHotTile->state = (HOTTILE_STATE)pDesc->postStoreTileState;
396 }
397 }
398 RDTSC_STOP(BEStoreTiles, numTiles, pDC->drawId);
399 }
400
401
402 void ProcessDiscardInvalidateTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData)
403 {
404 DISCARD_INVALIDATE_TILES_DESC *pDesc = (DISCARD_INVALIDATE_TILES_DESC *)pData;
405 SWR_CONTEXT *pContext = pDC->pContext;
406
407 const int numSamples = GetNumSamples(pDC->pState->state.rastState.sampleCount);
408
409 for (uint32_t i = 0; i < SWR_NUM_ATTACHMENTS; ++i)
410 {
411 if (pDesc->attachmentMask & (1 << i))
412 {
413 HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTileNoLoad(
414 pContext, pDC, macroTile, (SWR_RENDERTARGET_ATTACHMENT)i, pDesc->createNewTiles, numSamples);
415 if (pHotTile)
416 {
417 pHotTile->state = (HOTTILE_STATE)pDesc->newTileState;
418 }
419 }
420 }
421 }
422
423 #if KNOB_SIMD_WIDTH == 8
424 const __m256 vCenterOffsetsX = {0.5, 1.5, 0.5, 1.5, 2.5, 3.5, 2.5, 3.5};
425 const __m256 vCenterOffsetsY = {0.5, 0.5, 1.5, 1.5, 0.5, 0.5, 1.5, 1.5};
426 const __m256 vULOffsetsX = {0.0, 1.0, 0.0, 1.0, 2.0, 3.0, 2.0, 3.0};
427 const __m256 vULOffsetsY = {0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0};
428 #else
429 #error Unsupported vector width
430 #endif
431
432 INLINE
433 bool CanEarlyZ(const SWR_PS_STATE *pPSState)
434 {
435 return (pPSState->forceEarlyZ || (!pPSState->writesODepth && !pPSState->usesSourceDepth && !pPSState->usesUAV));
436 }
437
438 simdmask ComputeUserClipMask(uint8_t clipMask, float* pUserClipBuffer, simdscalar vI, simdscalar vJ)
439 {
440 simdscalar vClipMask = _simd_setzero_ps();
441 uint32_t numClipDistance = _mm_popcnt_u32(clipMask);
442
443 for (uint32_t i = 0; i < numClipDistance; ++i)
444 {
445 // pull triangle clip distance values from clip buffer
446 simdscalar vA = _simd_broadcast_ss(pUserClipBuffer++);
447 simdscalar vB = _simd_broadcast_ss(pUserClipBuffer++);
448 simdscalar vC = _simd_broadcast_ss(pUserClipBuffer++);
449
450 // interpolate
451 simdscalar vInterp = vplaneps(vA, vB, vC, vI, vJ);
452
453 // clip if interpolated clip distance is < 0 || NAN
454 simdscalar vCull = _simd_cmp_ps(_simd_setzero_ps(), vInterp, _CMP_NLE_UQ);
455
456 vClipMask = _simd_or_ps(vClipMask, vCull);
457 }
458
459 return _simd_movemask_ps(vClipMask);
460 }
461
462 template<bool perspMask>
463 INLINE void CalcPixelBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext)
464 {
465 if(perspMask)
466 {
467 // evaluate I,J
468 psContext.vI.center = vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.center, psContext.vY.center);
469 psContext.vJ.center = vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.center, psContext.vY.center);
470 psContext.vI.center = _simd_mul_ps(psContext.vI.center, coeffs.vRecipDet);
471 psContext.vJ.center = _simd_mul_ps(psContext.vJ.center, coeffs.vRecipDet);
472
473 // interpolate 1/w
474 psContext.vOneOverW.center = vplaneps(coeffs.vAOneOverW, coeffs.vBOneOverW, coeffs.vCOneOverW, psContext.vI.center, psContext.vJ.center);
475 }
476 }
477
478 template<bool perspMask>
479 INLINE void CalcSampleBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext)
480 {
481 if(perspMask)
482 {
483 // evaluate I,J
484 psContext.vI.sample = vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.sample, psContext.vY.sample);
485 psContext.vJ.sample = vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.sample, psContext.vY.sample);
486 psContext.vI.sample = _simd_mul_ps(psContext.vI.sample, coeffs.vRecipDet);
487 psContext.vJ.sample = _simd_mul_ps(psContext.vJ.sample, coeffs.vRecipDet);
488
489 // interpolate 1/w
490 psContext.vOneOverW.sample = vplaneps(coeffs.vAOneOverW, coeffs.vBOneOverW, coeffs.vCOneOverW, psContext.vI.sample, psContext.vJ.sample);
491 }
492 }
493
494
495 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
496 // Centroid behaves exactly as follows :
497 // (1) If all samples in the primitive are covered, the attribute is evaluated at the pixel center (even if the sample pattern does not happen to
498 // have a sample location there).
499 // (2) Else the attribute is evaluated at the first covered sample, in increasing order of sample index, where sample coverage is after ANDing the
500 // coverage with the SampleMask Rasterizer State.
501 // (3) If no samples are covered, such as on helper pixels executed off the bounds of a primitive to fill out 2x2 pixel stamps, the attribute is
502 // evaluated as follows : If the SampleMask Rasterizer state is a subset of the samples in the pixel, then the first sample covered by the
503 // SampleMask Rasterizer State is the evaluation point.Otherwise (full SampleMask), the pixel center is the evaluation point.
504 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
505 template<SWR_MULTISAMPLE_COUNT sampleCount, bool bForcedSampleCount>
506 INLINE void CalcCentroidPos(SWR_PS_CONTEXT &psContext, const uint64_t *const coverageMask, const uint32_t sampleMask,
507 const simdscalar vXSamplePosUL, const simdscalar vYSamplePosUL)
508 {
509 uint32_t inputMask[KNOB_SIMD_WIDTH];
510
511 generateInputCoverage<sampleCount, 1, bForcedSampleCount>(coverageMask, inputMask, sampleMask);
512
513 // Case (2) - partially covered pixel
514
515 // scan for first covered sample per pixel in the 4x2 span
516 unsigned long sampleNum[KNOB_SIMD_WIDTH];
517 (inputMask[0] > 0) ? (_BitScanForward(&sampleNum[0], inputMask[0])) : (sampleNum[0] = 0);
518 (inputMask[1] > 0) ? (_BitScanForward(&sampleNum[1], inputMask[1])) : (sampleNum[1] = 0);
519 (inputMask[2] > 0) ? (_BitScanForward(&sampleNum[2], inputMask[2])) : (sampleNum[2] = 0);
520 (inputMask[3] > 0) ? (_BitScanForward(&sampleNum[3], inputMask[3])) : (sampleNum[3] = 0);
521 (inputMask[4] > 0) ? (_BitScanForward(&sampleNum[4], inputMask[4])) : (sampleNum[4] = 0);
522 (inputMask[5] > 0) ? (_BitScanForward(&sampleNum[5], inputMask[5])) : (sampleNum[5] = 0);
523 (inputMask[6] > 0) ? (_BitScanForward(&sampleNum[6], inputMask[6])) : (sampleNum[6] = 0);
524 (inputMask[7] > 0) ? (_BitScanForward(&sampleNum[7], inputMask[7])) : (sampleNum[7] = 0);
525
526 // look up and set the sample offsets from UL pixel corner for first covered sample
527 __m256 vXSample = _mm256_set_ps(MultisampleTraits<sampleCount>::X(sampleNum[7]),
528 MultisampleTraits<sampleCount>::X(sampleNum[6]),
529 MultisampleTraits<sampleCount>::X(sampleNum[5]),
530 MultisampleTraits<sampleCount>::X(sampleNum[4]),
531 MultisampleTraits<sampleCount>::X(sampleNum[3]),
532 MultisampleTraits<sampleCount>::X(sampleNum[2]),
533 MultisampleTraits<sampleCount>::X(sampleNum[1]),
534 MultisampleTraits<sampleCount>::X(sampleNum[0]));
535
536 __m256 vYSample = _mm256_set_ps(MultisampleTraits<sampleCount>::Y(sampleNum[7]),
537 MultisampleTraits<sampleCount>::Y(sampleNum[6]),
538 MultisampleTraits<sampleCount>::Y(sampleNum[5]),
539 MultisampleTraits<sampleCount>::Y(sampleNum[4]),
540 MultisampleTraits<sampleCount>::Y(sampleNum[3]),
541 MultisampleTraits<sampleCount>::Y(sampleNum[2]),
542 MultisampleTraits<sampleCount>::Y(sampleNum[1]),
543 MultisampleTraits<sampleCount>::Y(sampleNum[0]));
544 // add sample offset to UL pixel corner
545 vXSample = _simd_add_ps(vXSamplePosUL, vXSample);
546 vYSample = _simd_add_ps(vYSamplePosUL, vYSample);
547
548 // Case (1) and case (3b) - All samples covered or not covered with full SampleMask
549 static const __m256i vFullyCoveredMask = MultisampleTraits<sampleCount>::FullSampleMask();
550 __m256i vInputCoveragei = _mm256_set_epi32(inputMask[7], inputMask[6], inputMask[5], inputMask[4], inputMask[3], inputMask[2], inputMask[1], inputMask[0]);
551 __m256i vAllSamplesCovered = _simd_cmpeq_epi32(vInputCoveragei, vFullyCoveredMask);
552
553 static const __m256i vZero = _simd_setzero_si();
554 const __m256i vSampleMask = _simd_and_si(_simd_set1_epi32(sampleMask), vFullyCoveredMask);
555 __m256i vNoSamplesCovered = _simd_cmpeq_epi32(vInputCoveragei, vZero);
556 __m256i vIsFullSampleMask = _simd_cmpeq_epi32(vSampleMask, vFullyCoveredMask);
557 __m256i vCase3b = _simd_and_si(vNoSamplesCovered, vIsFullSampleMask);
558
559 __m256i vEvalAtCenter = _simd_or_si(vAllSamplesCovered, vCase3b);
560
561 // set the centroid position based on results from above
562 psContext.vX.centroid = _simd_blendv_ps(vXSample, psContext.vX.center, _simd_castsi_ps(vEvalAtCenter));
563 psContext.vY.centroid = _simd_blendv_ps(vYSample, psContext.vY.center, _simd_castsi_ps(vEvalAtCenter));
564
565 // Case (3a) No samples covered and partial sample mask
566 __m256i vSomeSampleMaskSamples = _simd_cmplt_epi32(vSampleMask, vFullyCoveredMask);
567 // sample mask should never be all 0's for this case, but handle it anyways
568 unsigned long firstCoveredSampleMaskSample = 0;
569 (sampleMask > 0) ? (_BitScanForward(&firstCoveredSampleMaskSample, sampleMask)) : (firstCoveredSampleMaskSample = 0);
570
571 __m256i vCase3a = _simd_and_si(vNoSamplesCovered, vSomeSampleMaskSamples);
572
573 vXSample = _simd_set1_ps(MultisampleTraits<sampleCount>::X(firstCoveredSampleMaskSample));
574 vYSample = _simd_set1_ps(MultisampleTraits<sampleCount>::Y(firstCoveredSampleMaskSample));
575
576 // blend in case 3a pixel locations
577 psContext.vX.centroid = _simd_blendv_ps(psContext.vX.centroid, vXSample, _simd_castsi_ps(vCase3a));
578 psContext.vY.centroid = _simd_blendv_ps(psContext.vY.centroid, vYSample, _simd_castsi_ps(vCase3a));
579 }
580
581 template<uint32_t sampleCount, uint32_t persp, uint32_t standardPattern, uint32_t forcedMultisampleCount>
582 INLINE void CalcCentroidBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext,
583 const uint64_t *const coverageMask, const uint32_t sampleMask,
584 const simdscalar vXSamplePosUL, const simdscalar vYSamplePosUL)
585 {
586 static const bool bPersp = (bool)persp;
587 static const bool bIsStandardPattern = (bool)standardPattern;
588 static const bool bForcedMultisampleCount = (bool)forcedMultisampleCount;
589
590 // calculate centroid positions
591 if(bPersp)
592 {
593 if(bIsStandardPattern)
594 {
595 ///@ todo: don't need to generate input coverage 2x if input coverage and centroid
596 CalcCentroidPos<(SWR_MULTISAMPLE_COUNT)sampleCount, bForcedMultisampleCount>(psContext, coverageMask, sampleMask, vXSamplePosUL, vYSamplePosUL);
597 }
598 else
599 {
600 static const __m256 pixelCenter = _simd_set1_ps(0.5f);
601 psContext.vX.centroid = _simd_add_ps(vXSamplePosUL, pixelCenter);
602 psContext.vY.centroid = _simd_add_ps(vYSamplePosUL, pixelCenter);
603 }
604 // evaluate I,J
605 psContext.vI.centroid = vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.centroid, psContext.vY.centroid);
606 psContext.vJ.centroid = vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.centroid, psContext.vY.centroid);
607 psContext.vI.centroid = _simd_mul_ps(psContext.vI.centroid, coeffs.vRecipDet);
608 psContext.vJ.centroid = _simd_mul_ps(psContext.vJ.centroid, coeffs.vRecipDet);
609
610 // interpolate 1/w
611 psContext.vOneOverW.centroid = vplaneps(coeffs.vAOneOverW, coeffs.vBOneOverW, coeffs.vCOneOverW, psContext.vI.centroid, psContext.vJ.centroid);
612 }
613 }
614
615 template<uint32_t NumRT, uint32_t sampleCountT>
616 void OutputMerger(SWR_PS_CONTEXT &psContext, uint8_t* (&pColorBase)[SWR_NUM_RENDERTARGETS], uint32_t sample, const SWR_BLEND_STATE *pBlendState,
617 const PFN_BLEND_JIT_FUNC (&pfnBlendFunc)[SWR_NUM_RENDERTARGETS], simdscalar &coverageMask, simdscalar depthPassMask)
618 {
619 // type safety guaranteed from template instantiation in BEChooser<>::GetFunc
620 static const SWR_MULTISAMPLE_COUNT sampleCount = (SWR_MULTISAMPLE_COUNT)sampleCountT;
621 uint32_t rasterTileColorOffset = MultisampleTraits<sampleCount>::RasterTileColorOffset(sample);
622 simdvector blendOut;
623
624 for(uint32_t rt = 0; rt < NumRT; ++rt)
625 {
626 uint8_t *pColorSample;
627 if(sampleCount == SWR_MULTISAMPLE_1X)
628 {
629 pColorSample = pColorBase[rt];
630 }
631 else
632 {
633 pColorSample = pColorBase[rt] + rasterTileColorOffset;
634 }
635
636 const SWR_RENDER_TARGET_BLEND_STATE *pRTBlend = &pBlendState->renderTarget[rt];
637 // pfnBlendFunc may not update all channels. Initialize with PS output.
638 /// TODO: move this into the blend JIT.
639 blendOut = psContext.shaded[rt];
640
641 // Blend outputs and update coverage mask for alpha test
642 if(pfnBlendFunc[rt] != nullptr)
643 {
644 pfnBlendFunc[rt](
645 pBlendState,
646 psContext.shaded[rt],
647 psContext.shaded[1],
648 sample,
649 pColorSample,
650 blendOut,
651 &psContext.oMask,
652 (simdscalari*)&coverageMask);
653 }
654
655 // final write mask
656 simdscalari outputMask = _simd_castps_si(_simd_and_ps(coverageMask, depthPassMask));
657
658 ///@todo can only use maskstore fast path if bpc is 32. Assuming hot tile is RGBA32_FLOAT.
659 static_assert(KNOB_COLOR_HOT_TILE_FORMAT == R32G32B32A32_FLOAT, "Unsupported hot tile format");
660
661 const uint32_t simd = KNOB_SIMD_WIDTH * sizeof(float);
662
663 // store with color mask
664 if(!pRTBlend->writeDisableRed)
665 {
666 _simd_maskstore_ps((float*)pColorSample, outputMask, blendOut.x);
667 }
668 if(!pRTBlend->writeDisableGreen)
669 {
670 _simd_maskstore_ps((float*)(pColorSample + simd), outputMask, blendOut.y);
671 }
672 if(!pRTBlend->writeDisableBlue)
673 {
674 _simd_maskstore_ps((float*)(pColorSample + simd * 2), outputMask, blendOut.z);
675 }
676 if(!pRTBlend->writeDisableAlpha)
677 {
678 _simd_maskstore_ps((float*)(pColorSample + simd * 3), outputMask, blendOut.w);
679 }
680 }
681 }
682
683 template<uint32_t sampleCountT, uint32_t samplePattern, uint32_t inputCoverage, uint32_t centroidPos, uint32_t forcedSampleCount>
684 void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
685 {
686 RDTSC_START(BESetup);
687 // type safety guaranteed from template instantiation in BEChooser<>::GetFunc
688 static const bool bInputCoverage = (bool)inputCoverage;
689 static const bool bCentroidPos = (bool)centroidPos;
690
691 SWR_CONTEXT *pContext = pDC->pContext;
692 const API_STATE& state = GetApiState(pDC);
693 const SWR_RASTSTATE& rastState = state.rastState;
694 const SWR_PS_STATE *pPSState = &state.psState;
695 const SWR_BLEND_STATE *pBlendState = &state.blendState;
696 const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs;
697 uint64_t coverageMask = work.coverageMask[0];
698
699 // broadcast scalars
700 BarycentricCoeffs coeffs;
701 coeffs.vIa = _simd_broadcast_ss(&work.I[0]);
702 coeffs.vIb = _simd_broadcast_ss(&work.I[1]);
703 coeffs.vIc = _simd_broadcast_ss(&work.I[2]);
704
705 coeffs.vJa = _simd_broadcast_ss(&work.J[0]);
706 coeffs.vJb = _simd_broadcast_ss(&work.J[1]);
707 coeffs.vJc = _simd_broadcast_ss(&work.J[2]);
708
709 coeffs.vZa = _simd_broadcast_ss(&work.Z[0]);
710 coeffs.vZb = _simd_broadcast_ss(&work.Z[1]);
711 coeffs.vZc = _simd_broadcast_ss(&work.Z[2]);
712
713 coeffs.vRecipDet = _simd_broadcast_ss(&work.recipDet);
714
715 coeffs.vAOneOverW = _simd_broadcast_ss(&work.OneOverW[0]);
716 coeffs.vBOneOverW = _simd_broadcast_ss(&work.OneOverW[1]);
717 coeffs.vCOneOverW = _simd_broadcast_ss(&work.OneOverW[2]);
718
719 uint8_t *pColorBase[SWR_NUM_RENDERTARGETS];
720 uint32_t NumRT = state.psState.numRenderTargets;
721 for(uint32_t rt = 0; rt < NumRT; ++rt)
722 {
723 pColorBase[rt] = renderBuffers.pColor[rt];
724 }
725 uint8_t *pDepthBase = renderBuffers.pDepth, *pStencilBase = renderBuffers.pStencil;
726 RDTSC_STOP(BESetup, 0, 0);
727
728 SWR_PS_CONTEXT psContext;
729 psContext.pAttribs = work.pAttribs;
730 psContext.pPerspAttribs = work.pPerspAttribs;
731 psContext.frontFace = work.triFlags.frontFacing;
732 psContext.primID = work.triFlags.primID;
733
734 // save Ia/Ib/Ic and Ja/Jb/Jc if we need to reevaluate i/j/k in the shader because of pull attribs
735 psContext.I = work.I;
736 psContext.J = work.J;
737 psContext.recipDet = work.recipDet;
738 psContext.pRecipW = work.pRecipW;
739 psContext.pSamplePosX = (const float*)&MultisampleTraits<SWR_MULTISAMPLE_1X>::samplePosX;
740 psContext.pSamplePosY = (const float*)&MultisampleTraits<SWR_MULTISAMPLE_1X>::samplePosY;
741
742 for(uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
743 {
744 // UL pixel corner
745 psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps((float)yy));
746 // pixel center
747 psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps((float)yy));
748
749 for(uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
750 {
751 if(bInputCoverage)
752 {
753 generateInputCoverage<SWR_MULTISAMPLE_1X, SWR_MSAA_STANDARD_PATTERN, false>(&work.coverageMask[0], psContext.inputMask, pBlendState->sampleMask);
754 }
755
756 if(coverageMask & MASK)
757 {
758 RDTSC_START(BEBarycentric);
759 psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps((float)xx));
760 // pixel center
761 psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps((float)xx));
762
763 backendFuncs.pfnCalcPixelBarycentrics(coeffs, psContext);
764
765 if(bCentroidPos)
766 {
767 // for 1x case, centroid is pixel center
768 psContext.vX.centroid = psContext.vX.center;
769 psContext.vY.centroid = psContext.vY.center;
770 psContext.vI.centroid = psContext.vI.center;
771 psContext.vJ.centroid = psContext.vJ.center;
772 psContext.vOneOverW.centroid = psContext.vOneOverW.center;
773 }
774
775 // interpolate and quantize z
776 psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center);
777 psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
778
779 RDTSC_STOP(BEBarycentric, 0, 0);
780
781 simdmask clipCoverageMask = coverageMask & MASK;
782
783 // interpolate user clip distance if available
784 if(rastState.clipDistanceMask)
785 {
786 clipCoverageMask &= ~ComputeUserClipMask(rastState.clipDistanceMask, work.pUserClipBuffer,
787 psContext.vI.center, psContext.vJ.center);
788 }
789
790 simdscalar vCoverageMask = vMask(clipCoverageMask);
791 simdscalar depthPassMask = vCoverageMask;
792 simdscalar stencilPassMask = vCoverageMask;
793
794 // Early-Z?
795 if(CanEarlyZ(pPSState))
796 {
797 RDTSC_START(BEEarlyDepthTest);
798 depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing,
799 psContext.vZ, pDepthBase, vCoverageMask, pStencilBase, &stencilPassMask);
800 RDTSC_STOP(BEEarlyDepthTest, 0, 0);
801
802 // early-exit if no pixels passed depth or earlyZ is forced on
803 if(pPSState->forceEarlyZ || !_simd_movemask_ps(depthPassMask))
804 {
805 DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
806 pDepthBase, depthPassMask, vCoverageMask, pStencilBase, stencilPassMask);
807
808 if (!_simd_movemask_ps(depthPassMask))
809 {
810 goto Endtile;
811 }
812 }
813 }
814
815 psContext.sampleIndex = 0;
816 psContext.activeMask = _simd_castps_si(vCoverageMask);
817
818 // execute pixel shader
819 RDTSC_START(BEPixelShader);
820 UPDATE_STAT(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask)));
821 state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
822 RDTSC_STOP(BEPixelShader, 0, 0);
823
824 vCoverageMask = _simd_castsi_ps(psContext.activeMask);
825
826 // late-Z
827 if(!CanEarlyZ(pPSState))
828 {
829 RDTSC_START(BELateDepthTest);
830 depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing,
831 psContext.vZ, pDepthBase, vCoverageMask, pStencilBase, &stencilPassMask);
832 RDTSC_STOP(BELateDepthTest, 0, 0);
833
834 if(!_simd_movemask_ps(depthPassMask))
835 {
836 // need to call depth/stencil write for stencil write
837 DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
838 pDepthBase, depthPassMask, vCoverageMask, pStencilBase, stencilPassMask);
839 goto Endtile;
840 }
841 }
842
843 uint32_t statMask = _simd_movemask_ps(depthPassMask);
844 uint32_t statCount = _mm_popcnt_u32(statMask);
845 UPDATE_STAT(DepthPassCount, statCount);
846
847 // output merger
848 RDTSC_START(BEOutputMerger);
849 backendFuncs.pfnOutputMerger(psContext, pColorBase, 0, pBlendState, state.pfnBlendFunc,
850 vCoverageMask, depthPassMask);
851
852 // do final depth write after all pixel kills
853 if (!pPSState->forceEarlyZ)
854 {
855 DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
856 pDepthBase, depthPassMask, vCoverageMask, pStencilBase, stencilPassMask);
857 }
858 RDTSC_STOP(BEOutputMerger, 0, 0);
859 }
860
861 Endtile:
862 RDTSC_START(BEEndTile);
863 coverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
864 pDepthBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
865 pStencilBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
866
867 for(uint32_t rt = 0; rt < NumRT; ++rt)
868 {
869 pColorBase[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
870 }
871 RDTSC_STOP(BEEndTile, 0, 0);
872 }
873 }
874 }
875
876 template<uint32_t sampleCountT, uint32_t samplePattern, uint32_t inputCoverage, uint32_t centroidPos, uint32_t forcedSampleCount>
877 void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
878 {
879 // type safety guaranteed from template instantiation in BEChooser<>::GetFunc
880 static const SWR_MULTISAMPLE_COUNT sampleCount = (SWR_MULTISAMPLE_COUNT)sampleCountT;
881 static const bool bInputCoverage = (bool)inputCoverage;
882 static const bool bCentroidPos = (bool)centroidPos;
883
884 RDTSC_START(BESetup);
885
886 SWR_CONTEXT *pContext = pDC->pContext;
887 const API_STATE& state = GetApiState(pDC);
888 const SWR_RASTSTATE& rastState = state.rastState;
889 const SWR_PS_STATE *pPSState = &state.psState;
890 const SWR_BLEND_STATE *pBlendState = &state.blendState;
891 const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs;
892
893 // broadcast scalars
894 BarycentricCoeffs coeffs;
895 coeffs.vIa = _simd_broadcast_ss(&work.I[0]);
896 coeffs.vIb = _simd_broadcast_ss(&work.I[1]);
897 coeffs.vIc = _simd_broadcast_ss(&work.I[2]);
898
899 coeffs.vJa = _simd_broadcast_ss(&work.J[0]);
900 coeffs.vJb = _simd_broadcast_ss(&work.J[1]);
901 coeffs.vJc = _simd_broadcast_ss(&work.J[2]);
902
903 coeffs.vZa = _simd_broadcast_ss(&work.Z[0]);
904 coeffs.vZb = _simd_broadcast_ss(&work.Z[1]);
905 coeffs.vZc = _simd_broadcast_ss(&work.Z[2]);
906
907 coeffs.vRecipDet = _simd_broadcast_ss(&work.recipDet);
908
909 coeffs.vAOneOverW = _simd_broadcast_ss(&work.OneOverW[0]);
910 coeffs.vBOneOverW = _simd_broadcast_ss(&work.OneOverW[1]);
911 coeffs.vCOneOverW = _simd_broadcast_ss(&work.OneOverW[2]);
912
913 uint8_t *pColorBase[SWR_NUM_RENDERTARGETS];
914 uint32_t NumRT = state.psState.numRenderTargets;
915 for(uint32_t rt = 0; rt < NumRT; ++rt)
916 {
917 pColorBase[rt] = renderBuffers.pColor[rt];
918 }
919 uint8_t *pDepthBase = renderBuffers.pDepth, *pStencilBase = renderBuffers.pStencil;
920 RDTSC_STOP(BESetup, 0, 0);
921
922 SWR_PS_CONTEXT psContext;
923 psContext.pAttribs = work.pAttribs;
924 psContext.pPerspAttribs = work.pPerspAttribs;
925 psContext.pRecipW = work.pRecipW;
926 psContext.frontFace = work.triFlags.frontFacing;
927 psContext.primID = work.triFlags.primID;
928
929 // save Ia/Ib/Ic and Ja/Jb/Jc if we need to reevaluate i/j/k in the shader because of pull attribs
930 psContext.I = work.I;
931 psContext.J = work.J;
932 psContext.recipDet = work.recipDet;
933 psContext.pSamplePosX = (const float*)&MultisampleTraits<sampleCount>::samplePosX;
934 psContext.pSamplePosY = (const float*)&MultisampleTraits<sampleCount>::samplePosY;
935 const uint32_t numSamples = MultisampleTraits<sampleCount>::numSamples;
936
937 for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
938 {
939 // UL pixel corner
940 psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps((float)yy));
941 // pixel center
942 psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps((float)yy));
943
944 for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
945 {
946 psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps((float)xx));
947 // pixel center
948 psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps((float)xx));
949
950 RDTSC_START(BEBarycentric);
951 backendFuncs.pfnCalcPixelBarycentrics(coeffs, psContext);
952 RDTSC_STOP(BEBarycentric, 0, 0);
953
954 if(bInputCoverage)
955 {
956 generateInputCoverage<sampleCount, SWR_MSAA_STANDARD_PATTERN, false>(&work.coverageMask[0], psContext.inputMask, pBlendState->sampleMask);
957 }
958
959 if(bCentroidPos)
960 {
961 ///@ todo: don't need to genererate input coverage 2x if input coverage and centroid
962 RDTSC_START(BEBarycentric);
963 backendFuncs.pfnCalcCentroidBarycentrics(coeffs, psContext, &work.coverageMask[0], pBlendState->sampleMask, psContext.vX.UL, psContext.vY.UL);
964 RDTSC_STOP(BEBarycentric, 0, 0);
965 }
966
967 for(uint32_t sample = 0; sample < numSamples; sample++)
968 {
969 if (work.coverageMask[sample] & MASK)
970 {
971 RDTSC_START(BEBarycentric);
972
973 // calculate per sample positions
974 psContext.vX.sample = _simd_add_ps(psContext.vX.UL, MultisampleTraits<sampleCount>::vX(sample));
975 psContext.vY.sample = _simd_add_ps(psContext.vY.UL, MultisampleTraits<sampleCount>::vY(sample));
976
977 simdmask coverageMask = work.coverageMask[sample] & MASK;
978 simdscalar vCoverageMask = vMask(coverageMask);
979
980 backendFuncs.pfnCalcSampleBarycentrics(coeffs, psContext);
981
982 // interpolate and quantize z
983 psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample);
984 psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
985
986 RDTSC_STOP(BEBarycentric, 0, 0);
987
988 // interpolate user clip distance if available
989 if (rastState.clipDistanceMask)
990 {
991 coverageMask &= ~ComputeUserClipMask(rastState.clipDistanceMask, work.pUserClipBuffer,
992 psContext.vI.sample, psContext.vJ.sample);
993 }
994
995 simdscalar depthPassMask = vCoverageMask;
996 simdscalar stencilPassMask = vCoverageMask;
997
998 // offset depth/stencil buffers current sample
999 uint8_t *pDepthSample = pDepthBase + MultisampleTraits<sampleCount>::RasterTileDepthOffset(sample);
1000 uint8_t *pStencilSample = pStencilBase + MultisampleTraits<sampleCount>::RasterTileStencilOffset(sample);
1001
1002 // Early-Z?
1003 if (CanEarlyZ(pPSState))
1004 {
1005 RDTSC_START(BEEarlyDepthTest);
1006 depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing,
1007 psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
1008 RDTSC_STOP(BEEarlyDepthTest, 0, 0);
1009
1010 // early-exit if no samples passed depth or earlyZ is forced on.
1011 if (pPSState->forceEarlyZ || !_simd_movemask_ps(depthPassMask))
1012 {
1013 DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
1014 pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
1015
1016 if (!_simd_movemask_ps(depthPassMask))
1017 {
1018 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
1019 continue;
1020 }
1021 }
1022 }
1023
1024 psContext.sampleIndex = sample;
1025 psContext.activeMask = _simd_castps_si(vCoverageMask);
1026
1027 // execute pixel shader
1028 RDTSC_START(BEPixelShader);
1029 UPDATE_STAT(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask)));
1030 state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
1031 RDTSC_STOP(BEPixelShader, 0, 0);
1032
1033 vCoverageMask = _simd_castsi_ps(psContext.activeMask);
1034
1035 //// late-Z
1036 if (!CanEarlyZ(pPSState))
1037 {
1038 RDTSC_START(BELateDepthTest);
1039 depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing,
1040 psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
1041 RDTSC_STOP(BELateDepthTest, 0, 0);
1042
1043 if (!_simd_movemask_ps(depthPassMask))
1044 {
1045 // need to call depth/stencil write for stencil write
1046 DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
1047 pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
1048
1049 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
1050 continue;
1051 }
1052 }
1053
1054 uint32_t statMask = _simd_movemask_ps(depthPassMask);
1055 uint32_t statCount = _mm_popcnt_u32(statMask);
1056 UPDATE_STAT(DepthPassCount, statCount);
1057
1058 // output merger
1059 RDTSC_START(BEOutputMerger);
1060 backendFuncs.pfnOutputMerger(psContext, pColorBase, sample, pBlendState, state.pfnBlendFunc,
1061 vCoverageMask, depthPassMask);
1062
1063 // do final depth write after all pixel kills
1064 if (!pPSState->forceEarlyZ)
1065 {
1066 DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
1067 pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
1068 }
1069 RDTSC_STOP(BEOutputMerger, 0, 0);
1070 }
1071 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
1072 }
1073 RDTSC_START(BEEndTile);
1074 pDepthBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
1075 pStencilBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
1076
1077 for (uint32_t rt = 0; rt < NumRT; ++rt)
1078 {
1079 pColorBase[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
1080 }
1081 RDTSC_STOP(BEEndTile, 0, 0);
1082 }
1083 }
1084 }
1085
1086 template<uint32_t sampleCountT, uint32_t samplePattern, uint32_t inputCoverage, uint32_t centroidPos, uint32_t forcedSampleCount>
1087 void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
1088 {
1089 // type safety guaranteed from template instantiation in BEChooser<>::GetFunc
1090 static const SWR_MULTISAMPLE_COUNT sampleCount = (SWR_MULTISAMPLE_COUNT)sampleCountT;
1091 static const bool bIsStandardPattern = (bool)samplePattern;
1092 static const bool bInputCoverage = (bool)inputCoverage;
1093 static const bool bCentroidPos = (bool)centroidPos;
1094 static const bool bForcedSampleCount = (bool)forcedSampleCount;
1095
1096 RDTSC_START(BESetup);
1097
1098 SWR_CONTEXT *pContext = pDC->pContext;
1099 const API_STATE& state = GetApiState(pDC);
1100 const SWR_RASTSTATE& rastState = state.rastState;
1101 const SWR_PS_STATE *pPSState = &state.psState;
1102 const SWR_BLEND_STATE *pBlendState = &state.blendState;
1103 const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs;
1104
1105 // broadcast scalars
1106 BarycentricCoeffs coeffs;
1107 coeffs.vIa = _simd_broadcast_ss(&work.I[0]);
1108 coeffs.vIb = _simd_broadcast_ss(&work.I[1]);
1109 coeffs.vIc = _simd_broadcast_ss(&work.I[2]);
1110
1111 coeffs.vJa = _simd_broadcast_ss(&work.J[0]);
1112 coeffs.vJb = _simd_broadcast_ss(&work.J[1]);
1113 coeffs.vJc = _simd_broadcast_ss(&work.J[2]);
1114
1115 coeffs.vZa = _simd_broadcast_ss(&work.Z[0]);
1116 coeffs.vZb = _simd_broadcast_ss(&work.Z[1]);
1117 coeffs.vZc = _simd_broadcast_ss(&work.Z[2]);
1118
1119 coeffs.vRecipDet = _simd_broadcast_ss(&work.recipDet);
1120
1121 coeffs.vAOneOverW = _simd_broadcast_ss(&work.OneOverW[0]);
1122 coeffs.vBOneOverW = _simd_broadcast_ss(&work.OneOverW[1]);
1123 coeffs.vCOneOverW = _simd_broadcast_ss(&work.OneOverW[2]);
1124
1125 uint8_t *pColorBase[SWR_NUM_RENDERTARGETS];
1126 uint32_t NumRT = state.psState.numRenderTargets;
1127 for(uint32_t rt = 0; rt < NumRT; ++rt)
1128 {
1129 pColorBase[rt] = renderBuffers.pColor[rt];
1130 }
1131 uint8_t *pDepthBase = renderBuffers.pDepth, *pStencilBase = renderBuffers.pStencil;
1132 RDTSC_STOP(BESetup, 0, 0);
1133
1134 SWR_PS_CONTEXT psContext;
1135 psContext.pAttribs = work.pAttribs;
1136 psContext.pPerspAttribs = work.pPerspAttribs;
1137 psContext.frontFace = work.triFlags.frontFacing;
1138 psContext.primID = work.triFlags.primID;
1139 psContext.pRecipW = work.pRecipW;
1140 // save Ia/Ib/Ic and Ja/Jb/Jc if we need to reevaluate i/j/k in the shader because of pull attribs
1141 psContext.I = work.I;
1142 psContext.J = work.J;
1143 psContext.recipDet = work.recipDet;
1144 psContext.pSamplePosX = (const float*)&MultisampleTraits<sampleCount>::samplePosX;
1145 psContext.pSamplePosY = (const float*)&MultisampleTraits<sampleCount>::samplePosY;
1146 psContext.sampleIndex = 0;
1147
1148 uint32_t numCoverageSamples;
1149 if(bIsStandardPattern)
1150 {
1151 numCoverageSamples = MultisampleTraits<sampleCount>::numSamples;
1152 }
1153 else
1154 {
1155 numCoverageSamples = 1;
1156 }
1157
1158 uint32_t numOMSamples;
1159 // RT has to be single sample if we're in forcedMSAA mode
1160 if(bForcedSampleCount && (sampleCount > SWR_MULTISAMPLE_1X))
1161 {
1162 numOMSamples = 1;
1163 }
1164 // unless we're forced to single sample, in which case we run the OM at the sample count of the RT
1165 else if(bForcedSampleCount && (sampleCount == SWR_MULTISAMPLE_1X))
1166 {
1167 numOMSamples = GetNumSamples(pBlendState->sampleCount);
1168 }
1169 // else we're in normal MSAA mode and rasterizer and OM are running at the same sample count
1170 else
1171 {
1172 numOMSamples = MultisampleTraits<sampleCount>::numSamples;
1173 }
1174
1175 for(uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
1176 {
1177 psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps((float)yy));
1178 psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps((float)yy));
1179 for(uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
1180 {
1181 simdscalar vZ[MultisampleTraits<sampleCount>::numSamples]{ 0 };
1182 psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps((float)xx));
1183 // set pixel center positions
1184 psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps((float)xx));
1185
1186 if (bInputCoverage)
1187 {
1188 generateInputCoverage<sampleCount, bIsStandardPattern, bForcedSampleCount>(&work.coverageMask[0], psContext.inputMask, pBlendState->sampleMask);
1189 }
1190
1191 if(bCentroidPos)
1192 {
1193 ///@ todo: don't need to genererate input coverage 2x if input coverage and centroid
1194 RDTSC_START(BEBarycentric);
1195 backendFuncs.pfnCalcCentroidBarycentrics(coeffs, psContext, &work.coverageMask[0], pBlendState->sampleMask, psContext.vX.UL, psContext.vY.UL);
1196 RDTSC_STOP(BEBarycentric, 0, 0);
1197 }
1198
1199 // if oDepth written to, or there is a potential to discard any samples, we need to
1200 // run the PS early, then interp or broadcast Z and test
1201 if(pPSState->writesODepth || pPSState->killsPixel)
1202 {
1203 RDTSC_START(BEBarycentric);
1204 backendFuncs.pfnCalcPixelBarycentrics(coeffs, psContext);
1205
1206 // interpolate and quantize z
1207 psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center);
1208 psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
1209 RDTSC_STOP(BEBarycentric, 0, 0);
1210
1211 // execute pixel shader
1212 RDTSC_START(BEPixelShader);
1213 state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
1214 RDTSC_STOP(BEPixelShader, 0, 0);
1215 }
1216 else
1217 {
1218 psContext.activeMask = _simd_set1_epi32(-1);
1219 }
1220
1221 // need to declare enough space for all samples
1222 simdscalar vCoverageMask[MultisampleTraits<sampleCount>::numSamples];
1223 simdscalar depthPassMask[MultisampleTraits<sampleCount>::numSamples];
1224 simdscalar stencilPassMask[MultisampleTraits<sampleCount>::numSamples];
1225 simdscalar anyDepthSamplePassed = _simd_setzero_ps();
1226 simdscalar anyStencilSamplePassed = _simd_setzero_ps();
1227 for(uint32_t sample = 0; sample < numCoverageSamples; sample++)
1228 {
1229 vCoverageMask[sample] = vMask(work.coverageMask[sample] & MASK);
1230
1231 // pull mask back out for any discards and and with coverage
1232 vCoverageMask[sample] = _simd_and_ps(vCoverageMask[sample], _simd_castsi_ps(psContext.activeMask));
1233
1234 if (!_simd_movemask_ps(vCoverageMask[sample]))
1235 {
1236 vCoverageMask[sample] = depthPassMask[sample] = stencilPassMask[sample] = _simd_setzero_ps();
1237 continue;
1238 }
1239
1240 if(bForcedSampleCount)
1241 {
1242 // candidate pixels (that passed coverage) will cause shader invocation if any bits in the samplemask are set
1243 const simdscalar vSampleMask = _simd_castsi_ps(_simd_cmpgt_epi32(_simd_set1_epi32(pBlendState->sampleMask), _simd_setzero_si()));
1244 anyDepthSamplePassed = _simd_or_ps(anyDepthSamplePassed, _simd_and_ps(vCoverageMask[sample], vSampleMask));
1245 continue;
1246 }
1247
1248 depthPassMask[sample] = vCoverageMask[sample];
1249
1250 // if oDepth isn't written to, we need to interpolate Z for each sample
1251 // if clip distances are enabled, we need to interpolate for each sample
1252 if(!pPSState->writesODepth || rastState.clipDistanceMask)
1253 {
1254 RDTSC_START(BEBarycentric);
1255 if(bIsStandardPattern)
1256 {
1257 // calculate per sample positions
1258 psContext.vX.sample = _simd_add_ps(psContext.vX.UL, MultisampleTraits<sampleCount>::vX(sample));
1259 psContext.vY.sample = _simd_add_ps(psContext.vY.UL, MultisampleTraits<sampleCount>::vY(sample));
1260 }
1261 else
1262 {
1263 psContext.vX.sample = psContext.vX.center;
1264 psContext.vY.sample = psContext.vY.center;
1265 }
1266
1267 // calc I & J per sample
1268 backendFuncs.pfnCalcSampleBarycentrics(coeffs, psContext);
1269
1270 // interpolate and quantize z
1271 if (!pPSState->writesODepth)
1272 {
1273 vZ[sample] = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample);
1274 vZ[sample] = state.pfnQuantizeDepth(vZ[sample]);
1275 }
1276
1277 ///@todo: perspective correct vs non-perspective correct clipping?
1278 // interpolate clip distances
1279 if (rastState.clipDistanceMask)
1280 {
1281 uint8_t clipMask = ComputeUserClipMask(rastState.clipDistanceMask, work.pUserClipBuffer,
1282 psContext.vI.sample, psContext.vJ.sample);
1283 vCoverageMask[sample] = _simd_and_ps(vCoverageMask[sample], vMask(~clipMask));
1284 }
1285 RDTSC_STOP(BEBarycentric, 0, 0);
1286 }
1287 // else 'broadcast' and test psContext.vZ written from the PS each sample
1288 else
1289 {
1290 vZ[sample] = psContext.vZ;
1291 }
1292
1293 // offset depth/stencil buffers current sample
1294 uint8_t *pDepthSample = pDepthBase + MultisampleTraits<sampleCount>::RasterTileDepthOffset(sample);
1295 uint8_t * pStencilSample = pStencilBase + MultisampleTraits<sampleCount>::RasterTileStencilOffset(sample);
1296
1297 // ZTest for this sample
1298 RDTSC_START(BEEarlyDepthTest);
1299 stencilPassMask[sample] = vCoverageMask[sample];
1300 depthPassMask[sample] = DepthStencilTest(&state, work.triFlags.frontFacing,
1301 vZ[sample], pDepthSample, vCoverageMask[sample], pStencilSample, &stencilPassMask[sample]);
1302 RDTSC_STOP(BEEarlyDepthTest, 0, 0);
1303
1304 anyDepthSamplePassed = _simd_or_ps(anyDepthSamplePassed, depthPassMask[sample]);
1305 anyStencilSamplePassed = _simd_or_ps(anyStencilSamplePassed, stencilPassMask[sample]);
1306 uint32_t statMask = _simd_movemask_ps(depthPassMask[sample]);
1307 uint32_t statCount = _mm_popcnt_u32(statMask);
1308 UPDATE_STAT(DepthPassCount, statCount);
1309 }
1310
1311 // if we didn't have to execute the PS early, and at least 1 sample passed the depth test, run the PS
1312 if(!pPSState->writesODepth && !pPSState->killsPixel && _simd_movemask_ps(anyDepthSamplePassed))
1313 {
1314 RDTSC_START(BEBarycentric);
1315 backendFuncs.pfnCalcPixelBarycentrics(coeffs, psContext);
1316 // interpolate and quantize z
1317 psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center);
1318 psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
1319 RDTSC_STOP(BEBarycentric, 0, 0);
1320
1321 // execute pixel shader
1322 RDTSC_START(BEPixelShader);
1323 state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
1324 RDTSC_STOP(BEPixelShader, 0, 0);
1325 }
1326 ///@todo: make sure this works for kill pixel
1327 else if(!_simd_movemask_ps(anyStencilSamplePassed))
1328 {
1329 goto Endtile;
1330 }
1331
1332 // loop over all samples, broadcasting the results of the PS to all passing pixels
1333 for(uint32_t sample = 0; sample < numOMSamples; sample++)
1334 {
1335 uint8_t *pDepthSample = pDepthBase + MultisampleTraits<sampleCount>::RasterTileDepthOffset(sample);
1336 uint8_t * pStencilSample = pStencilBase + MultisampleTraits<sampleCount>::RasterTileStencilOffset(sample);
1337
1338 // output merger
1339 RDTSC_START(BEOutputMerger);
1340
1341 // skip if none of the pixels for this sample passed
1342 simdscalar coverageMaskSample;
1343 simdscalar depthMaskSample;
1344 simdscalar stencilMaskSample;
1345 simdscalar vInterpolatedZ;
1346
1347 // forcedSampleCount outputs to any pixels with covered samples not masked off by SampleMask
1348 // depth test is disabled, so just set the z val to 0.
1349 if(bForcedSampleCount)
1350 {
1351 coverageMaskSample = depthMaskSample = anyDepthSamplePassed;
1352 vInterpolatedZ = _simd_setzero_ps();
1353 }
1354 else if(bIsStandardPattern)
1355 {
1356 if(!_simd_movemask_ps(depthPassMask[sample]))
1357 {
1358 depthPassMask[sample] = _simd_setzero_ps();
1359 DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, vZ[sample], pDepthSample, depthPassMask[sample],
1360 vCoverageMask[sample], pStencilSample, stencilPassMask[sample]);
1361 continue;
1362 }
1363 coverageMaskSample = vCoverageMask[sample];
1364 depthMaskSample = depthPassMask[sample];
1365 stencilMaskSample = stencilPassMask[sample];
1366 vInterpolatedZ = vZ[sample];
1367 }
1368 else
1369 {
1370 // center pattern only needs to use a single depth test as all samples are at the same position
1371 if(!_simd_movemask_ps(depthPassMask[0]))
1372 {
1373 depthPassMask[0] = _simd_setzero_ps();
1374 DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, vZ[0], pDepthSample, depthPassMask[0],
1375 vCoverageMask[0], pStencilSample, stencilPassMask[0]);
1376 continue;
1377 }
1378 coverageMaskSample = (vCoverageMask[0]);
1379 depthMaskSample = depthPassMask[0];
1380 stencilMaskSample = stencilPassMask[0];
1381 vInterpolatedZ = vZ[0];
1382 }
1383
1384 // output merger
1385 RDTSC_START(BEOutputMerger);
1386 backendFuncs.pfnOutputMerger(psContext, pColorBase, sample, pBlendState, state.pfnBlendFunc,
1387 coverageMaskSample, depthMaskSample);
1388
1389 DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, vInterpolatedZ, pDepthSample, depthMaskSample,
1390 coverageMaskSample, pStencilSample, stencilMaskSample);
1391 RDTSC_STOP(BEOutputMerger, 0, 0);
1392 }
1393
1394 Endtile:
1395 RDTSC_START(BEEndTile);
1396 for(uint32_t sample = 0; sample < numCoverageSamples; sample++)
1397 {
1398 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
1399 }
1400
1401 pDepthBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
1402 pStencilBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
1403
1404 for(uint32_t rt = 0; rt < NumRT; ++rt)
1405 {
1406 pColorBase[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
1407 }
1408 RDTSC_STOP(BEEndTile, 0, 0);
1409 }
1410 }
1411 }
1412 // optimized backend flow with NULL PS
1413 template<uint32_t sampleCountT>
1414 void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
1415 {
1416 RDTSC_START(BESetup);
1417
1418 static const SWR_MULTISAMPLE_COUNT sampleCount = (SWR_MULTISAMPLE_COUNT)sampleCountT;
1419
1420 SWR_CONTEXT *pContext = pDC->pContext;
1421 const API_STATE& state = GetApiState(pDC);
1422 const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs;
1423 const SWR_RASTSTATE& rastState = pDC->pState->state.rastState;
1424
1425 // broadcast scalars
1426 BarycentricCoeffs coeffs;
1427 coeffs.vIa = _simd_broadcast_ss(&work.I[0]);
1428 coeffs.vIb = _simd_broadcast_ss(&work.I[1]);
1429 coeffs.vIc = _simd_broadcast_ss(&work.I[2]);
1430
1431 coeffs.vJa = _simd_broadcast_ss(&work.J[0]);
1432 coeffs.vJb = _simd_broadcast_ss(&work.J[1]);
1433 coeffs.vJc = _simd_broadcast_ss(&work.J[2]);
1434
1435 coeffs.vZa = _simd_broadcast_ss(&work.Z[0]);
1436 coeffs.vZb = _simd_broadcast_ss(&work.Z[1]);
1437 coeffs.vZc = _simd_broadcast_ss(&work.Z[2]);
1438
1439 coeffs.vRecipDet = _simd_broadcast_ss(&work.recipDet);
1440
1441 uint8_t *pDepthBase = renderBuffers.pDepth, *pStencilBase = renderBuffers.pStencil;
1442
1443 RDTSC_STOP(BESetup, 0, 0);
1444
1445 SWR_PS_CONTEXT psContext;
1446 for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
1447 {
1448 // UL pixel corner
1449 simdscalar vYSamplePosUL = _simd_add_ps(vULOffsetsY, _simd_set1_ps((float)yy));
1450
1451 for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
1452 {
1453 // UL pixel corners
1454 simdscalar vXSamplePosUL = _simd_add_ps(vULOffsetsX, _simd_set1_ps((float)xx));
1455
1456 // iterate over active samples
1457 unsigned long sample = 0;
1458 uint32_t sampleMask = state.blendState.sampleMask;
1459 while (_BitScanForward(&sample, sampleMask))
1460 {
1461 sampleMask &= ~(1 << sample);
1462 simdmask coverageMask = work.coverageMask[sample] & MASK;
1463 if (coverageMask)
1464 {
1465 RDTSC_START(BEBarycentric);
1466 // calculate per sample positions
1467 psContext.vX.sample = _simd_add_ps(vXSamplePosUL, MultisampleTraits<sampleCount>::vX(sample));
1468 psContext.vY.sample = _simd_add_ps(vYSamplePosUL, MultisampleTraits<sampleCount>::vY(sample));
1469
1470 backendFuncs.pfnCalcSampleBarycentrics(coeffs, psContext);
1471
1472 // interpolate and quantize z
1473 psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample);
1474 psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
1475
1476 RDTSC_STOP(BEBarycentric, 0, 0);
1477
1478 // interpolate user clip distance if available
1479 if (rastState.clipDistanceMask)
1480 {
1481 coverageMask &= ~ComputeUserClipMask(rastState.clipDistanceMask, work.pUserClipBuffer,
1482 psContext.vI.sample, psContext.vJ.sample);
1483 }
1484
1485 simdscalar vCoverageMask = vMask(coverageMask);
1486 simdscalar stencilPassMask = vCoverageMask;
1487
1488 // offset depth/stencil buffers current sample
1489 uint8_t *pDepthSample = pDepthBase + MultisampleTraits<sampleCount>::RasterTileDepthOffset(sample);
1490 uint8_t *pStencilSample = pStencilBase + MultisampleTraits<sampleCount>::RasterTileStencilOffset(sample);
1491
1492 RDTSC_START(BEEarlyDepthTest);
1493 simdscalar depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing,
1494 psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
1495 DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
1496 pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
1497 RDTSC_STOP(BEEarlyDepthTest, 0, 0);
1498
1499 uint32_t statMask = _simd_movemask_ps(depthPassMask);
1500 uint32_t statCount = _mm_popcnt_u32(statMask);
1501 UPDATE_STAT(DepthPassCount, statCount);
1502 }
1503 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
1504 }
1505 pDepthBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
1506 pStencilBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
1507 }
1508 }
1509 }
1510
1511 void InitClearTilesTable()
1512 {
1513 memset(sClearTilesTable, 0, sizeof(sClearTilesTable));
1514
1515 sClearTilesTable[R8G8B8A8_UNORM] = ClearMacroTile<R8G8B8A8_UNORM>;
1516 sClearTilesTable[B8G8R8A8_UNORM] = ClearMacroTile<B8G8R8A8_UNORM>;
1517 sClearTilesTable[R32_FLOAT] = ClearMacroTile<R32_FLOAT>;
1518 sClearTilesTable[R32G32B32A32_FLOAT] = ClearMacroTile<R32G32B32A32_FLOAT>;
1519 sClearTilesTable[R8_UINT] = ClearMacroTile<R8_UINT>;
1520 }
1521
1522 PFN_BACKEND_FUNC gBackendNullPs[SWR_MULTISAMPLE_TYPE_MAX];
1523 PFN_BACKEND_FUNC gBackendSingleSample[2][2] = {};
1524 PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_MAX][SWR_MSAA_SAMPLE_PATTERN_MAX][SWR_INPUT_COVERAGE_MAX][2][2] = {};
1525 PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_MAX][SWR_INPUT_COVERAGE_MAX][2] = {};
1526 PFN_OUTPUT_MERGER gBackendOutputMergerTable[SWR_NUM_RENDERTARGETS+1][SWR_MULTISAMPLE_TYPE_MAX] = {};
1527 PFN_CALC_PIXEL_BARYCENTRICS gPixelBarycentricTable[2] = {};
1528 PFN_CALC_SAMPLE_BARYCENTRICS gSampleBarycentricTable[2] = {};
1529 PFN_CALC_CENTROID_BARYCENTRICS gCentroidBarycentricTable[SWR_MULTISAMPLE_TYPE_MAX][2][2][2] = {};
1530
1531 // Recursive template used to auto-nest conditionals. Converts dynamic enum function
1532 // arguments to static template arguments.
1533 template <uint32_t... ArgsT>
1534 struct OMChooser
1535 {
1536 // Last Arg Terminator
1537 static PFN_OUTPUT_MERGER GetFunc(SWR_MULTISAMPLE_COUNT tArg)
1538 {
1539 switch(tArg)
1540 {
1541 case SWR_MULTISAMPLE_1X: return OutputMerger<ArgsT..., SWR_MULTISAMPLE_1X>; break;
1542 case SWR_MULTISAMPLE_2X: return OutputMerger<ArgsT..., SWR_MULTISAMPLE_2X>; break;
1543 case SWR_MULTISAMPLE_4X: return OutputMerger<ArgsT..., SWR_MULTISAMPLE_4X>; break;
1544 case SWR_MULTISAMPLE_8X: return OutputMerger<ArgsT..., SWR_MULTISAMPLE_8X>; break;
1545 case SWR_MULTISAMPLE_16X: return OutputMerger<ArgsT..., SWR_MULTISAMPLE_16X>; break;
1546 default:
1547 SWR_ASSERT(0 && "Invalid sample count\n");
1548 return nullptr;
1549 break;
1550 }
1551 }
1552
1553 // Recursively parse args
1554 template <typename... TArgsT>
1555 static PFN_OUTPUT_MERGER GetFunc(uint32_t tArg, TArgsT... remainingArgs)
1556 {
1557 switch(tArg)
1558 {
1559 case 0: return OMChooser<ArgsT..., 0>::GetFunc(remainingArgs...); break;
1560 case 1: return OMChooser<ArgsT..., 1>::GetFunc(remainingArgs...); break;
1561 case 2: return OMChooser<ArgsT..., 2>::GetFunc(remainingArgs...); break;
1562 case 3: return OMChooser<ArgsT..., 3>::GetFunc(remainingArgs...); break;
1563 case 4: return OMChooser<ArgsT..., 4>::GetFunc(remainingArgs...); break;
1564 case 5: return OMChooser<ArgsT..., 5>::GetFunc(remainingArgs...); break;
1565 case 6: return OMChooser<ArgsT..., 6>::GetFunc(remainingArgs...); break;
1566 case 7: return OMChooser<ArgsT..., 7>::GetFunc(remainingArgs...); break;
1567 case 8: return OMChooser<ArgsT..., 8>::GetFunc(remainingArgs...); break;
1568 default:
1569 SWR_ASSERT(0 && "Invalid RT index\n");
1570 return nullptr;
1571 break;
1572 }
1573 }
1574 };
1575
1576 // Recursive template used to auto-nest conditionals. Converts dynamic enum function
1577 // arguments to static template arguments.
1578 template <uint32_t... ArgsT>
1579 struct BECentroidBarycentricChooser
1580 {
1581
1582 // Last Arg Terminator
1583 template <typename... TArgsT>
1584 static PFN_CALC_CENTROID_BARYCENTRICS GetFunc(uint32_t tArg)
1585 {
1586 if(tArg > 0)
1587 {
1588 return CalcCentroidBarycentrics<ArgsT..., 1>;
1589 }
1590
1591 return CalcCentroidBarycentrics<ArgsT..., 0>;
1592 }
1593
1594 // Recursively parse args
1595 template <typename... TArgsT>
1596 static PFN_CALC_CENTROID_BARYCENTRICS GetFunc(SWR_MULTISAMPLE_COUNT tArg, TArgsT... remainingArgs)
1597 {
1598 switch(tArg)
1599 {
1600 case SWR_MULTISAMPLE_1X: return BECentroidBarycentricChooser<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...); break;
1601 case SWR_MULTISAMPLE_2X: return BECentroidBarycentricChooser<ArgsT..., SWR_MULTISAMPLE_2X>::GetFunc(remainingArgs...); break;
1602 case SWR_MULTISAMPLE_4X: return BECentroidBarycentricChooser<ArgsT..., SWR_MULTISAMPLE_4X>::GetFunc(remainingArgs...); break;
1603 case SWR_MULTISAMPLE_8X: return BECentroidBarycentricChooser<ArgsT..., SWR_MULTISAMPLE_8X>::GetFunc(remainingArgs...); break;
1604 case SWR_MULTISAMPLE_16X: return BECentroidBarycentricChooser<ArgsT..., SWR_MULTISAMPLE_16X>::GetFunc(remainingArgs...); break;
1605 default:
1606 SWR_ASSERT(0 && "Invalid sample count\n");
1607 return nullptr;
1608 break;
1609 }
1610 }
1611
1612 // Recursively parse args
1613 template <typename... TArgsT>
1614 static PFN_CALC_CENTROID_BARYCENTRICS GetFunc(uint32_t tArg, TArgsT... remainingArgs)
1615 {
1616 if(tArg > 0)
1617 {
1618 return BECentroidBarycentricChooser<ArgsT..., 1>::GetFunc(remainingArgs...);
1619 }
1620
1621 return BECentroidBarycentricChooser<ArgsT..., 0>::GetFunc(remainingArgs...);
1622 }
1623 };
1624
1625 // Recursive template used to auto-nest conditionals. Converts dynamic enum function
1626 // arguments to static template arguments.
1627 template <uint32_t... ArgsT>
1628 struct BEChooser
1629 {
1630 // Last Arg Terminator
1631 static PFN_BACKEND_FUNC GetFunc(SWR_BACKEND_FUNCS tArg)
1632 {
1633 switch(tArg)
1634 {
1635 case SWR_BACKEND_SINGLE_SAMPLE: return BackendSingleSample<ArgsT...>; break;
1636 case SWR_BACKEND_MSAA_PIXEL_RATE: return BackendPixelRate<ArgsT...>; break;
1637 case SWR_BACKEND_MSAA_SAMPLE_RATE: return BackendSampleRate<ArgsT...>; break;
1638 default:
1639 SWR_ASSERT(0 && "Invalid backend func\n");
1640 return nullptr;
1641 break;
1642 }
1643 }
1644
1645
1646 // Recursively parse args
1647 template <typename... TArgsT>
1648 static PFN_BACKEND_FUNC GetFunc(SWR_MULTISAMPLE_COUNT tArg, TArgsT... remainingArgs)
1649 {
1650 switch(tArg)
1651 {
1652 case SWR_MULTISAMPLE_1X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...); break;
1653 case SWR_MULTISAMPLE_2X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_2X>::GetFunc(remainingArgs...); break;
1654 case SWR_MULTISAMPLE_4X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_4X>::GetFunc(remainingArgs...); break;
1655 case SWR_MULTISAMPLE_8X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_8X>::GetFunc(remainingArgs...); break;
1656 case SWR_MULTISAMPLE_16X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_16X>::GetFunc(remainingArgs...); break;
1657 default:
1658 SWR_ASSERT(0 && "Invalid sample count\n");
1659 return nullptr;
1660 break;
1661 }
1662 }
1663
1664 // Recursively parse args
1665 template <typename... TArgsT>
1666 static PFN_BACKEND_FUNC GetFunc(uint32_t tArg, TArgsT... remainingArgs)
1667 {
1668 if(tArg > 0)
1669 {
1670 return BEChooser<ArgsT..., 1>::GetFunc(remainingArgs...);
1671 }
1672
1673 return BEChooser<ArgsT..., 0>::GetFunc(remainingArgs...);
1674 }
1675 };
1676
1677 template <uint32_t numRenderTargets, SWR_MULTISAMPLE_COUNT numSampleRates>
1678 void InitBackendOMFuncTable(PFN_OUTPUT_MERGER (&table)[numRenderTargets][numSampleRates])
1679 {
1680 for(uint32_t rtNum = SWR_ATTACHMENT_COLOR0; rtNum < numRenderTargets; rtNum++)
1681 {
1682 for(uint32_t sampleCount = SWR_MULTISAMPLE_1X; sampleCount < numSampleRates; sampleCount++)
1683 {
1684 table[rtNum][sampleCount] =
1685 OMChooser<>::GetFunc((SWR_RENDERTARGET_ATTACHMENT)rtNum, (SWR_MULTISAMPLE_COUNT)sampleCount);
1686 }
1687 }
1688 }
1689
1690 template <SWR_MULTISAMPLE_COUNT numSampleRates>
1691 void InitBackendBarycentricsTables(PFN_CALC_PIXEL_BARYCENTRICS (&pixelTable)[2],
1692 PFN_CALC_SAMPLE_BARYCENTRICS (&sampleTable)[2],
1693 PFN_CALC_CENTROID_BARYCENTRICS (&centroidTable)[numSampleRates][2][2][2])
1694 {
1695 pixelTable[0] = CalcPixelBarycentrics<0>;
1696 pixelTable[1] = CalcPixelBarycentrics<1>;
1697
1698 sampleTable[0] = CalcSampleBarycentrics<0>;
1699 sampleTable[1] = CalcSampleBarycentrics<1>;
1700
1701 for(uint32_t sampleCount = SWR_MULTISAMPLE_1X; sampleCount < numSampleRates; sampleCount++)
1702 {
1703 for(uint32_t baryMask = 0; baryMask < 2; baryMask++)
1704 {
1705 for(uint32_t patternNum = 0; patternNum < 2; patternNum++)
1706 {
1707 for(uint32_t forcedSampleEnable = 0; forcedSampleEnable < 2; forcedSampleEnable++)
1708 {
1709 centroidTable[sampleCount][baryMask][patternNum][forcedSampleEnable]=
1710 BECentroidBarycentricChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, baryMask, patternNum, forcedSampleEnable);
1711 }
1712 }
1713 }
1714 }
1715 }
1716
1717 void InitBackendSampleFuncTable(PFN_BACKEND_FUNC (&table)[2][2])
1718 {
1719 gBackendSingleSample[0][0] = BEChooser<>::GetFunc(SWR_MULTISAMPLE_1X, SWR_MSAA_STANDARD_PATTERN, SWR_INPUT_COVERAGE_NONE, 0, 0, (SWR_BACKEND_FUNCS)SWR_BACKEND_SINGLE_SAMPLE);
1720 gBackendSingleSample[0][1] = BEChooser<>::GetFunc(SWR_MULTISAMPLE_1X, SWR_MSAA_STANDARD_PATTERN, SWR_INPUT_COVERAGE_NONE, 1, 0, (SWR_BACKEND_FUNCS)SWR_BACKEND_SINGLE_SAMPLE);
1721 gBackendSingleSample[1][0] = BEChooser<>::GetFunc(SWR_MULTISAMPLE_1X, SWR_MSAA_STANDARD_PATTERN, SWR_INPUT_COVERAGE_NORMAL, 0, 0, (SWR_BACKEND_FUNCS)SWR_BACKEND_SINGLE_SAMPLE);
1722 gBackendSingleSample[1][1] = BEChooser<>::GetFunc(SWR_MULTISAMPLE_1X, SWR_MSAA_STANDARD_PATTERN, SWR_INPUT_COVERAGE_NORMAL, 1, 0, (SWR_BACKEND_FUNCS)SWR_BACKEND_SINGLE_SAMPLE);
1723 }
1724
1725 template <SWR_MULTISAMPLE_COUNT numSampleRates, SWR_MSAA_SAMPLE_PATTERN numSamplePatterns, SWR_INPUT_COVERAGE numCoverageModes>
1726 void InitBackendPixelFuncTable(PFN_BACKEND_FUNC (&table)[numSampleRates][numSamplePatterns][numCoverageModes][2][2])
1727 {
1728 for(uint32_t sampleCount = SWR_MULTISAMPLE_1X; sampleCount < numSampleRates; sampleCount++)
1729 {
1730 for(uint32_t samplePattern = SWR_MSAA_CENTER_PATTERN; samplePattern < numSamplePatterns; samplePattern++)
1731 {
1732 for(uint32_t inputCoverage = SWR_INPUT_COVERAGE_NONE; inputCoverage < numCoverageModes; inputCoverage++)
1733 {
1734 for(uint32_t isCentroid = 0; isCentroid < 2; isCentroid++)
1735 {
1736 table[sampleCount][samplePattern][inputCoverage][isCentroid][0] =
1737 BEChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, samplePattern, inputCoverage, isCentroid, 0, (SWR_BACKEND_FUNCS)SWR_BACKEND_MSAA_PIXEL_RATE);
1738 table[sampleCount][samplePattern][inputCoverage][isCentroid][1] =
1739 BEChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, samplePattern, inputCoverage, isCentroid, 1, (SWR_BACKEND_FUNCS)SWR_BACKEND_MSAA_PIXEL_RATE);
1740 }
1741 }
1742 }
1743 }
1744 }
1745
1746 template <uint32_t numSampleRates, uint32_t numCoverageModes>
1747 void InitBackendSampleFuncTable(PFN_BACKEND_FUNC (&table)[numSampleRates][numCoverageModes][2])
1748 {
1749 for(uint32_t sampleCount = SWR_MULTISAMPLE_1X; sampleCount < numSampleRates; sampleCount++)
1750 {
1751 for(uint32_t inputCoverage = SWR_INPUT_COVERAGE_NONE; inputCoverage < numCoverageModes; inputCoverage++)
1752 {
1753 table[sampleCount][inputCoverage][0] =
1754 BEChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, SWR_MSAA_STANDARD_PATTERN, inputCoverage, 0, 0, (SWR_BACKEND_FUNCS)SWR_BACKEND_MSAA_SAMPLE_RATE);
1755 table[sampleCount][inputCoverage][1] =
1756 BEChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, SWR_MSAA_STANDARD_PATTERN, inputCoverage, 1, 0, (SWR_BACKEND_FUNCS)SWR_BACKEND_MSAA_SAMPLE_RATE);
1757 }
1758 }
1759 }
1760
1761 void InitBackendFuncTables()
1762 {
1763 InitBackendSampleFuncTable(gBackendSingleSample);
1764 InitBackendPixelFuncTable<(SWR_MULTISAMPLE_COUNT)SWR_MULTISAMPLE_TYPE_MAX, SWR_MSAA_SAMPLE_PATTERN_MAX, SWR_INPUT_COVERAGE_MAX>(gBackendPixelRateTable);
1765 InitBackendSampleFuncTable<SWR_MULTISAMPLE_TYPE_MAX, SWR_INPUT_COVERAGE_MAX>(gBackendSampleRateTable);
1766 InitBackendOMFuncTable<SWR_NUM_RENDERTARGETS+1, SWR_MULTISAMPLE_TYPE_MAX>(gBackendOutputMergerTable);
1767 InitBackendBarycentricsTables<(SWR_MULTISAMPLE_COUNT)(SWR_MULTISAMPLE_TYPE_MAX)>(gPixelBarycentricTable, gSampleBarycentricTable, gCentroidBarycentricTable);
1768
1769 gBackendNullPs[SWR_MULTISAMPLE_1X] = &BackendNullPS < SWR_MULTISAMPLE_1X > ;
1770 gBackendNullPs[SWR_MULTISAMPLE_2X] = &BackendNullPS < SWR_MULTISAMPLE_2X > ;
1771 gBackendNullPs[SWR_MULTISAMPLE_4X] = &BackendNullPS < SWR_MULTISAMPLE_4X > ;
1772 gBackendNullPs[SWR_MULTISAMPLE_8X] = &BackendNullPS < SWR_MULTISAMPLE_8X > ;
1773 gBackendNullPs[SWR_MULTISAMPLE_16X] = &BackendNullPS < SWR_MULTISAMPLE_16X > ;
1774 }