swr: [rasterizer core] Use CS spill/fill size in core
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / backend.cpp
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file backend.cpp
24 *
25 * @brief Backend handles rasterization, pixel shading and output merger
26 * operations.
27 *
28 ******************************************************************************/
29
30 #include <smmintrin.h>
31
32 #include "rdtsc_core.h"
33 #include "backend.h"
34 #include "depthstencil.h"
35 #include "tilemgr.h"
36 #include "memory/tilingtraits.h"
37 #include "core/multisample.h"
38
39 #include <algorithm>
40
41 const __m128 vTileOffsetsX = {0.5, KNOB_TILE_X_DIM - 0.5, 0.5, KNOB_TILE_X_DIM - 0.5};
42 const __m128 vTileOffsetsY = {0.5, 0.5, KNOB_TILE_Y_DIM - 0.5, KNOB_TILE_Y_DIM - 0.5};
43
44 /// @todo move to common lib
45 #define MASKTOVEC(i3,i2,i1,i0) {-i0,-i1,-i2,-i3}
46 static const __m128 gMaskToVec[] = {
47 MASKTOVEC(0,0,0,0),
48 MASKTOVEC(0,0,0,1),
49 MASKTOVEC(0,0,1,0),
50 MASKTOVEC(0,0,1,1),
51 MASKTOVEC(0,1,0,0),
52 MASKTOVEC(0,1,0,1),
53 MASKTOVEC(0,1,1,0),
54 MASKTOVEC(0,1,1,1),
55 MASKTOVEC(1,0,0,0),
56 MASKTOVEC(1,0,0,1),
57 MASKTOVEC(1,0,1,0),
58 MASKTOVEC(1,0,1,1),
59 MASKTOVEC(1,1,0,0),
60 MASKTOVEC(1,1,0,1),
61 MASKTOVEC(1,1,1,0),
62 MASKTOVEC(1,1,1,1),
63 };
64
65 typedef void(*PFN_CLEAR_TILES)(DRAW_CONTEXT*, SWR_RENDERTARGET_ATTACHMENT rt, uint32_t, DWORD[4]);
66 static PFN_CLEAR_TILES sClearTilesTable[NUM_SWR_FORMATS];
67
68 //////////////////////////////////////////////////////////////////////////
69 /// @brief Process compute work.
70 /// @param pDC - pointer to draw context (dispatch).
71 /// @param workerId - The unique worker ID that is assigned to this thread.
72 /// @param threadGroupId - the linear index for the thread group within the dispatch.
73 void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId, void*& pSpillFillBuffer)
74 {
75 RDTSC_START(BEDispatch);
76
77 SWR_CONTEXT *pContext = pDC->pContext;
78
79 const COMPUTE_DESC* pTaskData = (COMPUTE_DESC*)pDC->pDispatch->GetTasksData();
80 SWR_ASSERT(pTaskData != nullptr);
81
82 // Ensure spill fill memory has been allocated.
83 if (pSpillFillBuffer == nullptr)
84 {
85 pSpillFillBuffer = pDC->pArena->AllocAlignedSync(pDC->pState->state.totalSpillFillSize, sizeof(float) * 8);
86 }
87
88 const API_STATE& state = GetApiState(pDC);
89
90 SWR_CS_CONTEXT csContext{ 0 };
91 csContext.tileCounter = threadGroupId;
92 csContext.dispatchDims[0] = pTaskData->threadGroupCountX;
93 csContext.dispatchDims[1] = pTaskData->threadGroupCountY;
94 csContext.dispatchDims[2] = pTaskData->threadGroupCountZ;
95 csContext.pTGSM = pContext->pScratch[workerId];
96 csContext.pSpillFillBuffer = (uint8_t*)pSpillFillBuffer;
97
98 state.pfnCsFunc(GetPrivateState(pDC), &csContext);
99
100 UPDATE_STAT(CsInvocations, state.totalThreadsInGroup);
101
102 RDTSC_STOP(BEDispatch, 1, 0);
103 }
104
105 void ProcessSyncBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData)
106 {
107 SYNC_DESC *pSync = (SYNC_DESC*)pUserData;
108
109 uint32_t x, y;
110 MacroTileMgr::getTileIndices(macroTile, x, y);
111 SWR_ASSERT(x == 0 && y == 0);
112
113 if (pSync->pfnCallbackFunc != nullptr)
114 {
115 pSync->pfnCallbackFunc(pSync->userData, pSync->userData2, pSync->userData3);
116 }
117 }
118
119 void ProcessQueryStatsBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData)
120 {
121 QUERY_DESC* pQueryDesc = (QUERY_DESC*)pUserData;
122 SWR_STATS* pStats = pQueryDesc->pStats;
123 SWR_CONTEXT *pContext = pDC->pContext;
124
125 SWR_ASSERT(pStats != nullptr);
126
127 for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i)
128 {
129 pStats->DepthPassCount += pContext->stats[i].DepthPassCount;
130
131 pStats->IaVertices += pContext->stats[i].IaVertices;
132 pStats->IaPrimitives += pContext->stats[i].IaPrimitives;
133 pStats->VsInvocations += pContext->stats[i].VsInvocations;
134 pStats->HsInvocations += pContext->stats[i].HsInvocations;
135 pStats->DsInvocations += pContext->stats[i].DsInvocations;
136 pStats->GsInvocations += pContext->stats[i].GsInvocations;
137 pStats->PsInvocations += pContext->stats[i].PsInvocations;
138 pStats->CInvocations += pContext->stats[i].CInvocations;
139 pStats->CsInvocations += pContext->stats[i].CsInvocations;
140 pStats->CPrimitives += pContext->stats[i].CPrimitives;
141 pStats->GsPrimitives += pContext->stats[i].GsPrimitives;
142
143 for (uint32_t stream = 0; stream < MAX_SO_STREAMS; ++stream)
144 {
145 pStats->SoWriteOffset[stream] += pContext->stats[i].SoWriteOffset[stream];
146
147 /// @note client is required to provide valid write offset before every draw, so we clear
148 /// out the contents of the write offset when storing stats
149 pContext->stats[i].SoWriteOffset[stream] = 0;
150
151 pStats->SoPrimStorageNeeded[stream] += pContext->stats[i].SoPrimStorageNeeded[stream];
152 pStats->SoNumPrimsWritten[stream] += pContext->stats[i].SoNumPrimsWritten[stream];
153 }
154 }
155 }
156
157 template<SWR_FORMAT format>
158 void ClearRasterTile(uint8_t *pTileBuffer, simdvector &value)
159 {
160 auto lambda = [&](int comp)
161 {
162 FormatTraits<format>::storeSOA(comp, pTileBuffer, value.v[comp]);
163 pTileBuffer += (KNOB_SIMD_WIDTH * FormatTraits<format>::GetBPC(comp) / 8);
164 };
165
166 const uint32_t numIter = (KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD_TILE_X_DIM);
167 for (uint32_t i = 0; i < numIter; ++i)
168 {
169 UnrollerL<0, FormatTraits<format>::numComps, 1>::step(lambda);
170 }
171 }
172
173 template<SWR_FORMAT format>
174 INLINE void ClearMacroTile(DRAW_CONTEXT *pDC, SWR_RENDERTARGET_ATTACHMENT rt, uint32_t macroTile, DWORD clear[4])
175 {
176 // convert clear color to hottile format
177 // clear color is in RGBA float/uint32
178 simdvector vClear;
179 for (uint32_t comp = 0; comp < FormatTraits<format>::numComps; ++comp)
180 {
181 simdscalar vComp;
182 vComp = _simd_load1_ps((const float*)&clear[comp]);
183 if (FormatTraits<format>::isNormalized(comp))
184 {
185 vComp = _simd_mul_ps(vComp, _simd_set1_ps(FormatTraits<format>::fromFloat(comp)));
186 vComp = _simd_castsi_ps(_simd_cvtps_epi32(vComp));
187 }
188 vComp = FormatTraits<format>::pack(comp, vComp);
189 vClear.v[FormatTraits<format>::swizzle(comp)] = vComp;
190 }
191
192 uint32_t tileX, tileY;
193 MacroTileMgr::getTileIndices(macroTile, tileX, tileY);
194 const API_STATE& state = GetApiState(pDC);
195
196 int top = KNOB_MACROTILE_Y_DIM_FIXED * tileY;
197 int bottom = top + KNOB_MACROTILE_Y_DIM_FIXED - 1;
198 int left = KNOB_MACROTILE_X_DIM_FIXED * tileX;
199 int right = left + KNOB_MACROTILE_X_DIM_FIXED - 1;
200
201 // intersect with scissor
202 top = std::max(top, state.scissorInFixedPoint.top);
203 left = std::max(left, state.scissorInFixedPoint.left);
204 bottom = std::min(bottom, state.scissorInFixedPoint.bottom);
205 right = std::min(right, state.scissorInFixedPoint.right);
206
207 // translate to local hottile origin
208 top -= KNOB_MACROTILE_Y_DIM_FIXED * tileY;
209 bottom -= KNOB_MACROTILE_Y_DIM_FIXED * tileY;
210 left -= KNOB_MACROTILE_X_DIM_FIXED * tileX;
211 right -= KNOB_MACROTILE_X_DIM_FIXED * tileX;
212
213 // convert to raster tiles
214 top >>= (KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT);
215 bottom >>= (KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT);
216 left >>= (KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT);
217 right >>= (KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT);
218
219 const int numSamples = GetNumSamples(pDC->pState->state.rastState.sampleCount);
220 // compute steps between raster tile samples / raster tiles / macro tile rows
221 const uint32_t rasterTileSampleStep = KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<format>::bpp / 8;
222 const uint32_t rasterTileStep = (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<format>::bpp / 8)) * numSamples;
223 const uint32_t macroTileRowStep = (KNOB_MACROTILE_X_DIM / KNOB_TILE_X_DIM) * rasterTileStep;
224 const uint32_t pitch = (FormatTraits<format>::bpp * KNOB_MACROTILE_X_DIM / 8);
225
226 HOTTILE *pHotTile = pDC->pContext->pHotTileMgr->GetHotTile(pDC->pContext, pDC, macroTile, rt, true, numSamples);
227 uint32_t rasterTileStartOffset = (ComputeTileOffset2D< TilingTraits<SWR_TILE_SWRZ, FormatTraits<format>::bpp > >(pitch, left, top)) * numSamples;
228 uint8_t* pRasterTileRow = pHotTile->pBuffer + rasterTileStartOffset; //(ComputeTileOffset2D< TilingTraits<SWR_TILE_SWRZ, FormatTraits<format>::bpp > >(pitch, x, y)) * numSamples;
229
230 // loop over all raster tiles in the current hot tile
231 for (int y = top; y <= bottom; ++y)
232 {
233 uint8_t* pRasterTile = pRasterTileRow;
234 for (int x = left; x <= right; ++x)
235 {
236 for( int sampleNum = 0; sampleNum < numSamples; sampleNum++)
237 {
238 ClearRasterTile<format>(pRasterTile, vClear);
239 pRasterTile += rasterTileSampleStep;
240 }
241 }
242 pRasterTileRow += macroTileRowStep;
243 }
244
245 pHotTile->state = HOTTILE_DIRTY;
246 }
247
248
249 void ProcessClearBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData)
250 {
251 if (KNOB_FAST_CLEAR)
252 {
253 CLEAR_DESC *pClear = (CLEAR_DESC*)pUserData;
254 SWR_CONTEXT *pContext = pDC->pContext;
255 SWR_MULTISAMPLE_COUNT sampleCount = pDC->pState->state.rastState.sampleCount;
256 uint32_t numSamples = GetNumSamples(sampleCount);
257
258 SWR_ASSERT(pClear->flags.bits != 0); // shouldn't be here without a reason.
259
260 RDTSC_START(BEClear);
261
262 if (pClear->flags.mask & SWR_CLEAR_COLOR)
263 {
264 HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, SWR_ATTACHMENT_COLOR0, true, numSamples);
265 // All we want to do here is to mark the hot tile as being in a "needs clear" state.
266 pHotTile->clearData[0] = *(DWORD*)&(pClear->clearRTColor[0]);
267 pHotTile->clearData[1] = *(DWORD*)&(pClear->clearRTColor[1]);
268 pHotTile->clearData[2] = *(DWORD*)&(pClear->clearRTColor[2]);
269 pHotTile->clearData[3] = *(DWORD*)&(pClear->clearRTColor[3]);
270 pHotTile->state = HOTTILE_CLEAR;
271 }
272
273 if (pClear->flags.mask & SWR_CLEAR_DEPTH)
274 {
275 HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, SWR_ATTACHMENT_DEPTH, true, numSamples);
276 pHotTile->clearData[0] = *(DWORD*)&pClear->clearDepth;
277 pHotTile->state = HOTTILE_CLEAR;
278 }
279
280 if (pClear->flags.mask & SWR_CLEAR_STENCIL)
281 {
282 HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, SWR_ATTACHMENT_STENCIL, true, numSamples);
283
284 pHotTile->clearData[0] = *(DWORD*)&pClear->clearStencil;
285 pHotTile->state = HOTTILE_CLEAR;
286 }
287
288 RDTSC_STOP(BEClear, 0, 0);
289 }
290 else
291 {
292 // Legacy clear
293 CLEAR_DESC *pClear = (CLEAR_DESC*)pUserData;
294 RDTSC_START(BEClear);
295
296 if (pClear->flags.mask & SWR_CLEAR_COLOR)
297 {
298 /// @todo clear data should come in as RGBA32_FLOAT
299 DWORD clearData[4];
300 float clearFloat[4];
301 clearFloat[0] = ((uint8_t*)(&pClear->clearRTColor))[0] / 255.0f;
302 clearFloat[1] = ((uint8_t*)(&pClear->clearRTColor))[1] / 255.0f;
303 clearFloat[2] = ((uint8_t*)(&pClear->clearRTColor))[2] / 255.0f;
304 clearFloat[3] = ((uint8_t*)(&pClear->clearRTColor))[3] / 255.0f;
305 clearData[0] = *(DWORD*)&clearFloat[0];
306 clearData[1] = *(DWORD*)&clearFloat[1];
307 clearData[2] = *(DWORD*)&clearFloat[2];
308 clearData[3] = *(DWORD*)&clearFloat[3];
309
310 PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[KNOB_COLOR_HOT_TILE_FORMAT];
311 SWR_ASSERT(pfnClearTiles != nullptr);
312
313 pfnClearTiles(pDC, SWR_ATTACHMENT_COLOR0, macroTile, clearData);
314 }
315
316 if (pClear->flags.mask & SWR_CLEAR_DEPTH)
317 {
318 DWORD clearData[4];
319 clearData[0] = *(DWORD*)&pClear->clearDepth;
320 PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[KNOB_DEPTH_HOT_TILE_FORMAT];
321 SWR_ASSERT(pfnClearTiles != nullptr);
322
323 pfnClearTiles(pDC, SWR_ATTACHMENT_DEPTH, macroTile, clearData);
324 }
325
326 if (pClear->flags.mask & SWR_CLEAR_STENCIL)
327 {
328 uint32_t value = pClear->clearStencil;
329 DWORD clearData[4];
330 clearData[0] = *(DWORD*)&value;
331 PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[KNOB_STENCIL_HOT_TILE_FORMAT];
332
333 pfnClearTiles(pDC, SWR_ATTACHMENT_STENCIL, macroTile, clearData);
334 }
335
336 RDTSC_STOP(BEClear, 0, 0);
337 }
338 }
339
340
341 void ProcessStoreTileBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData)
342 {
343 RDTSC_START(BEStoreTiles);
344 STORE_TILES_DESC *pDesc = (STORE_TILES_DESC*)pData;
345 SWR_CONTEXT *pContext = pDC->pContext;
346
347 #ifdef KNOB_ENABLE_RDTSC
348 uint32_t numTiles = 0;
349 #endif
350 SWR_FORMAT srcFormat;
351 switch (pDesc->attachment)
352 {
353 case SWR_ATTACHMENT_COLOR0:
354 case SWR_ATTACHMENT_COLOR1:
355 case SWR_ATTACHMENT_COLOR2:
356 case SWR_ATTACHMENT_COLOR3:
357 case SWR_ATTACHMENT_COLOR4:
358 case SWR_ATTACHMENT_COLOR5:
359 case SWR_ATTACHMENT_COLOR6:
360 case SWR_ATTACHMENT_COLOR7: srcFormat = KNOB_COLOR_HOT_TILE_FORMAT; break;
361 case SWR_ATTACHMENT_DEPTH: srcFormat = KNOB_DEPTH_HOT_TILE_FORMAT; break;
362 case SWR_ATTACHMENT_STENCIL: srcFormat = KNOB_STENCIL_HOT_TILE_FORMAT; break;
363 default: SWR_ASSERT(false, "Unknown attachment: %d", pDesc->attachment); srcFormat = KNOB_COLOR_HOT_TILE_FORMAT; break;
364 }
365
366 uint32_t x, y;
367 MacroTileMgr::getTileIndices(macroTile, x, y);
368
369 // Only need to store the hottile if it's been rendered to...
370 HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, pDesc->attachment, false);
371 if (pHotTile)
372 {
373 // clear if clear is pending (i.e., not rendered to), then mark as dirty for store.
374 if (pHotTile->state == HOTTILE_CLEAR)
375 {
376 PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[srcFormat];
377 SWR_ASSERT(pfnClearTiles != nullptr);
378
379 pfnClearTiles(pDC, pDesc->attachment, macroTile, pHotTile->clearData);
380 }
381
382 if (pHotTile->state == HOTTILE_DIRTY || pDesc->postStoreTileState == (SWR_TILE_STATE)HOTTILE_DIRTY)
383 {
384 int destX = KNOB_MACROTILE_X_DIM * x;
385 int destY = KNOB_MACROTILE_Y_DIM * y;
386
387 pContext->pfnStoreTile(GetPrivateState(pDC), srcFormat,
388 pDesc->attachment, destX, destY, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer);
389 }
390
391
392 if (pHotTile->state == HOTTILE_DIRTY || pHotTile->state == HOTTILE_RESOLVED)
393 {
394 pHotTile->state = (HOTTILE_STATE)pDesc->postStoreTileState;
395 }
396 }
397 RDTSC_STOP(BEStoreTiles, numTiles, pDC->drawId);
398 }
399
400
401 void ProcessDiscardInvalidateTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData)
402 {
403 DISCARD_INVALIDATE_TILES_DESC *pDesc = (DISCARD_INVALIDATE_TILES_DESC *)pData;
404 SWR_CONTEXT *pContext = pDC->pContext;
405
406 const int numSamples = GetNumSamples(pDC->pState->state.rastState.sampleCount);
407
408 for (uint32_t i = 0; i < SWR_NUM_ATTACHMENTS; ++i)
409 {
410 if (pDesc->attachmentMask & (1 << i))
411 {
412 HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTileNoLoad(
413 pContext, pDC, macroTile, (SWR_RENDERTARGET_ATTACHMENT)i, pDesc->createNewTiles, numSamples);
414 if (pHotTile)
415 {
416 pHotTile->state = (HOTTILE_STATE)pDesc->newTileState;
417 }
418 }
419 }
420 }
421
422 #if KNOB_SIMD_WIDTH == 8
423 const __m256 vCenterOffsetsX = {0.5, 1.5, 0.5, 1.5, 2.5, 3.5, 2.5, 3.5};
424 const __m256 vCenterOffsetsY = {0.5, 0.5, 1.5, 1.5, 0.5, 0.5, 1.5, 1.5};
425 const __m256 vULOffsetsX = {0.0, 1.0, 0.0, 1.0, 2.0, 3.0, 2.0, 3.0};
426 const __m256 vULOffsetsY = {0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0};
427 #else
428 #error Unsupported vector width
429 #endif
430
431 INLINE
432 bool CanEarlyZ(const SWR_PS_STATE *pPSState)
433 {
434 return (pPSState->forceEarlyZ || (!pPSState->writesODepth && !pPSState->usesSourceDepth && !pPSState->usesUAV));
435 }
436
437 simdmask ComputeUserClipMask(uint8_t clipMask, float* pUserClipBuffer, simdscalar vI, simdscalar vJ)
438 {
439 simdscalar vClipMask = _simd_setzero_ps();
440 uint32_t numClipDistance = _mm_popcnt_u32(clipMask);
441
442 for (uint32_t i = 0; i < numClipDistance; ++i)
443 {
444 // pull triangle clip distance values from clip buffer
445 simdscalar vA = _simd_broadcast_ss(pUserClipBuffer++);
446 simdscalar vB = _simd_broadcast_ss(pUserClipBuffer++);
447 simdscalar vC = _simd_broadcast_ss(pUserClipBuffer++);
448
449 // interpolate
450 simdscalar vInterp = vplaneps(vA, vB, vC, vI, vJ);
451
452 // clip if interpolated clip distance is < 0 || NAN
453 simdscalar vCull = _simd_cmp_ps(_simd_setzero_ps(), vInterp, _CMP_NLE_UQ);
454
455 vClipMask = _simd_or_ps(vClipMask, vCull);
456 }
457
458 return _simd_movemask_ps(vClipMask);
459 }
460
461 template<bool perspMask>
462 INLINE void CalcPixelBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext)
463 {
464 if(perspMask)
465 {
466 // evaluate I,J
467 psContext.vI.center = vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.center, psContext.vY.center);
468 psContext.vJ.center = vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.center, psContext.vY.center);
469 psContext.vI.center = _simd_mul_ps(psContext.vI.center, coeffs.vRecipDet);
470 psContext.vJ.center = _simd_mul_ps(psContext.vJ.center, coeffs.vRecipDet);
471
472 // interpolate 1/w
473 psContext.vOneOverW.center = vplaneps(coeffs.vAOneOverW, coeffs.vBOneOverW, coeffs.vCOneOverW, psContext.vI.center, psContext.vJ.center);
474 }
475 }
476
477 template<bool perspMask>
478 INLINE void CalcSampleBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext)
479 {
480 if(perspMask)
481 {
482 // evaluate I,J
483 psContext.vI.sample = vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.sample, psContext.vY.sample);
484 psContext.vJ.sample = vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.sample, psContext.vY.sample);
485 psContext.vI.sample = _simd_mul_ps(psContext.vI.sample, coeffs.vRecipDet);
486 psContext.vJ.sample = _simd_mul_ps(psContext.vJ.sample, coeffs.vRecipDet);
487
488 // interpolate 1/w
489 psContext.vOneOverW.sample = vplaneps(coeffs.vAOneOverW, coeffs.vBOneOverW, coeffs.vCOneOverW, psContext.vI.sample, psContext.vJ.sample);
490 }
491 }
492
493
494 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
495 // Centroid behaves exactly as follows :
496 // (1) If all samples in the primitive are covered, the attribute is evaluated at the pixel center (even if the sample pattern does not happen to
497 // have a sample location there).
498 // (2) Else the attribute is evaluated at the first covered sample, in increasing order of sample index, where sample coverage is after ANDing the
499 // coverage with the SampleMask Rasterizer State.
500 // (3) If no samples are covered, such as on helper pixels executed off the bounds of a primitive to fill out 2x2 pixel stamps, the attribute is
501 // evaluated as follows : If the SampleMask Rasterizer state is a subset of the samples in the pixel, then the first sample covered by the
502 // SampleMask Rasterizer State is the evaluation point.Otherwise (full SampleMask), the pixel center is the evaluation point.
503 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
504 template<SWR_MULTISAMPLE_COUNT sampleCount, bool bForcedSampleCount>
505 INLINE void CalcCentroidPos(SWR_PS_CONTEXT &psContext, const uint64_t *const coverageMask, const uint32_t sampleMask,
506 const simdscalar vXSamplePosUL, const simdscalar vYSamplePosUL)
507 {
508 uint32_t inputMask[KNOB_SIMD_WIDTH];
509
510 generateInputCoverage<sampleCount, 1, bForcedSampleCount>(coverageMask, inputMask, sampleMask);
511
512 // Case (2) - partially covered pixel
513
514 // scan for first covered sample per pixel in the 4x2 span
515 unsigned long sampleNum[KNOB_SIMD_WIDTH];
516 (inputMask[0] > 0) ? (_BitScanForward(&sampleNum[0], inputMask[0])) : (sampleNum[0] = 0);
517 (inputMask[1] > 0) ? (_BitScanForward(&sampleNum[1], inputMask[1])) : (sampleNum[1] = 0);
518 (inputMask[2] > 0) ? (_BitScanForward(&sampleNum[2], inputMask[2])) : (sampleNum[2] = 0);
519 (inputMask[3] > 0) ? (_BitScanForward(&sampleNum[3], inputMask[3])) : (sampleNum[3] = 0);
520 (inputMask[4] > 0) ? (_BitScanForward(&sampleNum[4], inputMask[4])) : (sampleNum[4] = 0);
521 (inputMask[5] > 0) ? (_BitScanForward(&sampleNum[5], inputMask[5])) : (sampleNum[5] = 0);
522 (inputMask[6] > 0) ? (_BitScanForward(&sampleNum[6], inputMask[6])) : (sampleNum[6] = 0);
523 (inputMask[7] > 0) ? (_BitScanForward(&sampleNum[7], inputMask[7])) : (sampleNum[7] = 0);
524
525 // look up and set the sample offsets from UL pixel corner for first covered sample
526 __m256 vXSample = _mm256_set_ps(MultisampleTraits<sampleCount>::X(sampleNum[7]),
527 MultisampleTraits<sampleCount>::X(sampleNum[6]),
528 MultisampleTraits<sampleCount>::X(sampleNum[5]),
529 MultisampleTraits<sampleCount>::X(sampleNum[4]),
530 MultisampleTraits<sampleCount>::X(sampleNum[3]),
531 MultisampleTraits<sampleCount>::X(sampleNum[2]),
532 MultisampleTraits<sampleCount>::X(sampleNum[1]),
533 MultisampleTraits<sampleCount>::X(sampleNum[0]));
534
535 __m256 vYSample = _mm256_set_ps(MultisampleTraits<sampleCount>::Y(sampleNum[7]),
536 MultisampleTraits<sampleCount>::Y(sampleNum[6]),
537 MultisampleTraits<sampleCount>::Y(sampleNum[5]),
538 MultisampleTraits<sampleCount>::Y(sampleNum[4]),
539 MultisampleTraits<sampleCount>::Y(sampleNum[3]),
540 MultisampleTraits<sampleCount>::Y(sampleNum[2]),
541 MultisampleTraits<sampleCount>::Y(sampleNum[1]),
542 MultisampleTraits<sampleCount>::Y(sampleNum[0]));
543 // add sample offset to UL pixel corner
544 vXSample = _simd_add_ps(vXSamplePosUL, vXSample);
545 vYSample = _simd_add_ps(vYSamplePosUL, vYSample);
546
547 // Case (1) and case (3b) - All samples covered or not covered with full SampleMask
548 static const __m256i vFullyCoveredMask = MultisampleTraits<sampleCount>::FullSampleMask();
549 __m256i vInputCoveragei = _mm256_set_epi32(inputMask[7], inputMask[6], inputMask[5], inputMask[4], inputMask[3], inputMask[2], inputMask[1], inputMask[0]);
550 __m256i vAllSamplesCovered = _simd_cmpeq_epi32(vInputCoveragei, vFullyCoveredMask);
551
552 static const __m256i vZero = _simd_setzero_si();
553 const __m256i vSampleMask = _simd_and_si(_simd_set1_epi32(sampleMask), vFullyCoveredMask);
554 __m256i vNoSamplesCovered = _simd_cmpeq_epi32(vInputCoveragei, vZero);
555 __m256i vIsFullSampleMask = _simd_cmpeq_epi32(vSampleMask, vFullyCoveredMask);
556 __m256i vCase3b = _simd_and_si(vNoSamplesCovered, vIsFullSampleMask);
557
558 __m256i vEvalAtCenter = _simd_or_si(vAllSamplesCovered, vCase3b);
559
560 // set the centroid position based on results from above
561 psContext.vX.centroid = _simd_blendv_ps(vXSample, psContext.vX.center, _simd_castsi_ps(vEvalAtCenter));
562 psContext.vY.centroid = _simd_blendv_ps(vYSample, psContext.vY.center, _simd_castsi_ps(vEvalAtCenter));
563
564 // Case (3a) No samples covered and partial sample mask
565 __m256i vSomeSampleMaskSamples = _simd_cmplt_epi32(vSampleMask, vFullyCoveredMask);
566 // sample mask should never be all 0's for this case, but handle it anyways
567 unsigned long firstCoveredSampleMaskSample = 0;
568 (sampleMask > 0) ? (_BitScanForward(&firstCoveredSampleMaskSample, sampleMask)) : (firstCoveredSampleMaskSample = 0);
569
570 __m256i vCase3a = _simd_and_si(vNoSamplesCovered, vSomeSampleMaskSamples);
571
572 vXSample = _simd_set1_ps(MultisampleTraits<sampleCount>::X(firstCoveredSampleMaskSample));
573 vYSample = _simd_set1_ps(MultisampleTraits<sampleCount>::Y(firstCoveredSampleMaskSample));
574
575 // blend in case 3a pixel locations
576 psContext.vX.centroid = _simd_blendv_ps(psContext.vX.centroid, vXSample, _simd_castsi_ps(vCase3a));
577 psContext.vY.centroid = _simd_blendv_ps(psContext.vY.centroid, vYSample, _simd_castsi_ps(vCase3a));
578 }
579
580 template<uint32_t sampleCount, uint32_t persp, uint32_t standardPattern, uint32_t forcedMultisampleCount>
581 INLINE void CalcCentroidBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext,
582 const uint64_t *const coverageMask, const uint32_t sampleMask,
583 const simdscalar vXSamplePosUL, const simdscalar vYSamplePosUL)
584 {
585 static const bool bPersp = (bool)persp;
586 static const bool bIsStandardPattern = (bool)standardPattern;
587 static const bool bForcedMultisampleCount = (bool)forcedMultisampleCount;
588
589 // calculate centroid positions
590 if(bPersp)
591 {
592 if(bIsStandardPattern)
593 {
594 ///@ todo: don't need to generate input coverage 2x if input coverage and centroid
595 CalcCentroidPos<(SWR_MULTISAMPLE_COUNT)sampleCount, bForcedMultisampleCount>(psContext, coverageMask, sampleMask, vXSamplePosUL, vYSamplePosUL);
596 }
597 else
598 {
599 static const __m256 pixelCenter = _simd_set1_ps(0.5f);
600 psContext.vX.centroid = _simd_add_ps(vXSamplePosUL, pixelCenter);
601 psContext.vY.centroid = _simd_add_ps(vYSamplePosUL, pixelCenter);
602 }
603 // evaluate I,J
604 psContext.vI.centroid = vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.centroid, psContext.vY.centroid);
605 psContext.vJ.centroid = vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.centroid, psContext.vY.centroid);
606 psContext.vI.centroid = _simd_mul_ps(psContext.vI.centroid, coeffs.vRecipDet);
607 psContext.vJ.centroid = _simd_mul_ps(psContext.vJ.centroid, coeffs.vRecipDet);
608
609 // interpolate 1/w
610 psContext.vOneOverW.centroid = vplaneps(coeffs.vAOneOverW, coeffs.vBOneOverW, coeffs.vCOneOverW, psContext.vI.centroid, psContext.vJ.centroid);
611 }
612 }
613
614 template<uint32_t NumRT, uint32_t sampleCountT>
615 void OutputMerger(SWR_PS_CONTEXT &psContext, uint8_t* (&pColorBase)[SWR_NUM_RENDERTARGETS], uint32_t sample, const SWR_BLEND_STATE *pBlendState,
616 const PFN_BLEND_JIT_FUNC (&pfnBlendFunc)[SWR_NUM_RENDERTARGETS], simdscalar &coverageMask, simdscalar depthPassMask)
617 {
618 // type safety guaranteed from template instantiation in BEChooser<>::GetFunc
619 static const SWR_MULTISAMPLE_COUNT sampleCount = (SWR_MULTISAMPLE_COUNT)sampleCountT;
620 uint32_t rasterTileColorOffset = MultisampleTraits<sampleCount>::RasterTileColorOffset(sample);
621 simdvector blendOut;
622
623 for(uint32_t rt = 0; rt < NumRT; ++rt)
624 {
625 uint8_t *pColorSample;
626 if(sampleCount == SWR_MULTISAMPLE_1X)
627 {
628 pColorSample = pColorBase[rt];
629 }
630 else
631 {
632 pColorSample = pColorBase[rt] + rasterTileColorOffset;
633 }
634
635 const SWR_RENDER_TARGET_BLEND_STATE *pRTBlend = &pBlendState->renderTarget[rt];
636 // pfnBlendFunc may not update all channels. Initialize with PS output.
637 /// TODO: move this into the blend JIT.
638 blendOut = psContext.shaded[rt];
639
640 // Blend outputs and update coverage mask for alpha test
641 if(pfnBlendFunc[rt] != nullptr)
642 {
643 pfnBlendFunc[rt](
644 pBlendState,
645 psContext.shaded[rt],
646 psContext.shaded[1],
647 sample,
648 pColorSample,
649 blendOut,
650 &psContext.oMask,
651 (simdscalari*)&coverageMask);
652 }
653
654 // final write mask
655 simdscalari outputMask = _simd_castps_si(_simd_and_ps(coverageMask, depthPassMask));
656
657 ///@todo can only use maskstore fast path if bpc is 32. Assuming hot tile is RGBA32_FLOAT.
658 static_assert(KNOB_COLOR_HOT_TILE_FORMAT == R32G32B32A32_FLOAT, "Unsupported hot tile format");
659
660 const uint32_t simd = KNOB_SIMD_WIDTH * sizeof(float);
661
662 // store with color mask
663 if(!pRTBlend->writeDisableRed)
664 {
665 _simd_maskstore_ps((float*)pColorSample, outputMask, blendOut.x);
666 }
667 if(!pRTBlend->writeDisableGreen)
668 {
669 _simd_maskstore_ps((float*)(pColorSample + simd), outputMask, blendOut.y);
670 }
671 if(!pRTBlend->writeDisableBlue)
672 {
673 _simd_maskstore_ps((float*)(pColorSample + simd * 2), outputMask, blendOut.z);
674 }
675 if(!pRTBlend->writeDisableAlpha)
676 {
677 _simd_maskstore_ps((float*)(pColorSample + simd * 3), outputMask, blendOut.w);
678 }
679 }
680 }
681
682 template<uint32_t sampleCountT, uint32_t samplePattern, uint32_t inputCoverage, uint32_t centroidPos, uint32_t forcedSampleCount>
683 void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
684 {
685 RDTSC_START(BESetup);
686 // type safety guaranteed from template instantiation in BEChooser<>::GetFunc
687 static const bool bInputCoverage = (bool)inputCoverage;
688 static const bool bCentroidPos = (bool)centroidPos;
689
690 SWR_CONTEXT *pContext = pDC->pContext;
691 const API_STATE& state = GetApiState(pDC);
692 const SWR_RASTSTATE& rastState = state.rastState;
693 const SWR_PS_STATE *pPSState = &state.psState;
694 const SWR_BLEND_STATE *pBlendState = &state.blendState;
695 const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs;
696 uint64_t coverageMask = work.coverageMask[0];
697
698 // broadcast scalars
699 BarycentricCoeffs coeffs;
700 coeffs.vIa = _simd_broadcast_ss(&work.I[0]);
701 coeffs.vIb = _simd_broadcast_ss(&work.I[1]);
702 coeffs.vIc = _simd_broadcast_ss(&work.I[2]);
703
704 coeffs.vJa = _simd_broadcast_ss(&work.J[0]);
705 coeffs.vJb = _simd_broadcast_ss(&work.J[1]);
706 coeffs.vJc = _simd_broadcast_ss(&work.J[2]);
707
708 coeffs.vZa = _simd_broadcast_ss(&work.Z[0]);
709 coeffs.vZb = _simd_broadcast_ss(&work.Z[1]);
710 coeffs.vZc = _simd_broadcast_ss(&work.Z[2]);
711
712 coeffs.vRecipDet = _simd_broadcast_ss(&work.recipDet);
713
714 coeffs.vAOneOverW = _simd_broadcast_ss(&work.OneOverW[0]);
715 coeffs.vBOneOverW = _simd_broadcast_ss(&work.OneOverW[1]);
716 coeffs.vCOneOverW = _simd_broadcast_ss(&work.OneOverW[2]);
717
718 uint8_t *pColorBase[SWR_NUM_RENDERTARGETS];
719 uint32_t NumRT = state.psState.numRenderTargets;
720 for(uint32_t rt = 0; rt < NumRT; ++rt)
721 {
722 pColorBase[rt] = renderBuffers.pColor[rt];
723 }
724 uint8_t *pDepthBase = renderBuffers.pDepth, *pStencilBase = renderBuffers.pStencil;
725 RDTSC_STOP(BESetup, 0, 0);
726
727 SWR_PS_CONTEXT psContext;
728 psContext.pAttribs = work.pAttribs;
729 psContext.pPerspAttribs = work.pPerspAttribs;
730 psContext.frontFace = work.triFlags.frontFacing;
731 psContext.primID = work.triFlags.primID;
732
733 // save Ia/Ib/Ic and Ja/Jb/Jc if we need to reevaluate i/j/k in the shader because of pull attribs
734 psContext.I = work.I;
735 psContext.J = work.J;
736 psContext.recipDet = work.recipDet;
737 psContext.pRecipW = work.pRecipW;
738 psContext.pSamplePosX = (const float*)&MultisampleTraits<SWR_MULTISAMPLE_1X>::samplePosX;
739 psContext.pSamplePosY = (const float*)&MultisampleTraits<SWR_MULTISAMPLE_1X>::samplePosY;
740
741 for(uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
742 {
743 // UL pixel corner
744 psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps((float)yy));
745 // pixel center
746 psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps((float)yy));
747
748 for(uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
749 {
750 if(bInputCoverage)
751 {
752 generateInputCoverage<SWR_MULTISAMPLE_1X, SWR_MSAA_STANDARD_PATTERN, false>(&work.coverageMask[0], psContext.inputMask, pBlendState->sampleMask);
753 }
754
755 if(coverageMask & MASK)
756 {
757 RDTSC_START(BEBarycentric);
758 psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps((float)xx));
759 // pixel center
760 psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps((float)xx));
761
762 backendFuncs.pfnCalcPixelBarycentrics(coeffs, psContext);
763
764 if(bCentroidPos)
765 {
766 // for 1x case, centroid is pixel center
767 psContext.vX.centroid = psContext.vX.center;
768 psContext.vY.centroid = psContext.vY.center;
769 psContext.vI.centroid = psContext.vI.center;
770 psContext.vJ.centroid = psContext.vJ.center;
771 psContext.vOneOverW.centroid = psContext.vOneOverW.center;
772 }
773
774 // interpolate and quantize z
775 psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center);
776 psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
777
778 RDTSC_STOP(BEBarycentric, 0, 0);
779
780 simdmask clipCoverageMask = coverageMask & MASK;
781
782 // interpolate user clip distance if available
783 if(rastState.clipDistanceMask)
784 {
785 clipCoverageMask &= ~ComputeUserClipMask(rastState.clipDistanceMask, work.pUserClipBuffer,
786 psContext.vI.center, psContext.vJ.center);
787 }
788
789 simdscalar vCoverageMask = vMask(clipCoverageMask);
790 simdscalar depthPassMask = vCoverageMask;
791 simdscalar stencilPassMask = vCoverageMask;
792
793 // Early-Z?
794 if(CanEarlyZ(pPSState))
795 {
796 RDTSC_START(BEEarlyDepthTest);
797 depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing,
798 psContext.vZ, pDepthBase, vCoverageMask, pStencilBase, &stencilPassMask);
799 RDTSC_STOP(BEEarlyDepthTest, 0, 0);
800
801 // early-exit if no pixels passed depth or earlyZ is forced on
802 if(pPSState->forceEarlyZ || !_simd_movemask_ps(depthPassMask))
803 {
804 DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
805 pDepthBase, depthPassMask, vCoverageMask, pStencilBase, stencilPassMask);
806
807 if (!_simd_movemask_ps(depthPassMask))
808 {
809 goto Endtile;
810 }
811 }
812 }
813
814 psContext.sampleIndex = 0;
815 psContext.activeMask = _simd_castps_si(vCoverageMask);
816
817 // execute pixel shader
818 RDTSC_START(BEPixelShader);
819 UPDATE_STAT(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask)));
820 state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
821 RDTSC_STOP(BEPixelShader, 0, 0);
822
823 vCoverageMask = _simd_castsi_ps(psContext.activeMask);
824
825 // late-Z
826 if(!CanEarlyZ(pPSState))
827 {
828 RDTSC_START(BELateDepthTest);
829 depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing,
830 psContext.vZ, pDepthBase, vCoverageMask, pStencilBase, &stencilPassMask);
831 RDTSC_STOP(BELateDepthTest, 0, 0);
832
833 if(!_simd_movemask_ps(depthPassMask))
834 {
835 // need to call depth/stencil write for stencil write
836 DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
837 pDepthBase, depthPassMask, vCoverageMask, pStencilBase, stencilPassMask);
838 goto Endtile;
839 }
840 }
841
842 uint32_t statMask = _simd_movemask_ps(depthPassMask);
843 uint32_t statCount = _mm_popcnt_u32(statMask);
844 UPDATE_STAT(DepthPassCount, statCount);
845
846 // output merger
847 RDTSC_START(BEOutputMerger);
848 backendFuncs.pfnOutputMerger(psContext, pColorBase, 0, pBlendState, state.pfnBlendFunc,
849 vCoverageMask, depthPassMask);
850
851 // do final depth write after all pixel kills
852 if (!pPSState->forceEarlyZ)
853 {
854 DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
855 pDepthBase, depthPassMask, vCoverageMask, pStencilBase, stencilPassMask);
856 }
857 RDTSC_STOP(BEOutputMerger, 0, 0);
858 }
859
860 Endtile:
861 RDTSC_START(BEEndTile);
862 coverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
863 pDepthBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
864 pStencilBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
865
866 for(uint32_t rt = 0; rt < NumRT; ++rt)
867 {
868 pColorBase[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
869 }
870 RDTSC_STOP(BEEndTile, 0, 0);
871 }
872 }
873 }
874
875 template<uint32_t sampleCountT, uint32_t samplePattern, uint32_t inputCoverage, uint32_t centroidPos, uint32_t forcedSampleCount>
876 void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
877 {
878 // type safety guaranteed from template instantiation in BEChooser<>::GetFunc
879 static const SWR_MULTISAMPLE_COUNT sampleCount = (SWR_MULTISAMPLE_COUNT)sampleCountT;
880 static const bool bInputCoverage = (bool)inputCoverage;
881 static const bool bCentroidPos = (bool)centroidPos;
882
883 RDTSC_START(BESetup);
884
885 SWR_CONTEXT *pContext = pDC->pContext;
886 const API_STATE& state = GetApiState(pDC);
887 const SWR_RASTSTATE& rastState = state.rastState;
888 const SWR_PS_STATE *pPSState = &state.psState;
889 const SWR_BLEND_STATE *pBlendState = &state.blendState;
890 const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs;
891
892 // broadcast scalars
893 BarycentricCoeffs coeffs;
894 coeffs.vIa = _simd_broadcast_ss(&work.I[0]);
895 coeffs.vIb = _simd_broadcast_ss(&work.I[1]);
896 coeffs.vIc = _simd_broadcast_ss(&work.I[2]);
897
898 coeffs.vJa = _simd_broadcast_ss(&work.J[0]);
899 coeffs.vJb = _simd_broadcast_ss(&work.J[1]);
900 coeffs.vJc = _simd_broadcast_ss(&work.J[2]);
901
902 coeffs.vZa = _simd_broadcast_ss(&work.Z[0]);
903 coeffs.vZb = _simd_broadcast_ss(&work.Z[1]);
904 coeffs.vZc = _simd_broadcast_ss(&work.Z[2]);
905
906 coeffs.vRecipDet = _simd_broadcast_ss(&work.recipDet);
907
908 coeffs.vAOneOverW = _simd_broadcast_ss(&work.OneOverW[0]);
909 coeffs.vBOneOverW = _simd_broadcast_ss(&work.OneOverW[1]);
910 coeffs.vCOneOverW = _simd_broadcast_ss(&work.OneOverW[2]);
911
912 uint8_t *pColorBase[SWR_NUM_RENDERTARGETS];
913 uint32_t NumRT = state.psState.numRenderTargets;
914 for(uint32_t rt = 0; rt < NumRT; ++rt)
915 {
916 pColorBase[rt] = renderBuffers.pColor[rt];
917 }
918 uint8_t *pDepthBase = renderBuffers.pDepth, *pStencilBase = renderBuffers.pStencil;
919 RDTSC_STOP(BESetup, 0, 0);
920
921 SWR_PS_CONTEXT psContext;
922 psContext.pAttribs = work.pAttribs;
923 psContext.pPerspAttribs = work.pPerspAttribs;
924 psContext.pRecipW = work.pRecipW;
925 psContext.frontFace = work.triFlags.frontFacing;
926 psContext.primID = work.triFlags.primID;
927
928 // save Ia/Ib/Ic and Ja/Jb/Jc if we need to reevaluate i/j/k in the shader because of pull attribs
929 psContext.I = work.I;
930 psContext.J = work.J;
931 psContext.recipDet = work.recipDet;
932 psContext.pSamplePosX = (const float*)&MultisampleTraits<sampleCount>::samplePosX;
933 psContext.pSamplePosY = (const float*)&MultisampleTraits<sampleCount>::samplePosY;
934 const uint32_t numSamples = MultisampleTraits<sampleCount>::numSamples;
935
936 for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
937 {
938 // UL pixel corner
939 psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps((float)yy));
940 // pixel center
941 psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps((float)yy));
942
943 for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
944 {
945 psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps((float)xx));
946 // pixel center
947 psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps((float)xx));
948
949 RDTSC_START(BEBarycentric);
950 backendFuncs.pfnCalcPixelBarycentrics(coeffs, psContext);
951 RDTSC_STOP(BEBarycentric, 0, 0);
952
953 if(bInputCoverage)
954 {
955 generateInputCoverage<sampleCount, SWR_MSAA_STANDARD_PATTERN, false>(&work.coverageMask[0], psContext.inputMask, pBlendState->sampleMask);
956 }
957
958 if(bCentroidPos)
959 {
960 ///@ todo: don't need to genererate input coverage 2x if input coverage and centroid
961 RDTSC_START(BEBarycentric);
962 backendFuncs.pfnCalcCentroidBarycentrics(coeffs, psContext, &work.coverageMask[0], pBlendState->sampleMask, psContext.vX.UL, psContext.vY.UL);
963 RDTSC_STOP(BEBarycentric, 0, 0);
964 }
965
966 for(uint32_t sample = 0; sample < numSamples; sample++)
967 {
968 if (work.coverageMask[sample] & MASK)
969 {
970 RDTSC_START(BEBarycentric);
971
972 // calculate per sample positions
973 psContext.vX.sample = _simd_add_ps(psContext.vX.UL, MultisampleTraits<sampleCount>::vX(sample));
974 psContext.vY.sample = _simd_add_ps(psContext.vY.UL, MultisampleTraits<sampleCount>::vY(sample));
975
976 simdmask coverageMask = work.coverageMask[sample] & MASK;
977 simdscalar vCoverageMask = vMask(coverageMask);
978
979 backendFuncs.pfnCalcSampleBarycentrics(coeffs, psContext);
980
981 // interpolate and quantize z
982 psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample);
983 psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
984
985 RDTSC_STOP(BEBarycentric, 0, 0);
986
987 // interpolate user clip distance if available
988 if (rastState.clipDistanceMask)
989 {
990 coverageMask &= ~ComputeUserClipMask(rastState.clipDistanceMask, work.pUserClipBuffer,
991 psContext.vI.sample, psContext.vJ.sample);
992 }
993
994 simdscalar depthPassMask = vCoverageMask;
995 simdscalar stencilPassMask = vCoverageMask;
996
997 // offset depth/stencil buffers current sample
998 uint8_t *pDepthSample = pDepthBase + MultisampleTraits<sampleCount>::RasterTileDepthOffset(sample);
999 uint8_t *pStencilSample = pStencilBase + MultisampleTraits<sampleCount>::RasterTileStencilOffset(sample);
1000
1001 // Early-Z?
1002 if (CanEarlyZ(pPSState))
1003 {
1004 RDTSC_START(BEEarlyDepthTest);
1005 depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing,
1006 psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
1007 RDTSC_STOP(BEEarlyDepthTest, 0, 0);
1008
1009 // early-exit if no samples passed depth or earlyZ is forced on.
1010 if (pPSState->forceEarlyZ || !_simd_movemask_ps(depthPassMask))
1011 {
1012 DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
1013 pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
1014
1015 if (!_simd_movemask_ps(depthPassMask))
1016 {
1017 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
1018 continue;
1019 }
1020 }
1021 }
1022
1023 psContext.sampleIndex = sample;
1024 psContext.activeMask = _simd_castps_si(vCoverageMask);
1025
1026 // execute pixel shader
1027 RDTSC_START(BEPixelShader);
1028 UPDATE_STAT(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask)));
1029 state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
1030 RDTSC_STOP(BEPixelShader, 0, 0);
1031
1032 vCoverageMask = _simd_castsi_ps(psContext.activeMask);
1033
1034 //// late-Z
1035 if (!CanEarlyZ(pPSState))
1036 {
1037 RDTSC_START(BELateDepthTest);
1038 depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing,
1039 psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
1040 RDTSC_STOP(BELateDepthTest, 0, 0);
1041
1042 if (!_simd_movemask_ps(depthPassMask))
1043 {
1044 // need to call depth/stencil write for stencil write
1045 DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
1046 pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
1047
1048 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
1049 continue;
1050 }
1051 }
1052
1053 uint32_t statMask = _simd_movemask_ps(depthPassMask);
1054 uint32_t statCount = _mm_popcnt_u32(statMask);
1055 UPDATE_STAT(DepthPassCount, statCount);
1056
1057 // output merger
1058 RDTSC_START(BEOutputMerger);
1059 backendFuncs.pfnOutputMerger(psContext, pColorBase, sample, pBlendState, state.pfnBlendFunc,
1060 vCoverageMask, depthPassMask);
1061
1062 // do final depth write after all pixel kills
1063 if (!pPSState->forceEarlyZ)
1064 {
1065 DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
1066 pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
1067 }
1068 RDTSC_STOP(BEOutputMerger, 0, 0);
1069 }
1070 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
1071 }
1072 RDTSC_START(BEEndTile);
1073 pDepthBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
1074 pStencilBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
1075
1076 for (uint32_t rt = 0; rt < NumRT; ++rt)
1077 {
1078 pColorBase[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
1079 }
1080 RDTSC_STOP(BEEndTile, 0, 0);
1081 }
1082 }
1083 }
1084
1085 template<uint32_t sampleCountT, uint32_t samplePattern, uint32_t inputCoverage, uint32_t centroidPos, uint32_t forcedSampleCount>
1086 void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
1087 {
1088 // type safety guaranteed from template instantiation in BEChooser<>::GetFunc
1089 static const SWR_MULTISAMPLE_COUNT sampleCount = (SWR_MULTISAMPLE_COUNT)sampleCountT;
1090 static const bool bIsStandardPattern = (bool)samplePattern;
1091 static const bool bInputCoverage = (bool)inputCoverage;
1092 static const bool bCentroidPos = (bool)centroidPos;
1093 static const bool bForcedSampleCount = (bool)forcedSampleCount;
1094
1095 RDTSC_START(BESetup);
1096
1097 SWR_CONTEXT *pContext = pDC->pContext;
1098 const API_STATE& state = GetApiState(pDC);
1099 const SWR_RASTSTATE& rastState = state.rastState;
1100 const SWR_PS_STATE *pPSState = &state.psState;
1101 const SWR_BLEND_STATE *pBlendState = &state.blendState;
1102 const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs;
1103
1104 // broadcast scalars
1105 BarycentricCoeffs coeffs;
1106 coeffs.vIa = _simd_broadcast_ss(&work.I[0]);
1107 coeffs.vIb = _simd_broadcast_ss(&work.I[1]);
1108 coeffs.vIc = _simd_broadcast_ss(&work.I[2]);
1109
1110 coeffs.vJa = _simd_broadcast_ss(&work.J[0]);
1111 coeffs.vJb = _simd_broadcast_ss(&work.J[1]);
1112 coeffs.vJc = _simd_broadcast_ss(&work.J[2]);
1113
1114 coeffs.vZa = _simd_broadcast_ss(&work.Z[0]);
1115 coeffs.vZb = _simd_broadcast_ss(&work.Z[1]);
1116 coeffs.vZc = _simd_broadcast_ss(&work.Z[2]);
1117
1118 coeffs.vRecipDet = _simd_broadcast_ss(&work.recipDet);
1119
1120 coeffs.vAOneOverW = _simd_broadcast_ss(&work.OneOverW[0]);
1121 coeffs.vBOneOverW = _simd_broadcast_ss(&work.OneOverW[1]);
1122 coeffs.vCOneOverW = _simd_broadcast_ss(&work.OneOverW[2]);
1123
1124 uint8_t *pColorBase[SWR_NUM_RENDERTARGETS];
1125 uint32_t NumRT = state.psState.numRenderTargets;
1126 for(uint32_t rt = 0; rt < NumRT; ++rt)
1127 {
1128 pColorBase[rt] = renderBuffers.pColor[rt];
1129 }
1130 uint8_t *pDepthBase = renderBuffers.pDepth, *pStencilBase = renderBuffers.pStencil;
1131 RDTSC_STOP(BESetup, 0, 0);
1132
1133 SWR_PS_CONTEXT psContext;
1134 psContext.pAttribs = work.pAttribs;
1135 psContext.pPerspAttribs = work.pPerspAttribs;
1136 psContext.frontFace = work.triFlags.frontFacing;
1137 psContext.primID = work.triFlags.primID;
1138 psContext.pRecipW = work.pRecipW;
1139 // save Ia/Ib/Ic and Ja/Jb/Jc if we need to reevaluate i/j/k in the shader because of pull attribs
1140 psContext.I = work.I;
1141 psContext.J = work.J;
1142 psContext.recipDet = work.recipDet;
1143 psContext.pSamplePosX = (const float*)&MultisampleTraits<sampleCount>::samplePosX;
1144 psContext.pSamplePosY = (const float*)&MultisampleTraits<sampleCount>::samplePosY;
1145 psContext.sampleIndex = 0;
1146
1147 uint32_t numCoverageSamples;
1148 if(bIsStandardPattern)
1149 {
1150 numCoverageSamples = MultisampleTraits<sampleCount>::numSamples;
1151 }
1152 else
1153 {
1154 numCoverageSamples = 1;
1155 }
1156
1157 uint32_t numOMSamples;
1158 // RT has to be single sample if we're in forcedMSAA mode
1159 if(bForcedSampleCount && (sampleCount > SWR_MULTISAMPLE_1X))
1160 {
1161 numOMSamples = 1;
1162 }
1163 // unless we're forced to single sample, in which case we run the OM at the sample count of the RT
1164 else if(bForcedSampleCount && (sampleCount == SWR_MULTISAMPLE_1X))
1165 {
1166 numOMSamples = GetNumSamples(pBlendState->sampleCount);
1167 }
1168 // else we're in normal MSAA mode and rasterizer and OM are running at the same sample count
1169 else
1170 {
1171 numOMSamples = MultisampleTraits<sampleCount>::numSamples;
1172 }
1173
1174 for(uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
1175 {
1176 psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps((float)yy));
1177 psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps((float)yy));
1178 for(uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
1179 {
1180 simdscalar vZ[MultisampleTraits<sampleCount>::numSamples]{ 0 };
1181 psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps((float)xx));
1182 // set pixel center positions
1183 psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps((float)xx));
1184
1185 if (bInputCoverage)
1186 {
1187 generateInputCoverage<sampleCount, bIsStandardPattern, bForcedSampleCount>(&work.coverageMask[0], psContext.inputMask, pBlendState->sampleMask);
1188 }
1189
1190 if(bCentroidPos)
1191 {
1192 ///@ todo: don't need to genererate input coverage 2x if input coverage and centroid
1193 RDTSC_START(BEBarycentric);
1194 backendFuncs.pfnCalcCentroidBarycentrics(coeffs, psContext, &work.coverageMask[0], pBlendState->sampleMask, psContext.vX.UL, psContext.vY.UL);
1195 RDTSC_STOP(BEBarycentric, 0, 0);
1196 }
1197
1198 // if oDepth written to, or there is a potential to discard any samples, we need to
1199 // run the PS early, then interp or broadcast Z and test
1200 if(pPSState->writesODepth || pPSState->killsPixel)
1201 {
1202 RDTSC_START(BEBarycentric);
1203 backendFuncs.pfnCalcPixelBarycentrics(coeffs, psContext);
1204
1205 // interpolate and quantize z
1206 psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center);
1207 psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
1208 RDTSC_STOP(BEBarycentric, 0, 0);
1209
1210 // execute pixel shader
1211 RDTSC_START(BEPixelShader);
1212 state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
1213 RDTSC_STOP(BEPixelShader, 0, 0);
1214 }
1215 else
1216 {
1217 psContext.activeMask = _simd_set1_epi32(-1);
1218 }
1219
1220 // need to declare enough space for all samples
1221 simdscalar vCoverageMask[MultisampleTraits<sampleCount>::numSamples];
1222 simdscalar depthPassMask[MultisampleTraits<sampleCount>::numSamples];
1223 simdscalar stencilPassMask[MultisampleTraits<sampleCount>::numSamples];
1224 simdscalar anyDepthSamplePassed = _simd_setzero_ps();
1225 simdscalar anyStencilSamplePassed = _simd_setzero_ps();
1226 for(uint32_t sample = 0; sample < numCoverageSamples; sample++)
1227 {
1228 vCoverageMask[sample] = vMask(work.coverageMask[sample] & MASK);
1229
1230 // pull mask back out for any discards and and with coverage
1231 vCoverageMask[sample] = _simd_and_ps(vCoverageMask[sample], _simd_castsi_ps(psContext.activeMask));
1232
1233 if (!_simd_movemask_ps(vCoverageMask[sample]))
1234 {
1235 vCoverageMask[sample] = depthPassMask[sample] = stencilPassMask[sample] = _simd_setzero_ps();
1236 continue;
1237 }
1238
1239 if(bForcedSampleCount)
1240 {
1241 // candidate pixels (that passed coverage) will cause shader invocation if any bits in the samplemask are set
1242 const simdscalar vSampleMask = _simd_castsi_ps(_simd_cmpgt_epi32(_simd_set1_epi32(pBlendState->sampleMask), _simd_setzero_si()));
1243 anyDepthSamplePassed = _simd_or_ps(anyDepthSamplePassed, _simd_and_ps(vCoverageMask[sample], vSampleMask));
1244 continue;
1245 }
1246
1247 depthPassMask[sample] = vCoverageMask[sample];
1248
1249 // if oDepth isn't written to, we need to interpolate Z for each sample
1250 // if clip distances are enabled, we need to interpolate for each sample
1251 if(!pPSState->writesODepth || rastState.clipDistanceMask)
1252 {
1253 RDTSC_START(BEBarycentric);
1254 if(bIsStandardPattern)
1255 {
1256 // calculate per sample positions
1257 psContext.vX.sample = _simd_add_ps(psContext.vX.UL, MultisampleTraits<sampleCount>::vX(sample));
1258 psContext.vY.sample = _simd_add_ps(psContext.vY.UL, MultisampleTraits<sampleCount>::vY(sample));
1259 }
1260 else
1261 {
1262 psContext.vX.sample = psContext.vX.center;
1263 psContext.vY.sample = psContext.vY.center;
1264 }
1265
1266 // calc I & J per sample
1267 backendFuncs.pfnCalcSampleBarycentrics(coeffs, psContext);
1268
1269 // interpolate and quantize z
1270 if (!pPSState->writesODepth)
1271 {
1272 vZ[sample] = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample);
1273 vZ[sample] = state.pfnQuantizeDepth(vZ[sample]);
1274 }
1275
1276 ///@todo: perspective correct vs non-perspective correct clipping?
1277 // interpolate clip distances
1278 if (rastState.clipDistanceMask)
1279 {
1280 uint8_t clipMask = ComputeUserClipMask(rastState.clipDistanceMask, work.pUserClipBuffer,
1281 psContext.vI.sample, psContext.vJ.sample);
1282 vCoverageMask[sample] = _simd_and_ps(vCoverageMask[sample], vMask(~clipMask));
1283 }
1284 RDTSC_STOP(BEBarycentric, 0, 0);
1285 }
1286 // else 'broadcast' and test psContext.vZ written from the PS each sample
1287 else
1288 {
1289 vZ[sample] = psContext.vZ;
1290 }
1291
1292 // offset depth/stencil buffers current sample
1293 uint8_t *pDepthSample = pDepthBase + MultisampleTraits<sampleCount>::RasterTileDepthOffset(sample);
1294 uint8_t * pStencilSample = pStencilBase + MultisampleTraits<sampleCount>::RasterTileStencilOffset(sample);
1295
1296 // ZTest for this sample
1297 RDTSC_START(BEEarlyDepthTest);
1298 stencilPassMask[sample] = vCoverageMask[sample];
1299 depthPassMask[sample] = DepthStencilTest(&state, work.triFlags.frontFacing,
1300 vZ[sample], pDepthSample, vCoverageMask[sample], pStencilSample, &stencilPassMask[sample]);
1301 RDTSC_STOP(BEEarlyDepthTest, 0, 0);
1302
1303 anyDepthSamplePassed = _simd_or_ps(anyDepthSamplePassed, depthPassMask[sample]);
1304 anyStencilSamplePassed = _simd_or_ps(anyStencilSamplePassed, stencilPassMask[sample]);
1305 uint32_t statMask = _simd_movemask_ps(depthPassMask[sample]);
1306 uint32_t statCount = _mm_popcnt_u32(statMask);
1307 UPDATE_STAT(DepthPassCount, statCount);
1308 }
1309
1310 // if we didn't have to execute the PS early, and at least 1 sample passed the depth test, run the PS
1311 if(!pPSState->writesODepth && !pPSState->killsPixel && _simd_movemask_ps(anyDepthSamplePassed))
1312 {
1313 RDTSC_START(BEBarycentric);
1314 backendFuncs.pfnCalcPixelBarycentrics(coeffs, psContext);
1315 // interpolate and quantize z
1316 psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center);
1317 psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
1318 RDTSC_STOP(BEBarycentric, 0, 0);
1319
1320 // execute pixel shader
1321 RDTSC_START(BEPixelShader);
1322 state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
1323 RDTSC_STOP(BEPixelShader, 0, 0);
1324 }
1325 ///@todo: make sure this works for kill pixel
1326 else if(!_simd_movemask_ps(anyStencilSamplePassed))
1327 {
1328 goto Endtile;
1329 }
1330
1331 // loop over all samples, broadcasting the results of the PS to all passing pixels
1332 for(uint32_t sample = 0; sample < numOMSamples; sample++)
1333 {
1334 uint8_t *pDepthSample = pDepthBase + MultisampleTraits<sampleCount>::RasterTileDepthOffset(sample);
1335 uint8_t * pStencilSample = pStencilBase + MultisampleTraits<sampleCount>::RasterTileStencilOffset(sample);
1336
1337 // output merger
1338 RDTSC_START(BEOutputMerger);
1339
1340 // skip if none of the pixels for this sample passed
1341 simdscalar coverageMaskSample;
1342 simdscalar depthMaskSample;
1343 simdscalar stencilMaskSample;
1344 simdscalar vInterpolatedZ;
1345
1346 // forcedSampleCount outputs to any pixels with covered samples not masked off by SampleMask
1347 // depth test is disabled, so just set the z val to 0.
1348 if(bForcedSampleCount)
1349 {
1350 coverageMaskSample = depthMaskSample = anyDepthSamplePassed;
1351 vInterpolatedZ = _simd_setzero_ps();
1352 }
1353 else if(bIsStandardPattern)
1354 {
1355 if(!_simd_movemask_ps(depthPassMask[sample]))
1356 {
1357 depthPassMask[sample] = _simd_setzero_ps();
1358 DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, vZ[sample], pDepthSample, depthPassMask[sample],
1359 vCoverageMask[sample], pStencilSample, stencilPassMask[sample]);
1360 continue;
1361 }
1362 coverageMaskSample = vCoverageMask[sample];
1363 depthMaskSample = depthPassMask[sample];
1364 stencilMaskSample = stencilPassMask[sample];
1365 vInterpolatedZ = vZ[sample];
1366 }
1367 else
1368 {
1369 // center pattern only needs to use a single depth test as all samples are at the same position
1370 if(!_simd_movemask_ps(depthPassMask[0]))
1371 {
1372 depthPassMask[0] = _simd_setzero_ps();
1373 DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, vZ[0], pDepthSample, depthPassMask[0],
1374 vCoverageMask[0], pStencilSample, stencilPassMask[0]);
1375 continue;
1376 }
1377 coverageMaskSample = (vCoverageMask[0]);
1378 depthMaskSample = depthPassMask[0];
1379 stencilMaskSample = stencilPassMask[0];
1380 vInterpolatedZ = vZ[0];
1381 }
1382
1383 // output merger
1384 RDTSC_START(BEOutputMerger);
1385 backendFuncs.pfnOutputMerger(psContext, pColorBase, sample, pBlendState, state.pfnBlendFunc,
1386 coverageMaskSample, depthMaskSample);
1387
1388 DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, vInterpolatedZ, pDepthSample, depthMaskSample,
1389 coverageMaskSample, pStencilSample, stencilMaskSample);
1390 RDTSC_STOP(BEOutputMerger, 0, 0);
1391 }
1392
1393 Endtile:
1394 RDTSC_START(BEEndTile);
1395 for(uint32_t sample = 0; sample < numCoverageSamples; sample++)
1396 {
1397 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
1398 }
1399
1400 pDepthBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
1401 pStencilBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
1402
1403 for(uint32_t rt = 0; rt < NumRT; ++rt)
1404 {
1405 pColorBase[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
1406 }
1407 RDTSC_STOP(BEEndTile, 0, 0);
1408 }
1409 }
1410 }
1411 // optimized backend flow with NULL PS
1412 template<uint32_t sampleCountT>
1413 void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
1414 {
1415 RDTSC_START(BESetup);
1416
1417 static const SWR_MULTISAMPLE_COUNT sampleCount = (SWR_MULTISAMPLE_COUNT)sampleCountT;
1418
1419 SWR_CONTEXT *pContext = pDC->pContext;
1420 const API_STATE& state = GetApiState(pDC);
1421 const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs;
1422 const SWR_RASTSTATE& rastState = pDC->pState->state.rastState;
1423
1424 // broadcast scalars
1425 BarycentricCoeffs coeffs;
1426 coeffs.vIa = _simd_broadcast_ss(&work.I[0]);
1427 coeffs.vIb = _simd_broadcast_ss(&work.I[1]);
1428 coeffs.vIc = _simd_broadcast_ss(&work.I[2]);
1429
1430 coeffs.vJa = _simd_broadcast_ss(&work.J[0]);
1431 coeffs.vJb = _simd_broadcast_ss(&work.J[1]);
1432 coeffs.vJc = _simd_broadcast_ss(&work.J[2]);
1433
1434 coeffs.vZa = _simd_broadcast_ss(&work.Z[0]);
1435 coeffs.vZb = _simd_broadcast_ss(&work.Z[1]);
1436 coeffs.vZc = _simd_broadcast_ss(&work.Z[2]);
1437
1438 coeffs.vRecipDet = _simd_broadcast_ss(&work.recipDet);
1439
1440 uint8_t *pDepthBase = renderBuffers.pDepth, *pStencilBase = renderBuffers.pStencil;
1441
1442 RDTSC_STOP(BESetup, 0, 0);
1443
1444 SWR_PS_CONTEXT psContext;
1445 for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
1446 {
1447 // UL pixel corner
1448 simdscalar vYSamplePosUL = _simd_add_ps(vULOffsetsY, _simd_set1_ps((float)yy));
1449
1450 for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
1451 {
1452 // UL pixel corners
1453 simdscalar vXSamplePosUL = _simd_add_ps(vULOffsetsX, _simd_set1_ps((float)xx));
1454
1455 // iterate over active samples
1456 unsigned long sample = 0;
1457 uint32_t sampleMask = state.blendState.sampleMask;
1458 while (_BitScanForward(&sample, sampleMask))
1459 {
1460 sampleMask &= ~(1 << sample);
1461 simdmask coverageMask = work.coverageMask[sample] & MASK;
1462 if (coverageMask)
1463 {
1464 RDTSC_START(BEBarycentric);
1465 // calculate per sample positions
1466 psContext.vX.sample = _simd_add_ps(vXSamplePosUL, MultisampleTraits<sampleCount>::vX(sample));
1467 psContext.vY.sample = _simd_add_ps(vYSamplePosUL, MultisampleTraits<sampleCount>::vY(sample));
1468
1469 backendFuncs.pfnCalcSampleBarycentrics(coeffs, psContext);
1470
1471 // interpolate and quantize z
1472 psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample);
1473 psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
1474
1475 RDTSC_STOP(BEBarycentric, 0, 0);
1476
1477 // interpolate user clip distance if available
1478 if (rastState.clipDistanceMask)
1479 {
1480 coverageMask &= ~ComputeUserClipMask(rastState.clipDistanceMask, work.pUserClipBuffer,
1481 psContext.vI.sample, psContext.vJ.sample);
1482 }
1483
1484 simdscalar vCoverageMask = vMask(coverageMask);
1485 simdscalar stencilPassMask = vCoverageMask;
1486
1487 // offset depth/stencil buffers current sample
1488 uint8_t *pDepthSample = pDepthBase + MultisampleTraits<sampleCount>::RasterTileDepthOffset(sample);
1489 uint8_t *pStencilSample = pStencilBase + MultisampleTraits<sampleCount>::RasterTileStencilOffset(sample);
1490
1491 RDTSC_START(BEEarlyDepthTest);
1492 simdscalar depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing,
1493 psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
1494 DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
1495 pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
1496 RDTSC_STOP(BEEarlyDepthTest, 0, 0);
1497
1498 uint32_t statMask = _simd_movemask_ps(depthPassMask);
1499 uint32_t statCount = _mm_popcnt_u32(statMask);
1500 UPDATE_STAT(DepthPassCount, statCount);
1501 }
1502 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
1503 }
1504 pDepthBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
1505 pStencilBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
1506 }
1507 }
1508 }
1509
1510 void InitClearTilesTable()
1511 {
1512 memset(sClearTilesTable, 0, sizeof(sClearTilesTable));
1513
1514 sClearTilesTable[R8G8B8A8_UNORM] = ClearMacroTile<R8G8B8A8_UNORM>;
1515 sClearTilesTable[B8G8R8A8_UNORM] = ClearMacroTile<B8G8R8A8_UNORM>;
1516 sClearTilesTable[R32_FLOAT] = ClearMacroTile<R32_FLOAT>;
1517 sClearTilesTable[R32G32B32A32_FLOAT] = ClearMacroTile<R32G32B32A32_FLOAT>;
1518 sClearTilesTable[R8_UINT] = ClearMacroTile<R8_UINT>;
1519 }
1520
1521 PFN_BACKEND_FUNC gBackendNullPs[SWR_MULTISAMPLE_TYPE_MAX];
1522 PFN_BACKEND_FUNC gBackendSingleSample[2][2] = {};
1523 PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_MAX][SWR_MSAA_SAMPLE_PATTERN_MAX][SWR_INPUT_COVERAGE_MAX][2][2] = {};
1524 PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_MAX][SWR_INPUT_COVERAGE_MAX][2] = {};
1525 PFN_OUTPUT_MERGER gBackendOutputMergerTable[SWR_NUM_RENDERTARGETS+1][SWR_MULTISAMPLE_TYPE_MAX] = {};
1526 PFN_CALC_PIXEL_BARYCENTRICS gPixelBarycentricTable[2] = {};
1527 PFN_CALC_SAMPLE_BARYCENTRICS gSampleBarycentricTable[2] = {};
1528 PFN_CALC_CENTROID_BARYCENTRICS gCentroidBarycentricTable[SWR_MULTISAMPLE_TYPE_MAX][2][2][2] = {};
1529
1530 // Recursive template used to auto-nest conditionals. Converts dynamic enum function
1531 // arguments to static template arguments.
1532 template <uint32_t... ArgsT>
1533 struct OMChooser
1534 {
1535 // Last Arg Terminator
1536 static PFN_OUTPUT_MERGER GetFunc(SWR_MULTISAMPLE_COUNT tArg)
1537 {
1538 switch(tArg)
1539 {
1540 case SWR_MULTISAMPLE_1X: return OutputMerger<ArgsT..., SWR_MULTISAMPLE_1X>; break;
1541 case SWR_MULTISAMPLE_2X: return OutputMerger<ArgsT..., SWR_MULTISAMPLE_2X>; break;
1542 case SWR_MULTISAMPLE_4X: return OutputMerger<ArgsT..., SWR_MULTISAMPLE_4X>; break;
1543 case SWR_MULTISAMPLE_8X: return OutputMerger<ArgsT..., SWR_MULTISAMPLE_8X>; break;
1544 case SWR_MULTISAMPLE_16X: return OutputMerger<ArgsT..., SWR_MULTISAMPLE_16X>; break;
1545 default:
1546 SWR_ASSERT(0 && "Invalid sample count\n");
1547 return nullptr;
1548 break;
1549 }
1550 }
1551
1552 // Recursively parse args
1553 template <typename... TArgsT>
1554 static PFN_OUTPUT_MERGER GetFunc(uint32_t tArg, TArgsT... remainingArgs)
1555 {
1556 switch(tArg)
1557 {
1558 case 0: return OMChooser<ArgsT..., 0>::GetFunc(remainingArgs...); break;
1559 case 1: return OMChooser<ArgsT..., 1>::GetFunc(remainingArgs...); break;
1560 case 2: return OMChooser<ArgsT..., 2>::GetFunc(remainingArgs...); break;
1561 case 3: return OMChooser<ArgsT..., 3>::GetFunc(remainingArgs...); break;
1562 case 4: return OMChooser<ArgsT..., 4>::GetFunc(remainingArgs...); break;
1563 case 5: return OMChooser<ArgsT..., 5>::GetFunc(remainingArgs...); break;
1564 case 6: return OMChooser<ArgsT..., 6>::GetFunc(remainingArgs...); break;
1565 case 7: return OMChooser<ArgsT..., 7>::GetFunc(remainingArgs...); break;
1566 case 8: return OMChooser<ArgsT..., 8>::GetFunc(remainingArgs...); break;
1567 default:
1568 SWR_ASSERT(0 && "Invalid RT index\n");
1569 return nullptr;
1570 break;
1571 }
1572 }
1573 };
1574
1575 // Recursive template used to auto-nest conditionals. Converts dynamic enum function
1576 // arguments to static template arguments.
1577 template <uint32_t... ArgsT>
1578 struct BECentroidBarycentricChooser
1579 {
1580
1581 // Last Arg Terminator
1582 template <typename... TArgsT>
1583 static PFN_CALC_CENTROID_BARYCENTRICS GetFunc(uint32_t tArg)
1584 {
1585 if(tArg > 0)
1586 {
1587 return CalcCentroidBarycentrics<ArgsT..., 1>;
1588 }
1589
1590 return CalcCentroidBarycentrics<ArgsT..., 0>;
1591 }
1592
1593 // Recursively parse args
1594 template <typename... TArgsT>
1595 static PFN_CALC_CENTROID_BARYCENTRICS GetFunc(SWR_MULTISAMPLE_COUNT tArg, TArgsT... remainingArgs)
1596 {
1597 switch(tArg)
1598 {
1599 case SWR_MULTISAMPLE_1X: return BECentroidBarycentricChooser<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...); break;
1600 case SWR_MULTISAMPLE_2X: return BECentroidBarycentricChooser<ArgsT..., SWR_MULTISAMPLE_2X>::GetFunc(remainingArgs...); break;
1601 case SWR_MULTISAMPLE_4X: return BECentroidBarycentricChooser<ArgsT..., SWR_MULTISAMPLE_4X>::GetFunc(remainingArgs...); break;
1602 case SWR_MULTISAMPLE_8X: return BECentroidBarycentricChooser<ArgsT..., SWR_MULTISAMPLE_8X>::GetFunc(remainingArgs...); break;
1603 case SWR_MULTISAMPLE_16X: return BECentroidBarycentricChooser<ArgsT..., SWR_MULTISAMPLE_16X>::GetFunc(remainingArgs...); break;
1604 default:
1605 SWR_ASSERT(0 && "Invalid sample count\n");
1606 return nullptr;
1607 break;
1608 }
1609 }
1610
1611 // Recursively parse args
1612 template <typename... TArgsT>
1613 static PFN_CALC_CENTROID_BARYCENTRICS GetFunc(uint32_t tArg, TArgsT... remainingArgs)
1614 {
1615 if(tArg > 0)
1616 {
1617 return BECentroidBarycentricChooser<ArgsT..., 1>::GetFunc(remainingArgs...);
1618 }
1619
1620 return BECentroidBarycentricChooser<ArgsT..., 0>::GetFunc(remainingArgs...);
1621 }
1622 };
1623
1624 // Recursive template used to auto-nest conditionals. Converts dynamic enum function
1625 // arguments to static template arguments.
1626 template <uint32_t... ArgsT>
1627 struct BEChooser
1628 {
1629 // Last Arg Terminator
1630 static PFN_BACKEND_FUNC GetFunc(SWR_BACKEND_FUNCS tArg)
1631 {
1632 switch(tArg)
1633 {
1634 case SWR_BACKEND_SINGLE_SAMPLE: return BackendSingleSample<ArgsT...>; break;
1635 case SWR_BACKEND_MSAA_PIXEL_RATE: return BackendPixelRate<ArgsT...>; break;
1636 case SWR_BACKEND_MSAA_SAMPLE_RATE: return BackendSampleRate<ArgsT...>; break;
1637 default:
1638 SWR_ASSERT(0 && "Invalid backend func\n");
1639 return nullptr;
1640 break;
1641 }
1642 }
1643
1644
1645 // Recursively parse args
1646 template <typename... TArgsT>
1647 static PFN_BACKEND_FUNC GetFunc(SWR_MULTISAMPLE_COUNT tArg, TArgsT... remainingArgs)
1648 {
1649 switch(tArg)
1650 {
1651 case SWR_MULTISAMPLE_1X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...); break;
1652 case SWR_MULTISAMPLE_2X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_2X>::GetFunc(remainingArgs...); break;
1653 case SWR_MULTISAMPLE_4X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_4X>::GetFunc(remainingArgs...); break;
1654 case SWR_MULTISAMPLE_8X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_8X>::GetFunc(remainingArgs...); break;
1655 case SWR_MULTISAMPLE_16X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_16X>::GetFunc(remainingArgs...); break;
1656 default:
1657 SWR_ASSERT(0 && "Invalid sample count\n");
1658 return nullptr;
1659 break;
1660 }
1661 }
1662
1663 // Recursively parse args
1664 template <typename... TArgsT>
1665 static PFN_BACKEND_FUNC GetFunc(uint32_t tArg, TArgsT... remainingArgs)
1666 {
1667 if(tArg > 0)
1668 {
1669 return BEChooser<ArgsT..., 1>::GetFunc(remainingArgs...);
1670 }
1671
1672 return BEChooser<ArgsT..., 0>::GetFunc(remainingArgs...);
1673 }
1674 };
1675
1676 template <uint32_t numRenderTargets, SWR_MULTISAMPLE_COUNT numSampleRates>
1677 void InitBackendOMFuncTable(PFN_OUTPUT_MERGER (&table)[numRenderTargets][numSampleRates])
1678 {
1679 for(uint32_t rtNum = SWR_ATTACHMENT_COLOR0; rtNum < numRenderTargets; rtNum++)
1680 {
1681 for(uint32_t sampleCount = SWR_MULTISAMPLE_1X; sampleCount < numSampleRates; sampleCount++)
1682 {
1683 table[rtNum][sampleCount] =
1684 OMChooser<>::GetFunc((SWR_RENDERTARGET_ATTACHMENT)rtNum, (SWR_MULTISAMPLE_COUNT)sampleCount);
1685 }
1686 }
1687 }
1688
1689 template <SWR_MULTISAMPLE_COUNT numSampleRates>
1690 void InitBackendBarycentricsTables(PFN_CALC_PIXEL_BARYCENTRICS (&pixelTable)[2],
1691 PFN_CALC_SAMPLE_BARYCENTRICS (&sampleTable)[2],
1692 PFN_CALC_CENTROID_BARYCENTRICS (&centroidTable)[numSampleRates][2][2][2])
1693 {
1694 pixelTable[0] = CalcPixelBarycentrics<0>;
1695 pixelTable[1] = CalcPixelBarycentrics<1>;
1696
1697 sampleTable[0] = CalcSampleBarycentrics<0>;
1698 sampleTable[1] = CalcSampleBarycentrics<1>;
1699
1700 for(uint32_t sampleCount = SWR_MULTISAMPLE_1X; sampleCount < numSampleRates; sampleCount++)
1701 {
1702 for(uint32_t baryMask = 0; baryMask < 2; baryMask++)
1703 {
1704 for(uint32_t patternNum = 0; patternNum < 2; patternNum++)
1705 {
1706 for(uint32_t forcedSampleEnable = 0; forcedSampleEnable < 2; forcedSampleEnable++)
1707 {
1708 centroidTable[sampleCount][baryMask][patternNum][forcedSampleEnable]=
1709 BECentroidBarycentricChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, baryMask, patternNum, forcedSampleEnable);
1710 }
1711 }
1712 }
1713 }
1714 }
1715
1716 void InitBackendSampleFuncTable(PFN_BACKEND_FUNC (&table)[2][2])
1717 {
1718 gBackendSingleSample[0][0] = BEChooser<>::GetFunc(SWR_MULTISAMPLE_1X, SWR_MSAA_STANDARD_PATTERN, SWR_INPUT_COVERAGE_NONE, 0, 0, (SWR_BACKEND_FUNCS)SWR_BACKEND_SINGLE_SAMPLE);
1719 gBackendSingleSample[0][1] = BEChooser<>::GetFunc(SWR_MULTISAMPLE_1X, SWR_MSAA_STANDARD_PATTERN, SWR_INPUT_COVERAGE_NONE, 1, 0, (SWR_BACKEND_FUNCS)SWR_BACKEND_SINGLE_SAMPLE);
1720 gBackendSingleSample[1][0] = BEChooser<>::GetFunc(SWR_MULTISAMPLE_1X, SWR_MSAA_STANDARD_PATTERN, SWR_INPUT_COVERAGE_NORMAL, 0, 0, (SWR_BACKEND_FUNCS)SWR_BACKEND_SINGLE_SAMPLE);
1721 gBackendSingleSample[1][1] = BEChooser<>::GetFunc(SWR_MULTISAMPLE_1X, SWR_MSAA_STANDARD_PATTERN, SWR_INPUT_COVERAGE_NORMAL, 1, 0, (SWR_BACKEND_FUNCS)SWR_BACKEND_SINGLE_SAMPLE);
1722 }
1723
1724 template <SWR_MULTISAMPLE_COUNT numSampleRates, SWR_MSAA_SAMPLE_PATTERN numSamplePatterns, SWR_INPUT_COVERAGE numCoverageModes>
1725 void InitBackendPixelFuncTable(PFN_BACKEND_FUNC (&table)[numSampleRates][numSamplePatterns][numCoverageModes][2][2])
1726 {
1727 for(uint32_t sampleCount = SWR_MULTISAMPLE_1X; sampleCount < numSampleRates; sampleCount++)
1728 {
1729 for(uint32_t samplePattern = SWR_MSAA_CENTER_PATTERN; samplePattern < numSamplePatterns; samplePattern++)
1730 {
1731 for(uint32_t inputCoverage = SWR_INPUT_COVERAGE_NONE; inputCoverage < numCoverageModes; inputCoverage++)
1732 {
1733 for(uint32_t isCentroid = 0; isCentroid < 2; isCentroid++)
1734 {
1735 table[sampleCount][samplePattern][inputCoverage][isCentroid][0] =
1736 BEChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, samplePattern, inputCoverage, isCentroid, 0, (SWR_BACKEND_FUNCS)SWR_BACKEND_MSAA_PIXEL_RATE);
1737 table[sampleCount][samplePattern][inputCoverage][isCentroid][1] =
1738 BEChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, samplePattern, inputCoverage, isCentroid, 1, (SWR_BACKEND_FUNCS)SWR_BACKEND_MSAA_PIXEL_RATE);
1739 }
1740 }
1741 }
1742 }
1743 }
1744
1745 template <uint32_t numSampleRates, uint32_t numCoverageModes>
1746 void InitBackendSampleFuncTable(PFN_BACKEND_FUNC (&table)[numSampleRates][numCoverageModes][2])
1747 {
1748 for(uint32_t sampleCount = SWR_MULTISAMPLE_1X; sampleCount < numSampleRates; sampleCount++)
1749 {
1750 for(uint32_t inputCoverage = SWR_INPUT_COVERAGE_NONE; inputCoverage < numCoverageModes; inputCoverage++)
1751 {
1752 table[sampleCount][inputCoverage][0] =
1753 BEChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, SWR_MSAA_STANDARD_PATTERN, inputCoverage, 0, 0, (SWR_BACKEND_FUNCS)SWR_BACKEND_MSAA_SAMPLE_RATE);
1754 table[sampleCount][inputCoverage][1] =
1755 BEChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, SWR_MSAA_STANDARD_PATTERN, inputCoverage, 1, 0, (SWR_BACKEND_FUNCS)SWR_BACKEND_MSAA_SAMPLE_RATE);
1756 }
1757 }
1758 }
1759
1760 void InitBackendFuncTables()
1761 {
1762 InitBackendSampleFuncTable(gBackendSingleSample);
1763 InitBackendPixelFuncTable<(SWR_MULTISAMPLE_COUNT)SWR_MULTISAMPLE_TYPE_MAX, SWR_MSAA_SAMPLE_PATTERN_MAX, SWR_INPUT_COVERAGE_MAX>(gBackendPixelRateTable);
1764 InitBackendSampleFuncTable<SWR_MULTISAMPLE_TYPE_MAX, SWR_INPUT_COVERAGE_MAX>(gBackendSampleRateTable);
1765 InitBackendOMFuncTable<SWR_NUM_RENDERTARGETS+1, SWR_MULTISAMPLE_TYPE_MAX>(gBackendOutputMergerTable);
1766 InitBackendBarycentricsTables<(SWR_MULTISAMPLE_COUNT)(SWR_MULTISAMPLE_TYPE_MAX)>(gPixelBarycentricTable, gSampleBarycentricTable, gCentroidBarycentricTable);
1767
1768 gBackendNullPs[SWR_MULTISAMPLE_1X] = &BackendNullPS < SWR_MULTISAMPLE_1X > ;
1769 gBackendNullPs[SWR_MULTISAMPLE_2X] = &BackendNullPS < SWR_MULTISAMPLE_2X > ;
1770 gBackendNullPs[SWR_MULTISAMPLE_4X] = &BackendNullPS < SWR_MULTISAMPLE_4X > ;
1771 gBackendNullPs[SWR_MULTISAMPLE_8X] = &BackendNullPS < SWR_MULTISAMPLE_8X > ;
1772 gBackendNullPs[SWR_MULTISAMPLE_16X] = &BackendNullPS < SWR_MULTISAMPLE_16X > ;
1773 }