swr: [rasterizer core] store blend output in temporary instead of PS output.
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / backend.cpp
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file backend.cpp
24 *
25 * @brief Backend handles rasterization, pixel shading and output merger
26 * operations.
27 *
28 ******************************************************************************/
29
30 #include <smmintrin.h>
31
32 #include "rdtsc_core.h"
33 #include "backend.h"
34 #include "depthstencil.h"
35 #include "tilemgr.h"
36 #include "memory/tilingtraits.h"
37 #include "core/multisample.h"
38
39 #include <algorithm>
40
41 const __m128 vTileOffsetsX = {0.5, KNOB_TILE_X_DIM - 0.5, 0.5, KNOB_TILE_X_DIM - 0.5};
42 const __m128 vTileOffsetsY = {0.5, 0.5, KNOB_TILE_Y_DIM - 0.5, KNOB_TILE_Y_DIM - 0.5};
43
44 /// @todo move to common lib
45 #define MASKTOVEC(i3,i2,i1,i0) {-i0,-i1,-i2,-i3}
46 static const __m128 gMaskToVec[] = {
47 MASKTOVEC(0,0,0,0),
48 MASKTOVEC(0,0,0,1),
49 MASKTOVEC(0,0,1,0),
50 MASKTOVEC(0,0,1,1),
51 MASKTOVEC(0,1,0,0),
52 MASKTOVEC(0,1,0,1),
53 MASKTOVEC(0,1,1,0),
54 MASKTOVEC(0,1,1,1),
55 MASKTOVEC(1,0,0,0),
56 MASKTOVEC(1,0,0,1),
57 MASKTOVEC(1,0,1,0),
58 MASKTOVEC(1,0,1,1),
59 MASKTOVEC(1,1,0,0),
60 MASKTOVEC(1,1,0,1),
61 MASKTOVEC(1,1,1,0),
62 MASKTOVEC(1,1,1,1),
63 };
64
65 typedef void(*PFN_CLEAR_TILES)(DRAW_CONTEXT*, SWR_RENDERTARGET_ATTACHMENT rt, uint32_t, DWORD[4]);
66 static PFN_CLEAR_TILES sClearTilesTable[NUM_SWR_FORMATS];
67
68 //////////////////////////////////////////////////////////////////////////
69 /// @brief Process compute work.
70 /// @param pDC - pointer to draw context (dispatch).
71 /// @param workerId - The unique worker ID that is assigned to this thread.
72 /// @param threadGroupId - the linear index for the thread group within the dispatch.
73 void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId)
74 {
75 RDTSC_START(BEDispatch);
76
77 SWR_CONTEXT *pContext = pDC->pContext;
78
79 const COMPUTE_DESC* pTaskData = (COMPUTE_DESC*)pDC->pDispatch->GetTasksData();
80 SWR_ASSERT(pTaskData != nullptr);
81
82 // Ensure spill fill memory has been allocated.
83 if (pDC->pSpillFill[workerId] == nullptr)
84 {
85 ///@todo Add state which indicates the spill fill size.
86 pDC->pSpillFill[workerId] = (uint8_t*)pDC->pArena->AllocAlignedSync(4096 * 1024, sizeof(float) * 8);
87 }
88
89 const API_STATE& state = GetApiState(pDC);
90
91 SWR_CS_CONTEXT csContext{ 0 };
92 csContext.tileCounter = threadGroupId;
93 csContext.dispatchDims[0] = pTaskData->threadGroupCountX;
94 csContext.dispatchDims[1] = pTaskData->threadGroupCountY;
95 csContext.dispatchDims[2] = pTaskData->threadGroupCountZ;
96 csContext.pTGSM = pContext->pScratch[workerId];
97 csContext.pSpillFillBuffer = pDC->pSpillFill[workerId];
98
99 state.pfnCsFunc(GetPrivateState(pDC), &csContext);
100
101 UPDATE_STAT(CsInvocations, state.totalThreadsInGroup);
102
103 RDTSC_STOP(BEDispatch, 1, 0);
104 }
105
106 void ProcessSyncBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData)
107 {
108 SYNC_DESC *pSync = (SYNC_DESC*)pUserData;
109
110 uint32_t x, y;
111 MacroTileMgr::getTileIndices(macroTile, x, y);
112 SWR_ASSERT(x == 0 && y == 0);
113
114 if (pSync->pfnCallbackFunc != nullptr)
115 {
116 pSync->pfnCallbackFunc(pSync->userData, pSync->userData2, pSync->userData3);
117 }
118 }
119
120 void ProcessQueryStatsBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData)
121 {
122 QUERY_DESC* pQueryDesc = (QUERY_DESC*)pUserData;
123 SWR_STATS* pStats = pQueryDesc->pStats;
124 SWR_CONTEXT *pContext = pDC->pContext;
125
126 SWR_ASSERT(pStats != nullptr);
127
128 for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i)
129 {
130 pStats->DepthPassCount += pContext->stats[i].DepthPassCount;
131
132 pStats->IaVertices += pContext->stats[i].IaVertices;
133 pStats->IaPrimitives += pContext->stats[i].IaPrimitives;
134 pStats->VsInvocations += pContext->stats[i].VsInvocations;
135 pStats->HsInvocations += pContext->stats[i].HsInvocations;
136 pStats->DsInvocations += pContext->stats[i].DsInvocations;
137 pStats->GsInvocations += pContext->stats[i].GsInvocations;
138 pStats->PsInvocations += pContext->stats[i].PsInvocations;
139 pStats->CInvocations += pContext->stats[i].CInvocations;
140 pStats->CsInvocations += pContext->stats[i].CsInvocations;
141 pStats->CPrimitives += pContext->stats[i].CPrimitives;
142 pStats->GsPrimitives += pContext->stats[i].GsPrimitives;
143
144 for (uint32_t stream = 0; stream < MAX_SO_STREAMS; ++stream)
145 {
146 pStats->SoWriteOffset[stream] += pContext->stats[i].SoWriteOffset[stream];
147
148 /// @note client is required to provide valid write offset before every draw, so we clear
149 /// out the contents of the write offset when storing stats
150 pContext->stats[i].SoWriteOffset[stream] = 0;
151
152 pStats->SoPrimStorageNeeded[stream] += pContext->stats[i].SoPrimStorageNeeded[stream];
153 pStats->SoNumPrimsWritten[stream] += pContext->stats[i].SoNumPrimsWritten[stream];
154 }
155 }
156 }
157
158 template<SWR_FORMAT format>
159 void ClearRasterTile(BYTE *pTileBuffer, simdvector &value)
160 {
161 auto lambda = [&](int comp)
162 {
163 FormatTraits<format>::storeSOA(comp, pTileBuffer, value.v[comp]);
164 pTileBuffer += (KNOB_SIMD_WIDTH * FormatTraits<format>::GetBPC(comp) / 8);
165 };
166
167 const uint32_t numIter = (KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD_TILE_X_DIM);
168 for (uint32_t i = 0; i < numIter; ++i)
169 {
170 UnrollerL<0, FormatTraits<format>::numComps, 1>::step(lambda);
171 }
172 }
173
174 template<SWR_FORMAT format>
175 INLINE void ClearMacroTile(DRAW_CONTEXT *pDC, SWR_RENDERTARGET_ATTACHMENT rt, uint32_t macroTile, DWORD clear[4])
176 {
177 // convert clear color to hottile format
178 // clear color is in RGBA float/uint32
179 simdvector vClear;
180 for (uint32_t comp = 0; comp < FormatTraits<format>::numComps; ++comp)
181 {
182 simdscalar vComp;
183 vComp = _simd_load1_ps((const float*)&clear[comp]);
184 if (FormatTraits<format>::isNormalized(comp))
185 {
186 vComp = _simd_mul_ps(vComp, _simd_set1_ps(FormatTraits<format>::fromFloat(comp)));
187 vComp = _simd_castsi_ps(_simd_cvtps_epi32(vComp));
188 }
189 vComp = FormatTraits<format>::pack(comp, vComp);
190 vClear.v[FormatTraits<format>::swizzle(comp)] = vComp;
191 }
192
193 uint32_t tileX, tileY;
194 MacroTileMgr::getTileIndices(macroTile, tileX, tileY);
195 const API_STATE& state = GetApiState(pDC);
196
197 int top = KNOB_MACROTILE_Y_DIM_FIXED * tileY;
198 int bottom = top + KNOB_MACROTILE_Y_DIM_FIXED - 1;
199 int left = KNOB_MACROTILE_X_DIM_FIXED * tileX;
200 int right = left + KNOB_MACROTILE_X_DIM_FIXED - 1;
201
202 // intersect with scissor
203 top = std::max(top, state.scissorInFixedPoint.top);
204 left = std::max(left, state.scissorInFixedPoint.left);
205 bottom = std::min(bottom, state.scissorInFixedPoint.bottom);
206 right = std::min(right, state.scissorInFixedPoint.right);
207
208 // translate to local hottile origin
209 top -= KNOB_MACROTILE_Y_DIM_FIXED * tileY;
210 bottom -= KNOB_MACROTILE_Y_DIM_FIXED * tileY;
211 left -= KNOB_MACROTILE_X_DIM_FIXED * tileX;
212 right -= KNOB_MACROTILE_X_DIM_FIXED * tileX;
213
214 // convert to raster tiles
215 top >>= (KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT);
216 bottom >>= (KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT);
217 left >>= (KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT);
218 right >>= (KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT);
219
220 const int numSamples = GetNumSamples(pDC->pState->state.rastState.sampleCount);
221 // compute steps between raster tile samples / raster tiles / macro tile rows
222 const uint32_t rasterTileSampleStep = KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<format>::bpp / 8;
223 const uint32_t rasterTileStep = (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<format>::bpp / 8)) * numSamples;
224 const uint32_t macroTileRowStep = (KNOB_MACROTILE_X_DIM / KNOB_TILE_X_DIM) * rasterTileStep;
225 const uint32_t pitch = (FormatTraits<format>::bpp * KNOB_MACROTILE_X_DIM / 8);
226
227 HOTTILE *pHotTile = pDC->pContext->pHotTileMgr->GetHotTile(pDC->pContext, pDC, macroTile, rt, true, numSamples);
228 uint32_t rasterTileStartOffset = (ComputeTileOffset2D< TilingTraits<SWR_TILE_SWRZ, FormatTraits<format>::bpp > >(pitch, left, top)) * numSamples;
229 uint8_t* pRasterTileRow = pHotTile->pBuffer + rasterTileStartOffset; //(ComputeTileOffset2D< TilingTraits<SWR_TILE_SWRZ, FormatTraits<format>::bpp > >(pitch, x, y)) * numSamples;
230
231 // loop over all raster tiles in the current hot tile
232 for (int y = top; y <= bottom; ++y)
233 {
234 uint8_t* pRasterTile = pRasterTileRow;
235 for (int x = left; x <= right; ++x)
236 {
237 for( int sampleNum = 0; sampleNum < numSamples; sampleNum++)
238 {
239 ClearRasterTile<format>(pRasterTile, vClear);
240 pRasterTile += rasterTileSampleStep;
241 }
242 }
243 pRasterTileRow += macroTileRowStep;
244 }
245
246 pHotTile->state = HOTTILE_DIRTY;
247 }
248
249
250 void ProcessClearBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData)
251 {
252 if (KNOB_FAST_CLEAR)
253 {
254 CLEAR_DESC *pClear = (CLEAR_DESC*)pUserData;
255 SWR_CONTEXT *pContext = pDC->pContext;
256 SWR_MULTISAMPLE_COUNT sampleCount = pDC->pState->state.rastState.sampleCount;
257 uint32_t numSamples = GetNumSamples(sampleCount);
258
259 SWR_ASSERT(pClear->flags.bits != 0); // shouldn't be here without a reason.
260
261 RDTSC_START(BEClear);
262
263 if (pClear->flags.mask & SWR_CLEAR_COLOR)
264 {
265 HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, SWR_ATTACHMENT_COLOR0, true, numSamples);
266 // All we want to do here is to mark the hot tile as being in a "needs clear" state.
267 pHotTile->clearData[0] = *(DWORD*)&(pClear->clearRTColor[0]);
268 pHotTile->clearData[1] = *(DWORD*)&(pClear->clearRTColor[1]);
269 pHotTile->clearData[2] = *(DWORD*)&(pClear->clearRTColor[2]);
270 pHotTile->clearData[3] = *(DWORD*)&(pClear->clearRTColor[3]);
271 pHotTile->state = HOTTILE_CLEAR;
272 }
273
274 if (pClear->flags.mask & SWR_CLEAR_DEPTH)
275 {
276 HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, SWR_ATTACHMENT_DEPTH, true, numSamples);
277 pHotTile->clearData[0] = *(DWORD*)&pClear->clearDepth;
278 pHotTile->state = HOTTILE_CLEAR;
279 }
280
281 if (pClear->flags.mask & SWR_CLEAR_STENCIL)
282 {
283 HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, SWR_ATTACHMENT_STENCIL, true, numSamples);
284
285 pHotTile->clearData[0] = *(DWORD*)&pClear->clearStencil;
286 pHotTile->state = HOTTILE_CLEAR;
287 }
288
289 RDTSC_STOP(BEClear, 0, 0);
290 }
291 else
292 {
293 // Legacy clear
294 CLEAR_DESC *pClear = (CLEAR_DESC*)pUserData;
295 RDTSC_START(BEClear);
296
297 if (pClear->flags.mask & SWR_CLEAR_COLOR)
298 {
299 /// @todo clear data should come in as RGBA32_FLOAT
300 DWORD clearData[4];
301 float clearFloat[4];
302 clearFloat[0] = ((BYTE*)(&pClear->clearRTColor))[0] / 255.0f;
303 clearFloat[1] = ((BYTE*)(&pClear->clearRTColor))[1] / 255.0f;
304 clearFloat[2] = ((BYTE*)(&pClear->clearRTColor))[2] / 255.0f;
305 clearFloat[3] = ((BYTE*)(&pClear->clearRTColor))[3] / 255.0f;
306 clearData[0] = *(DWORD*)&clearFloat[0];
307 clearData[1] = *(DWORD*)&clearFloat[1];
308 clearData[2] = *(DWORD*)&clearFloat[2];
309 clearData[3] = *(DWORD*)&clearFloat[3];
310
311 PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[KNOB_COLOR_HOT_TILE_FORMAT];
312 SWR_ASSERT(pfnClearTiles != nullptr);
313
314 pfnClearTiles(pDC, SWR_ATTACHMENT_COLOR0, macroTile, clearData);
315 }
316
317 if (pClear->flags.mask & SWR_CLEAR_DEPTH)
318 {
319 DWORD clearData[4];
320 clearData[0] = *(DWORD*)&pClear->clearDepth;
321 PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[KNOB_DEPTH_HOT_TILE_FORMAT];
322 SWR_ASSERT(pfnClearTiles != nullptr);
323
324 pfnClearTiles(pDC, SWR_ATTACHMENT_DEPTH, macroTile, clearData);
325 }
326
327 if (pClear->flags.mask & SWR_CLEAR_STENCIL)
328 {
329 uint32_t value = pClear->clearStencil;
330 DWORD clearData[4];
331 clearData[0] = *(DWORD*)&value;
332 PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[KNOB_STENCIL_HOT_TILE_FORMAT];
333
334 pfnClearTiles(pDC, SWR_ATTACHMENT_STENCIL, macroTile, clearData);
335 }
336
337 RDTSC_STOP(BEClear, 0, 0);
338 }
339 }
340
341
342 void ProcessStoreTileBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData)
343 {
344 RDTSC_START(BEStoreTiles);
345 STORE_TILES_DESC *pDesc = (STORE_TILES_DESC*)pData;
346 SWR_CONTEXT *pContext = pDC->pContext;
347
348 #ifdef KNOB_ENABLE_RDTSC
349 uint32_t numTiles = 0;
350 #endif
351 SWR_FORMAT srcFormat;
352 switch (pDesc->attachment)
353 {
354 case SWR_ATTACHMENT_COLOR0:
355 case SWR_ATTACHMENT_COLOR1:
356 case SWR_ATTACHMENT_COLOR2:
357 case SWR_ATTACHMENT_COLOR3:
358 case SWR_ATTACHMENT_COLOR4:
359 case SWR_ATTACHMENT_COLOR5:
360 case SWR_ATTACHMENT_COLOR6:
361 case SWR_ATTACHMENT_COLOR7: srcFormat = KNOB_COLOR_HOT_TILE_FORMAT; break;
362 case SWR_ATTACHMENT_DEPTH: srcFormat = KNOB_DEPTH_HOT_TILE_FORMAT; break;
363 case SWR_ATTACHMENT_STENCIL: srcFormat = KNOB_STENCIL_HOT_TILE_FORMAT; break;
364 default: SWR_ASSERT(false, "Unknown attachment: %d", pDesc->attachment); srcFormat = KNOB_COLOR_HOT_TILE_FORMAT; break;
365 }
366
367 uint32_t x, y;
368 MacroTileMgr::getTileIndices(macroTile, x, y);
369
370 // Only need to store the hottile if it's been rendered to...
371 HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, pDesc->attachment, false);
372 if (pHotTile)
373 {
374 // clear if clear is pending (i.e., not rendered to), then mark as dirty for store.
375 if (pHotTile->state == HOTTILE_CLEAR)
376 {
377 PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[srcFormat];
378 SWR_ASSERT(pfnClearTiles != nullptr);
379
380 pfnClearTiles(pDC, pDesc->attachment, macroTile, pHotTile->clearData);
381 }
382
383 if (pHotTile->state == HOTTILE_DIRTY || pDesc->postStoreTileState == (SWR_TILE_STATE)HOTTILE_DIRTY)
384 {
385 int destX = KNOB_MACROTILE_X_DIM * x;
386 int destY = KNOB_MACROTILE_Y_DIM * y;
387
388 pContext->pfnStoreTile(GetPrivateState(pDC), srcFormat,
389 pDesc->attachment, destX, destY, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer);
390 }
391
392
393 if (pHotTile->state == HOTTILE_DIRTY || pHotTile->state == HOTTILE_RESOLVED)
394 {
395 pHotTile->state = (HOTTILE_STATE)pDesc->postStoreTileState;
396 }
397 }
398 RDTSC_STOP(BEStoreTiles, numTiles, pDC->drawId);
399 }
400
401
402 void ProcessInvalidateTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData)
403 {
404 INVALIDATE_TILES_DESC *pDesc = (INVALIDATE_TILES_DESC*)pData;
405 SWR_CONTEXT *pContext = pDC->pContext;
406
407 for (uint32_t i = 0; i < SWR_NUM_ATTACHMENTS; ++i)
408 {
409 if (pDesc->attachmentMask & (1 << i))
410 {
411 HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, (SWR_RENDERTARGET_ATTACHMENT)i, false);
412 if (pHotTile)
413 {
414 pHotTile->state = HOTTILE_INVALID;
415 }
416 }
417 }
418 }
419
420 #if KNOB_SIMD_WIDTH == 8
421 const __m256 vQuadCenterOffsetsX = { 0.5, 1.5, 0.5, 1.5, 2.5, 3.5, 2.5, 3.5 };
422 const __m256 vQuadCenterOffsetsY = { 0.5, 0.5, 1.5, 1.5, 0.5, 0.5, 1.5, 1.5 };
423 const __m256 vQuadULOffsetsX ={0.0, 1.0, 0.0, 1.0, 2.0, 3.0, 2.0, 3.0};
424 const __m256 vQuadULOffsetsY ={0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0};
425 #define MASK 0xff
426 #else
427 #error Unsupported vector width
428 #endif
429
430 INLINE
431 bool CanEarlyZ(const SWR_PS_STATE *pPSState)
432 {
433 return (pPSState->forceEarlyZ || (!pPSState->writesODepth && !pPSState->usesSourceDepth && !pPSState->usesUAV));
434 }
435
436 simdmask ComputeUserClipMask(uint8_t clipMask, float* pUserClipBuffer, simdscalar vI, simdscalar vJ)
437 {
438 simdscalar vClipMask = _simd_setzero_ps();
439 uint32_t numClipDistance = _mm_popcnt_u32(clipMask);
440
441 for (uint32_t i = 0; i < numClipDistance; ++i)
442 {
443 // pull triangle clip distance values from clip buffer
444 simdscalar vA = _simd_broadcast_ss(pUserClipBuffer++);
445 simdscalar vB = _simd_broadcast_ss(pUserClipBuffer++);
446 simdscalar vC = _simd_broadcast_ss(pUserClipBuffer++);
447
448 // interpolate
449 simdscalar vInterp = vplaneps(vA, vB, vC, vI, vJ);
450
451 // clip if interpolated clip distance is < 0 || NAN
452 simdscalar vCull = _simd_cmp_ps(_simd_setzero_ps(), vInterp, _CMP_NLE_UQ);
453
454 vClipMask = _simd_or_ps(vClipMask, vCull);
455 }
456
457 return _simd_movemask_ps(vClipMask);
458 }
459
460 template<SWR_MULTISAMPLE_COUNT sampleCountT, bool bIsStandardPattern, bool bForcedSampleCount>
461 INLINE void generateInputCoverage(const uint64_t *const coverageMask, uint32_t (&inputMask)[KNOB_SIMD_WIDTH], const uint32_t sampleMask)
462 {
463
464 // will need to update for avx512
465 assert(KNOB_SIMD_WIDTH == 8);
466
467 __m256i mask[2];
468 __m256i sampleCoverage[2];
469 if(bIsStandardPattern)
470 {
471 __m256i src = _mm256_set1_epi32(0);
472 __m256i index0 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0), index1;
473
474 if(MultisampleTraits<sampleCountT>::numSamples == 1)
475 {
476 mask[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, -1);
477 }
478 else if(MultisampleTraits<sampleCountT>::numSamples == 2)
479 {
480 mask[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1);
481 }
482 else if(MultisampleTraits<sampleCountT>::numSamples == 4)
483 {
484 mask[0] = _mm256_set_epi32(0, 0, 0, 0, -1, -1, -1, -1);
485 }
486 else if(MultisampleTraits<sampleCountT>::numSamples == 8)
487 {
488 mask[0] = _mm256_set1_epi32(-1);
489 }
490 else if(MultisampleTraits<sampleCountT>::numSamples == 16)
491 {
492 mask[0] = _mm256_set1_epi32(-1);
493 mask[1] = _mm256_set1_epi32(-1);
494 index1 = _mm256_set_epi32(15, 14, 13, 12, 11, 10, 9, 8);
495 }
496
497 // gather coverage for samples 0-7
498 sampleCoverage[0] = _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src), (const float*)coverageMask, index0, _mm256_castsi256_ps(mask[0]), 8));
499 if(MultisampleTraits<sampleCountT>::numSamples > 8)
500 {
501 // gather coverage for samples 8-15
502 sampleCoverage[1] = _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src), (const float*)coverageMask, index1, _mm256_castsi256_ps(mask[1]), 8));
503 }
504 }
505 else
506 {
507 // center coverage is the same for all samples; just broadcast to the sample slots
508 uint32_t centerCoverage = ((uint32_t)(*coverageMask) & MASK);
509 if(MultisampleTraits<sampleCountT>::numSamples == 1)
510 {
511 sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, centerCoverage);
512 }
513 else if(MultisampleTraits<sampleCountT>::numSamples == 2)
514 {
515 sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, centerCoverage, centerCoverage);
516 }
517 else if(MultisampleTraits<sampleCountT>::numSamples == 4)
518 {
519 sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, centerCoverage, centerCoverage, centerCoverage, centerCoverage);
520 }
521 else if(MultisampleTraits<sampleCountT>::numSamples == 8)
522 {
523 sampleCoverage[0] = _mm256_set1_epi32(centerCoverage);
524 }
525 else if(MultisampleTraits<sampleCountT>::numSamples == 16)
526 {
527 sampleCoverage[0] = _mm256_set1_epi32(centerCoverage);
528 sampleCoverage[1] = _mm256_set1_epi32(centerCoverage);
529 }
530 }
531
532 mask[0] = _mm256_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xC, 0x8, 0x4, 0x0,
533 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xC, 0x8, 0x4, 0x0);
534 // pull out the the 8bit 4x2 coverage for samples 0-7 into the lower 32 bits of each 128bit lane
535 __m256i packedCoverage0 = _simd_shuffle_epi8(sampleCoverage[0], mask[0]);
536
537 __m256i packedCoverage1;
538 if(MultisampleTraits<sampleCountT>::numSamples > 8)
539 {
540 // pull out the the 8bit 4x2 coverage for samples 8-15 into the lower 32 bits of each 128bit lane
541 packedCoverage1 = _simd_shuffle_epi8(sampleCoverage[1], mask[0]);
542 }
543
544 #if (KNOB_ARCH == KNOB_ARCH_AVX)
545 // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane
546 __m256i hiToLow = _mm256_permute2f128_si256(packedCoverage0, packedCoverage0, 0x83);
547 __m256 shufRes = _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow), _mm256_castsi256_ps(hiToLow), _MM_SHUFFLE(1, 1, 0, 1));
548 packedCoverage0 = _mm256_castps_si256(_mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0), shufRes, 0xFE));
549
550 __m256i packedSampleCoverage;
551 if(MultisampleTraits<sampleCountT>::numSamples > 8)
552 {
553 // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane
554 hiToLow = _mm256_permute2f128_si256(packedCoverage1, packedCoverage1, 0x83);
555 shufRes = _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow), _mm256_castsi256_ps(hiToLow), _MM_SHUFFLE(1, 1, 0, 1));
556 shufRes = _mm256_blend_ps(_mm256_castsi256_ps(packedCoverage1), shufRes, 0xFE);
557 packedCoverage1 = _mm256_castps_si256(_mm256_castpd_ps(_mm256_shuffle_pd(_mm256_castps_pd(shufRes), _mm256_castps_pd(shufRes), 0x01)));
558 packedSampleCoverage = _mm256_castps_si256(_mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0), _mm256_castsi256_ps(packedCoverage1), 0xFC));
559 }
560 else
561 {
562 packedSampleCoverage = packedCoverage0;
563 }
564 #else
565 __m256i permMask = _mm256_set_epi32(0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x4, 0x0);
566 // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane
567 packedCoverage0 = _mm256_permutevar8x32_epi32(packedCoverage0, permMask);
568
569 __m256i packedSampleCoverage;
570 if(MultisampleTraits<sampleCountT>::numSamples > 8)
571 {
572 permMask = _mm256_set_epi32(0x7, 0x7, 0x7, 0x7, 0x4, 0x0, 0x7, 0x7);
573 // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane
574 packedCoverage1 = _mm256_permutevar8x32_epi32(packedCoverage1, permMask);
575
576 // blend coverage masks for samples 0-7 and samples 8-15 into single 128 bit lane
577 packedSampleCoverage = _mm256_blend_epi32(packedCoverage0, packedCoverage1, 0x0C);
578 }
579 else
580 {
581 packedSampleCoverage = packedCoverage0;
582 }
583 #endif
584
585 for(int32_t i = KNOB_SIMD_WIDTH - 1; i >= 0; i--)
586 {
587 // convert packed sample coverage masks into single coverage masks for all samples for each pixel in the 4x2
588 inputMask[i] = _simd_movemask_epi8(packedSampleCoverage);
589
590 if(!bForcedSampleCount)
591 {
592 // input coverage has to be anded with sample mask if MSAA isn't forced on
593 inputMask[i] &= sampleMask;
594 }
595
596 // shift to the next pixel in the 4x2
597 packedSampleCoverage = _simd_slli_epi32(packedSampleCoverage, 1);
598 }
599 }
600
601 template<SWR_MULTISAMPLE_COUNT sampleCountT, bool bIsStandardPattern, bool bForcedSampleCount>
602 INLINE void generateInputCoverage(const uint64_t *const coverageMask, __m256 &inputCoverage, const uint32_t sampleMask)
603 {
604 uint32_t inputMask[KNOB_SIMD_WIDTH];
605 generateInputCoverage<sampleCountT, bIsStandardPattern, bForcedSampleCount>(coverageMask, inputMask, sampleMask);
606 inputCoverage = _simd_castsi_ps(_mm256_set_epi32(inputMask[7], inputMask[6], inputMask[5], inputMask[4], inputMask[3], inputMask[2], inputMask[1], inputMask[0]));
607 }
608
609 template<bool perspMask>
610 INLINE void CalcPixelBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext)
611 {
612 if(perspMask)
613 {
614 // evaluate I,J
615 psContext.vI.center = vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.center, psContext.vY.center);
616 psContext.vJ.center = vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.center, psContext.vY.center);
617 psContext.vI.center = _simd_mul_ps(psContext.vI.center, coeffs.vRecipDet);
618 psContext.vJ.center = _simd_mul_ps(psContext.vJ.center, coeffs.vRecipDet);
619
620 // interpolate 1/w
621 psContext.vOneOverW.center = vplaneps(coeffs.vAOneOverW, coeffs.vBOneOverW, coeffs.vCOneOverW, psContext.vI.center, psContext.vJ.center);
622 }
623 }
624
625 template<bool perspMask>
626 INLINE void CalcSampleBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext)
627 {
628 if(perspMask)
629 {
630 // evaluate I,J
631 psContext.vI.sample = vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.sample, psContext.vY.sample);
632 psContext.vJ.sample = vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.sample, psContext.vY.sample);
633 psContext.vI.sample = _simd_mul_ps(psContext.vI.sample, coeffs.vRecipDet);
634 psContext.vJ.sample = _simd_mul_ps(psContext.vJ.sample, coeffs.vRecipDet);
635
636 // interpolate 1/w
637 psContext.vOneOverW.sample = vplaneps(coeffs.vAOneOverW, coeffs.vBOneOverW, coeffs.vCOneOverW, psContext.vI.sample, psContext.vJ.sample);
638 }
639 }
640
641
642 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
643 // Centroid behaves exactly as follows :
644 // (1) If all samples in the primitive are covered, the attribute is evaluated at the pixel center (even if the sample pattern does not happen to
645 // have a sample location there).
646 // (2) Else the attribute is evaluated at the first covered sample, in increasing order of sample index, where sample coverage is after ANDing the
647 // coverage with the SampleMask Rasterizer State.
648 // (3) If no samples are covered, such as on helper pixels executed off the bounds of a primitive to fill out 2x2 pixel stamps, the attribute is
649 // evaluated as follows : If the SampleMask Rasterizer state is a subset of the samples in the pixel, then the first sample covered by the
650 // SampleMask Rasterizer State is the evaluation point.Otherwise (full SampleMask), the pixel center is the evaluation point.
651 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
652 template<SWR_MULTISAMPLE_COUNT sampleCount, bool bForcedSampleCount>
653 INLINE void CalcCentroidPos(SWR_PS_CONTEXT &psContext, const uint64_t *const coverageMask, const uint32_t sampleMask,
654 const simdscalar vXSamplePosUL, const simdscalar vYSamplePosUL)
655 {
656 uint32_t inputMask[KNOB_SIMD_WIDTH];
657
658 generateInputCoverage<sampleCount, 1, bForcedSampleCount>(coverageMask, inputMask, sampleMask);
659
660 // Case (2) - partially covered pixel
661
662 // scan for first covered sample per pixel in the 4x2 span
663 unsigned long sampleNum[KNOB_SIMD_WIDTH];
664 (inputMask[0] > 0) ? (_BitScanForward(&sampleNum[0], inputMask[0])) : (sampleNum[0] = 0);
665 (inputMask[1] > 0) ? (_BitScanForward(&sampleNum[1], inputMask[1])) : (sampleNum[1] = 0);
666 (inputMask[2] > 0) ? (_BitScanForward(&sampleNum[2], inputMask[2])) : (sampleNum[2] = 0);
667 (inputMask[3] > 0) ? (_BitScanForward(&sampleNum[3], inputMask[3])) : (sampleNum[3] = 0);
668 (inputMask[4] > 0) ? (_BitScanForward(&sampleNum[4], inputMask[4])) : (sampleNum[4] = 0);
669 (inputMask[5] > 0) ? (_BitScanForward(&sampleNum[5], inputMask[5])) : (sampleNum[5] = 0);
670 (inputMask[6] > 0) ? (_BitScanForward(&sampleNum[6], inputMask[6])) : (sampleNum[6] = 0);
671 (inputMask[7] > 0) ? (_BitScanForward(&sampleNum[7], inputMask[7])) : (sampleNum[7] = 0);
672
673 // look up and set the sample offsets from UL pixel corner for first covered sample
674 __m256 vXSample = _mm256_set_ps(MultisampleTraits<sampleCount>::X(sampleNum[7]),
675 MultisampleTraits<sampleCount>::X(sampleNum[6]),
676 MultisampleTraits<sampleCount>::X(sampleNum[5]),
677 MultisampleTraits<sampleCount>::X(sampleNum[4]),
678 MultisampleTraits<sampleCount>::X(sampleNum[3]),
679 MultisampleTraits<sampleCount>::X(sampleNum[2]),
680 MultisampleTraits<sampleCount>::X(sampleNum[1]),
681 MultisampleTraits<sampleCount>::X(sampleNum[0]));
682
683 __m256 vYSample = _mm256_set_ps(MultisampleTraits<sampleCount>::Y(sampleNum[7]),
684 MultisampleTraits<sampleCount>::Y(sampleNum[6]),
685 MultisampleTraits<sampleCount>::Y(sampleNum[5]),
686 MultisampleTraits<sampleCount>::Y(sampleNum[4]),
687 MultisampleTraits<sampleCount>::Y(sampleNum[3]),
688 MultisampleTraits<sampleCount>::Y(sampleNum[2]),
689 MultisampleTraits<sampleCount>::Y(sampleNum[1]),
690 MultisampleTraits<sampleCount>::Y(sampleNum[0]));
691 // add sample offset to UL pixel corner
692 vXSample = _simd_add_ps(vXSamplePosUL, vXSample);
693 vYSample = _simd_add_ps(vYSamplePosUL, vYSample);
694
695 // Case (1) and case (3b) - All samples covered or not covered with full SampleMask
696 static const __m256i vFullyCoveredMask = MultisampleTraits<sampleCount>::FullSampleMask();
697 __m256i vInputCoveragei = _mm256_set_epi32(inputMask[7], inputMask[6], inputMask[5], inputMask[4], inputMask[3], inputMask[2], inputMask[1], inputMask[0]);
698 __m256i vAllSamplesCovered = _simd_cmpeq_epi32(vInputCoveragei, vFullyCoveredMask);
699
700 static const __m256i vZero = _simd_setzero_si();
701 const __m256i vSampleMask = _simd_and_si(_simd_set1_epi32(sampleMask), vFullyCoveredMask);
702 __m256i vNoSamplesCovered = _simd_cmpeq_epi32(vInputCoveragei, vZero);
703 __m256i vIsFullSampleMask = _simd_cmpeq_epi32(vSampleMask, vFullyCoveredMask);
704 __m256i vCase3b = _simd_and_si(vNoSamplesCovered, vIsFullSampleMask);
705
706 __m256i vEvalAtCenter = _simd_or_si(vAllSamplesCovered, vCase3b);
707
708 // set the centroid position based on results from above
709 psContext.vX.centroid = _simd_blendv_ps(vXSample, psContext.vX.center, _simd_castsi_ps(vEvalAtCenter));
710 psContext.vY.centroid = _simd_blendv_ps(vYSample, psContext.vY.center, _simd_castsi_ps(vEvalAtCenter));
711
712 // Case (3a) No samples covered and partial sample mask
713 __m256i vSomeSampleMaskSamples = _simd_cmplt_epi32(vSampleMask, vFullyCoveredMask);
714 // sample mask should never be all 0's for this case, but handle it anyways
715 unsigned long firstCoveredSampleMaskSample = 0;
716 (sampleMask > 0) ? (_BitScanForward(&firstCoveredSampleMaskSample, sampleMask)) : (firstCoveredSampleMaskSample = 0);
717
718 __m256i vCase3a = _simd_and_si(vNoSamplesCovered, vSomeSampleMaskSamples);
719
720 vXSample = _simd_set1_ps(MultisampleTraits<sampleCount>::X(firstCoveredSampleMaskSample));
721 vYSample = _simd_set1_ps(MultisampleTraits<sampleCount>::Y(firstCoveredSampleMaskSample));
722
723 // blend in case 3a pixel locations
724 psContext.vX.centroid = _simd_blendv_ps(psContext.vX.centroid, vXSample, _simd_castsi_ps(vCase3a));
725 psContext.vY.centroid = _simd_blendv_ps(psContext.vY.centroid, vYSample, _simd_castsi_ps(vCase3a));
726 }
727
728 template<uint32_t sampleCount, uint32_t persp, uint32_t standardPattern, uint32_t forcedMultisampleCount>
729 INLINE void CalcCentroidBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext,
730 const uint64_t *const coverageMask, const uint32_t sampleMask,
731 const simdscalar vXSamplePosUL, const simdscalar vYSamplePosUL)
732 {
733 static const bool bPersp = (bool)persp;
734 static const bool bIsStandardPattern = (bool)standardPattern;
735 static const bool bForcedMultisampleCount = (bool)forcedMultisampleCount;
736
737 // calculate centroid positions
738 if(bPersp)
739 {
740 if(bIsStandardPattern)
741 {
742 ///@ todo: don't need to generate input coverage 2x if input coverage and centroid
743 CalcCentroidPos<(SWR_MULTISAMPLE_COUNT)sampleCount, bForcedMultisampleCount>(psContext, coverageMask, sampleMask, vXSamplePosUL, vYSamplePosUL);
744 }
745 else
746 {
747 static const __m256 pixelCenter = _simd_set1_ps(0.5f);
748 psContext.vX.centroid = _simd_add_ps(vXSamplePosUL, pixelCenter);
749 psContext.vY.centroid = _simd_add_ps(vYSamplePosUL, pixelCenter);
750 }
751 // evaluate I,J
752 psContext.vI.centroid = vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.centroid, psContext.vY.centroid);
753 psContext.vJ.centroid = vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.centroid, psContext.vY.centroid);
754 psContext.vI.centroid = _simd_mul_ps(psContext.vI.centroid, coeffs.vRecipDet);
755 psContext.vJ.centroid = _simd_mul_ps(psContext.vJ.centroid, coeffs.vRecipDet);
756
757 // interpolate 1/w
758 psContext.vOneOverW.centroid = vplaneps(coeffs.vAOneOverW, coeffs.vBOneOverW, coeffs.vCOneOverW, psContext.vI.centroid, psContext.vJ.centroid);
759 }
760 }
761
762 template<uint32_t NumRT, uint32_t sampleCountT>
763 void OutputMerger(SWR_PS_CONTEXT &psContext, uint8_t* (&pColorBase)[SWR_NUM_RENDERTARGETS], uint32_t sample, const SWR_BLEND_STATE *pBlendState,
764 const PFN_BLEND_JIT_FUNC (&pfnBlendFunc)[SWR_NUM_RENDERTARGETS], simdscalar &coverageMask, simdscalar depthPassMask)
765 {
766 // type safety guaranteed from template instantiation in BEChooser<>::GetFunc
767 static const SWR_MULTISAMPLE_COUNT sampleCount = (SWR_MULTISAMPLE_COUNT)sampleCountT;
768 uint32_t rasterTileColorOffset = MultisampleTraits<sampleCount>::RasterTileColorOffset(sample);
769 simdvector blendOut;
770
771 for(uint32_t rt = 0; rt < NumRT; ++rt)
772 {
773 uint8_t *pColorSample;
774 if(sampleCount == SWR_MULTISAMPLE_1X)
775 {
776 pColorSample = pColorBase[rt];
777 }
778 else
779 {
780 pColorSample = pColorBase[rt] + rasterTileColorOffset;
781 }
782
783 const SWR_RENDER_TARGET_BLEND_STATE *pRTBlend = &pBlendState->renderTarget[rt];
784 // pfnBlendFunc may not update all channels. Initialize with PS output.
785 /// TODO: move this into the blend JIT.
786 blendOut = psContext.shaded[rt];
787
788 // Blend outputs and update coverage mask for alpha test
789 if(pfnBlendFunc[rt] != nullptr)
790 {
791 pfnBlendFunc[rt](
792 pBlendState,
793 psContext.shaded[rt],
794 psContext.shaded[1],
795 sample,
796 pColorSample,
797 blendOut,
798 &psContext.oMask,
799 (simdscalari*)&coverageMask);
800 }
801
802 // final write mask
803 simdscalari outputMask = _simd_castps_si(_simd_and_ps(coverageMask, depthPassMask));
804
805 ///@todo can only use maskstore fast path if bpc is 32. Assuming hot tile is RGBA32_FLOAT.
806 static_assert(KNOB_COLOR_HOT_TILE_FORMAT == R32G32B32A32_FLOAT, "Unsupported hot tile format");
807
808 const uint32_t simd = KNOB_SIMD_WIDTH * sizeof(float);
809
810 // store with color mask
811 if(!pRTBlend->writeDisableRed)
812 {
813 _simd_maskstore_ps((float*)pColorSample, outputMask, blendOut.x);
814 }
815 if(!pRTBlend->writeDisableGreen)
816 {
817 _simd_maskstore_ps((float*)(pColorSample + simd), outputMask, blendOut.y);
818 }
819 if(!pRTBlend->writeDisableBlue)
820 {
821 _simd_maskstore_ps((float*)(pColorSample + simd * 2), outputMask, blendOut.z);
822 }
823 if(!pRTBlend->writeDisableAlpha)
824 {
825 _simd_maskstore_ps((float*)(pColorSample + simd * 3), outputMask, blendOut.w);
826 }
827 }
828 }
829
830 template<uint32_t sampleCountT, uint32_t samplePattern, uint32_t inputCoverage, uint32_t centroidPos, uint32_t forcedSampleCount>
831 void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
832 {
833 RDTSC_START(BESetup);
834 // type safety guaranteed from template instantiation in BEChooser<>::GetFunc
835 static const bool bInputCoverage = (bool)inputCoverage;
836 static const bool bCentroidPos = (bool)centroidPos;
837
838 SWR_CONTEXT *pContext = pDC->pContext;
839 const API_STATE& state = GetApiState(pDC);
840 const SWR_RASTSTATE& rastState = state.rastState;
841 const SWR_PS_STATE *pPSState = &state.psState;
842 const SWR_BLEND_STATE *pBlendState = &state.blendState;
843 const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs;
844 uint64_t coverageMask = work.coverageMask[0];
845
846 // broadcast scalars
847 BarycentricCoeffs coeffs;
848 coeffs.vIa = _simd_broadcast_ss(&work.I[0]);
849 coeffs.vIb = _simd_broadcast_ss(&work.I[1]);
850 coeffs.vIc = _simd_broadcast_ss(&work.I[2]);
851
852 coeffs.vJa = _simd_broadcast_ss(&work.J[0]);
853 coeffs.vJb = _simd_broadcast_ss(&work.J[1]);
854 coeffs.vJc = _simd_broadcast_ss(&work.J[2]);
855
856 coeffs.vZa = _simd_broadcast_ss(&work.Z[0]);
857 coeffs.vZb = _simd_broadcast_ss(&work.Z[1]);
858 coeffs.vZc = _simd_broadcast_ss(&work.Z[2]);
859
860 coeffs.vRecipDet = _simd_broadcast_ss(&work.recipDet);
861
862 coeffs.vAOneOverW = _simd_broadcast_ss(&work.OneOverW[0]);
863 coeffs.vBOneOverW = _simd_broadcast_ss(&work.OneOverW[1]);
864 coeffs.vCOneOverW = _simd_broadcast_ss(&work.OneOverW[2]);
865
866 uint8_t *pColorBase[SWR_NUM_RENDERTARGETS];
867 uint32_t NumRT = state.psState.numRenderTargets;
868 for(uint32_t rt = 0; rt < NumRT; ++rt)
869 {
870 pColorBase[rt] = renderBuffers.pColor[rt];
871 }
872 uint8_t *pDepthBase = renderBuffers.pDepth, *pStencilBase = renderBuffers.pStencil;
873 RDTSC_STOP(BESetup, 0, 0);
874
875 SWR_PS_CONTEXT psContext;
876 psContext.pAttribs = work.pAttribs;
877 psContext.pPerspAttribs = work.pPerspAttribs;
878 psContext.frontFace = work.triFlags.frontFacing;
879 psContext.primID = work.triFlags.primID;
880
881 // save Ia/Ib/Ic and Ja/Jb/Jc if we need to reevaluate i/j/k in the shader because of pull attribs
882 psContext.I = work.I;
883 psContext.J = work.J;
884 psContext.recipDet = work.recipDet;
885 psContext.pRecipW = work.pRecipW;
886 psContext.pSamplePosX = (const float*)&MultisampleTraits<SWR_MULTISAMPLE_1X>::samplePosX;
887 psContext.pSamplePosY = (const float*)&MultisampleTraits<SWR_MULTISAMPLE_1X>::samplePosY;
888
889 for(uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
890 {
891 // UL pixel corner
892 psContext.vY.UL = _simd_add_ps(vQuadULOffsetsY, _simd_set1_ps((float)yy));
893 // pixel center
894 psContext.vY.center = _simd_add_ps(vQuadCenterOffsetsY, _simd_set1_ps((float)yy));
895
896 for(uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
897 {
898 if(bInputCoverage)
899 {
900 generateInputCoverage<SWR_MULTISAMPLE_1X, SWR_MSAA_STANDARD_PATTERN, false>(&work.coverageMask[0], psContext.inputMask, pBlendState->sampleMask);
901 }
902
903 if(coverageMask & MASK)
904 {
905 RDTSC_START(BEBarycentric);
906 psContext.vX.UL = _simd_add_ps(vQuadULOffsetsX, _simd_set1_ps((float)xx));
907 // pixel center
908 psContext.vX.center = _simd_add_ps(vQuadCenterOffsetsX, _simd_set1_ps((float)xx));
909
910 backendFuncs.pfnCalcPixelBarycentrics(coeffs, psContext);
911
912 if(bCentroidPos)
913 {
914 // for 1x case, centroid is pixel center
915 psContext.vX.centroid = psContext.vX.center;
916 psContext.vY.centroid = psContext.vY.center;
917 psContext.vI.centroid = psContext.vI.center;
918 psContext.vJ.centroid = psContext.vJ.center;
919 psContext.vOneOverW.centroid = psContext.vOneOverW.center;
920 }
921
922 // interpolate z
923 psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center);
924 RDTSC_STOP(BEBarycentric, 0, 0);
925
926 simdmask clipCoverageMask = coverageMask & MASK;
927
928 // interpolate user clip distance if available
929 if(rastState.clipDistanceMask)
930 {
931 clipCoverageMask &= ~ComputeUserClipMask(rastState.clipDistanceMask, work.pUserClipBuffer,
932 psContext.vI.center, psContext.vJ.center);
933 }
934
935 simdscalar vCoverageMask = vMask(clipCoverageMask);
936 simdscalar depthPassMask = vCoverageMask;
937 simdscalar stencilPassMask = vCoverageMask;
938
939 // Early-Z?
940 if(CanEarlyZ(pPSState))
941 {
942 RDTSC_START(BEEarlyDepthTest);
943 depthPassMask = DepthStencilTest(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing,
944 psContext.vZ, pDepthBase, vCoverageMask, pStencilBase, &stencilPassMask);
945 RDTSC_STOP(BEEarlyDepthTest, 0, 0);
946
947 // early-exit if no pixels passed depth or earlyZ is forced on
948 if(pPSState->forceEarlyZ || !_simd_movemask_ps(depthPassMask))
949 {
950 DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
951 pDepthBase, depthPassMask, vCoverageMask, pStencilBase, stencilPassMask);
952
953 if (!_simd_movemask_ps(depthPassMask))
954 {
955 goto Endtile;
956 }
957 }
958 }
959
960 psContext.sampleIndex = 0;
961 psContext.activeMask = _simd_castps_si(vCoverageMask);
962
963 // execute pixel shader
964 RDTSC_START(BEPixelShader);
965 UPDATE_STAT(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask)));
966 state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
967 RDTSC_STOP(BEPixelShader, 0, 0);
968
969 vCoverageMask = _simd_castsi_ps(psContext.activeMask);
970
971 // late-Z
972 if(!CanEarlyZ(pPSState))
973 {
974 RDTSC_START(BELateDepthTest);
975 depthPassMask = DepthStencilTest(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing,
976 psContext.vZ, pDepthBase, vCoverageMask, pStencilBase, &stencilPassMask);
977 RDTSC_STOP(BELateDepthTest, 0, 0);
978
979 if(!_simd_movemask_ps(depthPassMask))
980 {
981 // need to call depth/stencil write for stencil write
982 DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
983 pDepthBase, depthPassMask, vCoverageMask, pStencilBase, stencilPassMask);
984 goto Endtile;
985 }
986 }
987
988 uint32_t statMask = _simd_movemask_ps(depthPassMask);
989 uint32_t statCount = _mm_popcnt_u32(statMask);
990 UPDATE_STAT(DepthPassCount, statCount);
991
992 // output merger
993 RDTSC_START(BEOutputMerger);
994 backendFuncs.pfnOutputMerger(psContext, pColorBase, 0, pBlendState, state.pfnBlendFunc,
995 vCoverageMask, depthPassMask);
996
997 // do final depth write after all pixel kills
998 if (!pPSState->forceEarlyZ)
999 {
1000 DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
1001 pDepthBase, depthPassMask, vCoverageMask, pStencilBase, stencilPassMask);
1002 }
1003 RDTSC_STOP(BEOutputMerger, 0, 0);
1004 }
1005
1006 Endtile:
1007 RDTSC_START(BEEndTile);
1008 coverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
1009 pDepthBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
1010 pStencilBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
1011
1012 for(uint32_t rt = 0; rt < NumRT; ++rt)
1013 {
1014 pColorBase[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
1015 }
1016 RDTSC_STOP(BEEndTile, 0, 0);
1017 }
1018 }
1019 }
1020
1021 template<uint32_t sampleCountT, uint32_t samplePattern, uint32_t inputCoverage, uint32_t centroidPos, uint32_t forcedSampleCount>
1022 void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
1023 {
1024 // type safety guaranteed from template instantiation in BEChooser<>::GetFunc
1025 static const SWR_MULTISAMPLE_COUNT sampleCount = (SWR_MULTISAMPLE_COUNT)sampleCountT;
1026 static const bool bInputCoverage = (bool)inputCoverage;
1027 static const bool bCentroidPos = (bool)centroidPos;
1028
1029 RDTSC_START(BESetup);
1030
1031 SWR_CONTEXT *pContext = pDC->pContext;
1032 const API_STATE& state = GetApiState(pDC);
1033 const SWR_RASTSTATE& rastState = state.rastState;
1034 const SWR_PS_STATE *pPSState = &state.psState;
1035 const SWR_BLEND_STATE *pBlendState = &state.blendState;
1036 const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs;
1037
1038 // broadcast scalars
1039 BarycentricCoeffs coeffs;
1040 coeffs.vIa = _simd_broadcast_ss(&work.I[0]);
1041 coeffs.vIb = _simd_broadcast_ss(&work.I[1]);
1042 coeffs.vIc = _simd_broadcast_ss(&work.I[2]);
1043
1044 coeffs.vJa = _simd_broadcast_ss(&work.J[0]);
1045 coeffs.vJb = _simd_broadcast_ss(&work.J[1]);
1046 coeffs.vJc = _simd_broadcast_ss(&work.J[2]);
1047
1048 coeffs.vZa = _simd_broadcast_ss(&work.Z[0]);
1049 coeffs.vZb = _simd_broadcast_ss(&work.Z[1]);
1050 coeffs.vZc = _simd_broadcast_ss(&work.Z[2]);
1051
1052 coeffs.vRecipDet = _simd_broadcast_ss(&work.recipDet);
1053
1054 coeffs.vAOneOverW = _simd_broadcast_ss(&work.OneOverW[0]);
1055 coeffs.vBOneOverW = _simd_broadcast_ss(&work.OneOverW[1]);
1056 coeffs.vCOneOverW = _simd_broadcast_ss(&work.OneOverW[2]);
1057
1058 uint8_t *pColorBase[SWR_NUM_RENDERTARGETS];
1059 uint32_t NumRT = state.psState.numRenderTargets;
1060 for(uint32_t rt = 0; rt < NumRT; ++rt)
1061 {
1062 pColorBase[rt] = renderBuffers.pColor[rt];
1063 }
1064 uint8_t *pDepthBase = renderBuffers.pDepth, *pStencilBase = renderBuffers.pStencil;
1065 RDTSC_STOP(BESetup, 0, 0);
1066
1067 SWR_PS_CONTEXT psContext;
1068 psContext.pAttribs = work.pAttribs;
1069 psContext.pPerspAttribs = work.pPerspAttribs;
1070 psContext.pRecipW = work.pRecipW;
1071 psContext.frontFace = work.triFlags.frontFacing;
1072 psContext.primID = work.triFlags.primID;
1073
1074 // save Ia/Ib/Ic and Ja/Jb/Jc if we need to reevaluate i/j/k in the shader because of pull attribs
1075 psContext.I = work.I;
1076 psContext.J = work.J;
1077 psContext.recipDet = work.recipDet;
1078 psContext.pSamplePosX = (const float*)&MultisampleTraits<sampleCount>::samplePosX;
1079 psContext.pSamplePosY = (const float*)&MultisampleTraits<sampleCount>::samplePosY;
1080 const uint32_t numSamples = MultisampleTraits<sampleCount>::numSamples;
1081
1082 for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
1083 {
1084 // UL pixel corner
1085 psContext.vY.UL = _simd_add_ps(vQuadULOffsetsY, _simd_set1_ps((float)yy));
1086 // pixel center
1087 psContext.vY.center = _simd_add_ps(vQuadCenterOffsetsY, _simd_set1_ps((float)yy));
1088
1089 for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
1090 {
1091 psContext.vX.UL = _simd_add_ps(vQuadULOffsetsX, _simd_set1_ps((float)xx));
1092 // pixel center
1093 psContext.vX.center = _simd_add_ps(vQuadCenterOffsetsX, _simd_set1_ps((float)xx));
1094
1095 RDTSC_START(BEBarycentric);
1096 backendFuncs.pfnCalcPixelBarycentrics(coeffs, psContext);
1097 RDTSC_STOP(BEBarycentric, 0, 0);
1098
1099 if(bInputCoverage)
1100 {
1101 generateInputCoverage<sampleCount, SWR_MSAA_STANDARD_PATTERN, false>(&work.coverageMask[0], psContext.inputMask, pBlendState->sampleMask);
1102 }
1103
1104 if(bCentroidPos)
1105 {
1106 ///@ todo: don't need to genererate input coverage 2x if input coverage and centroid
1107 RDTSC_START(BEBarycentric);
1108 backendFuncs.pfnCalcCentroidBarycentrics(coeffs, psContext, &work.coverageMask[0], pBlendState->sampleMask, psContext.vX.UL, psContext.vY.UL);
1109 RDTSC_STOP(BEBarycentric, 0, 0);
1110 }
1111
1112 for(uint32_t sample = 0; sample < numSamples; sample++)
1113 {
1114 if (work.coverageMask[sample] & MASK)
1115 {
1116 RDTSC_START(BEBarycentric);
1117
1118 // calculate per sample positions
1119 psContext.vX.sample = _simd_add_ps(psContext.vX.UL, MultisampleTraits<sampleCount>::vX(sample));
1120 psContext.vY.sample = _simd_add_ps(psContext.vY.UL, MultisampleTraits<sampleCount>::vY(sample));
1121
1122 simdmask coverageMask = work.coverageMask[sample] & MASK;
1123 simdscalar vCoverageMask = vMask(coverageMask);
1124
1125 backendFuncs.pfnCalcSampleBarycentrics(coeffs, psContext);
1126
1127 // interpolate z
1128 psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample);
1129
1130 RDTSC_STOP(BEBarycentric, 0, 0);
1131
1132 // interpolate user clip distance if available
1133 if (rastState.clipDistanceMask)
1134 {
1135 coverageMask &= ~ComputeUserClipMask(rastState.clipDistanceMask, work.pUserClipBuffer,
1136 psContext.vI.sample, psContext.vJ.sample);
1137 }
1138
1139 simdscalar depthPassMask = vCoverageMask;
1140 simdscalar stencilPassMask = vCoverageMask;
1141
1142 // offset depth/stencil buffers current sample
1143 uint8_t *pDepthSample = pDepthBase + MultisampleTraits<sampleCount>::RasterTileDepthOffset(sample);
1144 uint8_t *pStencilSample = pStencilBase + MultisampleTraits<sampleCount>::RasterTileStencilOffset(sample);
1145
1146 // Early-Z?
1147 if (CanEarlyZ(pPSState))
1148 {
1149 RDTSC_START(BEEarlyDepthTest);
1150 depthPassMask = DepthStencilTest(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing,
1151 psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
1152 RDTSC_STOP(BEEarlyDepthTest, 0, 0);
1153
1154 // early-exit if no samples passed depth or earlyZ is forced on.
1155 if (pPSState->forceEarlyZ || !_simd_movemask_ps(depthPassMask))
1156 {
1157 DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
1158 pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
1159
1160 if (!_simd_movemask_ps(depthPassMask))
1161 {
1162 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
1163 continue;
1164 }
1165 }
1166 }
1167
1168 psContext.sampleIndex = sample;
1169 psContext.activeMask = _simd_castps_si(vCoverageMask);
1170
1171 // execute pixel shader
1172 RDTSC_START(BEPixelShader);
1173 UPDATE_STAT(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask)));
1174 state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
1175 RDTSC_STOP(BEPixelShader, 0, 0);
1176
1177 vCoverageMask = _simd_castsi_ps(psContext.activeMask);
1178
1179 //// late-Z
1180 if (!CanEarlyZ(pPSState))
1181 {
1182 RDTSC_START(BELateDepthTest);
1183 depthPassMask = DepthStencilTest(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing,
1184 psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
1185 RDTSC_STOP(BELateDepthTest, 0, 0);
1186
1187 if (!_simd_movemask_ps(depthPassMask))
1188 {
1189 // need to call depth/stencil write for stencil write
1190 DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
1191 pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
1192
1193 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
1194 continue;
1195 }
1196 }
1197
1198 uint32_t statMask = _simd_movemask_ps(depthPassMask);
1199 uint32_t statCount = _mm_popcnt_u32(statMask);
1200 UPDATE_STAT(DepthPassCount, statCount);
1201
1202 // output merger
1203 RDTSC_START(BEOutputMerger);
1204 backendFuncs.pfnOutputMerger(psContext, pColorBase, sample, pBlendState, state.pfnBlendFunc,
1205 vCoverageMask, depthPassMask);
1206
1207 // do final depth write after all pixel kills
1208 if (!pPSState->forceEarlyZ)
1209 {
1210 DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
1211 pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
1212 }
1213 RDTSC_STOP(BEOutputMerger, 0, 0);
1214 }
1215 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
1216 }
1217 RDTSC_START(BEEndTile);
1218 pDepthBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
1219 pStencilBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
1220
1221 for (uint32_t rt = 0; rt < NumRT; ++rt)
1222 {
1223 pColorBase[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
1224 }
1225 RDTSC_STOP(BEEndTile, 0, 0);
1226 }
1227 }
1228 }
1229
1230 template<uint32_t sampleCountT, uint32_t samplePattern, uint32_t inputCoverage, uint32_t centroidPos, uint32_t forcedSampleCount>
1231 void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
1232 {
1233 // type safety guaranteed from template instantiation in BEChooser<>::GetFunc
1234 static const SWR_MULTISAMPLE_COUNT sampleCount = (SWR_MULTISAMPLE_COUNT)sampleCountT;
1235 static const bool bIsStandardPattern = (bool)samplePattern;
1236 static const bool bInputCoverage = (bool)inputCoverage;
1237 static const bool bCentroidPos = (bool)centroidPos;
1238 static const bool bForcedSampleCount = (bool)forcedSampleCount;
1239
1240 RDTSC_START(BESetup);
1241
1242 SWR_CONTEXT *pContext = pDC->pContext;
1243 const API_STATE& state = GetApiState(pDC);
1244 const SWR_RASTSTATE& rastState = state.rastState;
1245 const SWR_PS_STATE *pPSState = &state.psState;
1246 const SWR_BLEND_STATE *pBlendState = &state.blendState;
1247 const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs;
1248
1249 // broadcast scalars
1250 BarycentricCoeffs coeffs;
1251 coeffs.vIa = _simd_broadcast_ss(&work.I[0]);
1252 coeffs.vIb = _simd_broadcast_ss(&work.I[1]);
1253 coeffs.vIc = _simd_broadcast_ss(&work.I[2]);
1254
1255 coeffs.vJa = _simd_broadcast_ss(&work.J[0]);
1256 coeffs.vJb = _simd_broadcast_ss(&work.J[1]);
1257 coeffs.vJc = _simd_broadcast_ss(&work.J[2]);
1258
1259 coeffs.vZa = _simd_broadcast_ss(&work.Z[0]);
1260 coeffs.vZb = _simd_broadcast_ss(&work.Z[1]);
1261 coeffs.vZc = _simd_broadcast_ss(&work.Z[2]);
1262
1263 coeffs.vRecipDet = _simd_broadcast_ss(&work.recipDet);
1264
1265 coeffs.vAOneOverW = _simd_broadcast_ss(&work.OneOverW[0]);
1266 coeffs.vBOneOverW = _simd_broadcast_ss(&work.OneOverW[1]);
1267 coeffs.vCOneOverW = _simd_broadcast_ss(&work.OneOverW[2]);
1268
1269 uint8_t *pColorBase[SWR_NUM_RENDERTARGETS];
1270 uint32_t NumRT = state.psState.numRenderTargets;
1271 for(uint32_t rt = 0; rt < NumRT; ++rt)
1272 {
1273 pColorBase[rt] = renderBuffers.pColor[rt];
1274 }
1275 uint8_t *pDepthBase = renderBuffers.pDepth, *pStencilBase = renderBuffers.pStencil;
1276 RDTSC_STOP(BESetup, 0, 0);
1277
1278 SWR_PS_CONTEXT psContext;
1279 psContext.pAttribs = work.pAttribs;
1280 psContext.pPerspAttribs = work.pPerspAttribs;
1281 psContext.frontFace = work.triFlags.frontFacing;
1282 psContext.primID = work.triFlags.primID;
1283 psContext.pRecipW = work.pRecipW;
1284 // save Ia/Ib/Ic and Ja/Jb/Jc if we need to reevaluate i/j/k in the shader because of pull attribs
1285 psContext.I = work.I;
1286 psContext.J = work.J;
1287 psContext.recipDet = work.recipDet;
1288 psContext.pSamplePosX = (const float*)&MultisampleTraits<sampleCount>::samplePosX;
1289 psContext.pSamplePosY = (const float*)&MultisampleTraits<sampleCount>::samplePosY;
1290 psContext.sampleIndex = 0;
1291
1292 uint32_t numCoverageSamples;
1293 if(bIsStandardPattern)
1294 {
1295 numCoverageSamples = MultisampleTraits<sampleCount>::numSamples;
1296 }
1297 else
1298 {
1299 numCoverageSamples = 1;
1300 }
1301
1302 uint32_t numOMSamples;
1303 // RT has to be single sample if we're in forcedMSAA mode
1304 if(bForcedSampleCount && (sampleCount > SWR_MULTISAMPLE_1X))
1305 {
1306 numOMSamples = 1;
1307 }
1308 // unless we're forced to single sample, in which case we run the OM at the sample count of the RT
1309 else if(bForcedSampleCount && (sampleCount == SWR_MULTISAMPLE_1X))
1310 {
1311 numOMSamples = GetNumSamples(pBlendState->sampleCount);
1312 }
1313 // else we're in normal MSAA mode and rasterizer and OM are running at the same sample count
1314 else
1315 {
1316 numOMSamples = MultisampleTraits<sampleCount>::numSamples;
1317 }
1318
1319 for(uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
1320 {
1321 psContext.vY.UL = _simd_add_ps(vQuadULOffsetsY, _simd_set1_ps((float)yy));
1322 psContext.vY.center = _simd_add_ps(vQuadCenterOffsetsY, _simd_set1_ps((float)yy));
1323 for(uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
1324 {
1325 simdscalar vZ[MultisampleTraits<sampleCount>::numSamples];
1326 psContext.vX.UL = _simd_add_ps(vQuadULOffsetsX, _simd_set1_ps((float)xx));
1327 // set pixel center positions
1328 psContext.vX.center = _simd_add_ps(vQuadCenterOffsetsX, _simd_set1_ps((float)xx));
1329
1330 if (bInputCoverage)
1331 {
1332 generateInputCoverage<sampleCount, bIsStandardPattern, bForcedSampleCount>(&work.coverageMask[0], psContext.inputMask, pBlendState->sampleMask);
1333 }
1334
1335 if(bCentroidPos)
1336 {
1337 ///@ todo: don't need to genererate input coverage 2x if input coverage and centroid
1338 RDTSC_START(BEBarycentric);
1339 backendFuncs.pfnCalcCentroidBarycentrics(coeffs, psContext, &work.coverageMask[0], pBlendState->sampleMask, psContext.vX.UL, psContext.vY.UL);
1340 RDTSC_STOP(BEBarycentric, 0, 0);
1341 }
1342
1343 // if oDepth written to, or there is a potential to discard any samples, we need to
1344 // run the PS early, then interp or broadcast Z and test
1345 if(pPSState->writesODepth || pPSState->killsPixel)
1346 {
1347 RDTSC_START(BEBarycentric);
1348 backendFuncs.pfnCalcPixelBarycentrics(coeffs, psContext);
1349
1350 // interpolate z
1351 psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center);
1352 RDTSC_STOP(BEBarycentric, 0, 0);
1353
1354 // execute pixel shader
1355 RDTSC_START(BEPixelShader);
1356 state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
1357 RDTSC_STOP(BEPixelShader, 0, 0);
1358 }
1359 else
1360 {
1361 psContext.activeMask = _simd_set1_epi32(-1);
1362 }
1363
1364 // need to declare enough space for all samples
1365 simdscalar vCoverageMask[MultisampleTraits<sampleCount>::numSamples];
1366 simdscalar depthPassMask[MultisampleTraits<sampleCount>::numSamples];
1367 simdscalar stencilPassMask[MultisampleTraits<sampleCount>::numSamples];
1368 simdscalar anyDepthSamplePassed = _simd_setzero_ps();
1369 simdscalar anyStencilSamplePassed = _simd_setzero_ps();
1370 for(uint32_t sample = 0; sample < numCoverageSamples; sample++)
1371 {
1372 vCoverageMask[sample] = vMask(work.coverageMask[sample] & MASK);
1373
1374 // pull mask back out for any discards and and with coverage
1375 vCoverageMask[sample] = _simd_and_ps(vCoverageMask[sample], _simd_castsi_ps(psContext.activeMask));
1376
1377 if (!_simd_movemask_ps(vCoverageMask[sample]))
1378 {
1379 vCoverageMask[sample] = depthPassMask[sample] = stencilPassMask[sample] = _simd_setzero_ps();
1380 continue;
1381 }
1382
1383 if(bForcedSampleCount)
1384 {
1385 // candidate pixels (that passed coverage) will cause shader invocation if any bits in the samplemask are set
1386 const simdscalar vSampleMask = _simd_castsi_ps(_simd_cmpgt_epi32(_simd_set1_epi32(pBlendState->sampleMask), _simd_setzero_si()));
1387 anyDepthSamplePassed = _simd_or_ps(anyDepthSamplePassed, _simd_and_ps(vCoverageMask[sample], vSampleMask));
1388 continue;
1389 }
1390
1391 depthPassMask[sample] = vCoverageMask[sample];
1392
1393 // if oDepth isn't written to, we need to interpolate Z for each sample
1394 // if clip distances are enabled, we need to interpolate for each sample
1395 if(!pPSState->writesODepth || rastState.clipDistanceMask)
1396 {
1397 RDTSC_START(BEBarycentric);
1398 if(bIsStandardPattern)
1399 {
1400 // calculate per sample positions
1401 psContext.vX.sample = _simd_add_ps(psContext.vX.UL, MultisampleTraits<sampleCount>::vX(sample));
1402 psContext.vY.sample = _simd_add_ps(psContext.vY.UL, MultisampleTraits<sampleCount>::vY(sample));
1403 }
1404 else
1405 {
1406 psContext.vX.sample = psContext.vX.center;
1407 psContext.vY.sample = psContext.vY.center;
1408 }
1409
1410 // calc I & J per sample
1411 backendFuncs.pfnCalcSampleBarycentrics(coeffs, psContext);
1412
1413 // interpolate z
1414 if (!pPSState->writesODepth)
1415 {
1416 vZ[sample] = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample);
1417 }
1418
1419 ///@todo: perspective correct vs non-perspective correct clipping?
1420 // interpolate clip distances
1421 if (rastState.clipDistanceMask)
1422 {
1423 uint8_t clipMask = ComputeUserClipMask(rastState.clipDistanceMask, work.pUserClipBuffer,
1424 psContext.vI.sample, psContext.vJ.sample);
1425 vCoverageMask[sample] = _simd_and_ps(vCoverageMask[sample], vMask(~clipMask));
1426 }
1427 RDTSC_STOP(BEBarycentric, 0, 0);
1428 }
1429 // else 'broadcast' and test psContext.vZ written from the PS each sample
1430 else
1431 {
1432 vZ[sample] = psContext.vZ;
1433 }
1434
1435 // offset depth/stencil buffers current sample
1436 uint8_t *pDepthSample = pDepthBase + MultisampleTraits<sampleCount>::RasterTileDepthOffset(sample);
1437 uint8_t * pStencilSample = pStencilBase + MultisampleTraits<sampleCount>::RasterTileStencilOffset(sample);
1438
1439 // ZTest for this sample
1440 RDTSC_START(BEEarlyDepthTest);
1441 stencilPassMask[sample] = vCoverageMask[sample];
1442 depthPassMask[sample] = DepthStencilTest(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing,
1443 vZ[sample], pDepthSample, vCoverageMask[sample], pStencilSample, &stencilPassMask[sample]);
1444 RDTSC_STOP(BEEarlyDepthTest, 0, 0);
1445
1446 anyDepthSamplePassed = _simd_or_ps(anyDepthSamplePassed, depthPassMask[sample]);
1447 anyStencilSamplePassed = _simd_or_ps(anyStencilSamplePassed, stencilPassMask[sample]);
1448 uint32_t statMask = _simd_movemask_ps(depthPassMask[sample]);
1449 uint32_t statCount = _mm_popcnt_u32(statMask);
1450 UPDATE_STAT(DepthPassCount, statCount);
1451 }
1452
1453 // if we didn't have to execute the PS early, and at least 1 sample passed the depth test, run the PS
1454 if(!pPSState->writesODepth && !pPSState->killsPixel && _simd_movemask_ps(anyDepthSamplePassed))
1455 {
1456 RDTSC_START(BEBarycentric);
1457 backendFuncs.pfnCalcPixelBarycentrics(coeffs, psContext);
1458 // interpolate z
1459 psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center);
1460 RDTSC_STOP(BEBarycentric, 0, 0);
1461
1462 // execute pixel shader
1463 RDTSC_START(BEPixelShader);
1464 state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
1465 RDTSC_STOP(BEPixelShader, 0, 0);
1466 }
1467 ///@todo: make sure this works for kill pixel
1468 else if(!_simd_movemask_ps(anyStencilSamplePassed))
1469 {
1470 goto Endtile;
1471 }
1472
1473 // loop over all samples, broadcasting the results of the PS to all passing pixels
1474 for(uint32_t sample = 0; sample < numOMSamples; sample++)
1475 {
1476 uint8_t *pDepthSample = pDepthBase + MultisampleTraits<sampleCount>::RasterTileDepthOffset(sample);
1477 uint8_t * pStencilSample = pStencilBase + MultisampleTraits<sampleCount>::RasterTileStencilOffset(sample);
1478
1479 // output merger
1480 RDTSC_START(BEOutputMerger);
1481
1482 // skip if none of the pixels for this sample passed
1483 simdscalar coverageMaskSample;
1484 simdscalar depthMaskSample;
1485 simdscalar stencilMaskSample;
1486 simdscalar vInterpolatedZ;
1487
1488 // forcedSampleCount outputs to any pixels with covered samples not masked off by SampleMask
1489 // depth test is disabled, so just set the z val to 0.
1490 if(bForcedSampleCount)
1491 {
1492 coverageMaskSample = depthMaskSample = anyDepthSamplePassed;
1493 vInterpolatedZ = _simd_setzero_ps();
1494 }
1495 else if(bIsStandardPattern)
1496 {
1497 if(!_simd_movemask_ps(depthPassMask[sample]))
1498 {
1499 depthPassMask[sample] = _simd_setzero_ps();
1500 DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, vZ[sample], pDepthSample, depthPassMask[sample],
1501 vCoverageMask[sample], pStencilSample, stencilPassMask[sample]);
1502 continue;
1503 }
1504 coverageMaskSample = vCoverageMask[sample];
1505 depthMaskSample = depthPassMask[sample];
1506 stencilMaskSample = stencilPassMask[sample];
1507 vInterpolatedZ = vZ[sample];
1508 }
1509 else
1510 {
1511 // center pattern only needs to use a single depth test as all samples are at the same position
1512 if(!_simd_movemask_ps(depthPassMask[0]))
1513 {
1514 depthPassMask[0] = _simd_setzero_ps();
1515 DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, vZ[0], pDepthSample, depthPassMask[0],
1516 vCoverageMask[0], pStencilSample, stencilPassMask[0]);
1517 continue;
1518 }
1519 coverageMaskSample = (vCoverageMask[0]);
1520 depthMaskSample = depthPassMask[0];
1521 stencilMaskSample = stencilPassMask[0];
1522 vInterpolatedZ = vZ[0];
1523 }
1524
1525 // output merger
1526 RDTSC_START(BEOutputMerger);
1527 backendFuncs.pfnOutputMerger(psContext, pColorBase, sample, pBlendState, state.pfnBlendFunc,
1528 coverageMaskSample, depthMaskSample);
1529
1530 DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, vInterpolatedZ, pDepthSample, depthMaskSample,
1531 coverageMaskSample, pStencilSample, stencilMaskSample);
1532 RDTSC_STOP(BEOutputMerger, 0, 0);
1533 }
1534
1535 Endtile:
1536 RDTSC_START(BEEndTile);
1537 for(uint32_t sample = 0; sample < numCoverageSamples; sample++)
1538 {
1539 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
1540 }
1541
1542 pDepthBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
1543 pStencilBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
1544
1545 for(uint32_t rt = 0; rt < NumRT; ++rt)
1546 {
1547 pColorBase[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
1548 }
1549 RDTSC_STOP(BEEndTile, 0, 0);
1550 }
1551 }
1552 }
1553 // optimized backend flow with NULL PS
1554 template<uint32_t sampleCountT>
1555 void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
1556 {
1557 RDTSC_START(BESetup);
1558
1559 static const SWR_MULTISAMPLE_COUNT sampleCount = (SWR_MULTISAMPLE_COUNT)sampleCountT;
1560 SWR_CONTEXT *pContext = pDC->pContext;
1561 const API_STATE& state = GetApiState(pDC);
1562 const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs;
1563
1564 // broadcast scalars
1565 BarycentricCoeffs coeffs;
1566 coeffs.vIa = _simd_broadcast_ss(&work.I[0]);
1567 coeffs.vIb = _simd_broadcast_ss(&work.I[1]);
1568 coeffs.vIc = _simd_broadcast_ss(&work.I[2]);
1569
1570 coeffs.vJa = _simd_broadcast_ss(&work.J[0]);
1571 coeffs.vJb = _simd_broadcast_ss(&work.J[1]);
1572 coeffs.vJc = _simd_broadcast_ss(&work.J[2]);
1573
1574 coeffs.vZa = _simd_broadcast_ss(&work.Z[0]);
1575 coeffs.vZb = _simd_broadcast_ss(&work.Z[1]);
1576 coeffs.vZc = _simd_broadcast_ss(&work.Z[2]);
1577
1578 coeffs.vRecipDet = _simd_broadcast_ss(&work.recipDet);
1579
1580 BYTE *pDepthBase = renderBuffers.pDepth, *pStencilBase = renderBuffers.pStencil;
1581
1582 RDTSC_STOP(BESetup, 0, 0);
1583
1584 SWR_PS_CONTEXT psContext;
1585 for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
1586 {
1587 // UL pixel corner
1588 simdscalar vYSamplePosUL = _simd_add_ps(vQuadULOffsetsY, _simd_set1_ps((float)yy));
1589
1590 for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
1591 {
1592 // UL pixel corners
1593 simdscalar vXSamplePosUL = _simd_add_ps(vQuadULOffsetsX, _simd_set1_ps((float)xx));
1594
1595 // iterate over active samples
1596 unsigned long sample = 0;
1597 uint32_t sampleMask = state.blendState.sampleMask;
1598 while (_BitScanForward(&sample, sampleMask))
1599 {
1600 sampleMask &= ~(1 << sample);
1601 if (work.coverageMask[sample] & MASK)
1602 {
1603 RDTSC_START(BEBarycentric);
1604 // calculate per sample positions
1605 psContext.vX.sample = _simd_add_ps(vXSamplePosUL, MultisampleTraits<sampleCount>::vX(sample));
1606 psContext.vY.sample = _simd_add_ps(vYSamplePosUL, MultisampleTraits<sampleCount>::vY(sample));
1607
1608 backendFuncs.pfnCalcSampleBarycentrics(coeffs, psContext);
1609
1610 // interpolate z
1611 psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample);
1612
1613 RDTSC_STOP(BEBarycentric, 0, 0);
1614
1615 simdscalar vCoverageMask = vMask(work.coverageMask[sample] & MASK);
1616 simdscalar stencilPassMask = vCoverageMask;
1617
1618 // offset depth/stencil buffers current sample
1619 uint8_t *pDepthSample = pDepthBase + MultisampleTraits<sampleCount>::RasterTileDepthOffset(sample);
1620 uint8_t *pStencilSample = pStencilBase + MultisampleTraits<sampleCount>::RasterTileStencilOffset(sample);
1621
1622 RDTSC_START(BEEarlyDepthTest);
1623 simdscalar depthPassMask = DepthStencilTest(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing,
1624 psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
1625 DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
1626 pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
1627 RDTSC_STOP(BEEarlyDepthTest, 0, 0);
1628
1629 uint32_t statMask = _simd_movemask_ps(depthPassMask);
1630 uint32_t statCount = _mm_popcnt_u32(statMask);
1631 UPDATE_STAT(DepthPassCount, statCount);
1632 }
1633 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
1634 }
1635 pDepthBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
1636 pStencilBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
1637 }
1638 }
1639 }
1640
1641 void InitClearTilesTable()
1642 {
1643 memset(sClearTilesTable, 0, sizeof(sClearTilesTable));
1644
1645 sClearTilesTable[R8G8B8A8_UNORM] = ClearMacroTile<R8G8B8A8_UNORM>;
1646 sClearTilesTable[B8G8R8A8_UNORM] = ClearMacroTile<B8G8R8A8_UNORM>;
1647 sClearTilesTable[R32_FLOAT] = ClearMacroTile<R32_FLOAT>;
1648 sClearTilesTable[R32G32B32A32_FLOAT] = ClearMacroTile<R32G32B32A32_FLOAT>;
1649 sClearTilesTable[R8_UINT] = ClearMacroTile<R8_UINT>;
1650 }
1651
1652 PFN_BACKEND_FUNC gBackendNullPs[SWR_MULTISAMPLE_TYPE_MAX];
1653 PFN_BACKEND_FUNC gBackendSingleSample[2][2] = {};
1654 PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_MAX][SWR_MSAA_SAMPLE_PATTERN_MAX][SWR_INPUT_COVERAGE_MAX][2][2] = {};
1655 PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_MAX][SWR_INPUT_COVERAGE_MAX][2] = {};
1656 PFN_OUTPUT_MERGER gBackendOutputMergerTable[SWR_NUM_RENDERTARGETS+1][SWR_MULTISAMPLE_TYPE_MAX] = {};
1657 PFN_CALC_PIXEL_BARYCENTRICS gPixelBarycentricTable[2] = {};
1658 PFN_CALC_SAMPLE_BARYCENTRICS gSampleBarycentricTable[2] = {};
1659 PFN_CALC_CENTROID_BARYCENTRICS gCentroidBarycentricTable[SWR_MULTISAMPLE_TYPE_MAX][2][2][2] = {};
1660
1661 // Recursive template used to auto-nest conditionals. Converts dynamic enum function
1662 // arguments to static template arguments.
1663 template <uint32_t... ArgsT>
1664 struct OMChooser
1665 {
1666 // Last Arg Terminator
1667 static PFN_OUTPUT_MERGER GetFunc(SWR_MULTISAMPLE_COUNT tArg)
1668 {
1669 switch(tArg)
1670 {
1671 case SWR_MULTISAMPLE_1X: return OutputMerger<ArgsT..., SWR_MULTISAMPLE_1X>; break;
1672 case SWR_MULTISAMPLE_2X: return OutputMerger<ArgsT..., SWR_MULTISAMPLE_2X>; break;
1673 case SWR_MULTISAMPLE_4X: return OutputMerger<ArgsT..., SWR_MULTISAMPLE_4X>; break;
1674 case SWR_MULTISAMPLE_8X: return OutputMerger<ArgsT..., SWR_MULTISAMPLE_8X>; break;
1675 case SWR_MULTISAMPLE_16X: return OutputMerger<ArgsT..., SWR_MULTISAMPLE_16X>; break;
1676 default:
1677 SWR_ASSERT(0 && "Invalid sample count\n");
1678 return nullptr;
1679 break;
1680 }
1681 }
1682
1683 // Recursively parse args
1684 template <typename... TArgsT>
1685 static PFN_OUTPUT_MERGER GetFunc(uint32_t tArg, TArgsT... remainingArgs)
1686 {
1687 switch(tArg)
1688 {
1689 case 0: return OMChooser<ArgsT..., 0>::GetFunc(remainingArgs...); break;
1690 case 1: return OMChooser<ArgsT..., 1>::GetFunc(remainingArgs...); break;
1691 case 2: return OMChooser<ArgsT..., 2>::GetFunc(remainingArgs...); break;
1692 case 3: return OMChooser<ArgsT..., 3>::GetFunc(remainingArgs...); break;
1693 case 4: return OMChooser<ArgsT..., 4>::GetFunc(remainingArgs...); break;
1694 case 5: return OMChooser<ArgsT..., 5>::GetFunc(remainingArgs...); break;
1695 case 6: return OMChooser<ArgsT..., 6>::GetFunc(remainingArgs...); break;
1696 case 7: return OMChooser<ArgsT..., 7>::GetFunc(remainingArgs...); break;
1697 case 8: return OMChooser<ArgsT..., 8>::GetFunc(remainingArgs...); break;
1698 default:
1699 SWR_ASSERT(0 && "Invalid RT index\n");
1700 return nullptr;
1701 break;
1702 }
1703 }
1704 };
1705
1706 // Recursive template used to auto-nest conditionals. Converts dynamic enum function
1707 // arguments to static template arguments.
1708 template <uint32_t... ArgsT>
1709 struct BECentroidBarycentricChooser
1710 {
1711
1712 // Last Arg Terminator
1713 template <typename... TArgsT>
1714 static PFN_CALC_CENTROID_BARYCENTRICS GetFunc(uint32_t tArg)
1715 {
1716 if(tArg > 0)
1717 {
1718 return CalcCentroidBarycentrics<ArgsT..., 1>;
1719 }
1720
1721 return CalcCentroidBarycentrics<ArgsT..., 0>;
1722 }
1723
1724 // Recursively parse args
1725 template <typename... TArgsT>
1726 static PFN_CALC_CENTROID_BARYCENTRICS GetFunc(SWR_MULTISAMPLE_COUNT tArg, TArgsT... remainingArgs)
1727 {
1728 switch(tArg)
1729 {
1730 case SWR_MULTISAMPLE_1X: return BECentroidBarycentricChooser<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...); break;
1731 case SWR_MULTISAMPLE_2X: return BECentroidBarycentricChooser<ArgsT..., SWR_MULTISAMPLE_2X>::GetFunc(remainingArgs...); break;
1732 case SWR_MULTISAMPLE_4X: return BECentroidBarycentricChooser<ArgsT..., SWR_MULTISAMPLE_4X>::GetFunc(remainingArgs...); break;
1733 case SWR_MULTISAMPLE_8X: return BECentroidBarycentricChooser<ArgsT..., SWR_MULTISAMPLE_8X>::GetFunc(remainingArgs...); break;
1734 case SWR_MULTISAMPLE_16X: return BECentroidBarycentricChooser<ArgsT..., SWR_MULTISAMPLE_16X>::GetFunc(remainingArgs...); break;
1735 default:
1736 SWR_ASSERT(0 && "Invalid sample count\n");
1737 return nullptr;
1738 break;
1739 }
1740 }
1741
1742 // Recursively parse args
1743 template <typename... TArgsT>
1744 static PFN_CALC_CENTROID_BARYCENTRICS GetFunc(uint32_t tArg, TArgsT... remainingArgs)
1745 {
1746 if(tArg > 0)
1747 {
1748 return BECentroidBarycentricChooser<ArgsT..., 1>::GetFunc(remainingArgs...);
1749 }
1750
1751 return BECentroidBarycentricChooser<ArgsT..., 0>::GetFunc(remainingArgs...);
1752 }
1753 };
1754
1755 // Recursive template used to auto-nest conditionals. Converts dynamic enum function
1756 // arguments to static template arguments.
1757 template <uint32_t... ArgsT>
1758 struct BEChooser
1759 {
1760 // Last Arg Terminator
1761 static PFN_BACKEND_FUNC GetFunc(SWR_BACKEND_FUNCS tArg)
1762 {
1763 switch(tArg)
1764 {
1765 case SWR_BACKEND_SINGLE_SAMPLE: return BackendSingleSample<ArgsT...>; break;
1766 case SWR_BACKEND_MSAA_PIXEL_RATE: return BackendPixelRate<ArgsT...>; break;
1767 case SWR_BACKEND_MSAA_SAMPLE_RATE: return BackendSampleRate<ArgsT...>; break;
1768 default:
1769 SWR_ASSERT(0 && "Invalid backend func\n");
1770 return nullptr;
1771 break;
1772 }
1773 }
1774
1775
1776 // Recursively parse args
1777 template <typename... TArgsT>
1778 static PFN_BACKEND_FUNC GetFunc(SWR_MULTISAMPLE_COUNT tArg, TArgsT... remainingArgs)
1779 {
1780 switch(tArg)
1781 {
1782 case SWR_MULTISAMPLE_1X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...); break;
1783 case SWR_MULTISAMPLE_2X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_2X>::GetFunc(remainingArgs...); break;
1784 case SWR_MULTISAMPLE_4X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_4X>::GetFunc(remainingArgs...); break;
1785 case SWR_MULTISAMPLE_8X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_8X>::GetFunc(remainingArgs...); break;
1786 case SWR_MULTISAMPLE_16X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_16X>::GetFunc(remainingArgs...); break;
1787 default:
1788 SWR_ASSERT(0 && "Invalid sample count\n");
1789 return nullptr;
1790 break;
1791 }
1792 }
1793
1794 // Recursively parse args
1795 template <typename... TArgsT>
1796 static PFN_BACKEND_FUNC GetFunc(uint32_t tArg, TArgsT... remainingArgs)
1797 {
1798 if(tArg > 0)
1799 {
1800 return BEChooser<ArgsT..., 1>::GetFunc(remainingArgs...);
1801 }
1802
1803 return BEChooser<ArgsT..., 0>::GetFunc(remainingArgs...);
1804 }
1805 };
1806
1807 template <uint32_t numRenderTargets, SWR_MULTISAMPLE_COUNT numSampleRates>
1808 void InitBackendOMFuncTable(PFN_OUTPUT_MERGER (&table)[numRenderTargets][numSampleRates])
1809 {
1810 for(uint32_t rtNum = SWR_ATTACHMENT_COLOR0; rtNum < numRenderTargets; rtNum++)
1811 {
1812 for(uint32_t sampleCount = SWR_MULTISAMPLE_1X; sampleCount < numSampleRates; sampleCount++)
1813 {
1814 table[rtNum][sampleCount] =
1815 OMChooser<>::GetFunc((SWR_RENDERTARGET_ATTACHMENT)rtNum, (SWR_MULTISAMPLE_COUNT)sampleCount);
1816 }
1817 }
1818 }
1819
1820 template <SWR_MULTISAMPLE_COUNT numSampleRates>
1821 void InitBackendBarycentricsTables(PFN_CALC_PIXEL_BARYCENTRICS (&pixelTable)[2],
1822 PFN_CALC_SAMPLE_BARYCENTRICS (&sampleTable)[2],
1823 PFN_CALC_CENTROID_BARYCENTRICS (&centroidTable)[numSampleRates][2][2][2])
1824 {
1825 pixelTable[0] = CalcPixelBarycentrics<0>;
1826 pixelTable[1] = CalcPixelBarycentrics<1>;
1827
1828 sampleTable[0] = CalcSampleBarycentrics<0>;
1829 sampleTable[1] = CalcSampleBarycentrics<1>;
1830
1831 for(uint32_t sampleCount = SWR_MULTISAMPLE_1X; sampleCount < numSampleRates; sampleCount++)
1832 {
1833 for(uint32_t baryMask = 0; baryMask < 2; baryMask++)
1834 {
1835 for(uint32_t patternNum = 0; patternNum < 2; patternNum++)
1836 {
1837 for(uint32_t forcedSampleEnable = 0; forcedSampleEnable < 2; forcedSampleEnable++)
1838 {
1839 centroidTable[sampleCount][baryMask][patternNum][forcedSampleEnable]=
1840 BECentroidBarycentricChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, baryMask, patternNum, forcedSampleEnable);
1841 }
1842 }
1843 }
1844 }
1845 }
1846
1847 void InitBackendSampleFuncTable(PFN_BACKEND_FUNC (&table)[2][2])
1848 {
1849 gBackendSingleSample[0][0] = BEChooser<>::GetFunc(SWR_MULTISAMPLE_1X, SWR_MSAA_STANDARD_PATTERN, SWR_INPUT_COVERAGE_NONE, 0, 0, (SWR_BACKEND_FUNCS)SWR_BACKEND_SINGLE_SAMPLE);
1850 gBackendSingleSample[0][1] = BEChooser<>::GetFunc(SWR_MULTISAMPLE_1X, SWR_MSAA_STANDARD_PATTERN, SWR_INPUT_COVERAGE_NONE, 1, 0, (SWR_BACKEND_FUNCS)SWR_BACKEND_SINGLE_SAMPLE);
1851 gBackendSingleSample[1][0] = BEChooser<>::GetFunc(SWR_MULTISAMPLE_1X, SWR_MSAA_STANDARD_PATTERN, SWR_INPUT_COVERAGE_NORMAL, 0, 0, (SWR_BACKEND_FUNCS)SWR_BACKEND_SINGLE_SAMPLE);
1852 gBackendSingleSample[1][1] = BEChooser<>::GetFunc(SWR_MULTISAMPLE_1X, SWR_MSAA_STANDARD_PATTERN, SWR_INPUT_COVERAGE_NORMAL, 1, 0, (SWR_BACKEND_FUNCS)SWR_BACKEND_SINGLE_SAMPLE);
1853 }
1854
1855 template <SWR_MULTISAMPLE_COUNT numSampleRates, SWR_MSAA_SAMPLE_PATTERN numSamplePatterns, SWR_INPUT_COVERAGE numCoverageModes>
1856 void InitBackendPixelFuncTable(PFN_BACKEND_FUNC (&table)[numSampleRates][numSamplePatterns][numCoverageModes][2][2])
1857 {
1858 for(uint32_t sampleCount = SWR_MULTISAMPLE_1X; sampleCount < numSampleRates; sampleCount++)
1859 {
1860 for(uint32_t samplePattern = SWR_MSAA_CENTER_PATTERN; samplePattern < numSamplePatterns; samplePattern++)
1861 {
1862 for(uint32_t inputCoverage = SWR_INPUT_COVERAGE_NONE; inputCoverage < numCoverageModes; inputCoverage++)
1863 {
1864 for(uint32_t isCentroid = 0; isCentroid < 2; isCentroid++)
1865 {
1866 table[sampleCount][samplePattern][inputCoverage][isCentroid][0] =
1867 BEChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, samplePattern, inputCoverage, isCentroid, 0, (SWR_BACKEND_FUNCS)SWR_BACKEND_MSAA_PIXEL_RATE);
1868 table[sampleCount][samplePattern][inputCoverage][isCentroid][1] =
1869 BEChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, samplePattern, inputCoverage, isCentroid, 1, (SWR_BACKEND_FUNCS)SWR_BACKEND_MSAA_PIXEL_RATE);
1870 }
1871 }
1872 }
1873 }
1874 }
1875
1876 template <uint32_t numSampleRates, uint32_t numCoverageModes>
1877 void InitBackendSampleFuncTable(PFN_BACKEND_FUNC (&table)[numSampleRates][numCoverageModes][2])
1878 {
1879 for(uint32_t sampleCount = SWR_MULTISAMPLE_1X; sampleCount < numSampleRates; sampleCount++)
1880 {
1881 for(uint32_t inputCoverage = SWR_INPUT_COVERAGE_NONE; inputCoverage < numCoverageModes; inputCoverage++)
1882 {
1883 table[sampleCount][inputCoverage][0] =
1884 BEChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, SWR_MSAA_STANDARD_PATTERN, inputCoverage, 0, 0, (SWR_BACKEND_FUNCS)SWR_BACKEND_MSAA_SAMPLE_RATE);
1885 table[sampleCount][inputCoverage][1] =
1886 BEChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, SWR_MSAA_STANDARD_PATTERN, inputCoverage, 1, 0, (SWR_BACKEND_FUNCS)SWR_BACKEND_MSAA_SAMPLE_RATE);
1887 }
1888 }
1889 }
1890
1891 void InitBackendFuncTables()
1892 {
1893 InitBackendSampleFuncTable(gBackendSingleSample);
1894 InitBackendPixelFuncTable<(SWR_MULTISAMPLE_COUNT)SWR_MULTISAMPLE_TYPE_MAX, SWR_MSAA_SAMPLE_PATTERN_MAX, SWR_INPUT_COVERAGE_MAX>(gBackendPixelRateTable);
1895 InitBackendSampleFuncTable<SWR_MULTISAMPLE_TYPE_MAX, SWR_INPUT_COVERAGE_MAX>(gBackendSampleRateTable);
1896 InitBackendOMFuncTable<SWR_NUM_RENDERTARGETS+1, SWR_MULTISAMPLE_TYPE_MAX>(gBackendOutputMergerTable);
1897 InitBackendBarycentricsTables<(SWR_MULTISAMPLE_COUNT)(SWR_MULTISAMPLE_TYPE_MAX)>(gPixelBarycentricTable, gSampleBarycentricTable, gCentroidBarycentricTable);
1898
1899 gBackendNullPs[SWR_MULTISAMPLE_1X] = &BackendNullPS < SWR_MULTISAMPLE_1X > ;
1900 gBackendNullPs[SWR_MULTISAMPLE_2X] = &BackendNullPS < SWR_MULTISAMPLE_2X > ;
1901 gBackendNullPs[SWR_MULTISAMPLE_4X] = &BackendNullPS < SWR_MULTISAMPLE_4X > ;
1902 gBackendNullPs[SWR_MULTISAMPLE_8X] = &BackendNullPS < SWR_MULTISAMPLE_8X > ;
1903 gBackendNullPs[SWR_MULTISAMPLE_16X] = &BackendNullPS < SWR_MULTISAMPLE_16X > ;
1904 }