1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * @brief Backend handles rasterization, pixel shading and output merger
28 ******************************************************************************/
30 #include <smmintrin.h>
32 #include "rdtsc_core.h"
34 #include "depthstencil.h"
36 #include "memory/tilingtraits.h"
37 #include "core/multisample.h"
41 const __m128 vTileOffsetsX
= {0.5, KNOB_TILE_X_DIM
- 0.5, 0.5, KNOB_TILE_X_DIM
- 0.5};
42 const __m128 vTileOffsetsY
= {0.5, 0.5, KNOB_TILE_Y_DIM
- 0.5, KNOB_TILE_Y_DIM
- 0.5};
44 /// @todo move to common lib
45 #define MASKTOVEC(i3,i2,i1,i0) {-i0,-i1,-i2,-i3}
46 static const __m128 gMaskToVec
[] = {
65 typedef void(*PFN_CLEAR_TILES
)(DRAW_CONTEXT
*, SWR_RENDERTARGET_ATTACHMENT rt
, uint32_t, DWORD
[4]);
66 static PFN_CLEAR_TILES sClearTilesTable
[NUM_SWR_FORMATS
];
68 //////////////////////////////////////////////////////////////////////////
69 /// @brief Process compute work.
70 /// @param pDC - pointer to draw context (dispatch).
71 /// @param workerId - The unique worker ID that is assigned to this thread.
72 /// @param threadGroupId - the linear index for the thread group within the dispatch.
73 void ProcessComputeBE(DRAW_CONTEXT
* pDC
, uint32_t workerId
, uint32_t threadGroupId
)
75 RDTSC_START(BEDispatch
);
77 SWR_CONTEXT
*pContext
= pDC
->pContext
;
79 const COMPUTE_DESC
* pTaskData
= (COMPUTE_DESC
*)pDC
->pDispatch
->GetTasksData();
80 SWR_ASSERT(pTaskData
!= nullptr);
82 // Ensure spill fill memory has been allocated.
83 if (pDC
->pSpillFill
[workerId
] == nullptr)
85 ///@todo Add state which indicates the spill fill size.
86 pDC
->pSpillFill
[workerId
] = (uint8_t*)pDC
->pArena
->AllocAlignedSync(4096 * 1024, sizeof(float) * 8);
89 const API_STATE
& state
= GetApiState(pDC
);
91 SWR_CS_CONTEXT csContext
{ 0 };
92 csContext
.tileCounter
= threadGroupId
;
93 csContext
.dispatchDims
[0] = pTaskData
->threadGroupCountX
;
94 csContext
.dispatchDims
[1] = pTaskData
->threadGroupCountY
;
95 csContext
.dispatchDims
[2] = pTaskData
->threadGroupCountZ
;
96 csContext
.pTGSM
= pContext
->pScratch
[workerId
];
97 csContext
.pSpillFillBuffer
= pDC
->pSpillFill
[workerId
];
99 state
.pfnCsFunc(GetPrivateState(pDC
), &csContext
);
101 UPDATE_STAT(CsInvocations
, state
.totalThreadsInGroup
);
103 RDTSC_STOP(BEDispatch
, 1, 0);
106 void ProcessSyncBE(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t macroTile
, void *pUserData
)
108 SYNC_DESC
*pSync
= (SYNC_DESC
*)pUserData
;
111 MacroTileMgr::getTileIndices(macroTile
, x
, y
);
112 SWR_ASSERT(x
== 0 && y
== 0);
114 if (pSync
->pfnCallbackFunc
!= nullptr)
116 pSync
->pfnCallbackFunc(pSync
->userData
, pSync
->userData2
, pSync
->userData3
);
120 void ProcessQueryStatsBE(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t macroTile
, void *pUserData
)
122 QUERY_DESC
* pQueryDesc
= (QUERY_DESC
*)pUserData
;
123 SWR_STATS
* pStats
= pQueryDesc
->pStats
;
124 SWR_CONTEXT
*pContext
= pDC
->pContext
;
126 SWR_ASSERT(pStats
!= nullptr);
128 for (uint32_t i
= 0; i
< pContext
->NumWorkerThreads
; ++i
)
130 pStats
->DepthPassCount
+= pContext
->stats
[i
].DepthPassCount
;
132 pStats
->IaVertices
+= pContext
->stats
[i
].IaVertices
;
133 pStats
->IaPrimitives
+= pContext
->stats
[i
].IaPrimitives
;
134 pStats
->VsInvocations
+= pContext
->stats
[i
].VsInvocations
;
135 pStats
->HsInvocations
+= pContext
->stats
[i
].HsInvocations
;
136 pStats
->DsInvocations
+= pContext
->stats
[i
].DsInvocations
;
137 pStats
->GsInvocations
+= pContext
->stats
[i
].GsInvocations
;
138 pStats
->PsInvocations
+= pContext
->stats
[i
].PsInvocations
;
139 pStats
->CInvocations
+= pContext
->stats
[i
].CInvocations
;
140 pStats
->CsInvocations
+= pContext
->stats
[i
].CsInvocations
;
141 pStats
->CPrimitives
+= pContext
->stats
[i
].CPrimitives
;
142 pStats
->GsPrimitives
+= pContext
->stats
[i
].GsPrimitives
;
144 for (uint32_t stream
= 0; stream
< MAX_SO_STREAMS
; ++stream
)
146 pStats
->SoWriteOffset
[stream
] += pContext
->stats
[i
].SoWriteOffset
[stream
];
148 /// @note client is required to provide valid write offset before every draw, so we clear
149 /// out the contents of the write offset when storing stats
150 pContext
->stats
[i
].SoWriteOffset
[stream
] = 0;
152 pStats
->SoPrimStorageNeeded
[stream
] += pContext
->stats
[i
].SoPrimStorageNeeded
[stream
];
153 pStats
->SoNumPrimsWritten
[stream
] += pContext
->stats
[i
].SoNumPrimsWritten
[stream
];
158 template<SWR_FORMAT format
>
159 void ClearRasterTile(BYTE
*pTileBuffer
, simdvector
&value
)
161 auto lambda
= [&](int comp
)
163 FormatTraits
<format
>::storeSOA(comp
, pTileBuffer
, value
.v
[comp
]);
164 pTileBuffer
+= (KNOB_SIMD_WIDTH
* FormatTraits
<format
>::GetBPC(comp
) / 8);
167 const uint32_t numIter
= (KNOB_TILE_Y_DIM
/ SIMD_TILE_Y_DIM
) * (KNOB_TILE_X_DIM
/ SIMD_TILE_X_DIM
);
168 for (uint32_t i
= 0; i
< numIter
; ++i
)
170 UnrollerL
<0, FormatTraits
<format
>::numComps
, 1>::step(lambda
);
174 template<SWR_FORMAT format
>
175 INLINE
void ClearMacroTile(DRAW_CONTEXT
*pDC
, SWR_RENDERTARGET_ATTACHMENT rt
, uint32_t macroTile
, DWORD clear
[4])
177 // convert clear color to hottile format
178 // clear color is in RGBA float/uint32
180 for (uint32_t comp
= 0; comp
< FormatTraits
<format
>::numComps
; ++comp
)
183 vComp
= _simd_load1_ps((const float*)&clear
[comp
]);
184 if (FormatTraits
<format
>::isNormalized(comp
))
186 vComp
= _simd_mul_ps(vComp
, _simd_set1_ps(FormatTraits
<format
>::fromFloat(comp
)));
187 vComp
= _simd_castsi_ps(_simd_cvtps_epi32(vComp
));
189 vComp
= FormatTraits
<format
>::pack(comp
, vComp
);
190 vClear
.v
[FormatTraits
<format
>::swizzle(comp
)] = vComp
;
193 uint32_t tileX
, tileY
;
194 MacroTileMgr::getTileIndices(macroTile
, tileX
, tileY
);
195 const API_STATE
& state
= GetApiState(pDC
);
197 int top
= KNOB_MACROTILE_Y_DIM_FIXED
* tileY
;
198 int bottom
= top
+ KNOB_MACROTILE_Y_DIM_FIXED
- 1;
199 int left
= KNOB_MACROTILE_X_DIM_FIXED
* tileX
;
200 int right
= left
+ KNOB_MACROTILE_X_DIM_FIXED
- 1;
202 // intersect with scissor
203 top
= std::max(top
, state
.scissorInFixedPoint
.top
);
204 left
= std::max(left
, state
.scissorInFixedPoint
.left
);
205 bottom
= std::min(bottom
, state
.scissorInFixedPoint
.bottom
);
206 right
= std::min(right
, state
.scissorInFixedPoint
.right
);
208 // translate to local hottile origin
209 top
-= KNOB_MACROTILE_Y_DIM_FIXED
* tileY
;
210 bottom
-= KNOB_MACROTILE_Y_DIM_FIXED
* tileY
;
211 left
-= KNOB_MACROTILE_X_DIM_FIXED
* tileX
;
212 right
-= KNOB_MACROTILE_X_DIM_FIXED
* tileX
;
214 // convert to raster tiles
215 top
>>= (KNOB_TILE_Y_DIM_SHIFT
+ FIXED_POINT_SHIFT
);
216 bottom
>>= (KNOB_TILE_Y_DIM_SHIFT
+ FIXED_POINT_SHIFT
);
217 left
>>= (KNOB_TILE_X_DIM_SHIFT
+ FIXED_POINT_SHIFT
);
218 right
>>= (KNOB_TILE_X_DIM_SHIFT
+ FIXED_POINT_SHIFT
);
220 const int numSamples
= GetNumSamples(pDC
->pState
->state
.rastState
.sampleCount
);
221 // compute steps between raster tile samples / raster tiles / macro tile rows
222 const uint32_t rasterTileSampleStep
= KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<format
>::bpp
/ 8;
223 const uint32_t rasterTileStep
= (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* (FormatTraits
<format
>::bpp
/ 8)) * numSamples
;
224 const uint32_t macroTileRowStep
= (KNOB_MACROTILE_X_DIM
/ KNOB_TILE_X_DIM
) * rasterTileStep
;
225 const uint32_t pitch
= (FormatTraits
<format
>::bpp
* KNOB_MACROTILE_X_DIM
/ 8);
227 HOTTILE
*pHotTile
= pDC
->pContext
->pHotTileMgr
->GetHotTile(pDC
->pContext
, pDC
, macroTile
, rt
, true, numSamples
);
228 uint32_t rasterTileStartOffset
= (ComputeTileOffset2D
< TilingTraits
<SWR_TILE_SWRZ
, FormatTraits
<format
>::bpp
> >(pitch
, left
, top
)) * numSamples
;
229 uint8_t* pRasterTileRow
= pHotTile
->pBuffer
+ rasterTileStartOffset
; //(ComputeTileOffset2D< TilingTraits<SWR_TILE_SWRZ, FormatTraits<format>::bpp > >(pitch, x, y)) * numSamples;
231 // loop over all raster tiles in the current hot tile
232 for (int y
= top
; y
<= bottom
; ++y
)
234 uint8_t* pRasterTile
= pRasterTileRow
;
235 for (int x
= left
; x
<= right
; ++x
)
237 for( int sampleNum
= 0; sampleNum
< numSamples
; sampleNum
++)
239 ClearRasterTile
<format
>(pRasterTile
, vClear
);
240 pRasterTile
+= rasterTileSampleStep
;
243 pRasterTileRow
+= macroTileRowStep
;
246 pHotTile
->state
= HOTTILE_DIRTY
;
250 void ProcessClearBE(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t macroTile
, void *pUserData
)
254 CLEAR_DESC
*pClear
= (CLEAR_DESC
*)pUserData
;
255 SWR_CONTEXT
*pContext
= pDC
->pContext
;
256 SWR_MULTISAMPLE_COUNT sampleCount
= pDC
->pState
->state
.rastState
.sampleCount
;
257 uint32_t numSamples
= GetNumSamples(sampleCount
);
259 SWR_ASSERT(pClear
->flags
.bits
!= 0); // shouldn't be here without a reason.
261 RDTSC_START(BEClear
);
263 if (pClear
->flags
.mask
& SWR_CLEAR_COLOR
)
265 HOTTILE
*pHotTile
= pContext
->pHotTileMgr
->GetHotTile(pContext
, pDC
, macroTile
, SWR_ATTACHMENT_COLOR0
, true, numSamples
);
266 // All we want to do here is to mark the hot tile as being in a "needs clear" state.
267 pHotTile
->clearData
[0] = *(DWORD
*)&(pClear
->clearRTColor
[0]);
268 pHotTile
->clearData
[1] = *(DWORD
*)&(pClear
->clearRTColor
[1]);
269 pHotTile
->clearData
[2] = *(DWORD
*)&(pClear
->clearRTColor
[2]);
270 pHotTile
->clearData
[3] = *(DWORD
*)&(pClear
->clearRTColor
[3]);
271 pHotTile
->state
= HOTTILE_CLEAR
;
274 if (pClear
->flags
.mask
& SWR_CLEAR_DEPTH
)
276 HOTTILE
*pHotTile
= pContext
->pHotTileMgr
->GetHotTile(pContext
, pDC
, macroTile
, SWR_ATTACHMENT_DEPTH
, true, numSamples
);
277 pHotTile
->clearData
[0] = *(DWORD
*)&pClear
->clearDepth
;
278 pHotTile
->state
= HOTTILE_CLEAR
;
281 if (pClear
->flags
.mask
& SWR_CLEAR_STENCIL
)
283 HOTTILE
*pHotTile
= pContext
->pHotTileMgr
->GetHotTile(pContext
, pDC
, macroTile
, SWR_ATTACHMENT_STENCIL
, true, numSamples
);
285 pHotTile
->clearData
[0] = *(DWORD
*)&pClear
->clearStencil
;
286 pHotTile
->state
= HOTTILE_CLEAR
;
289 RDTSC_STOP(BEClear
, 0, 0);
294 CLEAR_DESC
*pClear
= (CLEAR_DESC
*)pUserData
;
295 RDTSC_START(BEClear
);
297 if (pClear
->flags
.mask
& SWR_CLEAR_COLOR
)
299 /// @todo clear data should come in as RGBA32_FLOAT
302 clearFloat
[0] = ((BYTE
*)(&pClear
->clearRTColor
))[0] / 255.0f
;
303 clearFloat
[1] = ((BYTE
*)(&pClear
->clearRTColor
))[1] / 255.0f
;
304 clearFloat
[2] = ((BYTE
*)(&pClear
->clearRTColor
))[2] / 255.0f
;
305 clearFloat
[3] = ((BYTE
*)(&pClear
->clearRTColor
))[3] / 255.0f
;
306 clearData
[0] = *(DWORD
*)&clearFloat
[0];
307 clearData
[1] = *(DWORD
*)&clearFloat
[1];
308 clearData
[2] = *(DWORD
*)&clearFloat
[2];
309 clearData
[3] = *(DWORD
*)&clearFloat
[3];
311 PFN_CLEAR_TILES pfnClearTiles
= sClearTilesTable
[KNOB_COLOR_HOT_TILE_FORMAT
];
312 SWR_ASSERT(pfnClearTiles
!= nullptr);
314 pfnClearTiles(pDC
, SWR_ATTACHMENT_COLOR0
, macroTile
, clearData
);
317 if (pClear
->flags
.mask
& SWR_CLEAR_DEPTH
)
320 clearData
[0] = *(DWORD
*)&pClear
->clearDepth
;
321 PFN_CLEAR_TILES pfnClearTiles
= sClearTilesTable
[KNOB_DEPTH_HOT_TILE_FORMAT
];
322 SWR_ASSERT(pfnClearTiles
!= nullptr);
324 pfnClearTiles(pDC
, SWR_ATTACHMENT_DEPTH
, macroTile
, clearData
);
327 if (pClear
->flags
.mask
& SWR_CLEAR_STENCIL
)
329 uint32_t value
= pClear
->clearStencil
;
331 clearData
[0] = *(DWORD
*)&value
;
332 PFN_CLEAR_TILES pfnClearTiles
= sClearTilesTable
[KNOB_STENCIL_HOT_TILE_FORMAT
];
334 pfnClearTiles(pDC
, SWR_ATTACHMENT_STENCIL
, macroTile
, clearData
);
337 RDTSC_STOP(BEClear
, 0, 0);
342 void ProcessStoreTileBE(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t macroTile
, void *pData
)
344 RDTSC_START(BEStoreTiles
);
345 STORE_TILES_DESC
*pDesc
= (STORE_TILES_DESC
*)pData
;
346 SWR_CONTEXT
*pContext
= pDC
->pContext
;
348 #ifdef KNOB_ENABLE_RDTSC
349 uint32_t numTiles
= 0;
351 SWR_FORMAT srcFormat
;
352 switch (pDesc
->attachment
)
354 case SWR_ATTACHMENT_COLOR0
:
355 case SWR_ATTACHMENT_COLOR1
:
356 case SWR_ATTACHMENT_COLOR2
:
357 case SWR_ATTACHMENT_COLOR3
:
358 case SWR_ATTACHMENT_COLOR4
:
359 case SWR_ATTACHMENT_COLOR5
:
360 case SWR_ATTACHMENT_COLOR6
:
361 case SWR_ATTACHMENT_COLOR7
: srcFormat
= KNOB_COLOR_HOT_TILE_FORMAT
; break;
362 case SWR_ATTACHMENT_DEPTH
: srcFormat
= KNOB_DEPTH_HOT_TILE_FORMAT
; break;
363 case SWR_ATTACHMENT_STENCIL
: srcFormat
= KNOB_STENCIL_HOT_TILE_FORMAT
; break;
364 default: SWR_ASSERT(false, "Unknown attachment: %d", pDesc
->attachment
); srcFormat
= KNOB_COLOR_HOT_TILE_FORMAT
; break;
368 MacroTileMgr::getTileIndices(macroTile
, x
, y
);
370 // Only need to store the hottile if it's been rendered to...
371 HOTTILE
*pHotTile
= pContext
->pHotTileMgr
->GetHotTile(pContext
, pDC
, macroTile
, pDesc
->attachment
, false);
374 // clear if clear is pending (i.e., not rendered to), then mark as dirty for store.
375 if (pHotTile
->state
== HOTTILE_CLEAR
)
377 PFN_CLEAR_TILES pfnClearTiles
= sClearTilesTable
[srcFormat
];
378 SWR_ASSERT(pfnClearTiles
!= nullptr);
380 pfnClearTiles(pDC
, pDesc
->attachment
, macroTile
, pHotTile
->clearData
);
383 if (pHotTile
->state
== HOTTILE_DIRTY
|| pDesc
->postStoreTileState
== (SWR_TILE_STATE
)HOTTILE_DIRTY
)
385 int destX
= KNOB_MACROTILE_X_DIM
* x
;
386 int destY
= KNOB_MACROTILE_Y_DIM
* y
;
388 pContext
->pfnStoreTile(GetPrivateState(pDC
), srcFormat
,
389 pDesc
->attachment
, destX
, destY
, pHotTile
->renderTargetArrayIndex
, pHotTile
->pBuffer
);
393 if (pHotTile
->state
== HOTTILE_DIRTY
|| pHotTile
->state
== HOTTILE_RESOLVED
)
395 pHotTile
->state
= (HOTTILE_STATE
)pDesc
->postStoreTileState
;
398 RDTSC_STOP(BEStoreTiles
, numTiles
, pDC
->drawId
);
402 void ProcessInvalidateTilesBE(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t macroTile
, void *pData
)
404 INVALIDATE_TILES_DESC
*pDesc
= (INVALIDATE_TILES_DESC
*)pData
;
405 SWR_CONTEXT
*pContext
= pDC
->pContext
;
407 for (uint32_t i
= 0; i
< SWR_NUM_ATTACHMENTS
; ++i
)
409 if (pDesc
->attachmentMask
& (1 << i
))
411 HOTTILE
*pHotTile
= pContext
->pHotTileMgr
->GetHotTile(pContext
, pDC
, macroTile
, (SWR_RENDERTARGET_ATTACHMENT
)i
, false);
414 pHotTile
->state
= HOTTILE_INVALID
;
420 #if KNOB_SIMD_WIDTH == 8
421 const __m256 vQuadCenterOffsetsX
= { 0.5, 1.5, 0.5, 1.5, 2.5, 3.5, 2.5, 3.5 };
422 const __m256 vQuadCenterOffsetsY
= { 0.5, 0.5, 1.5, 1.5, 0.5, 0.5, 1.5, 1.5 };
423 const __m256 vQuadULOffsetsX
={0.0, 1.0, 0.0, 1.0, 2.0, 3.0, 2.0, 3.0};
424 const __m256 vQuadULOffsetsY
={0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0};
427 #error Unsupported vector width
431 bool CanEarlyZ(const SWR_PS_STATE
*pPSState
)
433 return (pPSState
->forceEarlyZ
|| (!pPSState
->writesODepth
&& !pPSState
->usesSourceDepth
&& !pPSState
->usesUAV
));
436 simdmask
ComputeUserClipMask(uint8_t clipMask
, float* pUserClipBuffer
, simdscalar vI
, simdscalar vJ
)
438 simdscalar vClipMask
= _simd_setzero_ps();
439 uint32_t numClipDistance
= _mm_popcnt_u32(clipMask
);
441 for (uint32_t i
= 0; i
< numClipDistance
; ++i
)
443 // pull triangle clip distance values from clip buffer
444 simdscalar vA
= _simd_broadcast_ss(pUserClipBuffer
++);
445 simdscalar vB
= _simd_broadcast_ss(pUserClipBuffer
++);
446 simdscalar vC
= _simd_broadcast_ss(pUserClipBuffer
++);
449 simdscalar vInterp
= vplaneps(vA
, vB
, vC
, vI
, vJ
);
451 // clip if interpolated clip distance is < 0 || NAN
452 simdscalar vCull
= _simd_cmp_ps(_simd_setzero_ps(), vInterp
, _CMP_NLE_UQ
);
454 vClipMask
= _simd_or_ps(vClipMask
, vCull
);
457 return _simd_movemask_ps(vClipMask
);
460 template<SWR_MULTISAMPLE_COUNT sampleCountT
, bool bIsStandardPattern
, bool bForcedSampleCount
>
461 INLINE
void generateInputCoverage(const uint64_t *const coverageMask
, uint32_t (&inputMask
)[KNOB_SIMD_WIDTH
], const uint32_t sampleMask
)
464 // will need to update for avx512
465 assert(KNOB_SIMD_WIDTH
== 8);
468 __m256i sampleCoverage
[2];
469 if(bIsStandardPattern
)
471 __m256i src
= _mm256_set1_epi32(0);
472 __m256i index0
= _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0), index1
;
474 if(MultisampleTraits
<sampleCountT
>::numSamples
== 1)
476 mask
[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, -1);
478 else if(MultisampleTraits
<sampleCountT
>::numSamples
== 2)
480 mask
[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1);
482 else if(MultisampleTraits
<sampleCountT
>::numSamples
== 4)
484 mask
[0] = _mm256_set_epi32(0, 0, 0, 0, -1, -1, -1, -1);
486 else if(MultisampleTraits
<sampleCountT
>::numSamples
== 8)
488 mask
[0] = _mm256_set1_epi32(-1);
490 else if(MultisampleTraits
<sampleCountT
>::numSamples
== 16)
492 mask
[0] = _mm256_set1_epi32(-1);
493 mask
[1] = _mm256_set1_epi32(-1);
494 index1
= _mm256_set_epi32(15, 14, 13, 12, 11, 10, 9, 8);
497 // gather coverage for samples 0-7
498 sampleCoverage
[0] = _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src
), (const float*)coverageMask
, index0
, _mm256_castsi256_ps(mask
[0]), 8));
499 if(MultisampleTraits
<sampleCountT
>::numSamples
> 8)
501 // gather coverage for samples 8-15
502 sampleCoverage
[1] = _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src
), (const float*)coverageMask
, index1
, _mm256_castsi256_ps(mask
[1]), 8));
507 // center coverage is the same for all samples; just broadcast to the sample slots
508 uint32_t centerCoverage
= ((uint32_t)(*coverageMask
) & MASK
);
509 if(MultisampleTraits
<sampleCountT
>::numSamples
== 1)
511 sampleCoverage
[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, centerCoverage
);
513 else if(MultisampleTraits
<sampleCountT
>::numSamples
== 2)
515 sampleCoverage
[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, centerCoverage
, centerCoverage
);
517 else if(MultisampleTraits
<sampleCountT
>::numSamples
== 4)
519 sampleCoverage
[0] = _mm256_set_epi32(0, 0, 0, 0, centerCoverage
, centerCoverage
, centerCoverage
, centerCoverage
);
521 else if(MultisampleTraits
<sampleCountT
>::numSamples
== 8)
523 sampleCoverage
[0] = _mm256_set1_epi32(centerCoverage
);
525 else if(MultisampleTraits
<sampleCountT
>::numSamples
== 16)
527 sampleCoverage
[0] = _mm256_set1_epi32(centerCoverage
);
528 sampleCoverage
[1] = _mm256_set1_epi32(centerCoverage
);
532 mask
[0] = _mm256_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xC, 0x8, 0x4, 0x0,
533 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xC, 0x8, 0x4, 0x0);
534 // pull out the the 8bit 4x2 coverage for samples 0-7 into the lower 32 bits of each 128bit lane
535 __m256i packedCoverage0
= _simd_shuffle_epi8(sampleCoverage
[0], mask
[0]);
537 __m256i packedCoverage1
;
538 if(MultisampleTraits
<sampleCountT
>::numSamples
> 8)
540 // pull out the the 8bit 4x2 coverage for samples 8-15 into the lower 32 bits of each 128bit lane
541 packedCoverage1
= _simd_shuffle_epi8(sampleCoverage
[1], mask
[0]);
544 #if (KNOB_ARCH == KNOB_ARCH_AVX)
545 // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane
546 __m256i hiToLow
= _mm256_permute2f128_si256(packedCoverage0
, packedCoverage0
, 0x83);
547 __m256 shufRes
= _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow
), _mm256_castsi256_ps(hiToLow
), _MM_SHUFFLE(1, 1, 0, 1));
548 packedCoverage0
= _mm256_castps_si256(_mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0
), shufRes
, 0xFE));
550 __m256i packedSampleCoverage
;
551 if(MultisampleTraits
<sampleCountT
>::numSamples
> 8)
553 // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane
554 hiToLow
= _mm256_permute2f128_si256(packedCoverage1
, packedCoverage1
, 0x83);
555 shufRes
= _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow
), _mm256_castsi256_ps(hiToLow
), _MM_SHUFFLE(1, 1, 0, 1));
556 shufRes
= _mm256_blend_ps(_mm256_castsi256_ps(packedCoverage1
), shufRes
, 0xFE);
557 packedCoverage1
= _mm256_castps_si256(_mm256_castpd_ps(_mm256_shuffle_pd(_mm256_castps_pd(shufRes
), _mm256_castps_pd(shufRes
), 0x01)));
558 packedSampleCoverage
= _mm256_castps_si256(_mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0
), _mm256_castsi256_ps(packedCoverage1
), 0xFC));
562 packedSampleCoverage
= packedCoverage0
;
565 __m256i permMask
= _mm256_set_epi32(0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x4, 0x0);
566 // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane
567 packedCoverage0
= _mm256_permutevar8x32_epi32(packedCoverage0
, permMask
);
569 __m256i packedSampleCoverage
;
570 if(MultisampleTraits
<sampleCountT
>::numSamples
> 8)
572 permMask
= _mm256_set_epi32(0x7, 0x7, 0x7, 0x7, 0x4, 0x0, 0x7, 0x7);
573 // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane
574 packedCoverage1
= _mm256_permutevar8x32_epi32(packedCoverage1
, permMask
);
576 // blend coverage masks for samples 0-7 and samples 8-15 into single 128 bit lane
577 packedSampleCoverage
= _mm256_blend_epi32(packedCoverage0
, packedCoverage1
, 0x0C);
581 packedSampleCoverage
= packedCoverage0
;
585 for(int32_t i
= KNOB_SIMD_WIDTH
- 1; i
>= 0; i
--)
587 // convert packed sample coverage masks into single coverage masks for all samples for each pixel in the 4x2
588 inputMask
[i
] = _simd_movemask_epi8(packedSampleCoverage
);
590 if(!bForcedSampleCount
)
592 // input coverage has to be anded with sample mask if MSAA isn't forced on
593 inputMask
[i
] &= sampleMask
;
596 // shift to the next pixel in the 4x2
597 packedSampleCoverage
= _simd_slli_epi32(packedSampleCoverage
, 1);
601 template<SWR_MULTISAMPLE_COUNT sampleCountT
, bool bIsStandardPattern
, bool bForcedSampleCount
>
602 INLINE
void generateInputCoverage(const uint64_t *const coverageMask
, __m256
&inputCoverage
, const uint32_t sampleMask
)
604 uint32_t inputMask
[KNOB_SIMD_WIDTH
];
605 generateInputCoverage
<sampleCountT
, bIsStandardPattern
, bForcedSampleCount
>(coverageMask
, inputMask
, sampleMask
);
606 inputCoverage
= _simd_castsi_ps(_mm256_set_epi32(inputMask
[7], inputMask
[6], inputMask
[5], inputMask
[4], inputMask
[3], inputMask
[2], inputMask
[1], inputMask
[0]));
609 template<bool perspMask
>
610 INLINE
void CalcPixelBarycentrics(const BarycentricCoeffs
& coeffs
, SWR_PS_CONTEXT
&psContext
)
615 psContext
.vI
.center
= vplaneps(coeffs
.vIa
, coeffs
.vIb
, coeffs
.vIc
, psContext
.vX
.center
, psContext
.vY
.center
);
616 psContext
.vJ
.center
= vplaneps(coeffs
.vJa
, coeffs
.vJb
, coeffs
.vJc
, psContext
.vX
.center
, psContext
.vY
.center
);
617 psContext
.vI
.center
= _simd_mul_ps(psContext
.vI
.center
, coeffs
.vRecipDet
);
618 psContext
.vJ
.center
= _simd_mul_ps(psContext
.vJ
.center
, coeffs
.vRecipDet
);
621 psContext
.vOneOverW
.center
= vplaneps(coeffs
.vAOneOverW
, coeffs
.vBOneOverW
, coeffs
.vCOneOverW
, psContext
.vI
.center
, psContext
.vJ
.center
);
625 template<bool perspMask
>
626 INLINE
void CalcSampleBarycentrics(const BarycentricCoeffs
& coeffs
, SWR_PS_CONTEXT
&psContext
)
631 psContext
.vI
.sample
= vplaneps(coeffs
.vIa
, coeffs
.vIb
, coeffs
.vIc
, psContext
.vX
.sample
, psContext
.vY
.sample
);
632 psContext
.vJ
.sample
= vplaneps(coeffs
.vJa
, coeffs
.vJb
, coeffs
.vJc
, psContext
.vX
.sample
, psContext
.vY
.sample
);
633 psContext
.vI
.sample
= _simd_mul_ps(psContext
.vI
.sample
, coeffs
.vRecipDet
);
634 psContext
.vJ
.sample
= _simd_mul_ps(psContext
.vJ
.sample
, coeffs
.vRecipDet
);
637 psContext
.vOneOverW
.sample
= vplaneps(coeffs
.vAOneOverW
, coeffs
.vBOneOverW
, coeffs
.vCOneOverW
, psContext
.vI
.sample
, psContext
.vJ
.sample
);
642 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
643 // Centroid behaves exactly as follows :
644 // (1) If all samples in the primitive are covered, the attribute is evaluated at the pixel center (even if the sample pattern does not happen to
645 // have a sample location there).
646 // (2) Else the attribute is evaluated at the first covered sample, in increasing order of sample index, where sample coverage is after ANDing the
647 // coverage with the SampleMask Rasterizer State.
648 // (3) If no samples are covered, such as on helper pixels executed off the bounds of a primitive to fill out 2x2 pixel stamps, the attribute is
649 // evaluated as follows : If the SampleMask Rasterizer state is a subset of the samples in the pixel, then the first sample covered by the
650 // SampleMask Rasterizer State is the evaluation point.Otherwise (full SampleMask), the pixel center is the evaluation point.
651 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
652 template<SWR_MULTISAMPLE_COUNT sampleCount
, bool bForcedSampleCount
>
653 INLINE
void CalcCentroidPos(SWR_PS_CONTEXT
&psContext
, const uint64_t *const coverageMask
, const uint32_t sampleMask
,
654 const simdscalar vXSamplePosUL
, const simdscalar vYSamplePosUL
)
656 uint32_t inputMask
[KNOB_SIMD_WIDTH
];
658 generateInputCoverage
<sampleCount
, 1, bForcedSampleCount
>(coverageMask
, inputMask
, sampleMask
);
660 // Case (2) - partially covered pixel
662 // scan for first covered sample per pixel in the 4x2 span
663 unsigned long sampleNum
[KNOB_SIMD_WIDTH
];
664 (inputMask
[0] > 0) ? (_BitScanForward(&sampleNum
[0], inputMask
[0])) : (sampleNum
[0] = 0);
665 (inputMask
[1] > 0) ? (_BitScanForward(&sampleNum
[1], inputMask
[1])) : (sampleNum
[1] = 0);
666 (inputMask
[2] > 0) ? (_BitScanForward(&sampleNum
[2], inputMask
[2])) : (sampleNum
[2] = 0);
667 (inputMask
[3] > 0) ? (_BitScanForward(&sampleNum
[3], inputMask
[3])) : (sampleNum
[3] = 0);
668 (inputMask
[4] > 0) ? (_BitScanForward(&sampleNum
[4], inputMask
[4])) : (sampleNum
[4] = 0);
669 (inputMask
[5] > 0) ? (_BitScanForward(&sampleNum
[5], inputMask
[5])) : (sampleNum
[5] = 0);
670 (inputMask
[6] > 0) ? (_BitScanForward(&sampleNum
[6], inputMask
[6])) : (sampleNum
[6] = 0);
671 (inputMask
[7] > 0) ? (_BitScanForward(&sampleNum
[7], inputMask
[7])) : (sampleNum
[7] = 0);
673 // look up and set the sample offsets from UL pixel corner for first covered sample
674 __m256 vXSample
= _mm256_set_ps(MultisampleTraits
<sampleCount
>::X(sampleNum
[7]),
675 MultisampleTraits
<sampleCount
>::X(sampleNum
[6]),
676 MultisampleTraits
<sampleCount
>::X(sampleNum
[5]),
677 MultisampleTraits
<sampleCount
>::X(sampleNum
[4]),
678 MultisampleTraits
<sampleCount
>::X(sampleNum
[3]),
679 MultisampleTraits
<sampleCount
>::X(sampleNum
[2]),
680 MultisampleTraits
<sampleCount
>::X(sampleNum
[1]),
681 MultisampleTraits
<sampleCount
>::X(sampleNum
[0]));
683 __m256 vYSample
= _mm256_set_ps(MultisampleTraits
<sampleCount
>::Y(sampleNum
[7]),
684 MultisampleTraits
<sampleCount
>::Y(sampleNum
[6]),
685 MultisampleTraits
<sampleCount
>::Y(sampleNum
[5]),
686 MultisampleTraits
<sampleCount
>::Y(sampleNum
[4]),
687 MultisampleTraits
<sampleCount
>::Y(sampleNum
[3]),
688 MultisampleTraits
<sampleCount
>::Y(sampleNum
[2]),
689 MultisampleTraits
<sampleCount
>::Y(sampleNum
[1]),
690 MultisampleTraits
<sampleCount
>::Y(sampleNum
[0]));
691 // add sample offset to UL pixel corner
692 vXSample
= _simd_add_ps(vXSamplePosUL
, vXSample
);
693 vYSample
= _simd_add_ps(vYSamplePosUL
, vYSample
);
695 // Case (1) and case (3b) - All samples covered or not covered with full SampleMask
696 static const __m256i vFullyCoveredMask
= MultisampleTraits
<sampleCount
>::FullSampleMask();
697 __m256i vInputCoveragei
= _mm256_set_epi32(inputMask
[7], inputMask
[6], inputMask
[5], inputMask
[4], inputMask
[3], inputMask
[2], inputMask
[1], inputMask
[0]);
698 __m256i vAllSamplesCovered
= _simd_cmpeq_epi32(vInputCoveragei
, vFullyCoveredMask
);
700 static const __m256i vZero
= _simd_setzero_si();
701 const __m256i vSampleMask
= _simd_and_si(_simd_set1_epi32(sampleMask
), vFullyCoveredMask
);
702 __m256i vNoSamplesCovered
= _simd_cmpeq_epi32(vInputCoveragei
, vZero
);
703 __m256i vIsFullSampleMask
= _simd_cmpeq_epi32(vSampleMask
, vFullyCoveredMask
);
704 __m256i vCase3b
= _simd_and_si(vNoSamplesCovered
, vIsFullSampleMask
);
706 __m256i vEvalAtCenter
= _simd_or_si(vAllSamplesCovered
, vCase3b
);
708 // set the centroid position based on results from above
709 psContext
.vX
.centroid
= _simd_blendv_ps(vXSample
, psContext
.vX
.center
, _simd_castsi_ps(vEvalAtCenter
));
710 psContext
.vY
.centroid
= _simd_blendv_ps(vYSample
, psContext
.vY
.center
, _simd_castsi_ps(vEvalAtCenter
));
712 // Case (3a) No samples covered and partial sample mask
713 __m256i vSomeSampleMaskSamples
= _simd_cmplt_epi32(vSampleMask
, vFullyCoveredMask
);
714 // sample mask should never be all 0's for this case, but handle it anyways
715 unsigned long firstCoveredSampleMaskSample
= 0;
716 (sampleMask
> 0) ? (_BitScanForward(&firstCoveredSampleMaskSample
, sampleMask
)) : (firstCoveredSampleMaskSample
= 0);
718 __m256i vCase3a
= _simd_and_si(vNoSamplesCovered
, vSomeSampleMaskSamples
);
720 vXSample
= _simd_set1_ps(MultisampleTraits
<sampleCount
>::X(firstCoveredSampleMaskSample
));
721 vYSample
= _simd_set1_ps(MultisampleTraits
<sampleCount
>::Y(firstCoveredSampleMaskSample
));
723 // blend in case 3a pixel locations
724 psContext
.vX
.centroid
= _simd_blendv_ps(psContext
.vX
.centroid
, vXSample
, _simd_castsi_ps(vCase3a
));
725 psContext
.vY
.centroid
= _simd_blendv_ps(psContext
.vY
.centroid
, vYSample
, _simd_castsi_ps(vCase3a
));
728 template<uint32_t sampleCount
, uint32_t persp
, uint32_t standardPattern
, uint32_t forcedMultisampleCount
>
729 INLINE
void CalcCentroidBarycentrics(const BarycentricCoeffs
& coeffs
, SWR_PS_CONTEXT
&psContext
,
730 const uint64_t *const coverageMask
, const uint32_t sampleMask
,
731 const simdscalar vXSamplePosUL
, const simdscalar vYSamplePosUL
)
733 static const bool bPersp
= (bool)persp
;
734 static const bool bIsStandardPattern
= (bool)standardPattern
;
735 static const bool bForcedMultisampleCount
= (bool)forcedMultisampleCount
;
737 // calculate centroid positions
740 if(bIsStandardPattern
)
742 ///@ todo: don't need to generate input coverage 2x if input coverage and centroid
743 CalcCentroidPos
<(SWR_MULTISAMPLE_COUNT
)sampleCount
, bForcedMultisampleCount
>(psContext
, coverageMask
, sampleMask
, vXSamplePosUL
, vYSamplePosUL
);
747 static const __m256 pixelCenter
= _simd_set1_ps(0.5f
);
748 psContext
.vX
.centroid
= _simd_add_ps(vXSamplePosUL
, pixelCenter
);
749 psContext
.vY
.centroid
= _simd_add_ps(vYSamplePosUL
, pixelCenter
);
752 psContext
.vI
.centroid
= vplaneps(coeffs
.vIa
, coeffs
.vIb
, coeffs
.vIc
, psContext
.vX
.centroid
, psContext
.vY
.centroid
);
753 psContext
.vJ
.centroid
= vplaneps(coeffs
.vJa
, coeffs
.vJb
, coeffs
.vJc
, psContext
.vX
.centroid
, psContext
.vY
.centroid
);
754 psContext
.vI
.centroid
= _simd_mul_ps(psContext
.vI
.centroid
, coeffs
.vRecipDet
);
755 psContext
.vJ
.centroid
= _simd_mul_ps(psContext
.vJ
.centroid
, coeffs
.vRecipDet
);
758 psContext
.vOneOverW
.centroid
= vplaneps(coeffs
.vAOneOverW
, coeffs
.vBOneOverW
, coeffs
.vCOneOverW
, psContext
.vI
.centroid
, psContext
.vJ
.centroid
);
762 template<uint32_t NumRT
, uint32_t sampleCountT
>
763 void OutputMerger(SWR_PS_CONTEXT
&psContext
, uint8_t* (&pColorBase
)[SWR_NUM_RENDERTARGETS
], uint32_t sample
, const SWR_BLEND_STATE
*pBlendState
,
764 const PFN_BLEND_JIT_FUNC (&pfnBlendFunc
)[SWR_NUM_RENDERTARGETS
], simdscalar
&coverageMask
, simdscalar depthPassMask
)
766 // type safety guaranteed from template instantiation in BEChooser<>::GetFunc
767 static const SWR_MULTISAMPLE_COUNT sampleCount
= (SWR_MULTISAMPLE_COUNT
)sampleCountT
;
768 uint32_t rasterTileColorOffset
= MultisampleTraits
<sampleCount
>::RasterTileColorOffset(sample
);
771 for(uint32_t rt
= 0; rt
< NumRT
; ++rt
)
773 uint8_t *pColorSample
;
774 if(sampleCount
== SWR_MULTISAMPLE_1X
)
776 pColorSample
= pColorBase
[rt
];
780 pColorSample
= pColorBase
[rt
] + rasterTileColorOffset
;
783 const SWR_RENDER_TARGET_BLEND_STATE
*pRTBlend
= &pBlendState
->renderTarget
[rt
];
784 // pfnBlendFunc may not update all channels. Initialize with PS output.
785 /// TODO: move this into the blend JIT.
786 blendOut
= psContext
.shaded
[rt
];
788 // Blend outputs and update coverage mask for alpha test
789 if(pfnBlendFunc
[rt
] != nullptr)
793 psContext
.shaded
[rt
],
799 (simdscalari
*)&coverageMask
);
803 simdscalari outputMask
= _simd_castps_si(_simd_and_ps(coverageMask
, depthPassMask
));
805 ///@todo can only use maskstore fast path if bpc is 32. Assuming hot tile is RGBA32_FLOAT.
806 static_assert(KNOB_COLOR_HOT_TILE_FORMAT
== R32G32B32A32_FLOAT
, "Unsupported hot tile format");
808 const uint32_t simd
= KNOB_SIMD_WIDTH
* sizeof(float);
810 // store with color mask
811 if(!pRTBlend
->writeDisableRed
)
813 _simd_maskstore_ps((float*)pColorSample
, outputMask
, blendOut
.x
);
815 if(!pRTBlend
->writeDisableGreen
)
817 _simd_maskstore_ps((float*)(pColorSample
+ simd
), outputMask
, blendOut
.y
);
819 if(!pRTBlend
->writeDisableBlue
)
821 _simd_maskstore_ps((float*)(pColorSample
+ simd
* 2), outputMask
, blendOut
.z
);
823 if(!pRTBlend
->writeDisableAlpha
)
825 _simd_maskstore_ps((float*)(pColorSample
+ simd
* 3), outputMask
, blendOut
.w
);
830 template<uint32_t sampleCountT
, uint32_t samplePattern
, uint32_t inputCoverage
, uint32_t centroidPos
, uint32_t forcedSampleCount
>
831 void BackendSingleSample(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t x
, uint32_t y
, SWR_TRIANGLE_DESC
&work
, RenderOutputBuffers
&renderBuffers
)
833 RDTSC_START(BESetup
);
834 // type safety guaranteed from template instantiation in BEChooser<>::GetFunc
835 static const bool bInputCoverage
= (bool)inputCoverage
;
836 static const bool bCentroidPos
= (bool)centroidPos
;
838 SWR_CONTEXT
*pContext
= pDC
->pContext
;
839 const API_STATE
& state
= GetApiState(pDC
);
840 const SWR_RASTSTATE
& rastState
= state
.rastState
;
841 const SWR_PS_STATE
*pPSState
= &state
.psState
;
842 const SWR_BLEND_STATE
*pBlendState
= &state
.blendState
;
843 const BACKEND_FUNCS
& backendFuncs
= pDC
->pState
->backendFuncs
;
844 uint64_t coverageMask
= work
.coverageMask
[0];
847 BarycentricCoeffs coeffs
;
848 coeffs
.vIa
= _simd_broadcast_ss(&work
.I
[0]);
849 coeffs
.vIb
= _simd_broadcast_ss(&work
.I
[1]);
850 coeffs
.vIc
= _simd_broadcast_ss(&work
.I
[2]);
852 coeffs
.vJa
= _simd_broadcast_ss(&work
.J
[0]);
853 coeffs
.vJb
= _simd_broadcast_ss(&work
.J
[1]);
854 coeffs
.vJc
= _simd_broadcast_ss(&work
.J
[2]);
856 coeffs
.vZa
= _simd_broadcast_ss(&work
.Z
[0]);
857 coeffs
.vZb
= _simd_broadcast_ss(&work
.Z
[1]);
858 coeffs
.vZc
= _simd_broadcast_ss(&work
.Z
[2]);
860 coeffs
.vRecipDet
= _simd_broadcast_ss(&work
.recipDet
);
862 coeffs
.vAOneOverW
= _simd_broadcast_ss(&work
.OneOverW
[0]);
863 coeffs
.vBOneOverW
= _simd_broadcast_ss(&work
.OneOverW
[1]);
864 coeffs
.vCOneOverW
= _simd_broadcast_ss(&work
.OneOverW
[2]);
866 uint8_t *pColorBase
[SWR_NUM_RENDERTARGETS
];
867 uint32_t NumRT
= state
.psState
.numRenderTargets
;
868 for(uint32_t rt
= 0; rt
< NumRT
; ++rt
)
870 pColorBase
[rt
] = renderBuffers
.pColor
[rt
];
872 uint8_t *pDepthBase
= renderBuffers
.pDepth
, *pStencilBase
= renderBuffers
.pStencil
;
873 RDTSC_STOP(BESetup
, 0, 0);
875 SWR_PS_CONTEXT psContext
;
876 psContext
.pAttribs
= work
.pAttribs
;
877 psContext
.pPerspAttribs
= work
.pPerspAttribs
;
878 psContext
.frontFace
= work
.triFlags
.frontFacing
;
879 psContext
.primID
= work
.triFlags
.primID
;
881 // save Ia/Ib/Ic and Ja/Jb/Jc if we need to reevaluate i/j/k in the shader because of pull attribs
882 psContext
.I
= work
.I
;
883 psContext
.J
= work
.J
;
884 psContext
.recipDet
= work
.recipDet
;
885 psContext
.pRecipW
= work
.pRecipW
;
886 psContext
.pSamplePosX
= (const float*)&MultisampleTraits
<SWR_MULTISAMPLE_1X
>::samplePosX
;
887 psContext
.pSamplePosY
= (const float*)&MultisampleTraits
<SWR_MULTISAMPLE_1X
>::samplePosY
;
889 for(uint32_t yy
= y
; yy
< y
+ KNOB_TILE_Y_DIM
; yy
+= SIMD_TILE_Y_DIM
)
892 psContext
.vY
.UL
= _simd_add_ps(vQuadULOffsetsY
, _simd_set1_ps((float)yy
));
894 psContext
.vY
.center
= _simd_add_ps(vQuadCenterOffsetsY
, _simd_set1_ps((float)yy
));
896 for(uint32_t xx
= x
; xx
< x
+ KNOB_TILE_X_DIM
; xx
+= SIMD_TILE_X_DIM
)
900 generateInputCoverage
<SWR_MULTISAMPLE_1X
, SWR_MSAA_STANDARD_PATTERN
, false>(&work
.coverageMask
[0], psContext
.inputMask
, pBlendState
->sampleMask
);
903 if(coverageMask
& MASK
)
905 RDTSC_START(BEBarycentric
);
906 psContext
.vX
.UL
= _simd_add_ps(vQuadULOffsetsX
, _simd_set1_ps((float)xx
));
908 psContext
.vX
.center
= _simd_add_ps(vQuadCenterOffsetsX
, _simd_set1_ps((float)xx
));
910 backendFuncs
.pfnCalcPixelBarycentrics(coeffs
, psContext
);
914 // for 1x case, centroid is pixel center
915 psContext
.vX
.centroid
= psContext
.vX
.center
;
916 psContext
.vY
.centroid
= psContext
.vY
.center
;
917 psContext
.vI
.centroid
= psContext
.vI
.center
;
918 psContext
.vJ
.centroid
= psContext
.vJ
.center
;
919 psContext
.vOneOverW
.centroid
= psContext
.vOneOverW
.center
;
923 psContext
.vZ
= vplaneps(coeffs
.vZa
, coeffs
.vZb
, coeffs
.vZc
, psContext
.vI
.center
, psContext
.vJ
.center
);
924 RDTSC_STOP(BEBarycentric
, 0, 0);
926 simdmask clipCoverageMask
= coverageMask
& MASK
;
928 // interpolate user clip distance if available
929 if(rastState
.clipDistanceMask
)
931 clipCoverageMask
&= ~ComputeUserClipMask(rastState
.clipDistanceMask
, work
.pUserClipBuffer
,
932 psContext
.vI
.center
, psContext
.vJ
.center
);
935 simdscalar vCoverageMask
= vMask(clipCoverageMask
);
936 simdscalar depthPassMask
= vCoverageMask
;
937 simdscalar stencilPassMask
= vCoverageMask
;
940 if(CanEarlyZ(pPSState
))
942 RDTSC_START(BEEarlyDepthTest
);
943 depthPassMask
= DepthStencilTest(&state
.vp
[0], &state
.depthStencilState
, work
.triFlags
.frontFacing
,
944 psContext
.vZ
, pDepthBase
, vCoverageMask
, pStencilBase
, &stencilPassMask
);
945 RDTSC_STOP(BEEarlyDepthTest
, 0, 0);
947 // early-exit if no pixels passed depth or earlyZ is forced on
948 if(pPSState
->forceEarlyZ
|| !_simd_movemask_ps(depthPassMask
))
950 DepthStencilWrite(&state
.vp
[0], &state
.depthStencilState
, work
.triFlags
.frontFacing
, psContext
.vZ
,
951 pDepthBase
, depthPassMask
, vCoverageMask
, pStencilBase
, stencilPassMask
);
953 if (!_simd_movemask_ps(depthPassMask
))
960 psContext
.sampleIndex
= 0;
961 psContext
.activeMask
= _simd_castps_si(vCoverageMask
);
963 // execute pixel shader
964 RDTSC_START(BEPixelShader
);
965 UPDATE_STAT(PsInvocations
, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask
)));
966 state
.psState
.pfnPixelShader(GetPrivateState(pDC
), &psContext
);
967 RDTSC_STOP(BEPixelShader
, 0, 0);
969 vCoverageMask
= _simd_castsi_ps(psContext
.activeMask
);
972 if(!CanEarlyZ(pPSState
))
974 RDTSC_START(BELateDepthTest
);
975 depthPassMask
= DepthStencilTest(&state
.vp
[0], &state
.depthStencilState
, work
.triFlags
.frontFacing
,
976 psContext
.vZ
, pDepthBase
, vCoverageMask
, pStencilBase
, &stencilPassMask
);
977 RDTSC_STOP(BELateDepthTest
, 0, 0);
979 if(!_simd_movemask_ps(depthPassMask
))
981 // need to call depth/stencil write for stencil write
982 DepthStencilWrite(&state
.vp
[0], &state
.depthStencilState
, work
.triFlags
.frontFacing
, psContext
.vZ
,
983 pDepthBase
, depthPassMask
, vCoverageMask
, pStencilBase
, stencilPassMask
);
988 uint32_t statMask
= _simd_movemask_ps(depthPassMask
);
989 uint32_t statCount
= _mm_popcnt_u32(statMask
);
990 UPDATE_STAT(DepthPassCount
, statCount
);
993 RDTSC_START(BEOutputMerger
);
994 backendFuncs
.pfnOutputMerger(psContext
, pColorBase
, 0, pBlendState
, state
.pfnBlendFunc
,
995 vCoverageMask
, depthPassMask
);
997 // do final depth write after all pixel kills
998 if (!pPSState
->forceEarlyZ
)
1000 DepthStencilWrite(&state
.vp
[0], &state
.depthStencilState
, work
.triFlags
.frontFacing
, psContext
.vZ
,
1001 pDepthBase
, depthPassMask
, vCoverageMask
, pStencilBase
, stencilPassMask
);
1003 RDTSC_STOP(BEOutputMerger
, 0, 0);
1007 RDTSC_START(BEEndTile
);
1008 coverageMask
>>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
1009 pDepthBase
+= (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
) / 8;
1010 pStencilBase
+= (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
) / 8;
1012 for(uint32_t rt
= 0; rt
< NumRT
; ++rt
)
1014 pColorBase
[rt
] += (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
) / 8;
1016 RDTSC_STOP(BEEndTile
, 0, 0);
1021 template<uint32_t sampleCountT
, uint32_t samplePattern
, uint32_t inputCoverage
, uint32_t centroidPos
, uint32_t forcedSampleCount
>
1022 void BackendSampleRate(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t x
, uint32_t y
, SWR_TRIANGLE_DESC
&work
, RenderOutputBuffers
&renderBuffers
)
1024 // type safety guaranteed from template instantiation in BEChooser<>::GetFunc
1025 static const SWR_MULTISAMPLE_COUNT sampleCount
= (SWR_MULTISAMPLE_COUNT
)sampleCountT
;
1026 static const bool bInputCoverage
= (bool)inputCoverage
;
1027 static const bool bCentroidPos
= (bool)centroidPos
;
1029 RDTSC_START(BESetup
);
1031 SWR_CONTEXT
*pContext
= pDC
->pContext
;
1032 const API_STATE
& state
= GetApiState(pDC
);
1033 const SWR_RASTSTATE
& rastState
= state
.rastState
;
1034 const SWR_PS_STATE
*pPSState
= &state
.psState
;
1035 const SWR_BLEND_STATE
*pBlendState
= &state
.blendState
;
1036 const BACKEND_FUNCS
& backendFuncs
= pDC
->pState
->backendFuncs
;
1038 // broadcast scalars
1039 BarycentricCoeffs coeffs
;
1040 coeffs
.vIa
= _simd_broadcast_ss(&work
.I
[0]);
1041 coeffs
.vIb
= _simd_broadcast_ss(&work
.I
[1]);
1042 coeffs
.vIc
= _simd_broadcast_ss(&work
.I
[2]);
1044 coeffs
.vJa
= _simd_broadcast_ss(&work
.J
[0]);
1045 coeffs
.vJb
= _simd_broadcast_ss(&work
.J
[1]);
1046 coeffs
.vJc
= _simd_broadcast_ss(&work
.J
[2]);
1048 coeffs
.vZa
= _simd_broadcast_ss(&work
.Z
[0]);
1049 coeffs
.vZb
= _simd_broadcast_ss(&work
.Z
[1]);
1050 coeffs
.vZc
= _simd_broadcast_ss(&work
.Z
[2]);
1052 coeffs
.vRecipDet
= _simd_broadcast_ss(&work
.recipDet
);
1054 coeffs
.vAOneOverW
= _simd_broadcast_ss(&work
.OneOverW
[0]);
1055 coeffs
.vBOneOverW
= _simd_broadcast_ss(&work
.OneOverW
[1]);
1056 coeffs
.vCOneOverW
= _simd_broadcast_ss(&work
.OneOverW
[2]);
1058 uint8_t *pColorBase
[SWR_NUM_RENDERTARGETS
];
1059 uint32_t NumRT
= state
.psState
.numRenderTargets
;
1060 for(uint32_t rt
= 0; rt
< NumRT
; ++rt
)
1062 pColorBase
[rt
] = renderBuffers
.pColor
[rt
];
1064 uint8_t *pDepthBase
= renderBuffers
.pDepth
, *pStencilBase
= renderBuffers
.pStencil
;
1065 RDTSC_STOP(BESetup
, 0, 0);
1067 SWR_PS_CONTEXT psContext
;
1068 psContext
.pAttribs
= work
.pAttribs
;
1069 psContext
.pPerspAttribs
= work
.pPerspAttribs
;
1070 psContext
.pRecipW
= work
.pRecipW
;
1071 psContext
.frontFace
= work
.triFlags
.frontFacing
;
1072 psContext
.primID
= work
.triFlags
.primID
;
1074 // save Ia/Ib/Ic and Ja/Jb/Jc if we need to reevaluate i/j/k in the shader because of pull attribs
1075 psContext
.I
= work
.I
;
1076 psContext
.J
= work
.J
;
1077 psContext
.recipDet
= work
.recipDet
;
1078 psContext
.pSamplePosX
= (const float*)&MultisampleTraits
<sampleCount
>::samplePosX
;
1079 psContext
.pSamplePosY
= (const float*)&MultisampleTraits
<sampleCount
>::samplePosY
;
1080 const uint32_t numSamples
= MultisampleTraits
<sampleCount
>::numSamples
;
1082 for (uint32_t yy
= y
; yy
< y
+ KNOB_TILE_Y_DIM
; yy
+= SIMD_TILE_Y_DIM
)
1085 psContext
.vY
.UL
= _simd_add_ps(vQuadULOffsetsY
, _simd_set1_ps((float)yy
));
1087 psContext
.vY
.center
= _simd_add_ps(vQuadCenterOffsetsY
, _simd_set1_ps((float)yy
));
1089 for (uint32_t xx
= x
; xx
< x
+ KNOB_TILE_X_DIM
; xx
+= SIMD_TILE_X_DIM
)
1091 psContext
.vX
.UL
= _simd_add_ps(vQuadULOffsetsX
, _simd_set1_ps((float)xx
));
1093 psContext
.vX
.center
= _simd_add_ps(vQuadCenterOffsetsX
, _simd_set1_ps((float)xx
));
1095 RDTSC_START(BEBarycentric
);
1096 backendFuncs
.pfnCalcPixelBarycentrics(coeffs
, psContext
);
1097 RDTSC_STOP(BEBarycentric
, 0, 0);
1101 generateInputCoverage
<sampleCount
, SWR_MSAA_STANDARD_PATTERN
, false>(&work
.coverageMask
[0], psContext
.inputMask
, pBlendState
->sampleMask
);
1106 ///@ todo: don't need to genererate input coverage 2x if input coverage and centroid
1107 RDTSC_START(BEBarycentric
);
1108 backendFuncs
.pfnCalcCentroidBarycentrics(coeffs
, psContext
, &work
.coverageMask
[0], pBlendState
->sampleMask
, psContext
.vX
.UL
, psContext
.vY
.UL
);
1109 RDTSC_STOP(BEBarycentric
, 0, 0);
1112 for(uint32_t sample
= 0; sample
< numSamples
; sample
++)
1114 if (work
.coverageMask
[sample
] & MASK
)
1116 RDTSC_START(BEBarycentric
);
1118 // calculate per sample positions
1119 psContext
.vX
.sample
= _simd_add_ps(psContext
.vX
.UL
, MultisampleTraits
<sampleCount
>::vX(sample
));
1120 psContext
.vY
.sample
= _simd_add_ps(psContext
.vY
.UL
, MultisampleTraits
<sampleCount
>::vY(sample
));
1122 simdmask coverageMask
= work
.coverageMask
[sample
] & MASK
;
1123 simdscalar vCoverageMask
= vMask(coverageMask
);
1125 backendFuncs
.pfnCalcSampleBarycentrics(coeffs
, psContext
);
1128 psContext
.vZ
= vplaneps(coeffs
.vZa
, coeffs
.vZb
, coeffs
.vZc
, psContext
.vI
.sample
, psContext
.vJ
.sample
);
1130 RDTSC_STOP(BEBarycentric
, 0, 0);
1132 // interpolate user clip distance if available
1133 if (rastState
.clipDistanceMask
)
1135 coverageMask
&= ~ComputeUserClipMask(rastState
.clipDistanceMask
, work
.pUserClipBuffer
,
1136 psContext
.vI
.sample
, psContext
.vJ
.sample
);
1139 simdscalar depthPassMask
= vCoverageMask
;
1140 simdscalar stencilPassMask
= vCoverageMask
;
1142 // offset depth/stencil buffers current sample
1143 uint8_t *pDepthSample
= pDepthBase
+ MultisampleTraits
<sampleCount
>::RasterTileDepthOffset(sample
);
1144 uint8_t *pStencilSample
= pStencilBase
+ MultisampleTraits
<sampleCount
>::RasterTileStencilOffset(sample
);
1147 if (CanEarlyZ(pPSState
))
1149 RDTSC_START(BEEarlyDepthTest
);
1150 depthPassMask
= DepthStencilTest(&state
.vp
[0], &state
.depthStencilState
, work
.triFlags
.frontFacing
,
1151 psContext
.vZ
, pDepthSample
, vCoverageMask
, pStencilSample
, &stencilPassMask
);
1152 RDTSC_STOP(BEEarlyDepthTest
, 0, 0);
1154 // early-exit if no samples passed depth or earlyZ is forced on.
1155 if (pPSState
->forceEarlyZ
|| !_simd_movemask_ps(depthPassMask
))
1157 DepthStencilWrite(&state
.vp
[0], &state
.depthStencilState
, work
.triFlags
.frontFacing
, psContext
.vZ
,
1158 pDepthSample
, depthPassMask
, vCoverageMask
, pStencilSample
, stencilPassMask
);
1160 if (!_simd_movemask_ps(depthPassMask
))
1162 work
.coverageMask
[sample
] >>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
1168 psContext
.sampleIndex
= sample
;
1169 psContext
.activeMask
= _simd_castps_si(vCoverageMask
);
1171 // execute pixel shader
1172 RDTSC_START(BEPixelShader
);
1173 UPDATE_STAT(PsInvocations
, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask
)));
1174 state
.psState
.pfnPixelShader(GetPrivateState(pDC
), &psContext
);
1175 RDTSC_STOP(BEPixelShader
, 0, 0);
1177 vCoverageMask
= _simd_castsi_ps(psContext
.activeMask
);
1180 if (!CanEarlyZ(pPSState
))
1182 RDTSC_START(BELateDepthTest
);
1183 depthPassMask
= DepthStencilTest(&state
.vp
[0], &state
.depthStencilState
, work
.triFlags
.frontFacing
,
1184 psContext
.vZ
, pDepthSample
, vCoverageMask
, pStencilSample
, &stencilPassMask
);
1185 RDTSC_STOP(BELateDepthTest
, 0, 0);
1187 if (!_simd_movemask_ps(depthPassMask
))
1189 // need to call depth/stencil write for stencil write
1190 DepthStencilWrite(&state
.vp
[0], &state
.depthStencilState
, work
.triFlags
.frontFacing
, psContext
.vZ
,
1191 pDepthSample
, depthPassMask
, vCoverageMask
, pStencilSample
, stencilPassMask
);
1193 work
.coverageMask
[sample
] >>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
1198 uint32_t statMask
= _simd_movemask_ps(depthPassMask
);
1199 uint32_t statCount
= _mm_popcnt_u32(statMask
);
1200 UPDATE_STAT(DepthPassCount
, statCount
);
1203 RDTSC_START(BEOutputMerger
);
1204 backendFuncs
.pfnOutputMerger(psContext
, pColorBase
, sample
, pBlendState
, state
.pfnBlendFunc
,
1205 vCoverageMask
, depthPassMask
);
1207 // do final depth write after all pixel kills
1208 if (!pPSState
->forceEarlyZ
)
1210 DepthStencilWrite(&state
.vp
[0], &state
.depthStencilState
, work
.triFlags
.frontFacing
, psContext
.vZ
,
1211 pDepthSample
, depthPassMask
, vCoverageMask
, pStencilSample
, stencilPassMask
);
1213 RDTSC_STOP(BEOutputMerger
, 0, 0);
1215 work
.coverageMask
[sample
] >>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
1217 RDTSC_START(BEEndTile
);
1218 pDepthBase
+= (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
) / 8;
1219 pStencilBase
+= (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
) / 8;
1221 for (uint32_t rt
= 0; rt
< NumRT
; ++rt
)
1223 pColorBase
[rt
] += (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
) / 8;
1225 RDTSC_STOP(BEEndTile
, 0, 0);
1230 template<uint32_t sampleCountT
, uint32_t samplePattern
, uint32_t inputCoverage
, uint32_t centroidPos
, uint32_t forcedSampleCount
>
1231 void BackendPixelRate(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t x
, uint32_t y
, SWR_TRIANGLE_DESC
&work
, RenderOutputBuffers
&renderBuffers
)
1233 // type safety guaranteed from template instantiation in BEChooser<>::GetFunc
1234 static const SWR_MULTISAMPLE_COUNT sampleCount
= (SWR_MULTISAMPLE_COUNT
)sampleCountT
;
1235 static const bool bIsStandardPattern
= (bool)samplePattern
;
1236 static const bool bInputCoverage
= (bool)inputCoverage
;
1237 static const bool bCentroidPos
= (bool)centroidPos
;
1238 static const bool bForcedSampleCount
= (bool)forcedSampleCount
;
1240 RDTSC_START(BESetup
);
1242 SWR_CONTEXT
*pContext
= pDC
->pContext
;
1243 const API_STATE
& state
= GetApiState(pDC
);
1244 const SWR_RASTSTATE
& rastState
= state
.rastState
;
1245 const SWR_PS_STATE
*pPSState
= &state
.psState
;
1246 const SWR_BLEND_STATE
*pBlendState
= &state
.blendState
;
1247 const BACKEND_FUNCS
& backendFuncs
= pDC
->pState
->backendFuncs
;
1249 // broadcast scalars
1250 BarycentricCoeffs coeffs
;
1251 coeffs
.vIa
= _simd_broadcast_ss(&work
.I
[0]);
1252 coeffs
.vIb
= _simd_broadcast_ss(&work
.I
[1]);
1253 coeffs
.vIc
= _simd_broadcast_ss(&work
.I
[2]);
1255 coeffs
.vJa
= _simd_broadcast_ss(&work
.J
[0]);
1256 coeffs
.vJb
= _simd_broadcast_ss(&work
.J
[1]);
1257 coeffs
.vJc
= _simd_broadcast_ss(&work
.J
[2]);
1259 coeffs
.vZa
= _simd_broadcast_ss(&work
.Z
[0]);
1260 coeffs
.vZb
= _simd_broadcast_ss(&work
.Z
[1]);
1261 coeffs
.vZc
= _simd_broadcast_ss(&work
.Z
[2]);
1263 coeffs
.vRecipDet
= _simd_broadcast_ss(&work
.recipDet
);
1265 coeffs
.vAOneOverW
= _simd_broadcast_ss(&work
.OneOverW
[0]);
1266 coeffs
.vBOneOverW
= _simd_broadcast_ss(&work
.OneOverW
[1]);
1267 coeffs
.vCOneOverW
= _simd_broadcast_ss(&work
.OneOverW
[2]);
1269 uint8_t *pColorBase
[SWR_NUM_RENDERTARGETS
];
1270 uint32_t NumRT
= state
.psState
.numRenderTargets
;
1271 for(uint32_t rt
= 0; rt
< NumRT
; ++rt
)
1273 pColorBase
[rt
] = renderBuffers
.pColor
[rt
];
1275 uint8_t *pDepthBase
= renderBuffers
.pDepth
, *pStencilBase
= renderBuffers
.pStencil
;
1276 RDTSC_STOP(BESetup
, 0, 0);
1278 SWR_PS_CONTEXT psContext
;
1279 psContext
.pAttribs
= work
.pAttribs
;
1280 psContext
.pPerspAttribs
= work
.pPerspAttribs
;
1281 psContext
.frontFace
= work
.triFlags
.frontFacing
;
1282 psContext
.primID
= work
.triFlags
.primID
;
1283 psContext
.pRecipW
= work
.pRecipW
;
1284 // save Ia/Ib/Ic and Ja/Jb/Jc if we need to reevaluate i/j/k in the shader because of pull attribs
1285 psContext
.I
= work
.I
;
1286 psContext
.J
= work
.J
;
1287 psContext
.recipDet
= work
.recipDet
;
1288 psContext
.pSamplePosX
= (const float*)&MultisampleTraits
<sampleCount
>::samplePosX
;
1289 psContext
.pSamplePosY
= (const float*)&MultisampleTraits
<sampleCount
>::samplePosY
;
1290 psContext
.sampleIndex
= 0;
1292 uint32_t numCoverageSamples
;
1293 if(bIsStandardPattern
)
1295 numCoverageSamples
= MultisampleTraits
<sampleCount
>::numSamples
;
1299 numCoverageSamples
= 1;
1302 uint32_t numOMSamples
;
1303 // RT has to be single sample if we're in forcedMSAA mode
1304 if(bForcedSampleCount
&& (sampleCount
> SWR_MULTISAMPLE_1X
))
1308 // unless we're forced to single sample, in which case we run the OM at the sample count of the RT
1309 else if(bForcedSampleCount
&& (sampleCount
== SWR_MULTISAMPLE_1X
))
1311 numOMSamples
= GetNumSamples(pBlendState
->sampleCount
);
1313 // else we're in normal MSAA mode and rasterizer and OM are running at the same sample count
1316 numOMSamples
= MultisampleTraits
<sampleCount
>::numSamples
;
1319 for(uint32_t yy
= y
; yy
< y
+ KNOB_TILE_Y_DIM
; yy
+= SIMD_TILE_Y_DIM
)
1321 psContext
.vY
.UL
= _simd_add_ps(vQuadULOffsetsY
, _simd_set1_ps((float)yy
));
1322 psContext
.vY
.center
= _simd_add_ps(vQuadCenterOffsetsY
, _simd_set1_ps((float)yy
));
1323 for(uint32_t xx
= x
; xx
< x
+ KNOB_TILE_X_DIM
; xx
+= SIMD_TILE_X_DIM
)
1325 simdscalar vZ
[MultisampleTraits
<sampleCount
>::numSamples
];
1326 psContext
.vX
.UL
= _simd_add_ps(vQuadULOffsetsX
, _simd_set1_ps((float)xx
));
1327 // set pixel center positions
1328 psContext
.vX
.center
= _simd_add_ps(vQuadCenterOffsetsX
, _simd_set1_ps((float)xx
));
1332 generateInputCoverage
<sampleCount
, bIsStandardPattern
, bForcedSampleCount
>(&work
.coverageMask
[0], psContext
.inputMask
, pBlendState
->sampleMask
);
1337 ///@ todo: don't need to genererate input coverage 2x if input coverage and centroid
1338 RDTSC_START(BEBarycentric
);
1339 backendFuncs
.pfnCalcCentroidBarycentrics(coeffs
, psContext
, &work
.coverageMask
[0], pBlendState
->sampleMask
, psContext
.vX
.UL
, psContext
.vY
.UL
);
1340 RDTSC_STOP(BEBarycentric
, 0, 0);
1343 // if oDepth written to, or there is a potential to discard any samples, we need to
1344 // run the PS early, then interp or broadcast Z and test
1345 if(pPSState
->writesODepth
|| pPSState
->killsPixel
)
1347 RDTSC_START(BEBarycentric
);
1348 backendFuncs
.pfnCalcPixelBarycentrics(coeffs
, psContext
);
1351 psContext
.vZ
= vplaneps(coeffs
.vZa
, coeffs
.vZb
, coeffs
.vZc
, psContext
.vI
.center
, psContext
.vJ
.center
);
1352 RDTSC_STOP(BEBarycentric
, 0, 0);
1354 // execute pixel shader
1355 RDTSC_START(BEPixelShader
);
1356 state
.psState
.pfnPixelShader(GetPrivateState(pDC
), &psContext
);
1357 RDTSC_STOP(BEPixelShader
, 0, 0);
1361 psContext
.activeMask
= _simd_set1_epi32(-1);
1364 // need to declare enough space for all samples
1365 simdscalar vCoverageMask
[MultisampleTraits
<sampleCount
>::numSamples
];
1366 simdscalar depthPassMask
[MultisampleTraits
<sampleCount
>::numSamples
];
1367 simdscalar stencilPassMask
[MultisampleTraits
<sampleCount
>::numSamples
];
1368 simdscalar anyDepthSamplePassed
= _simd_setzero_ps();
1369 simdscalar anyStencilSamplePassed
= _simd_setzero_ps();
1370 for(uint32_t sample
= 0; sample
< numCoverageSamples
; sample
++)
1372 vCoverageMask
[sample
] = vMask(work
.coverageMask
[sample
] & MASK
);
1374 // pull mask back out for any discards and and with coverage
1375 vCoverageMask
[sample
] = _simd_and_ps(vCoverageMask
[sample
], _simd_castsi_ps(psContext
.activeMask
));
1377 if (!_simd_movemask_ps(vCoverageMask
[sample
]))
1379 vCoverageMask
[sample
] = depthPassMask
[sample
] = stencilPassMask
[sample
] = _simd_setzero_ps();
1383 if(bForcedSampleCount
)
1385 // candidate pixels (that passed coverage) will cause shader invocation if any bits in the samplemask are set
1386 const simdscalar vSampleMask
= _simd_castsi_ps(_simd_cmpgt_epi32(_simd_set1_epi32(pBlendState
->sampleMask
), _simd_setzero_si()));
1387 anyDepthSamplePassed
= _simd_or_ps(anyDepthSamplePassed
, _simd_and_ps(vCoverageMask
[sample
], vSampleMask
));
1391 depthPassMask
[sample
] = vCoverageMask
[sample
];
1393 // if oDepth isn't written to, we need to interpolate Z for each sample
1394 // if clip distances are enabled, we need to interpolate for each sample
1395 if(!pPSState
->writesODepth
|| rastState
.clipDistanceMask
)
1397 RDTSC_START(BEBarycentric
);
1398 if(bIsStandardPattern
)
1400 // calculate per sample positions
1401 psContext
.vX
.sample
= _simd_add_ps(psContext
.vX
.UL
, MultisampleTraits
<sampleCount
>::vX(sample
));
1402 psContext
.vY
.sample
= _simd_add_ps(psContext
.vY
.UL
, MultisampleTraits
<sampleCount
>::vY(sample
));
1406 psContext
.vX
.sample
= psContext
.vX
.center
;
1407 psContext
.vY
.sample
= psContext
.vY
.center
;
1410 // calc I & J per sample
1411 backendFuncs
.pfnCalcSampleBarycentrics(coeffs
, psContext
);
1414 if (!pPSState
->writesODepth
)
1416 vZ
[sample
] = vplaneps(coeffs
.vZa
, coeffs
.vZb
, coeffs
.vZc
, psContext
.vI
.sample
, psContext
.vJ
.sample
);
1419 ///@todo: perspective correct vs non-perspective correct clipping?
1420 // interpolate clip distances
1421 if (rastState
.clipDistanceMask
)
1423 uint8_t clipMask
= ComputeUserClipMask(rastState
.clipDistanceMask
, work
.pUserClipBuffer
,
1424 psContext
.vI
.sample
, psContext
.vJ
.sample
);
1425 vCoverageMask
[sample
] = _simd_and_ps(vCoverageMask
[sample
], vMask(~clipMask
));
1427 RDTSC_STOP(BEBarycentric
, 0, 0);
1429 // else 'broadcast' and test psContext.vZ written from the PS each sample
1432 vZ
[sample
] = psContext
.vZ
;
1435 // offset depth/stencil buffers current sample
1436 uint8_t *pDepthSample
= pDepthBase
+ MultisampleTraits
<sampleCount
>::RasterTileDepthOffset(sample
);
1437 uint8_t * pStencilSample
= pStencilBase
+ MultisampleTraits
<sampleCount
>::RasterTileStencilOffset(sample
);
1439 // ZTest for this sample
1440 RDTSC_START(BEEarlyDepthTest
);
1441 stencilPassMask
[sample
] = vCoverageMask
[sample
];
1442 depthPassMask
[sample
] = DepthStencilTest(&state
.vp
[0], &state
.depthStencilState
, work
.triFlags
.frontFacing
,
1443 vZ
[sample
], pDepthSample
, vCoverageMask
[sample
], pStencilSample
, &stencilPassMask
[sample
]);
1444 RDTSC_STOP(BEEarlyDepthTest
, 0, 0);
1446 anyDepthSamplePassed
= _simd_or_ps(anyDepthSamplePassed
, depthPassMask
[sample
]);
1447 anyStencilSamplePassed
= _simd_or_ps(anyStencilSamplePassed
, stencilPassMask
[sample
]);
1448 uint32_t statMask
= _simd_movemask_ps(depthPassMask
[sample
]);
1449 uint32_t statCount
= _mm_popcnt_u32(statMask
);
1450 UPDATE_STAT(DepthPassCount
, statCount
);
1453 // if we didn't have to execute the PS early, and at least 1 sample passed the depth test, run the PS
1454 if(!pPSState
->writesODepth
&& !pPSState
->killsPixel
&& _simd_movemask_ps(anyDepthSamplePassed
))
1456 RDTSC_START(BEBarycentric
);
1457 backendFuncs
.pfnCalcPixelBarycentrics(coeffs
, psContext
);
1459 psContext
.vZ
= vplaneps(coeffs
.vZa
, coeffs
.vZb
, coeffs
.vZc
, psContext
.vI
.center
, psContext
.vJ
.center
);
1460 RDTSC_STOP(BEBarycentric
, 0, 0);
1462 // execute pixel shader
1463 RDTSC_START(BEPixelShader
);
1464 state
.psState
.pfnPixelShader(GetPrivateState(pDC
), &psContext
);
1465 RDTSC_STOP(BEPixelShader
, 0, 0);
1467 ///@todo: make sure this works for kill pixel
1468 else if(!_simd_movemask_ps(anyStencilSamplePassed
))
1473 // loop over all samples, broadcasting the results of the PS to all passing pixels
1474 for(uint32_t sample
= 0; sample
< numOMSamples
; sample
++)
1476 uint8_t *pDepthSample
= pDepthBase
+ MultisampleTraits
<sampleCount
>::RasterTileDepthOffset(sample
);
1477 uint8_t * pStencilSample
= pStencilBase
+ MultisampleTraits
<sampleCount
>::RasterTileStencilOffset(sample
);
1480 RDTSC_START(BEOutputMerger
);
1482 // skip if none of the pixels for this sample passed
1483 simdscalar coverageMaskSample
;
1484 simdscalar depthMaskSample
;
1485 simdscalar stencilMaskSample
;
1486 simdscalar vInterpolatedZ
;
1488 // forcedSampleCount outputs to any pixels with covered samples not masked off by SampleMask
1489 // depth test is disabled, so just set the z val to 0.
1490 if(bForcedSampleCount
)
1492 coverageMaskSample
= depthMaskSample
= anyDepthSamplePassed
;
1493 vInterpolatedZ
= _simd_setzero_ps();
1495 else if(bIsStandardPattern
)
1497 if(!_simd_movemask_ps(depthPassMask
[sample
]))
1499 depthPassMask
[sample
] = _simd_setzero_ps();
1500 DepthStencilWrite(&state
.vp
[0], &state
.depthStencilState
, work
.triFlags
.frontFacing
, vZ
[sample
], pDepthSample
, depthPassMask
[sample
],
1501 vCoverageMask
[sample
], pStencilSample
, stencilPassMask
[sample
]);
1504 coverageMaskSample
= vCoverageMask
[sample
];
1505 depthMaskSample
= depthPassMask
[sample
];
1506 stencilMaskSample
= stencilPassMask
[sample
];
1507 vInterpolatedZ
= vZ
[sample
];
1511 // center pattern only needs to use a single depth test as all samples are at the same position
1512 if(!_simd_movemask_ps(depthPassMask
[0]))
1514 depthPassMask
[0] = _simd_setzero_ps();
1515 DepthStencilWrite(&state
.vp
[0], &state
.depthStencilState
, work
.triFlags
.frontFacing
, vZ
[0], pDepthSample
, depthPassMask
[0],
1516 vCoverageMask
[0], pStencilSample
, stencilPassMask
[0]);
1519 coverageMaskSample
= (vCoverageMask
[0]);
1520 depthMaskSample
= depthPassMask
[0];
1521 stencilMaskSample
= stencilPassMask
[0];
1522 vInterpolatedZ
= vZ
[0];
1526 RDTSC_START(BEOutputMerger
);
1527 backendFuncs
.pfnOutputMerger(psContext
, pColorBase
, sample
, pBlendState
, state
.pfnBlendFunc
,
1528 coverageMaskSample
, depthMaskSample
);
1530 DepthStencilWrite(&state
.vp
[0], &state
.depthStencilState
, work
.triFlags
.frontFacing
, vInterpolatedZ
, pDepthSample
, depthMaskSample
,
1531 coverageMaskSample
, pStencilSample
, stencilMaskSample
);
1532 RDTSC_STOP(BEOutputMerger
, 0, 0);
1536 RDTSC_START(BEEndTile
);
1537 for(uint32_t sample
= 0; sample
< numCoverageSamples
; sample
++)
1539 work
.coverageMask
[sample
] >>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
1542 pDepthBase
+= (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
) / 8;
1543 pStencilBase
+= (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
) / 8;
1545 for(uint32_t rt
= 0; rt
< NumRT
; ++rt
)
1547 pColorBase
[rt
] += (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
) / 8;
1549 RDTSC_STOP(BEEndTile
, 0, 0);
1553 // optimized backend flow with NULL PS
1554 template<uint32_t sampleCountT
>
1555 void BackendNullPS(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t x
, uint32_t y
, SWR_TRIANGLE_DESC
&work
, RenderOutputBuffers
&renderBuffers
)
1557 RDTSC_START(BESetup
);
1559 static const SWR_MULTISAMPLE_COUNT sampleCount
= (SWR_MULTISAMPLE_COUNT
)sampleCountT
;
1560 SWR_CONTEXT
*pContext
= pDC
->pContext
;
1561 const API_STATE
& state
= GetApiState(pDC
);
1562 const BACKEND_FUNCS
& backendFuncs
= pDC
->pState
->backendFuncs
;
1564 // broadcast scalars
1565 BarycentricCoeffs coeffs
;
1566 coeffs
.vIa
= _simd_broadcast_ss(&work
.I
[0]);
1567 coeffs
.vIb
= _simd_broadcast_ss(&work
.I
[1]);
1568 coeffs
.vIc
= _simd_broadcast_ss(&work
.I
[2]);
1570 coeffs
.vJa
= _simd_broadcast_ss(&work
.J
[0]);
1571 coeffs
.vJb
= _simd_broadcast_ss(&work
.J
[1]);
1572 coeffs
.vJc
= _simd_broadcast_ss(&work
.J
[2]);
1574 coeffs
.vZa
= _simd_broadcast_ss(&work
.Z
[0]);
1575 coeffs
.vZb
= _simd_broadcast_ss(&work
.Z
[1]);
1576 coeffs
.vZc
= _simd_broadcast_ss(&work
.Z
[2]);
1578 coeffs
.vRecipDet
= _simd_broadcast_ss(&work
.recipDet
);
1580 BYTE
*pDepthBase
= renderBuffers
.pDepth
, *pStencilBase
= renderBuffers
.pStencil
;
1582 RDTSC_STOP(BESetup
, 0, 0);
1584 SWR_PS_CONTEXT psContext
;
1585 for (uint32_t yy
= y
; yy
< y
+ KNOB_TILE_Y_DIM
; yy
+= SIMD_TILE_Y_DIM
)
1588 simdscalar vYSamplePosUL
= _simd_add_ps(vQuadULOffsetsY
, _simd_set1_ps((float)yy
));
1590 for (uint32_t xx
= x
; xx
< x
+ KNOB_TILE_X_DIM
; xx
+= SIMD_TILE_X_DIM
)
1593 simdscalar vXSamplePosUL
= _simd_add_ps(vQuadULOffsetsX
, _simd_set1_ps((float)xx
));
1595 // iterate over active samples
1596 unsigned long sample
= 0;
1597 uint32_t sampleMask
= state
.blendState
.sampleMask
;
1598 while (_BitScanForward(&sample
, sampleMask
))
1600 sampleMask
&= ~(1 << sample
);
1601 if (work
.coverageMask
[sample
] & MASK
)
1603 RDTSC_START(BEBarycentric
);
1604 // calculate per sample positions
1605 psContext
.vX
.sample
= _simd_add_ps(vXSamplePosUL
, MultisampleTraits
<sampleCount
>::vX(sample
));
1606 psContext
.vY
.sample
= _simd_add_ps(vYSamplePosUL
, MultisampleTraits
<sampleCount
>::vY(sample
));
1608 backendFuncs
.pfnCalcSampleBarycentrics(coeffs
, psContext
);
1611 psContext
.vZ
= vplaneps(coeffs
.vZa
, coeffs
.vZb
, coeffs
.vZc
, psContext
.vI
.sample
, psContext
.vJ
.sample
);
1613 RDTSC_STOP(BEBarycentric
, 0, 0);
1615 simdscalar vCoverageMask
= vMask(work
.coverageMask
[sample
] & MASK
);
1616 simdscalar stencilPassMask
= vCoverageMask
;
1618 // offset depth/stencil buffers current sample
1619 uint8_t *pDepthSample
= pDepthBase
+ MultisampleTraits
<sampleCount
>::RasterTileDepthOffset(sample
);
1620 uint8_t *pStencilSample
= pStencilBase
+ MultisampleTraits
<sampleCount
>::RasterTileStencilOffset(sample
);
1622 RDTSC_START(BEEarlyDepthTest
);
1623 simdscalar depthPassMask
= DepthStencilTest(&state
.vp
[0], &state
.depthStencilState
, work
.triFlags
.frontFacing
,
1624 psContext
.vZ
, pDepthSample
, vCoverageMask
, pStencilSample
, &stencilPassMask
);
1625 DepthStencilWrite(&state
.vp
[0], &state
.depthStencilState
, work
.triFlags
.frontFacing
, psContext
.vZ
,
1626 pDepthSample
, depthPassMask
, vCoverageMask
, pStencilSample
, stencilPassMask
);
1627 RDTSC_STOP(BEEarlyDepthTest
, 0, 0);
1629 uint32_t statMask
= _simd_movemask_ps(depthPassMask
);
1630 uint32_t statCount
= _mm_popcnt_u32(statMask
);
1631 UPDATE_STAT(DepthPassCount
, statCount
);
1633 work
.coverageMask
[sample
] >>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
1635 pDepthBase
+= (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
) / 8;
1636 pStencilBase
+= (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
) / 8;
1641 void InitClearTilesTable()
1643 memset(sClearTilesTable
, 0, sizeof(sClearTilesTable
));
1645 sClearTilesTable
[R8G8B8A8_UNORM
] = ClearMacroTile
<R8G8B8A8_UNORM
>;
1646 sClearTilesTable
[B8G8R8A8_UNORM
] = ClearMacroTile
<B8G8R8A8_UNORM
>;
1647 sClearTilesTable
[R32_FLOAT
] = ClearMacroTile
<R32_FLOAT
>;
1648 sClearTilesTable
[R32G32B32A32_FLOAT
] = ClearMacroTile
<R32G32B32A32_FLOAT
>;
1649 sClearTilesTable
[R8_UINT
] = ClearMacroTile
<R8_UINT
>;
1652 PFN_BACKEND_FUNC gBackendNullPs
[SWR_MULTISAMPLE_TYPE_MAX
];
1653 PFN_BACKEND_FUNC gBackendSingleSample
[2][2] = {};
1654 PFN_BACKEND_FUNC gBackendPixelRateTable
[SWR_MULTISAMPLE_TYPE_MAX
][SWR_MSAA_SAMPLE_PATTERN_MAX
][SWR_INPUT_COVERAGE_MAX
][2][2] = {};
1655 PFN_BACKEND_FUNC gBackendSampleRateTable
[SWR_MULTISAMPLE_TYPE_MAX
][SWR_INPUT_COVERAGE_MAX
][2] = {};
1656 PFN_OUTPUT_MERGER gBackendOutputMergerTable
[SWR_NUM_RENDERTARGETS
+1][SWR_MULTISAMPLE_TYPE_MAX
] = {};
1657 PFN_CALC_PIXEL_BARYCENTRICS gPixelBarycentricTable
[2] = {};
1658 PFN_CALC_SAMPLE_BARYCENTRICS gSampleBarycentricTable
[2] = {};
1659 PFN_CALC_CENTROID_BARYCENTRICS gCentroidBarycentricTable
[SWR_MULTISAMPLE_TYPE_MAX
][2][2][2] = {};
1661 // Recursive template used to auto-nest conditionals. Converts dynamic enum function
1662 // arguments to static template arguments.
1663 template <uint32_t... ArgsT
>
1666 // Last Arg Terminator
1667 static PFN_OUTPUT_MERGER
GetFunc(SWR_MULTISAMPLE_COUNT tArg
)
1671 case SWR_MULTISAMPLE_1X
: return OutputMerger
<ArgsT
..., SWR_MULTISAMPLE_1X
>; break;
1672 case SWR_MULTISAMPLE_2X
: return OutputMerger
<ArgsT
..., SWR_MULTISAMPLE_2X
>; break;
1673 case SWR_MULTISAMPLE_4X
: return OutputMerger
<ArgsT
..., SWR_MULTISAMPLE_4X
>; break;
1674 case SWR_MULTISAMPLE_8X
: return OutputMerger
<ArgsT
..., SWR_MULTISAMPLE_8X
>; break;
1675 case SWR_MULTISAMPLE_16X
: return OutputMerger
<ArgsT
..., SWR_MULTISAMPLE_16X
>; break;
1677 SWR_ASSERT(0 && "Invalid sample count\n");
1683 // Recursively parse args
1684 template <typename
... TArgsT
>
1685 static PFN_OUTPUT_MERGER
GetFunc(uint32_t tArg
, TArgsT
... remainingArgs
)
1689 case 0: return OMChooser
<ArgsT
..., 0>::GetFunc(remainingArgs
...); break;
1690 case 1: return OMChooser
<ArgsT
..., 1>::GetFunc(remainingArgs
...); break;
1691 case 2: return OMChooser
<ArgsT
..., 2>::GetFunc(remainingArgs
...); break;
1692 case 3: return OMChooser
<ArgsT
..., 3>::GetFunc(remainingArgs
...); break;
1693 case 4: return OMChooser
<ArgsT
..., 4>::GetFunc(remainingArgs
...); break;
1694 case 5: return OMChooser
<ArgsT
..., 5>::GetFunc(remainingArgs
...); break;
1695 case 6: return OMChooser
<ArgsT
..., 6>::GetFunc(remainingArgs
...); break;
1696 case 7: return OMChooser
<ArgsT
..., 7>::GetFunc(remainingArgs
...); break;
1697 case 8: return OMChooser
<ArgsT
..., 8>::GetFunc(remainingArgs
...); break;
1699 SWR_ASSERT(0 && "Invalid RT index\n");
1706 // Recursive template used to auto-nest conditionals. Converts dynamic enum function
1707 // arguments to static template arguments.
1708 template <uint32_t... ArgsT
>
1709 struct BECentroidBarycentricChooser
1712 // Last Arg Terminator
1713 template <typename
... TArgsT
>
1714 static PFN_CALC_CENTROID_BARYCENTRICS
GetFunc(uint32_t tArg
)
1718 return CalcCentroidBarycentrics
<ArgsT
..., 1>;
1721 return CalcCentroidBarycentrics
<ArgsT
..., 0>;
1724 // Recursively parse args
1725 template <typename
... TArgsT
>
1726 static PFN_CALC_CENTROID_BARYCENTRICS
GetFunc(SWR_MULTISAMPLE_COUNT tArg
, TArgsT
... remainingArgs
)
1730 case SWR_MULTISAMPLE_1X
: return BECentroidBarycentricChooser
<ArgsT
..., SWR_MULTISAMPLE_1X
>::GetFunc(remainingArgs
...); break;
1731 case SWR_MULTISAMPLE_2X
: return BECentroidBarycentricChooser
<ArgsT
..., SWR_MULTISAMPLE_2X
>::GetFunc(remainingArgs
...); break;
1732 case SWR_MULTISAMPLE_4X
: return BECentroidBarycentricChooser
<ArgsT
..., SWR_MULTISAMPLE_4X
>::GetFunc(remainingArgs
...); break;
1733 case SWR_MULTISAMPLE_8X
: return BECentroidBarycentricChooser
<ArgsT
..., SWR_MULTISAMPLE_8X
>::GetFunc(remainingArgs
...); break;
1734 case SWR_MULTISAMPLE_16X
: return BECentroidBarycentricChooser
<ArgsT
..., SWR_MULTISAMPLE_16X
>::GetFunc(remainingArgs
...); break;
1736 SWR_ASSERT(0 && "Invalid sample count\n");
1742 // Recursively parse args
1743 template <typename
... TArgsT
>
1744 static PFN_CALC_CENTROID_BARYCENTRICS
GetFunc(uint32_t tArg
, TArgsT
... remainingArgs
)
1748 return BECentroidBarycentricChooser
<ArgsT
..., 1>::GetFunc(remainingArgs
...);
1751 return BECentroidBarycentricChooser
<ArgsT
..., 0>::GetFunc(remainingArgs
...);
1755 // Recursive template used to auto-nest conditionals. Converts dynamic enum function
1756 // arguments to static template arguments.
1757 template <uint32_t... ArgsT
>
1760 // Last Arg Terminator
1761 static PFN_BACKEND_FUNC
GetFunc(SWR_BACKEND_FUNCS tArg
)
1765 case SWR_BACKEND_SINGLE_SAMPLE
: return BackendSingleSample
<ArgsT
...>; break;
1766 case SWR_BACKEND_MSAA_PIXEL_RATE
: return BackendPixelRate
<ArgsT
...>; break;
1767 case SWR_BACKEND_MSAA_SAMPLE_RATE
: return BackendSampleRate
<ArgsT
...>; break;
1769 SWR_ASSERT(0 && "Invalid backend func\n");
1776 // Recursively parse args
1777 template <typename
... TArgsT
>
1778 static PFN_BACKEND_FUNC
GetFunc(SWR_MULTISAMPLE_COUNT tArg
, TArgsT
... remainingArgs
)
1782 case SWR_MULTISAMPLE_1X
: return BEChooser
<ArgsT
..., SWR_MULTISAMPLE_1X
>::GetFunc(remainingArgs
...); break;
1783 case SWR_MULTISAMPLE_2X
: return BEChooser
<ArgsT
..., SWR_MULTISAMPLE_2X
>::GetFunc(remainingArgs
...); break;
1784 case SWR_MULTISAMPLE_4X
: return BEChooser
<ArgsT
..., SWR_MULTISAMPLE_4X
>::GetFunc(remainingArgs
...); break;
1785 case SWR_MULTISAMPLE_8X
: return BEChooser
<ArgsT
..., SWR_MULTISAMPLE_8X
>::GetFunc(remainingArgs
...); break;
1786 case SWR_MULTISAMPLE_16X
: return BEChooser
<ArgsT
..., SWR_MULTISAMPLE_16X
>::GetFunc(remainingArgs
...); break;
1788 SWR_ASSERT(0 && "Invalid sample count\n");
1794 // Recursively parse args
1795 template <typename
... TArgsT
>
1796 static PFN_BACKEND_FUNC
GetFunc(uint32_t tArg
, TArgsT
... remainingArgs
)
1800 return BEChooser
<ArgsT
..., 1>::GetFunc(remainingArgs
...);
1803 return BEChooser
<ArgsT
..., 0>::GetFunc(remainingArgs
...);
1807 template <uint32_t numRenderTargets
, SWR_MULTISAMPLE_COUNT numSampleRates
>
1808 void InitBackendOMFuncTable(PFN_OUTPUT_MERGER (&table
)[numRenderTargets
][numSampleRates
])
1810 for(uint32_t rtNum
= SWR_ATTACHMENT_COLOR0
; rtNum
< numRenderTargets
; rtNum
++)
1812 for(uint32_t sampleCount
= SWR_MULTISAMPLE_1X
; sampleCount
< numSampleRates
; sampleCount
++)
1814 table
[rtNum
][sampleCount
] =
1815 OMChooser
<>::GetFunc((SWR_RENDERTARGET_ATTACHMENT
)rtNum
, (SWR_MULTISAMPLE_COUNT
)sampleCount
);
1820 template <SWR_MULTISAMPLE_COUNT numSampleRates
>
1821 void InitBackendBarycentricsTables(PFN_CALC_PIXEL_BARYCENTRICS (&pixelTable
)[2],
1822 PFN_CALC_SAMPLE_BARYCENTRICS (&sampleTable
)[2],
1823 PFN_CALC_CENTROID_BARYCENTRICS (¢roidTable
)[numSampleRates
][2][2][2])
1825 pixelTable
[0] = CalcPixelBarycentrics
<0>;
1826 pixelTable
[1] = CalcPixelBarycentrics
<1>;
1828 sampleTable
[0] = CalcSampleBarycentrics
<0>;
1829 sampleTable
[1] = CalcSampleBarycentrics
<1>;
1831 for(uint32_t sampleCount
= SWR_MULTISAMPLE_1X
; sampleCount
< numSampleRates
; sampleCount
++)
1833 for(uint32_t baryMask
= 0; baryMask
< 2; baryMask
++)
1835 for(uint32_t patternNum
= 0; patternNum
< 2; patternNum
++)
1837 for(uint32_t forcedSampleEnable
= 0; forcedSampleEnable
< 2; forcedSampleEnable
++)
1839 centroidTable
[sampleCount
][baryMask
][patternNum
][forcedSampleEnable
]=
1840 BECentroidBarycentricChooser
<>::GetFunc((SWR_MULTISAMPLE_COUNT
)sampleCount
, baryMask
, patternNum
, forcedSampleEnable
);
1847 void InitBackendSampleFuncTable(PFN_BACKEND_FUNC (&table
)[2][2])
1849 gBackendSingleSample
[0][0] = BEChooser
<>::GetFunc(SWR_MULTISAMPLE_1X
, SWR_MSAA_STANDARD_PATTERN
, SWR_INPUT_COVERAGE_NONE
, 0, 0, (SWR_BACKEND_FUNCS
)SWR_BACKEND_SINGLE_SAMPLE
);
1850 gBackendSingleSample
[0][1] = BEChooser
<>::GetFunc(SWR_MULTISAMPLE_1X
, SWR_MSAA_STANDARD_PATTERN
, SWR_INPUT_COVERAGE_NONE
, 1, 0, (SWR_BACKEND_FUNCS
)SWR_BACKEND_SINGLE_SAMPLE
);
1851 gBackendSingleSample
[1][0] = BEChooser
<>::GetFunc(SWR_MULTISAMPLE_1X
, SWR_MSAA_STANDARD_PATTERN
, SWR_INPUT_COVERAGE_NORMAL
, 0, 0, (SWR_BACKEND_FUNCS
)SWR_BACKEND_SINGLE_SAMPLE
);
1852 gBackendSingleSample
[1][1] = BEChooser
<>::GetFunc(SWR_MULTISAMPLE_1X
, SWR_MSAA_STANDARD_PATTERN
, SWR_INPUT_COVERAGE_NORMAL
, 1, 0, (SWR_BACKEND_FUNCS
)SWR_BACKEND_SINGLE_SAMPLE
);
1855 template <SWR_MULTISAMPLE_COUNT numSampleRates
, SWR_MSAA_SAMPLE_PATTERN numSamplePatterns
, SWR_INPUT_COVERAGE numCoverageModes
>
1856 void InitBackendPixelFuncTable(PFN_BACKEND_FUNC (&table
)[numSampleRates
][numSamplePatterns
][numCoverageModes
][2][2])
1858 for(uint32_t sampleCount
= SWR_MULTISAMPLE_1X
; sampleCount
< numSampleRates
; sampleCount
++)
1860 for(uint32_t samplePattern
= SWR_MSAA_CENTER_PATTERN
; samplePattern
< numSamplePatterns
; samplePattern
++)
1862 for(uint32_t inputCoverage
= SWR_INPUT_COVERAGE_NONE
; inputCoverage
< numCoverageModes
; inputCoverage
++)
1864 for(uint32_t isCentroid
= 0; isCentroid
< 2; isCentroid
++)
1866 table
[sampleCount
][samplePattern
][inputCoverage
][isCentroid
][0] =
1867 BEChooser
<>::GetFunc((SWR_MULTISAMPLE_COUNT
)sampleCount
, samplePattern
, inputCoverage
, isCentroid
, 0, (SWR_BACKEND_FUNCS
)SWR_BACKEND_MSAA_PIXEL_RATE
);
1868 table
[sampleCount
][samplePattern
][inputCoverage
][isCentroid
][1] =
1869 BEChooser
<>::GetFunc((SWR_MULTISAMPLE_COUNT
)sampleCount
, samplePattern
, inputCoverage
, isCentroid
, 1, (SWR_BACKEND_FUNCS
)SWR_BACKEND_MSAA_PIXEL_RATE
);
1876 template <uint32_t numSampleRates
, uint32_t numCoverageModes
>
1877 void InitBackendSampleFuncTable(PFN_BACKEND_FUNC (&table
)[numSampleRates
][numCoverageModes
][2])
1879 for(uint32_t sampleCount
= SWR_MULTISAMPLE_1X
; sampleCount
< numSampleRates
; sampleCount
++)
1881 for(uint32_t inputCoverage
= SWR_INPUT_COVERAGE_NONE
; inputCoverage
< numCoverageModes
; inputCoverage
++)
1883 table
[sampleCount
][inputCoverage
][0] =
1884 BEChooser
<>::GetFunc((SWR_MULTISAMPLE_COUNT
)sampleCount
, SWR_MSAA_STANDARD_PATTERN
, inputCoverage
, 0, 0, (SWR_BACKEND_FUNCS
)SWR_BACKEND_MSAA_SAMPLE_RATE
);
1885 table
[sampleCount
][inputCoverage
][1] =
1886 BEChooser
<>::GetFunc((SWR_MULTISAMPLE_COUNT
)sampleCount
, SWR_MSAA_STANDARD_PATTERN
, inputCoverage
, 1, 0, (SWR_BACKEND_FUNCS
)SWR_BACKEND_MSAA_SAMPLE_RATE
);
1891 void InitBackendFuncTables()
1893 InitBackendSampleFuncTable(gBackendSingleSample
);
1894 InitBackendPixelFuncTable
<(SWR_MULTISAMPLE_COUNT
)SWR_MULTISAMPLE_TYPE_MAX
, SWR_MSAA_SAMPLE_PATTERN_MAX
, SWR_INPUT_COVERAGE_MAX
>(gBackendPixelRateTable
);
1895 InitBackendSampleFuncTable
<SWR_MULTISAMPLE_TYPE_MAX
, SWR_INPUT_COVERAGE_MAX
>(gBackendSampleRateTable
);
1896 InitBackendOMFuncTable
<SWR_NUM_RENDERTARGETS
+1, SWR_MULTISAMPLE_TYPE_MAX
>(gBackendOutputMergerTable
);
1897 InitBackendBarycentricsTables
<(SWR_MULTISAMPLE_COUNT
)(SWR_MULTISAMPLE_TYPE_MAX
)>(gPixelBarycentricTable
, gSampleBarycentricTable
, gCentroidBarycentricTable
);
1899 gBackendNullPs
[SWR_MULTISAMPLE_1X
] = &BackendNullPS
< SWR_MULTISAMPLE_1X
> ;
1900 gBackendNullPs
[SWR_MULTISAMPLE_2X
] = &BackendNullPS
< SWR_MULTISAMPLE_2X
> ;
1901 gBackendNullPs
[SWR_MULTISAMPLE_4X
] = &BackendNullPS
< SWR_MULTISAMPLE_4X
> ;
1902 gBackendNullPs
[SWR_MULTISAMPLE_8X
] = &BackendNullPS
< SWR_MULTISAMPLE_8X
> ;
1903 gBackendNullPs
[SWR_MULTISAMPLE_16X
] = &BackendNullPS
< SWR_MULTISAMPLE_16X
> ;