1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * @brief Backend handles rasterization, pixel shading and output merger
28 ******************************************************************************/
30 #include <smmintrin.h>
33 #include "depthstencil.h"
35 #include "memory/tilingtraits.h"
36 #include "core/multisample.h"
40 typedef void(*PFN_CLEAR_TILES
)(DRAW_CONTEXT
*, SWR_RENDERTARGET_ATTACHMENT rt
, uint32_t, DWORD
[4]);
41 static PFN_CLEAR_TILES sClearTilesTable
[NUM_SWR_FORMATS
];
43 //////////////////////////////////////////////////////////////////////////
44 /// @brief Process compute work.
45 /// @param pDC - pointer to draw context (dispatch).
46 /// @param workerId - The unique worker ID that is assigned to this thread.
47 /// @param threadGroupId - the linear index for the thread group within the dispatch.
48 void ProcessComputeBE(DRAW_CONTEXT
* pDC
, uint32_t workerId
, uint32_t threadGroupId
, void*& pSpillFillBuffer
)
50 RDTSC_START(BEDispatch
);
52 SWR_CONTEXT
*pContext
= pDC
->pContext
;
54 const COMPUTE_DESC
* pTaskData
= (COMPUTE_DESC
*)pDC
->pDispatch
->GetTasksData();
55 SWR_ASSERT(pTaskData
!= nullptr);
57 // Ensure spill fill memory has been allocated.
58 size_t spillFillSize
= pDC
->pState
->state
.totalSpillFillSize
;
59 if (spillFillSize
&& pSpillFillBuffer
== nullptr)
61 pSpillFillBuffer
= pDC
->pArena
->AllocAlignedSync(spillFillSize
, KNOB_SIMD_BYTES
);
64 const API_STATE
& state
= GetApiState(pDC
);
66 SWR_CS_CONTEXT csContext
{ 0 };
67 csContext
.tileCounter
= threadGroupId
;
68 csContext
.dispatchDims
[0] = pTaskData
->threadGroupCountX
;
69 csContext
.dispatchDims
[1] = pTaskData
->threadGroupCountY
;
70 csContext
.dispatchDims
[2] = pTaskData
->threadGroupCountZ
;
71 csContext
.pTGSM
= pContext
->pScratch
[workerId
];
72 csContext
.pSpillFillBuffer
= (uint8_t*)pSpillFillBuffer
;
74 state
.pfnCsFunc(GetPrivateState(pDC
), &csContext
);
76 UPDATE_STAT(CsInvocations
, state
.totalThreadsInGroup
);
78 RDTSC_STOP(BEDispatch
, 1, 0);
81 void ProcessSyncBE(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t macroTile
, void *pUserData
)
83 SYNC_DESC
*pSync
= (SYNC_DESC
*)pUserData
;
86 MacroTileMgr::getTileIndices(macroTile
, x
, y
);
87 SWR_ASSERT(x
== 0 && y
== 0);
89 if (pSync
->pfnCallbackFunc
!= nullptr)
91 pSync
->pfnCallbackFunc(pSync
->userData
, pSync
->userData2
, pSync
->userData3
);
95 void ProcessQueryStatsBE(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t macroTile
, void *pUserData
)
97 QUERY_DESC
* pQueryDesc
= (QUERY_DESC
*)pUserData
;
98 SWR_STATS
* pStats
= pQueryDesc
->pStats
;
99 SWR_CONTEXT
*pContext
= pDC
->pContext
;
101 SWR_ASSERT(pStats
!= nullptr);
103 for (uint32_t i
= 0; i
< pContext
->NumWorkerThreads
; ++i
)
105 pStats
->DepthPassCount
+= pContext
->stats
[i
].DepthPassCount
;
107 pStats
->IaVertices
+= pContext
->stats
[i
].IaVertices
;
108 pStats
->IaPrimitives
+= pContext
->stats
[i
].IaPrimitives
;
109 pStats
->VsInvocations
+= pContext
->stats
[i
].VsInvocations
;
110 pStats
->HsInvocations
+= pContext
->stats
[i
].HsInvocations
;
111 pStats
->DsInvocations
+= pContext
->stats
[i
].DsInvocations
;
112 pStats
->GsInvocations
+= pContext
->stats
[i
].GsInvocations
;
113 pStats
->PsInvocations
+= pContext
->stats
[i
].PsInvocations
;
114 pStats
->CInvocations
+= pContext
->stats
[i
].CInvocations
;
115 pStats
->CsInvocations
+= pContext
->stats
[i
].CsInvocations
;
116 pStats
->CPrimitives
+= pContext
->stats
[i
].CPrimitives
;
117 pStats
->GsPrimitives
+= pContext
->stats
[i
].GsPrimitives
;
119 for (uint32_t stream
= 0; stream
< MAX_SO_STREAMS
; ++stream
)
121 pStats
->SoWriteOffset
[stream
] += pContext
->stats
[i
].SoWriteOffset
[stream
];
123 /// @note client is required to provide valid write offset before every draw, so we clear
124 /// out the contents of the write offset when storing stats
125 pContext
->stats
[i
].SoWriteOffset
[stream
] = 0;
127 pStats
->SoPrimStorageNeeded
[stream
] += pContext
->stats
[i
].SoPrimStorageNeeded
[stream
];
128 pStats
->SoNumPrimsWritten
[stream
] += pContext
->stats
[i
].SoNumPrimsWritten
[stream
];
133 template<SWR_FORMAT format
>
134 void ClearRasterTile(uint8_t *pTileBuffer
, simdvector
&value
)
136 auto lambda
= [&](int comp
)
138 FormatTraits
<format
>::storeSOA(comp
, pTileBuffer
, value
.v
[comp
]);
139 pTileBuffer
+= (KNOB_SIMD_WIDTH
* FormatTraits
<format
>::GetBPC(comp
) / 8);
142 const uint32_t numIter
= (KNOB_TILE_Y_DIM
/ SIMD_TILE_Y_DIM
) * (KNOB_TILE_X_DIM
/ SIMD_TILE_X_DIM
);
143 for (uint32_t i
= 0; i
< numIter
; ++i
)
145 UnrollerL
<0, FormatTraits
<format
>::numComps
, 1>::step(lambda
);
149 template<SWR_FORMAT format
>
150 INLINE
void ClearMacroTile(DRAW_CONTEXT
*pDC
, SWR_RENDERTARGET_ATTACHMENT rt
, uint32_t macroTile
, DWORD clear
[4])
152 // convert clear color to hottile format
153 // clear color is in RGBA float/uint32
155 for (uint32_t comp
= 0; comp
< FormatTraits
<format
>::numComps
; ++comp
)
158 vComp
= _simd_load1_ps((const float*)&clear
[comp
]);
159 if (FormatTraits
<format
>::isNormalized(comp
))
161 vComp
= _simd_mul_ps(vComp
, _simd_set1_ps(FormatTraits
<format
>::fromFloat(comp
)));
162 vComp
= _simd_castsi_ps(_simd_cvtps_epi32(vComp
));
164 vComp
= FormatTraits
<format
>::pack(comp
, vComp
);
165 vClear
.v
[FormatTraits
<format
>::swizzle(comp
)] = vComp
;
168 uint32_t tileX
, tileY
;
169 MacroTileMgr::getTileIndices(macroTile
, tileX
, tileY
);
170 const API_STATE
& state
= GetApiState(pDC
);
172 int top
= KNOB_MACROTILE_Y_DIM_FIXED
* tileY
;
173 int bottom
= top
+ KNOB_MACROTILE_Y_DIM_FIXED
- 1;
174 int left
= KNOB_MACROTILE_X_DIM_FIXED
* tileX
;
175 int right
= left
+ KNOB_MACROTILE_X_DIM_FIXED
- 1;
177 // intersect with scissor
178 top
= std::max(top
, state
.scissorInFixedPoint
.top
);
179 left
= std::max(left
, state
.scissorInFixedPoint
.left
);
180 bottom
= std::min(bottom
, state
.scissorInFixedPoint
.bottom
);
181 right
= std::min(right
, state
.scissorInFixedPoint
.right
);
183 // translate to local hottile origin
184 top
-= KNOB_MACROTILE_Y_DIM_FIXED
* tileY
;
185 bottom
-= KNOB_MACROTILE_Y_DIM_FIXED
* tileY
;
186 left
-= KNOB_MACROTILE_X_DIM_FIXED
* tileX
;
187 right
-= KNOB_MACROTILE_X_DIM_FIXED
* tileX
;
189 // convert to raster tiles
190 top
>>= (KNOB_TILE_Y_DIM_SHIFT
+ FIXED_POINT_SHIFT
);
191 bottom
>>= (KNOB_TILE_Y_DIM_SHIFT
+ FIXED_POINT_SHIFT
);
192 left
>>= (KNOB_TILE_X_DIM_SHIFT
+ FIXED_POINT_SHIFT
);
193 right
>>= (KNOB_TILE_X_DIM_SHIFT
+ FIXED_POINT_SHIFT
);
195 const int numSamples
= GetNumSamples(pDC
->pState
->state
.rastState
.sampleCount
);
196 // compute steps between raster tile samples / raster tiles / macro tile rows
197 const uint32_t rasterTileSampleStep
= KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<format
>::bpp
/ 8;
198 const uint32_t rasterTileStep
= (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* (FormatTraits
<format
>::bpp
/ 8)) * numSamples
;
199 const uint32_t macroTileRowStep
= (KNOB_MACROTILE_X_DIM
/ KNOB_TILE_X_DIM
) * rasterTileStep
;
200 const uint32_t pitch
= (FormatTraits
<format
>::bpp
* KNOB_MACROTILE_X_DIM
/ 8);
202 HOTTILE
*pHotTile
= pDC
->pContext
->pHotTileMgr
->GetHotTile(pDC
->pContext
, pDC
, macroTile
, rt
, true, numSamples
);
203 uint32_t rasterTileStartOffset
= (ComputeTileOffset2D
< TilingTraits
<SWR_TILE_SWRZ
, FormatTraits
<format
>::bpp
> >(pitch
, left
, top
)) * numSamples
;
204 uint8_t* pRasterTileRow
= pHotTile
->pBuffer
+ rasterTileStartOffset
; //(ComputeTileOffset2D< TilingTraits<SWR_TILE_SWRZ, FormatTraits<format>::bpp > >(pitch, x, y)) * numSamples;
206 // loop over all raster tiles in the current hot tile
207 for (int y
= top
; y
<= bottom
; ++y
)
209 uint8_t* pRasterTile
= pRasterTileRow
;
210 for (int x
= left
; x
<= right
; ++x
)
212 for( int sampleNum
= 0; sampleNum
< numSamples
; sampleNum
++)
214 ClearRasterTile
<format
>(pRasterTile
, vClear
);
215 pRasterTile
+= rasterTileSampleStep
;
218 pRasterTileRow
+= macroTileRowStep
;
221 pHotTile
->state
= HOTTILE_DIRTY
;
225 void ProcessClearBE(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t macroTile
, void *pUserData
)
229 CLEAR_DESC
*pClear
= (CLEAR_DESC
*)pUserData
;
230 SWR_CONTEXT
*pContext
= pDC
->pContext
;
231 SWR_MULTISAMPLE_COUNT sampleCount
= pDC
->pState
->state
.rastState
.sampleCount
;
232 uint32_t numSamples
= GetNumSamples(sampleCount
);
234 SWR_ASSERT(pClear
->flags
.bits
!= 0); // shouldn't be here without a reason.
236 RDTSC_START(BEClear
);
238 if (pClear
->flags
.mask
& SWR_CLEAR_COLOR
)
240 HOTTILE
*pHotTile
= pContext
->pHotTileMgr
->GetHotTile(pContext
, pDC
, macroTile
, SWR_ATTACHMENT_COLOR0
, true, numSamples
);
241 // All we want to do here is to mark the hot tile as being in a "needs clear" state.
242 pHotTile
->clearData
[0] = *(DWORD
*)&(pClear
->clearRTColor
[0]);
243 pHotTile
->clearData
[1] = *(DWORD
*)&(pClear
->clearRTColor
[1]);
244 pHotTile
->clearData
[2] = *(DWORD
*)&(pClear
->clearRTColor
[2]);
245 pHotTile
->clearData
[3] = *(DWORD
*)&(pClear
->clearRTColor
[3]);
246 pHotTile
->state
= HOTTILE_CLEAR
;
249 if (pClear
->flags
.mask
& SWR_CLEAR_DEPTH
)
251 HOTTILE
*pHotTile
= pContext
->pHotTileMgr
->GetHotTile(pContext
, pDC
, macroTile
, SWR_ATTACHMENT_DEPTH
, true, numSamples
);
252 pHotTile
->clearData
[0] = *(DWORD
*)&pClear
->clearDepth
;
253 pHotTile
->state
= HOTTILE_CLEAR
;
256 if (pClear
->flags
.mask
& SWR_CLEAR_STENCIL
)
258 HOTTILE
*pHotTile
= pContext
->pHotTileMgr
->GetHotTile(pContext
, pDC
, macroTile
, SWR_ATTACHMENT_STENCIL
, true, numSamples
);
260 pHotTile
->clearData
[0] = *(DWORD
*)&pClear
->clearStencil
;
261 pHotTile
->state
= HOTTILE_CLEAR
;
264 RDTSC_STOP(BEClear
, 0, 0);
269 CLEAR_DESC
*pClear
= (CLEAR_DESC
*)pUserData
;
270 RDTSC_START(BEClear
);
272 if (pClear
->flags
.mask
& SWR_CLEAR_COLOR
)
274 /// @todo clear data should come in as RGBA32_FLOAT
277 clearFloat
[0] = ((uint8_t*)(&pClear
->clearRTColor
))[0] / 255.0f
;
278 clearFloat
[1] = ((uint8_t*)(&pClear
->clearRTColor
))[1] / 255.0f
;
279 clearFloat
[2] = ((uint8_t*)(&pClear
->clearRTColor
))[2] / 255.0f
;
280 clearFloat
[3] = ((uint8_t*)(&pClear
->clearRTColor
))[3] / 255.0f
;
281 clearData
[0] = *(DWORD
*)&clearFloat
[0];
282 clearData
[1] = *(DWORD
*)&clearFloat
[1];
283 clearData
[2] = *(DWORD
*)&clearFloat
[2];
284 clearData
[3] = *(DWORD
*)&clearFloat
[3];
286 PFN_CLEAR_TILES pfnClearTiles
= sClearTilesTable
[KNOB_COLOR_HOT_TILE_FORMAT
];
287 SWR_ASSERT(pfnClearTiles
!= nullptr);
289 pfnClearTiles(pDC
, SWR_ATTACHMENT_COLOR0
, macroTile
, clearData
);
292 if (pClear
->flags
.mask
& SWR_CLEAR_DEPTH
)
295 clearData
[0] = *(DWORD
*)&pClear
->clearDepth
;
296 PFN_CLEAR_TILES pfnClearTiles
= sClearTilesTable
[KNOB_DEPTH_HOT_TILE_FORMAT
];
297 SWR_ASSERT(pfnClearTiles
!= nullptr);
299 pfnClearTiles(pDC
, SWR_ATTACHMENT_DEPTH
, macroTile
, clearData
);
302 if (pClear
->flags
.mask
& SWR_CLEAR_STENCIL
)
304 uint32_t value
= pClear
->clearStencil
;
306 clearData
[0] = *(DWORD
*)&value
;
307 PFN_CLEAR_TILES pfnClearTiles
= sClearTilesTable
[KNOB_STENCIL_HOT_TILE_FORMAT
];
309 pfnClearTiles(pDC
, SWR_ATTACHMENT_STENCIL
, macroTile
, clearData
);
312 RDTSC_STOP(BEClear
, 0, 0);
317 void ProcessStoreTileBE(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t macroTile
, void *pData
)
319 RDTSC_START(BEStoreTiles
);
320 STORE_TILES_DESC
*pDesc
= (STORE_TILES_DESC
*)pData
;
321 SWR_CONTEXT
*pContext
= pDC
->pContext
;
323 #ifdef KNOB_ENABLE_RDTSC
324 uint32_t numTiles
= 0;
326 SWR_FORMAT srcFormat
;
327 switch (pDesc
->attachment
)
329 case SWR_ATTACHMENT_COLOR0
:
330 case SWR_ATTACHMENT_COLOR1
:
331 case SWR_ATTACHMENT_COLOR2
:
332 case SWR_ATTACHMENT_COLOR3
:
333 case SWR_ATTACHMENT_COLOR4
:
334 case SWR_ATTACHMENT_COLOR5
:
335 case SWR_ATTACHMENT_COLOR6
:
336 case SWR_ATTACHMENT_COLOR7
: srcFormat
= KNOB_COLOR_HOT_TILE_FORMAT
; break;
337 case SWR_ATTACHMENT_DEPTH
: srcFormat
= KNOB_DEPTH_HOT_TILE_FORMAT
; break;
338 case SWR_ATTACHMENT_STENCIL
: srcFormat
= KNOB_STENCIL_HOT_TILE_FORMAT
; break;
339 default: SWR_ASSERT(false, "Unknown attachment: %d", pDesc
->attachment
); srcFormat
= KNOB_COLOR_HOT_TILE_FORMAT
; break;
343 MacroTileMgr::getTileIndices(macroTile
, x
, y
);
345 // Only need to store the hottile if it's been rendered to...
346 HOTTILE
*pHotTile
= pContext
->pHotTileMgr
->GetHotTile(pContext
, pDC
, macroTile
, pDesc
->attachment
, false);
349 // clear if clear is pending (i.e., not rendered to), then mark as dirty for store.
350 if (pHotTile
->state
== HOTTILE_CLEAR
)
352 PFN_CLEAR_TILES pfnClearTiles
= sClearTilesTable
[srcFormat
];
353 SWR_ASSERT(pfnClearTiles
!= nullptr);
355 pfnClearTiles(pDC
, pDesc
->attachment
, macroTile
, pHotTile
->clearData
);
358 if (pHotTile
->state
== HOTTILE_DIRTY
|| pDesc
->postStoreTileState
== (SWR_TILE_STATE
)HOTTILE_DIRTY
)
360 int destX
= KNOB_MACROTILE_X_DIM
* x
;
361 int destY
= KNOB_MACROTILE_Y_DIM
* y
;
363 pContext
->pfnStoreTile(GetPrivateState(pDC
), srcFormat
,
364 pDesc
->attachment
, destX
, destY
, pHotTile
->renderTargetArrayIndex
, pHotTile
->pBuffer
);
368 if (pHotTile
->state
== HOTTILE_DIRTY
|| pHotTile
->state
== HOTTILE_RESOLVED
)
370 pHotTile
->state
= (HOTTILE_STATE
)pDesc
->postStoreTileState
;
373 RDTSC_STOP(BEStoreTiles
, numTiles
, pDC
->drawId
);
377 void ProcessDiscardInvalidateTilesBE(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t macroTile
, void *pData
)
379 DISCARD_INVALIDATE_TILES_DESC
*pDesc
= (DISCARD_INVALIDATE_TILES_DESC
*)pData
;
380 SWR_CONTEXT
*pContext
= pDC
->pContext
;
382 const int numSamples
= GetNumSamples(pDC
->pState
->state
.rastState
.sampleCount
);
384 for (uint32_t i
= 0; i
< SWR_NUM_ATTACHMENTS
; ++i
)
386 if (pDesc
->attachmentMask
& (1 << i
))
388 HOTTILE
*pHotTile
= pContext
->pHotTileMgr
->GetHotTileNoLoad(
389 pContext
, pDC
, macroTile
, (SWR_RENDERTARGET_ATTACHMENT
)i
, pDesc
->createNewTiles
, numSamples
);
392 pHotTile
->state
= (HOTTILE_STATE
)pDesc
->newTileState
;
398 #if KNOB_SIMD_WIDTH == 8
399 const __m256 vCenterOffsetsX
= {0.5, 1.5, 0.5, 1.5, 2.5, 3.5, 2.5, 3.5};
400 const __m256 vCenterOffsetsY
= {0.5, 0.5, 1.5, 1.5, 0.5, 0.5, 1.5, 1.5};
401 const __m256 vULOffsetsX
= {0.0, 1.0, 0.0, 1.0, 2.0, 3.0, 2.0, 3.0};
402 const __m256 vULOffsetsY
= {0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0};
404 #error Unsupported vector width
408 bool CanEarlyZ(const SWR_PS_STATE
*pPSState
)
410 return (pPSState
->forceEarlyZ
|| (!pPSState
->writesODepth
&& !pPSState
->usesSourceDepth
&& !pPSState
->usesUAV
));
413 simdmask
ComputeUserClipMask(uint8_t clipMask
, float* pUserClipBuffer
, simdscalar vI
, simdscalar vJ
)
415 simdscalar vClipMask
= _simd_setzero_ps();
416 uint32_t numClipDistance
= _mm_popcnt_u32(clipMask
);
418 for (uint32_t i
= 0; i
< numClipDistance
; ++i
)
420 // pull triangle clip distance values from clip buffer
421 simdscalar vA
= _simd_broadcast_ss(pUserClipBuffer
++);
422 simdscalar vB
= _simd_broadcast_ss(pUserClipBuffer
++);
423 simdscalar vC
= _simd_broadcast_ss(pUserClipBuffer
++);
426 simdscalar vInterp
= vplaneps(vA
, vB
, vC
, vI
, vJ
);
428 // clip if interpolated clip distance is < 0 || NAN
429 simdscalar vCull
= _simd_cmp_ps(_simd_setzero_ps(), vInterp
, _CMP_NLE_UQ
);
431 vClipMask
= _simd_or_ps(vClipMask
, vCull
);
434 return _simd_movemask_ps(vClipMask
);
438 void BackendSingleSample(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t x
, uint32_t y
, SWR_TRIANGLE_DESC
&work
, RenderOutputBuffers
&renderBuffers
)
440 RDTSC_START(BESingleSampleBackend
);
441 RDTSC_START(BESetup
);
443 SWR_CONTEXT
*pContext
= pDC
->pContext
;
444 const API_STATE
& state
= GetApiState(pDC
);
445 const SWR_RASTSTATE
& rastState
= state
.rastState
;
446 const SWR_PS_STATE
*pPSState
= &state
.psState
;
447 const SWR_BLEND_STATE
*pBlendState
= &state
.blendState
;
448 uint64_t coverageMask
= work
.coverageMask
[0];
451 BarycentricCoeffs coeffs
;
452 coeffs
.vIa
= _simd_broadcast_ss(&work
.I
[0]);
453 coeffs
.vIb
= _simd_broadcast_ss(&work
.I
[1]);
454 coeffs
.vIc
= _simd_broadcast_ss(&work
.I
[2]);
456 coeffs
.vJa
= _simd_broadcast_ss(&work
.J
[0]);
457 coeffs
.vJb
= _simd_broadcast_ss(&work
.J
[1]);
458 coeffs
.vJc
= _simd_broadcast_ss(&work
.J
[2]);
460 coeffs
.vZa
= _simd_broadcast_ss(&work
.Z
[0]);
461 coeffs
.vZb
= _simd_broadcast_ss(&work
.Z
[1]);
462 coeffs
.vZc
= _simd_broadcast_ss(&work
.Z
[2]);
464 coeffs
.vRecipDet
= _simd_broadcast_ss(&work
.recipDet
);
466 coeffs
.vAOneOverW
= _simd_broadcast_ss(&work
.OneOverW
[0]);
467 coeffs
.vBOneOverW
= _simd_broadcast_ss(&work
.OneOverW
[1]);
468 coeffs
.vCOneOverW
= _simd_broadcast_ss(&work
.OneOverW
[2]);
470 uint8_t *pColorBase
[SWR_NUM_RENDERTARGETS
];
471 uint32_t NumRT
= state
.psState
.numRenderTargets
;
472 for(uint32_t rt
= 0; rt
< NumRT
; ++rt
)
474 pColorBase
[rt
] = renderBuffers
.pColor
[rt
];
476 uint8_t *pDepthBase
= renderBuffers
.pDepth
, *pStencilBase
= renderBuffers
.pStencil
;
477 RDTSC_STOP(BESetup
, 0, 0);
479 SWR_PS_CONTEXT psContext
;
480 psContext
.pAttribs
= work
.pAttribs
;
481 psContext
.pPerspAttribs
= work
.pPerspAttribs
;
482 psContext
.frontFace
= work
.triFlags
.frontFacing
;
483 psContext
.primID
= work
.triFlags
.primID
;
485 // save Ia/Ib/Ic and Ja/Jb/Jc if we need to reevaluate i/j/k in the shader because of pull attribs
486 psContext
.I
= work
.I
;
487 psContext
.J
= work
.J
;
488 psContext
.recipDet
= work
.recipDet
;
489 psContext
.pRecipW
= work
.pRecipW
;
490 psContext
.pSamplePosX
= (const float*)&T::MultisampleT::samplePosX
;
491 psContext
.pSamplePosY
= (const float*)&T::MultisampleT::samplePosY
;
493 for(uint32_t yy
= y
; yy
< y
+ KNOB_TILE_Y_DIM
; yy
+= SIMD_TILE_Y_DIM
)
496 psContext
.vY
.UL
= _simd_add_ps(vULOffsetsY
, _simd_set1_ps((float)yy
));
498 psContext
.vY
.center
= _simd_add_ps(vCenterOffsetsY
, _simd_set1_ps((float)yy
));
500 for(uint32_t xx
= x
; xx
< x
+ KNOB_TILE_X_DIM
; xx
+= SIMD_TILE_X_DIM
)
502 if(coverageMask
& MASK
)
504 psContext
.vX
.UL
= _simd_add_ps(vULOffsetsX
, _simd_set1_ps((float)xx
));
506 psContext
.vX
.center
= _simd_add_ps(vCenterOffsetsX
, _simd_set1_ps((float)xx
));
508 if(T::bInputCoverage
)
510 generateInputCoverage
<T
>(&work
.coverageMask
[0], psContext
.inputMask
, pBlendState
->sampleMask
);
513 RDTSC_START(BEBarycentric
);
514 CalcPixelBarycentrics(coeffs
, psContext
);
516 // for 1x case, centroid is pixel center
517 psContext
.vX
.centroid
= psContext
.vX
.center
;
518 psContext
.vY
.centroid
= psContext
.vY
.center
;
519 psContext
.vI
.centroid
= psContext
.vI
.center
;
520 psContext
.vJ
.centroid
= psContext
.vJ
.center
;
521 psContext
.vOneOverW
.centroid
= psContext
.vOneOverW
.center
;
523 // interpolate and quantize z
524 psContext
.vZ
= vplaneps(coeffs
.vZa
, coeffs
.vZb
, coeffs
.vZc
, psContext
.vI
.center
, psContext
.vJ
.center
);
525 psContext
.vZ
= state
.pfnQuantizeDepth(psContext
.vZ
);
526 RDTSC_STOP(BEBarycentric
, 0, 0);
528 simdmask clipCoverageMask
= coverageMask
& MASK
;
529 // interpolate user clip distance if available
530 if(rastState
.clipDistanceMask
)
532 clipCoverageMask
&= ~ComputeUserClipMask(rastState
.clipDistanceMask
, work
.pUserClipBuffer
,
533 psContext
.vI
.center
, psContext
.vJ
.center
);
536 simdscalar vCoverageMask
= vMask(clipCoverageMask
);
537 simdscalar depthPassMask
= vCoverageMask
;
538 simdscalar stencilPassMask
= vCoverageMask
;
543 RDTSC_START(BEEarlyDepthTest
);
544 depthPassMask
= DepthStencilTest(&state
, work
.triFlags
.frontFacing
,
545 psContext
.vZ
, pDepthBase
, vCoverageMask
, pStencilBase
, &stencilPassMask
);
546 RDTSC_STOP(BEEarlyDepthTest
, 0, 0);
548 // early-exit if no pixels passed depth or earlyZ is forced on
549 if(pPSState
->forceEarlyZ
|| !_simd_movemask_ps(depthPassMask
))
551 DepthStencilWrite(&state
.vp
[0], &state
.depthStencilState
, work
.triFlags
.frontFacing
, psContext
.vZ
,
552 pDepthBase
, depthPassMask
, vCoverageMask
, pStencilBase
, stencilPassMask
);
554 if (!_simd_movemask_ps(depthPassMask
))
561 psContext
.sampleIndex
= 0;
562 psContext
.activeMask
= _simd_castps_si(vCoverageMask
);
564 // execute pixel shader
565 RDTSC_START(BEPixelShader
);
566 UPDATE_STAT(PsInvocations
, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask
)));
567 state
.psState
.pfnPixelShader(GetPrivateState(pDC
), &psContext
);
568 RDTSC_STOP(BEPixelShader
, 0, 0);
570 vCoverageMask
= _simd_castsi_ps(psContext
.activeMask
);
575 RDTSC_START(BELateDepthTest
);
576 depthPassMask
= DepthStencilTest(&state
, work
.triFlags
.frontFacing
,
577 psContext
.vZ
, pDepthBase
, vCoverageMask
, pStencilBase
, &stencilPassMask
);
578 RDTSC_STOP(BELateDepthTest
, 0, 0);
580 if(!_simd_movemask_ps(depthPassMask
))
582 // need to call depth/stencil write for stencil write
583 DepthStencilWrite(&state
.vp
[0], &state
.depthStencilState
, work
.triFlags
.frontFacing
, psContext
.vZ
,
584 pDepthBase
, depthPassMask
, vCoverageMask
, pStencilBase
, stencilPassMask
);
589 uint32_t statMask
= _simd_movemask_ps(depthPassMask
);
590 uint32_t statCount
= _mm_popcnt_u32(statMask
);
591 UPDATE_STAT(DepthPassCount
, statCount
);
594 RDTSC_START(BEOutputMerger
);
595 OutputMerger(psContext
, pColorBase
, 0, pBlendState
, state
.pfnBlendFunc
, vCoverageMask
, depthPassMask
, pPSState
->numRenderTargets
);
597 // do final depth write after all pixel kills
598 if (!pPSState
->forceEarlyZ
)
600 DepthStencilWrite(&state
.vp
[0], &state
.depthStencilState
, work
.triFlags
.frontFacing
, psContext
.vZ
,
601 pDepthBase
, depthPassMask
, vCoverageMask
, pStencilBase
, stencilPassMask
);
603 RDTSC_STOP(BEOutputMerger
, 0, 0);
607 RDTSC_START(BEEndTile
);
608 coverageMask
>>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
609 pDepthBase
+= (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
) / 8;
610 pStencilBase
+= (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
) / 8;
612 for(uint32_t rt
= 0; rt
< NumRT
; ++rt
)
614 pColorBase
[rt
] += (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
) / 8;
616 RDTSC_STOP(BEEndTile
, 0, 0);
619 RDTSC_STOP(BESingleSampleBackend
, 0, 0);
623 void BackendSampleRate(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t x
, uint32_t y
, SWR_TRIANGLE_DESC
&work
, RenderOutputBuffers
&renderBuffers
)
625 RDTSC_START(BESampleRateBackend
);
626 RDTSC_START(BESetup
);
628 SWR_CONTEXT
*pContext
= pDC
->pContext
;
629 const API_STATE
& state
= GetApiState(pDC
);
630 const SWR_RASTSTATE
& rastState
= state
.rastState
;
631 const SWR_PS_STATE
*pPSState
= &state
.psState
;
632 const SWR_BLEND_STATE
*pBlendState
= &state
.blendState
;
635 BarycentricCoeffs coeffs
;
636 coeffs
.vIa
= _simd_broadcast_ss(&work
.I
[0]);
637 coeffs
.vIb
= _simd_broadcast_ss(&work
.I
[1]);
638 coeffs
.vIc
= _simd_broadcast_ss(&work
.I
[2]);
640 coeffs
.vJa
= _simd_broadcast_ss(&work
.J
[0]);
641 coeffs
.vJb
= _simd_broadcast_ss(&work
.J
[1]);
642 coeffs
.vJc
= _simd_broadcast_ss(&work
.J
[2]);
644 coeffs
.vZa
= _simd_broadcast_ss(&work
.Z
[0]);
645 coeffs
.vZb
= _simd_broadcast_ss(&work
.Z
[1]);
646 coeffs
.vZc
= _simd_broadcast_ss(&work
.Z
[2]);
648 coeffs
.vRecipDet
= _simd_broadcast_ss(&work
.recipDet
);
650 coeffs
.vAOneOverW
= _simd_broadcast_ss(&work
.OneOverW
[0]);
651 coeffs
.vBOneOverW
= _simd_broadcast_ss(&work
.OneOverW
[1]);
652 coeffs
.vCOneOverW
= _simd_broadcast_ss(&work
.OneOverW
[2]);
654 uint8_t *pColorBase
[SWR_NUM_RENDERTARGETS
];
655 uint32_t NumRT
= state
.psState
.numRenderTargets
;
656 for(uint32_t rt
= 0; rt
< NumRT
; ++rt
)
658 pColorBase
[rt
] = renderBuffers
.pColor
[rt
];
660 uint8_t *pDepthBase
= renderBuffers
.pDepth
, *pStencilBase
= renderBuffers
.pStencil
;
661 RDTSC_STOP(BESetup
, 0, 0);
663 SWR_PS_CONTEXT psContext
;
664 psContext
.pAttribs
= work
.pAttribs
;
665 psContext
.pPerspAttribs
= work
.pPerspAttribs
;
666 psContext
.pRecipW
= work
.pRecipW
;
667 psContext
.frontFace
= work
.triFlags
.frontFacing
;
668 psContext
.primID
= work
.triFlags
.primID
;
670 // save Ia/Ib/Ic and Ja/Jb/Jc if we need to reevaluate i/j/k in the shader because of pull attribs
671 psContext
.I
= work
.I
;
672 psContext
.J
= work
.J
;
673 psContext
.recipDet
= work
.recipDet
;
674 psContext
.pSamplePosX
= (const float*)&T::MultisampleT::samplePosX
;
675 psContext
.pSamplePosY
= (const float*)&T::MultisampleT::samplePosY
;
677 for (uint32_t yy
= y
; yy
< y
+ KNOB_TILE_Y_DIM
; yy
+= SIMD_TILE_Y_DIM
)
680 psContext
.vY
.UL
= _simd_add_ps(vULOffsetsY
, _simd_set1_ps((float)yy
));
682 psContext
.vY
.center
= _simd_add_ps(vCenterOffsetsY
, _simd_set1_ps((float)yy
));
684 for (uint32_t xx
= x
; xx
< x
+ KNOB_TILE_X_DIM
; xx
+= SIMD_TILE_X_DIM
)
686 psContext
.vX
.UL
= _simd_add_ps(vULOffsetsX
, _simd_set1_ps((float)xx
));
688 psContext
.vX
.center
= _simd_add_ps(vCenterOffsetsX
, _simd_set1_ps((float)xx
));
690 RDTSC_START(BEBarycentric
);
691 CalcPixelBarycentrics(coeffs
, psContext
);
692 RDTSC_STOP(BEBarycentric
, 0, 0);
694 if(T::bInputCoverage
)
696 generateInputCoverage
<T
>(&work
.coverageMask
[0], psContext
.inputMask
, pBlendState
->sampleMask
);
701 ///@ todo: don't need to genererate input coverage 2x if input coverage and centroid
702 RDTSC_START(BEBarycentric
);
703 if(T::bIsStandardPattern
)
705 CalcCentroidPos
<T
>(psContext
, &work
.coverageMask
[0], pBlendState
->sampleMask
, psContext
.vX
.UL
, psContext
.vY
.UL
);
709 psContext
.vX
.centroid
= _simd_add_ps(psContext
.vX
.UL
, _simd_set1_ps(0.5f
));
710 psContext
.vY
.centroid
= _simd_add_ps(psContext
.vY
.UL
, _simd_set1_ps(0.5f
));
712 CalcCentroidBarycentrics(coeffs
, psContext
, psContext
.vX
.UL
, psContext
.vY
.UL
);
713 RDTSC_STOP(BEBarycentric
, 0, 0);
717 psContext
.vX
.centroid
= psContext
.vX
.sample
;
718 psContext
.vY
.centroid
= psContext
.vY
.sample
;
721 for(uint32_t sample
= 0; sample
< T::MultisampleT::numSamples
; sample
++)
723 simdmask coverageMask
= work
.coverageMask
[sample
] & MASK
;
726 RDTSC_START(BEBarycentric
);
727 // calculate per sample positions
728 psContext
.vX
.sample
= _simd_add_ps(psContext
.vX
.UL
, T::MultisampleT::vX(sample
));
729 psContext
.vY
.sample
= _simd_add_ps(psContext
.vY
.UL
, T::MultisampleT::vY(sample
));
731 CalcSampleBarycentrics(coeffs
, psContext
);
733 // interpolate and quantize z
734 psContext
.vZ
= vplaneps(coeffs
.vZa
, coeffs
.vZb
, coeffs
.vZc
, psContext
.vI
.sample
, psContext
.vJ
.sample
);
735 psContext
.vZ
= state
.pfnQuantizeDepth(psContext
.vZ
);
736 RDTSC_STOP(BEBarycentric
, 0, 0);
738 // interpolate user clip distance if available
739 if (rastState
.clipDistanceMask
)
741 coverageMask
&= ~ComputeUserClipMask(rastState
.clipDistanceMask
, work
.pUserClipBuffer
,
742 psContext
.vI
.sample
, psContext
.vJ
.sample
);
745 simdscalar vCoverageMask
= vMask(coverageMask
);
746 simdscalar depthPassMask
= vCoverageMask
;
747 simdscalar stencilPassMask
= vCoverageMask
;
749 // offset depth/stencil buffers current sample
750 uint8_t *pDepthSample
= pDepthBase
+ RasterTileDepthOffset(sample
);
751 uint8_t *pStencilSample
= pStencilBase
+ RasterTileStencilOffset(sample
);
756 RDTSC_START(BEEarlyDepthTest
);
757 depthPassMask
= DepthStencilTest(&state
, work
.triFlags
.frontFacing
,
758 psContext
.vZ
, pDepthSample
, vCoverageMask
, pStencilSample
, &stencilPassMask
);
759 RDTSC_STOP(BEEarlyDepthTest
, 0, 0);
761 // early-exit if no samples passed depth or earlyZ is forced on.
762 if (pPSState
->forceEarlyZ
|| !_simd_movemask_ps(depthPassMask
))
764 DepthStencilWrite(&state
.vp
[0], &state
.depthStencilState
, work
.triFlags
.frontFacing
, psContext
.vZ
,
765 pDepthSample
, depthPassMask
, vCoverageMask
, pStencilSample
, stencilPassMask
);
767 if (!_simd_movemask_ps(depthPassMask
))
769 work
.coverageMask
[sample
] >>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
775 psContext
.sampleIndex
= sample
;
776 psContext
.activeMask
= _simd_castps_si(vCoverageMask
);
778 // execute pixel shader
779 RDTSC_START(BEPixelShader
);
780 UPDATE_STAT(PsInvocations
, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask
)));
781 state
.psState
.pfnPixelShader(GetPrivateState(pDC
), &psContext
);
782 RDTSC_STOP(BEPixelShader
, 0, 0);
784 vCoverageMask
= _simd_castsi_ps(psContext
.activeMask
);
789 RDTSC_START(BELateDepthTest
);
790 depthPassMask
= DepthStencilTest(&state
, work
.triFlags
.frontFacing
,
791 psContext
.vZ
, pDepthSample
, vCoverageMask
, pStencilSample
, &stencilPassMask
);
792 RDTSC_STOP(BELateDepthTest
, 0, 0);
794 if (!_simd_movemask_ps(depthPassMask
))
796 // need to call depth/stencil write for stencil write
797 DepthStencilWrite(&state
.vp
[0], &state
.depthStencilState
, work
.triFlags
.frontFacing
, psContext
.vZ
,
798 pDepthSample
, depthPassMask
, vCoverageMask
, pStencilSample
, stencilPassMask
);
800 work
.coverageMask
[sample
] >>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
805 uint32_t statMask
= _simd_movemask_ps(depthPassMask
);
806 uint32_t statCount
= _mm_popcnt_u32(statMask
);
807 UPDATE_STAT(DepthPassCount
, statCount
);
810 RDTSC_START(BEOutputMerger
);
811 OutputMerger(psContext
, pColorBase
, sample
, pBlendState
, state
.pfnBlendFunc
, vCoverageMask
, depthPassMask
, pPSState
->numRenderTargets
);
813 // do final depth write after all pixel kills
814 if (!pPSState
->forceEarlyZ
)
816 DepthStencilWrite(&state
.vp
[0], &state
.depthStencilState
, work
.triFlags
.frontFacing
, psContext
.vZ
,
817 pDepthSample
, depthPassMask
, vCoverageMask
, pStencilSample
, stencilPassMask
);
819 RDTSC_STOP(BEOutputMerger
, 0, 0);
821 work
.coverageMask
[sample
] >>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
823 RDTSC_START(BEEndTile
);
824 pDepthBase
+= (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
) / 8;
825 pStencilBase
+= (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
) / 8;
827 for (uint32_t rt
= 0; rt
< NumRT
; ++rt
)
829 pColorBase
[rt
] += (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
) / 8;
831 RDTSC_STOP(BEEndTile
, 0, 0);
834 RDTSC_STOP(BESampleRateBackend
, 0, 0);
838 void BackendPixelRate(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t x
, uint32_t y
, SWR_TRIANGLE_DESC
&work
, RenderOutputBuffers
&renderBuffers
)
840 RDTSC_START(BEPixelRateBackend
);
841 RDTSC_START(BESetup
);
843 SWR_CONTEXT
*pContext
= pDC
->pContext
;
844 const API_STATE
& state
= GetApiState(pDC
);
845 const SWR_RASTSTATE
& rastState
= state
.rastState
;
846 const SWR_PS_STATE
*pPSState
= &state
.psState
;
847 const SWR_BLEND_STATE
*pBlendState
= &state
.blendState
;
850 BarycentricCoeffs coeffs
;
851 coeffs
.vIa
= _simd_broadcast_ss(&work
.I
[0]);
852 coeffs
.vIb
= _simd_broadcast_ss(&work
.I
[1]);
853 coeffs
.vIc
= _simd_broadcast_ss(&work
.I
[2]);
855 coeffs
.vJa
= _simd_broadcast_ss(&work
.J
[0]);
856 coeffs
.vJb
= _simd_broadcast_ss(&work
.J
[1]);
857 coeffs
.vJc
= _simd_broadcast_ss(&work
.J
[2]);
859 coeffs
.vZa
= _simd_broadcast_ss(&work
.Z
[0]);
860 coeffs
.vZb
= _simd_broadcast_ss(&work
.Z
[1]);
861 coeffs
.vZc
= _simd_broadcast_ss(&work
.Z
[2]);
863 coeffs
.vRecipDet
= _simd_broadcast_ss(&work
.recipDet
);
865 coeffs
.vAOneOverW
= _simd_broadcast_ss(&work
.OneOverW
[0]);
866 coeffs
.vBOneOverW
= _simd_broadcast_ss(&work
.OneOverW
[1]);
867 coeffs
.vCOneOverW
= _simd_broadcast_ss(&work
.OneOverW
[2]);
869 uint8_t *pColorBase
[SWR_NUM_RENDERTARGETS
];
870 uint32_t NumRT
= state
.psState
.numRenderTargets
;
871 for(uint32_t rt
= 0; rt
< NumRT
; ++rt
)
873 pColorBase
[rt
] = renderBuffers
.pColor
[rt
];
875 uint8_t *pDepthBase
= renderBuffers
.pDepth
, *pStencilBase
= renderBuffers
.pStencil
;
876 RDTSC_STOP(BESetup
, 0, 0);
878 SWR_PS_CONTEXT psContext
;
879 psContext
.pAttribs
= work
.pAttribs
;
880 psContext
.pPerspAttribs
= work
.pPerspAttribs
;
881 psContext
.frontFace
= work
.triFlags
.frontFacing
;
882 psContext
.primID
= work
.triFlags
.primID
;
883 psContext
.pRecipW
= work
.pRecipW
;
884 // save Ia/Ib/Ic and Ja/Jb/Jc if we need to reevaluate i/j/k in the shader because of pull attribs
885 psContext
.I
= work
.I
;
886 psContext
.J
= work
.J
;
887 psContext
.recipDet
= work
.recipDet
;
888 psContext
.pSamplePosX
= (const float*)&T::MultisampleT::samplePosX
;
889 psContext
.pSamplePosY
= (const float*)&T::MultisampleT::samplePosY
;
890 psContext
.sampleIndex
= 0;
892 PixelRateZTestLoop
<T
> PixelRateZTest(pDC
, work
, coeffs
, state
, pDepthBase
, pStencilBase
, rastState
.clipDistanceMask
);
894 for(uint32_t yy
= y
; yy
< y
+ KNOB_TILE_Y_DIM
; yy
+= SIMD_TILE_Y_DIM
)
896 psContext
.vY
.UL
= _simd_add_ps(vULOffsetsY
, _simd_set1_ps((float)yy
));
897 psContext
.vY
.center
= _simd_add_ps(vCenterOffsetsY
, _simd_set1_ps((float)yy
));
898 for(uint32_t xx
= x
; xx
< x
+ KNOB_TILE_X_DIM
; xx
+= SIMD_TILE_X_DIM
)
900 simdscalar activeLanes
;
901 if(!(work
.anyCoveredSamples
& MASK
)) {goto Endtile
;};
902 activeLanes
= vMask(work
.anyCoveredSamples
& MASK
);
904 psContext
.vX
.UL
= _simd_add_ps(vULOffsetsX
, _simd_set1_ps((float)xx
));
905 // set pixel center positions
906 psContext
.vX
.center
= _simd_add_ps(vCenterOffsetsX
, _simd_set1_ps((float)xx
));
908 RDTSC_START(BEBarycentric
);
909 CalcPixelBarycentrics(coeffs
, psContext
);
910 RDTSC_STOP(BEBarycentric
, 0, 0);
912 if (T::bInputCoverage
)
914 generateInputCoverage
<T
>(&work
.coverageMask
[0], psContext
.inputMask
, pBlendState
->sampleMask
);
919 ///@ todo: don't need to genererate input coverage 2x if input coverage and centroid
920 RDTSC_START(BEBarycentric
);
921 if(T::bIsStandardPattern
)
923 CalcCentroidPos
<T
>(psContext
, &work
.coverageMask
[0], pBlendState
->sampleMask
, psContext
.vX
.UL
, psContext
.vY
.UL
);
927 psContext
.vX
.centroid
= _simd_add_ps(psContext
.vX
.UL
, _simd_set1_ps(0.5f
));
928 psContext
.vY
.centroid
= _simd_add_ps(psContext
.vY
.UL
, _simd_set1_ps(0.5f
));
931 CalcCentroidBarycentrics(coeffs
, psContext
, psContext
.vX
.UL
, psContext
.vY
.UL
);
932 RDTSC_STOP(BEBarycentric
, 0, 0);
936 psContext
.vX
.centroid
= _simd_add_ps(psContext
.vX
.UL
, _simd_set1_ps(0.5f
));
937 psContext
.vY
.centroid
= _simd_add_ps(psContext
.vY
.UL
, _simd_set1_ps(0.5f
));
940 if(T::bForcedSampleCount
)
942 // candidate pixels (that passed coverage) will cause shader invocation if any bits in the samplemask are set
943 const simdscalar vSampleMask
= _simd_castsi_ps(_simd_cmpgt_epi32(_simd_set1_epi32(pBlendState
->sampleMask
), _simd_setzero_si()));
944 activeLanes
= _simd_and_ps(activeLanes
, vSampleMask
);
948 if(T::bCanEarlyZ
&& !T::bForcedSampleCount
)
950 uint32_t depthPassCount
= PixelRateZTest(activeLanes
, psContext
, BEEarlyDepthTest
);
951 UPDATE_STAT(DepthPassCount
, depthPassCount
);
954 // if we have no covered samples that passed depth at this point, go to next tile
955 if(!_simd_movemask_ps(activeLanes
)) { goto Endtile
; };
957 if(pPSState
->usesSourceDepth
)
959 RDTSC_START(BEBarycentric
);
960 // interpolate and quantize z
961 psContext
.vZ
= vplaneps(coeffs
.vZa
, coeffs
.vZb
, coeffs
.vZc
, psContext
.vI
.center
, psContext
.vJ
.center
);
962 psContext
.vZ
= state
.pfnQuantizeDepth(psContext
.vZ
);
963 RDTSC_STOP(BEBarycentric
, 0, 0);
966 // pixels that are currently active
967 psContext
.activeMask
= _simd_castps_si(activeLanes
);
968 psContext
.oMask
= T::MultisampleT::FullSampleMask();
970 // execute pixel shader
971 RDTSC_START(BEPixelShader
);
972 state
.psState
.pfnPixelShader(GetPrivateState(pDC
), &psContext
);
973 UPDATE_STAT(PsInvocations
, _mm_popcnt_u32(_simd_movemask_ps(activeLanes
)));
974 RDTSC_STOP(BEPixelShader
, 0, 0);
976 // update active lanes to remove any discarded or oMask'd pixels
977 activeLanes
= _simd_castsi_ps(_simd_and_si(psContext
.activeMask
, _simd_cmpgt_epi32(psContext
.oMask
, _simd_setzero_si())));
978 if(!_simd_movemask_ps(activeLanes
)) { goto Endtile
; };
981 if(!T::bCanEarlyZ
&& !T::bForcedSampleCount
)
983 uint32_t depthPassCount
= PixelRateZTest(activeLanes
, psContext
, BELateDepthTest
);
984 UPDATE_STAT(DepthPassCount
, depthPassCount
);
987 // if we have no covered samples that passed depth at this point, skip OM and go to next tile
988 if(!_simd_movemask_ps(activeLanes
)) { goto Endtile
; };
991 // loop over all samples, broadcasting the results of the PS to all passing pixels
992 for(uint32_t sample
= 0; sample
< GetNumOMSamples
<T
>(pBlendState
->sampleCount
); sample
++)
994 RDTSC_START(BEOutputMerger
);
995 // center pattern does a single coverage/depth/stencil test, standard pattern tests all samples
996 uint32_t coverageSampleNum
= (T::bIsStandardPattern
) ? sample
: 0;
997 simdscalar coverageMask
, depthMask
;
998 if(T::bForcedSampleCount
)
1000 coverageMask
= depthMask
= activeLanes
;
1004 coverageMask
= PixelRateZTest
.vCoverageMask
[coverageSampleNum
];
1005 depthMask
= PixelRateZTest
.depthPassMask
[coverageSampleNum
];
1006 if(!_simd_movemask_ps(depthMask
))
1008 // stencil should already have been written in early/lateZ tests
1009 RDTSC_STOP(BEOutputMerger
, 0, 0);
1014 // broadcast the results of the PS to all passing pixels
1015 OutputMerger(psContext
, pColorBase
, sample
, pBlendState
, state
.pfnBlendFunc
, coverageMask
, depthMask
, pPSState
->numRenderTargets
);
1017 if(!pPSState
->forceEarlyZ
&& !T::bForcedSampleCount
)
1019 uint8_t *pDepthSample
= pDepthBase
+ RasterTileDepthOffset(sample
);
1020 uint8_t * pStencilSample
= pStencilBase
+ RasterTileStencilOffset(sample
);
1022 DepthStencilWrite(&state
.vp
[0], &state
.depthStencilState
, work
.triFlags
.frontFacing
, PixelRateZTest
.vZ
[coverageSampleNum
],
1023 pDepthSample
, depthMask
, coverageMask
, pStencilSample
, PixelRateZTest
.stencilPassMask
[coverageSampleNum
]);
1025 RDTSC_STOP(BEOutputMerger
, 0, 0);
1028 RDTSC_START(BEEndTile
);
1029 for(uint32_t sample
= 0; sample
< T::MultisampleT::numCoverageSamples
; sample
++)
1031 work
.coverageMask
[sample
] >>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
1034 work
.anyCoveredSamples
>>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
1035 pDepthBase
+= (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
) / 8;
1036 pStencilBase
+= (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
) / 8;
1038 for(uint32_t rt
= 0; rt
< NumRT
; ++rt
)
1040 pColorBase
[rt
] += (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
) / 8;
1042 RDTSC_STOP(BEEndTile
, 0, 0);
1045 RDTSC_STOP(BEPixelRateBackend
, 0, 0);
1047 // optimized backend flow with NULL PS
1048 template<uint32_t sampleCountT
>
1049 void BackendNullPS(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t x
, uint32_t y
, SWR_TRIANGLE_DESC
&work
, RenderOutputBuffers
&renderBuffers
)
1051 RDTSC_START(BENullBackend
);
1052 ///@todo: handle center multisample pattern
1053 typedef SwrBackendTraits
<sampleCountT
, SWR_MSAA_STANDARD_PATTERN
> T
;
1054 RDTSC_START(BESetup
);
1056 SWR_CONTEXT
*pContext
= pDC
->pContext
;
1057 const API_STATE
& state
= GetApiState(pDC
);
1058 const SWR_RASTSTATE
& rastState
= pDC
->pState
->state
.rastState
;
1060 // broadcast scalars
1061 BarycentricCoeffs coeffs
;
1062 coeffs
.vIa
= _simd_broadcast_ss(&work
.I
[0]);
1063 coeffs
.vIb
= _simd_broadcast_ss(&work
.I
[1]);
1064 coeffs
.vIc
= _simd_broadcast_ss(&work
.I
[2]);
1066 coeffs
.vJa
= _simd_broadcast_ss(&work
.J
[0]);
1067 coeffs
.vJb
= _simd_broadcast_ss(&work
.J
[1]);
1068 coeffs
.vJc
= _simd_broadcast_ss(&work
.J
[2]);
1070 coeffs
.vZa
= _simd_broadcast_ss(&work
.Z
[0]);
1071 coeffs
.vZb
= _simd_broadcast_ss(&work
.Z
[1]);
1072 coeffs
.vZc
= _simd_broadcast_ss(&work
.Z
[2]);
1074 coeffs
.vRecipDet
= _simd_broadcast_ss(&work
.recipDet
);
1076 uint8_t *pDepthBase
= renderBuffers
.pDepth
, *pStencilBase
= renderBuffers
.pStencil
;
1078 RDTSC_STOP(BESetup
, 0, 0);
1080 SWR_PS_CONTEXT psContext
;
1081 for (uint32_t yy
= y
; yy
< y
+ KNOB_TILE_Y_DIM
; yy
+= SIMD_TILE_Y_DIM
)
1084 simdscalar vYSamplePosUL
= _simd_add_ps(vULOffsetsY
, _simd_set1_ps((float)yy
));
1086 for (uint32_t xx
= x
; xx
< x
+ KNOB_TILE_X_DIM
; xx
+= SIMD_TILE_X_DIM
)
1089 simdscalar vXSamplePosUL
= _simd_add_ps(vULOffsetsX
, _simd_set1_ps((float)xx
));
1091 // iterate over active samples
1092 unsigned long sample
= 0;
1093 uint32_t sampleMask
= state
.blendState
.sampleMask
;
1094 while (_BitScanForward(&sample
, sampleMask
))
1096 sampleMask
&= ~(1 << sample
);
1097 simdmask coverageMask
= work
.coverageMask
[sample
] & MASK
;
1100 RDTSC_START(BEBarycentric
);
1101 // calculate per sample positions
1102 psContext
.vX
.sample
= _simd_add_ps(vXSamplePosUL
, T::MultisampleT::vX(sample
));
1103 psContext
.vY
.sample
= _simd_add_ps(vYSamplePosUL
, T::MultisampleT::vY(sample
));
1105 CalcSampleBarycentrics(coeffs
, psContext
);
1107 // interpolate and quantize z
1108 psContext
.vZ
= vplaneps(coeffs
.vZa
, coeffs
.vZb
, coeffs
.vZc
, psContext
.vI
.sample
, psContext
.vJ
.sample
);
1109 psContext
.vZ
= state
.pfnQuantizeDepth(psContext
.vZ
);
1111 RDTSC_STOP(BEBarycentric
, 0, 0);
1113 // interpolate user clip distance if available
1114 if (rastState
.clipDistanceMask
)
1116 coverageMask
&= ~ComputeUserClipMask(rastState
.clipDistanceMask
, work
.pUserClipBuffer
,
1117 psContext
.vI
.sample
, psContext
.vJ
.sample
);
1120 simdscalar vCoverageMask
= vMask(coverageMask
);
1121 simdscalar stencilPassMask
= vCoverageMask
;
1123 // offset depth/stencil buffers current sample
1124 uint8_t *pDepthSample
= pDepthBase
+ RasterTileDepthOffset(sample
);
1125 uint8_t *pStencilSample
= pStencilBase
+ RasterTileStencilOffset(sample
);
1127 RDTSC_START(BEEarlyDepthTest
);
1128 simdscalar depthPassMask
= DepthStencilTest(&state
, work
.triFlags
.frontFacing
,
1129 psContext
.vZ
, pDepthSample
, vCoverageMask
, pStencilSample
, &stencilPassMask
);
1130 DepthStencilWrite(&state
.vp
[0], &state
.depthStencilState
, work
.triFlags
.frontFacing
, psContext
.vZ
,
1131 pDepthSample
, depthPassMask
, vCoverageMask
, pStencilSample
, stencilPassMask
);
1132 RDTSC_STOP(BEEarlyDepthTest
, 0, 0);
1134 uint32_t statMask
= _simd_movemask_ps(depthPassMask
);
1135 uint32_t statCount
= _mm_popcnt_u32(statMask
);
1136 UPDATE_STAT(DepthPassCount
, statCount
);
1138 work
.coverageMask
[sample
] >>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
1140 pDepthBase
+= (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
) / 8;
1141 pStencilBase
+= (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
) / 8;
1144 RDTSC_STOP(BENullBackend
, 0, 0);
1147 void InitClearTilesTable()
1149 memset(sClearTilesTable
, 0, sizeof(sClearTilesTable
));
1151 sClearTilesTable
[R8G8B8A8_UNORM
] = ClearMacroTile
<R8G8B8A8_UNORM
>;
1152 sClearTilesTable
[B8G8R8A8_UNORM
] = ClearMacroTile
<B8G8R8A8_UNORM
>;
1153 sClearTilesTable
[R32_FLOAT
] = ClearMacroTile
<R32_FLOAT
>;
1154 sClearTilesTable
[R32G32B32A32_FLOAT
] = ClearMacroTile
<R32G32B32A32_FLOAT
>;
1155 sClearTilesTable
[R8_UINT
] = ClearMacroTile
<R8_UINT
>;
1158 PFN_BACKEND_FUNC gBackendNullPs
[SWR_MULTISAMPLE_TYPE_MAX
];
1159 PFN_BACKEND_FUNC gBackendSingleSample
[2] // input coverage
1163 PFN_BACKEND_FUNC gBackendPixelRateTable
[SWR_MULTISAMPLE_TYPE_MAX
]
1164 [SWR_MSAA_SAMPLE_PATTERN_MAX
]
1165 [2] // input coverage
1167 [2] // forcedSampleCount
1170 PFN_BACKEND_FUNC gBackendSampleRateTable
[SWR_MULTISAMPLE_TYPE_MAX
]
1171 [2] // input coverage
1176 // Recursive template used to auto-nest conditionals. Converts dynamic enum function
1177 // arguments to static template arguments.
1178 template <uint32_t... ArgsT
>
1181 // Last Arg Terminator
1182 static PFN_BACKEND_FUNC
GetFunc(SWR_BACKEND_FUNCS tArg
)
1186 case SWR_BACKEND_SINGLE_SAMPLE
: return BackendSingleSample
<SwrBackendTraits
<ArgsT
...>>; break;
1187 case SWR_BACKEND_MSAA_PIXEL_RATE
: return BackendPixelRate
<SwrBackendTraits
<ArgsT
...>>; break;
1188 case SWR_BACKEND_MSAA_SAMPLE_RATE
: return BackendSampleRate
<SwrBackendTraits
<ArgsT
...>>; break;
1190 SWR_ASSERT(0 && "Invalid backend func\n");
1196 // Recursively parse args
1197 template <typename
... TArgsT
>
1198 static PFN_BACKEND_FUNC
GetFunc(SWR_MSAA_SAMPLE_PATTERN tArg
, TArgsT
... remainingArgs
)
1202 case SWR_MSAA_CENTER_PATTERN
: return BEChooser
<ArgsT
..., SWR_MSAA_CENTER_PATTERN
>::GetFunc(remainingArgs
...); break;
1203 case SWR_MSAA_STANDARD_PATTERN
: return BEChooser
<ArgsT
..., SWR_MSAA_STANDARD_PATTERN
>::GetFunc(remainingArgs
...); break;
1205 SWR_ASSERT(0 && "Invalid sample pattern\n");
1206 return BEChooser
<ArgsT
..., SWR_MSAA_STANDARD_PATTERN
>::GetFunc(remainingArgs
...);
1211 // Recursively parse args
1212 template <typename
... TArgsT
>
1213 static PFN_BACKEND_FUNC
GetFunc(SWR_MULTISAMPLE_COUNT tArg
, TArgsT
... remainingArgs
)
1217 case SWR_MULTISAMPLE_1X
: return BEChooser
<ArgsT
..., SWR_MULTISAMPLE_1X
>::GetFunc(remainingArgs
...); break;
1218 case SWR_MULTISAMPLE_2X
: return BEChooser
<ArgsT
..., SWR_MULTISAMPLE_2X
>::GetFunc(remainingArgs
...); break;
1219 case SWR_MULTISAMPLE_4X
: return BEChooser
<ArgsT
..., SWR_MULTISAMPLE_4X
>::GetFunc(remainingArgs
...); break;
1220 case SWR_MULTISAMPLE_8X
: return BEChooser
<ArgsT
..., SWR_MULTISAMPLE_8X
>::GetFunc(remainingArgs
...); break;
1221 case SWR_MULTISAMPLE_16X
: return BEChooser
<ArgsT
..., SWR_MULTISAMPLE_16X
>::GetFunc(remainingArgs
...); break;
1223 SWR_ASSERT(0 && "Invalid sample count\n");
1224 return BEChooser
<ArgsT
..., SWR_MULTISAMPLE_1X
>::GetFunc(remainingArgs
...);
1229 // Recursively parse args
1230 template <typename
... TArgsT
>
1231 static PFN_BACKEND_FUNC
GetFunc(bool tArg
, TArgsT
... remainingArgs
)
1235 return BEChooser
<ArgsT
..., 1>::GetFunc(remainingArgs
...);
1238 return BEChooser
<ArgsT
..., 0>::GetFunc(remainingArgs
...);
1242 void InitBackendSingleFuncTable(PFN_BACKEND_FUNC (&table
)[2][2][2])
1244 for(uint32_t inputCoverage
= 0; inputCoverage
< 2; inputCoverage
++)
1246 for(uint32_t isCentroid
= 0; isCentroid
< 2; isCentroid
++)
1248 for(uint32_t canEarlyZ
= 0; canEarlyZ
< 2; canEarlyZ
++)
1250 table
[inputCoverage
][isCentroid
][canEarlyZ
] =
1251 BEChooser
<>::GetFunc(SWR_MULTISAMPLE_1X
, SWR_MSAA_STANDARD_PATTERN
, (inputCoverage
> 0),
1252 (isCentroid
> 0), false, (canEarlyZ
> 0), SWR_BACKEND_SINGLE_SAMPLE
);
1258 void InitBackendPixelFuncTable(PFN_BACKEND_FUNC (&table
)[SWR_MULTISAMPLE_TYPE_MAX
][SWR_MSAA_SAMPLE_PATTERN_MAX
][2][2][2][2])
1260 for(uint32_t sampleCount
= SWR_MULTISAMPLE_1X
; sampleCount
< SWR_MULTISAMPLE_TYPE_MAX
; sampleCount
++)
1262 for(uint32_t samplePattern
= SWR_MSAA_CENTER_PATTERN
; samplePattern
< SWR_MSAA_SAMPLE_PATTERN_MAX
; samplePattern
++)
1264 for(uint32_t inputCoverage
= 0; inputCoverage
< 2; inputCoverage
++)
1266 for(uint32_t isCentroid
= 0; isCentroid
< 2; isCentroid
++)
1268 for(uint32_t forcedSampleCount
= 0; forcedSampleCount
< 2; forcedSampleCount
++)
1270 for(uint32_t canEarlyZ
= 0; canEarlyZ
< 2; canEarlyZ
++)
1272 table
[sampleCount
][samplePattern
][inputCoverage
][isCentroid
][forcedSampleCount
][canEarlyZ
] =
1273 BEChooser
<>::GetFunc((SWR_MULTISAMPLE_COUNT
)sampleCount
, (SWR_MSAA_SAMPLE_PATTERN
)samplePattern
, (inputCoverage
> 0),
1274 (isCentroid
> 0), (forcedSampleCount
> 0), (canEarlyZ
> 0), SWR_BACKEND_MSAA_PIXEL_RATE
);
1283 void InitBackendSampleFuncTable(PFN_BACKEND_FUNC (&table
)[SWR_MULTISAMPLE_TYPE_MAX
][2][2][2])
1285 for(uint32_t sampleCount
= SWR_MULTISAMPLE_1X
; sampleCount
< SWR_MULTISAMPLE_TYPE_MAX
; sampleCount
++)
1287 for(uint32_t inputCoverage
= 0; inputCoverage
< 2; inputCoverage
++)
1289 for(uint32_t centroid
= 0; centroid
< 2; centroid
++)
1291 for(uint32_t canEarlyZ
= 0; canEarlyZ
< 2; canEarlyZ
++)
1293 table
[sampleCount
][inputCoverage
][centroid
][canEarlyZ
] =
1294 BEChooser
<>::GetFunc((SWR_MULTISAMPLE_COUNT
)sampleCount
, SWR_MSAA_STANDARD_PATTERN
, (inputCoverage
> 0),
1295 (centroid
> 0), false, (canEarlyZ
> 0), (SWR_BACKEND_FUNCS
)SWR_BACKEND_MSAA_SAMPLE_RATE
);
1302 void InitBackendFuncTables()
1304 InitBackendSingleFuncTable(gBackendSingleSample
);
1305 InitBackendPixelFuncTable(gBackendPixelRateTable
);
1306 InitBackendSampleFuncTable(gBackendSampleRateTable
);
1308 gBackendNullPs
[SWR_MULTISAMPLE_1X
] = &BackendNullPS
< SWR_MULTISAMPLE_1X
> ;
1309 gBackendNullPs
[SWR_MULTISAMPLE_2X
] = &BackendNullPS
< SWR_MULTISAMPLE_2X
> ;
1310 gBackendNullPs
[SWR_MULTISAMPLE_4X
] = &BackendNullPS
< SWR_MULTISAMPLE_4X
> ;
1311 gBackendNullPs
[SWR_MULTISAMPLE_8X
] = &BackendNullPS
< SWR_MULTISAMPLE_8X
> ;
1312 gBackendNullPs
[SWR_MULTISAMPLE_16X
] = &BackendNullPS
< SWR_MULTISAMPLE_16X
> ;