1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * @brief Backend handles rasterization, pixel shading and output merger
28 ******************************************************************************/
30 #include <smmintrin.h>
33 #include "depthstencil.h"
35 #include "memory/tilingtraits.h"
36 #include "core/multisample.h"
40 typedef void(*PFN_CLEAR_TILES
)(DRAW_CONTEXT
*, SWR_RENDERTARGET_ATTACHMENT rt
, uint32_t, DWORD
[4]);
41 static PFN_CLEAR_TILES sClearTilesTable
[NUM_SWR_FORMATS
];
43 //////////////////////////////////////////////////////////////////////////
44 /// @brief Process compute work.
45 /// @param pDC - pointer to draw context (dispatch).
46 /// @param workerId - The unique worker ID that is assigned to this thread.
47 /// @param threadGroupId - the linear index for the thread group within the dispatch.
48 void ProcessComputeBE(DRAW_CONTEXT
* pDC
, uint32_t workerId
, uint32_t threadGroupId
, void*& pSpillFillBuffer
)
50 RDTSC_START(BEDispatch
);
52 SWR_CONTEXT
*pContext
= pDC
->pContext
;
54 const COMPUTE_DESC
* pTaskData
= (COMPUTE_DESC
*)pDC
->pDispatch
->GetTasksData();
55 SWR_ASSERT(pTaskData
!= nullptr);
57 // Ensure spill fill memory has been allocated.
58 size_t spillFillSize
= pDC
->pState
->state
.totalSpillFillSize
;
59 if (spillFillSize
&& pSpillFillBuffer
== nullptr)
61 pSpillFillBuffer
= pDC
->pArena
->AllocAlignedSync(spillFillSize
, KNOB_SIMD_BYTES
);
64 const API_STATE
& state
= GetApiState(pDC
);
66 SWR_CS_CONTEXT csContext
{ 0 };
67 csContext
.tileCounter
= threadGroupId
;
68 csContext
.dispatchDims
[0] = pTaskData
->threadGroupCountX
;
69 csContext
.dispatchDims
[1] = pTaskData
->threadGroupCountY
;
70 csContext
.dispatchDims
[2] = pTaskData
->threadGroupCountZ
;
71 csContext
.pTGSM
= pContext
->pScratch
[workerId
];
72 csContext
.pSpillFillBuffer
= (uint8_t*)pSpillFillBuffer
;
74 state
.pfnCsFunc(GetPrivateState(pDC
), &csContext
);
76 UPDATE_STAT(CsInvocations
, state
.totalThreadsInGroup
);
78 RDTSC_STOP(BEDispatch
, 1, 0);
81 void ProcessSyncBE(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t macroTile
, void *pUserData
)
83 SYNC_DESC
*pSync
= (SYNC_DESC
*)pUserData
;
86 MacroTileMgr::getTileIndices(macroTile
, x
, y
);
87 SWR_ASSERT(x
== 0 && y
== 0);
89 if (pSync
->pfnCallbackFunc
!= nullptr)
91 pSync
->pfnCallbackFunc(pSync
->userData
, pSync
->userData2
, pSync
->userData3
);
95 void ProcessQueryStatsBE(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t macroTile
, void *pUserData
)
97 QUERY_DESC
* pQueryDesc
= (QUERY_DESC
*)pUserData
;
98 SWR_STATS
* pStats
= pQueryDesc
->pStats
;
99 SWR_CONTEXT
*pContext
= pDC
->pContext
;
101 SWR_ASSERT(pStats
!= nullptr);
103 for (uint32_t i
= 0; i
< pContext
->NumWorkerThreads
; ++i
)
105 pStats
->DepthPassCount
+= pContext
->stats
[i
].DepthPassCount
;
107 pStats
->IaVertices
+= pContext
->stats
[i
].IaVertices
;
108 pStats
->IaPrimitives
+= pContext
->stats
[i
].IaPrimitives
;
109 pStats
->VsInvocations
+= pContext
->stats
[i
].VsInvocations
;
110 pStats
->HsInvocations
+= pContext
->stats
[i
].HsInvocations
;
111 pStats
->DsInvocations
+= pContext
->stats
[i
].DsInvocations
;
112 pStats
->GsInvocations
+= pContext
->stats
[i
].GsInvocations
;
113 pStats
->PsInvocations
+= pContext
->stats
[i
].PsInvocations
;
114 pStats
->CInvocations
+= pContext
->stats
[i
].CInvocations
;
115 pStats
->CsInvocations
+= pContext
->stats
[i
].CsInvocations
;
116 pStats
->CPrimitives
+= pContext
->stats
[i
].CPrimitives
;
117 pStats
->GsPrimitives
+= pContext
->stats
[i
].GsPrimitives
;
119 for (uint32_t stream
= 0; stream
< MAX_SO_STREAMS
; ++stream
)
121 pStats
->SoWriteOffset
[stream
] += pContext
->stats
[i
].SoWriteOffset
[stream
];
123 /// @note client is required to provide valid write offset before every draw, so we clear
124 /// out the contents of the write offset when storing stats
125 pContext
->stats
[i
].SoWriteOffset
[stream
] = 0;
127 pStats
->SoPrimStorageNeeded
[stream
] += pContext
->stats
[i
].SoPrimStorageNeeded
[stream
];
128 pStats
->SoNumPrimsWritten
[stream
] += pContext
->stats
[i
].SoNumPrimsWritten
[stream
];
133 template<SWR_FORMAT format
>
134 void ClearRasterTile(uint8_t *pTileBuffer
, simdvector
&value
)
136 auto lambda
= [&](int comp
)
138 FormatTraits
<format
>::storeSOA(comp
, pTileBuffer
, value
.v
[comp
]);
139 pTileBuffer
+= (KNOB_SIMD_WIDTH
* FormatTraits
<format
>::GetBPC(comp
) / 8);
142 const uint32_t numIter
= (KNOB_TILE_Y_DIM
/ SIMD_TILE_Y_DIM
) * (KNOB_TILE_X_DIM
/ SIMD_TILE_X_DIM
);
143 for (uint32_t i
= 0; i
< numIter
; ++i
)
145 UnrollerL
<0, FormatTraits
<format
>::numComps
, 1>::step(lambda
);
149 template<SWR_FORMAT format
>
150 INLINE
void ClearMacroTile(DRAW_CONTEXT
*pDC
, SWR_RENDERTARGET_ATTACHMENT rt
, uint32_t macroTile
, DWORD clear
[4])
152 // convert clear color to hottile format
153 // clear color is in RGBA float/uint32
155 for (uint32_t comp
= 0; comp
< FormatTraits
<format
>::numComps
; ++comp
)
158 vComp
= _simd_load1_ps((const float*)&clear
[comp
]);
159 if (FormatTraits
<format
>::isNormalized(comp
))
161 vComp
= _simd_mul_ps(vComp
, _simd_set1_ps(FormatTraits
<format
>::fromFloat(comp
)));
162 vComp
= _simd_castsi_ps(_simd_cvtps_epi32(vComp
));
164 vComp
= FormatTraits
<format
>::pack(comp
, vComp
);
165 vClear
.v
[FormatTraits
<format
>::swizzle(comp
)] = vComp
;
168 uint32_t tileX
, tileY
;
169 MacroTileMgr::getTileIndices(macroTile
, tileX
, tileY
);
170 const API_STATE
& state
= GetApiState(pDC
);
172 int top
= KNOB_MACROTILE_Y_DIM_FIXED
* tileY
;
173 int bottom
= top
+ KNOB_MACROTILE_Y_DIM_FIXED
- 1;
174 int left
= KNOB_MACROTILE_X_DIM_FIXED
* tileX
;
175 int right
= left
+ KNOB_MACROTILE_X_DIM_FIXED
- 1;
177 // intersect with scissor
178 top
= std::max(top
, state
.scissorInFixedPoint
.top
);
179 left
= std::max(left
, state
.scissorInFixedPoint
.left
);
180 bottom
= std::min(bottom
, state
.scissorInFixedPoint
.bottom
);
181 right
= std::min(right
, state
.scissorInFixedPoint
.right
);
183 // translate to local hottile origin
184 top
-= KNOB_MACROTILE_Y_DIM_FIXED
* tileY
;
185 bottom
-= KNOB_MACROTILE_Y_DIM_FIXED
* tileY
;
186 left
-= KNOB_MACROTILE_X_DIM_FIXED
* tileX
;
187 right
-= KNOB_MACROTILE_X_DIM_FIXED
* tileX
;
189 // convert to raster tiles
190 top
>>= (KNOB_TILE_Y_DIM_SHIFT
+ FIXED_POINT_SHIFT
);
191 bottom
>>= (KNOB_TILE_Y_DIM_SHIFT
+ FIXED_POINT_SHIFT
);
192 left
>>= (KNOB_TILE_X_DIM_SHIFT
+ FIXED_POINT_SHIFT
);
193 right
>>= (KNOB_TILE_X_DIM_SHIFT
+ FIXED_POINT_SHIFT
);
195 const int numSamples
= GetNumSamples(pDC
->pState
->state
.rastState
.sampleCount
);
196 // compute steps between raster tile samples / raster tiles / macro tile rows
197 const uint32_t rasterTileSampleStep
= KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<format
>::bpp
/ 8;
198 const uint32_t rasterTileStep
= (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* (FormatTraits
<format
>::bpp
/ 8)) * numSamples
;
199 const uint32_t macroTileRowStep
= (KNOB_MACROTILE_X_DIM
/ KNOB_TILE_X_DIM
) * rasterTileStep
;
200 const uint32_t pitch
= (FormatTraits
<format
>::bpp
* KNOB_MACROTILE_X_DIM
/ 8);
202 HOTTILE
*pHotTile
= pDC
->pContext
->pHotTileMgr
->GetHotTile(pDC
->pContext
, pDC
, macroTile
, rt
, true, numSamples
);
203 uint32_t rasterTileStartOffset
= (ComputeTileOffset2D
< TilingTraits
<SWR_TILE_SWRZ
, FormatTraits
<format
>::bpp
> >(pitch
, left
, top
)) * numSamples
;
204 uint8_t* pRasterTileRow
= pHotTile
->pBuffer
+ rasterTileStartOffset
; //(ComputeTileOffset2D< TilingTraits<SWR_TILE_SWRZ, FormatTraits<format>::bpp > >(pitch, x, y)) * numSamples;
206 // loop over all raster tiles in the current hot tile
207 for (int y
= top
; y
<= bottom
; ++y
)
209 uint8_t* pRasterTile
= pRasterTileRow
;
210 for (int x
= left
; x
<= right
; ++x
)
212 for( int sampleNum
= 0; sampleNum
< numSamples
; sampleNum
++)
214 ClearRasterTile
<format
>(pRasterTile
, vClear
);
215 pRasterTile
+= rasterTileSampleStep
;
218 pRasterTileRow
+= macroTileRowStep
;
221 pHotTile
->state
= HOTTILE_DIRTY
;
225 void ProcessClearBE(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t macroTile
, void *pUserData
)
229 CLEAR_DESC
*pClear
= (CLEAR_DESC
*)pUserData
;
230 SWR_CONTEXT
*pContext
= pDC
->pContext
;
231 SWR_MULTISAMPLE_COUNT sampleCount
= pDC
->pState
->state
.rastState
.sampleCount
;
232 uint32_t numSamples
= GetNumSamples(sampleCount
);
234 SWR_ASSERT(pClear
->flags
.bits
!= 0); // shouldn't be here without a reason.
236 RDTSC_START(BEClear
);
238 if (pClear
->flags
.mask
& SWR_CLEAR_COLOR
)
240 HOTTILE
*pHotTile
= pContext
->pHotTileMgr
->GetHotTile(pContext
, pDC
, macroTile
, SWR_ATTACHMENT_COLOR0
, true, numSamples
);
241 // All we want to do here is to mark the hot tile as being in a "needs clear" state.
242 pHotTile
->clearData
[0] = *(DWORD
*)&(pClear
->clearRTColor
[0]);
243 pHotTile
->clearData
[1] = *(DWORD
*)&(pClear
->clearRTColor
[1]);
244 pHotTile
->clearData
[2] = *(DWORD
*)&(pClear
->clearRTColor
[2]);
245 pHotTile
->clearData
[3] = *(DWORD
*)&(pClear
->clearRTColor
[3]);
246 pHotTile
->state
= HOTTILE_CLEAR
;
249 if (pClear
->flags
.mask
& SWR_CLEAR_DEPTH
)
251 HOTTILE
*pHotTile
= pContext
->pHotTileMgr
->GetHotTile(pContext
, pDC
, macroTile
, SWR_ATTACHMENT_DEPTH
, true, numSamples
);
252 pHotTile
->clearData
[0] = *(DWORD
*)&pClear
->clearDepth
;
253 pHotTile
->state
= HOTTILE_CLEAR
;
256 if (pClear
->flags
.mask
& SWR_CLEAR_STENCIL
)
258 HOTTILE
*pHotTile
= pContext
->pHotTileMgr
->GetHotTile(pContext
, pDC
, macroTile
, SWR_ATTACHMENT_STENCIL
, true, numSamples
);
260 pHotTile
->clearData
[0] = *(DWORD
*)&pClear
->clearStencil
;
261 pHotTile
->state
= HOTTILE_CLEAR
;
264 RDTSC_STOP(BEClear
, 0, 0);
269 CLEAR_DESC
*pClear
= (CLEAR_DESC
*)pUserData
;
270 RDTSC_START(BEClear
);
272 if (pClear
->flags
.mask
& SWR_CLEAR_COLOR
)
274 /// @todo clear data should come in as RGBA32_FLOAT
277 clearFloat
[0] = ((uint8_t*)(&pClear
->clearRTColor
))[0] / 255.0f
;
278 clearFloat
[1] = ((uint8_t*)(&pClear
->clearRTColor
))[1] / 255.0f
;
279 clearFloat
[2] = ((uint8_t*)(&pClear
->clearRTColor
))[2] / 255.0f
;
280 clearFloat
[3] = ((uint8_t*)(&pClear
->clearRTColor
))[3] / 255.0f
;
281 clearData
[0] = *(DWORD
*)&clearFloat
[0];
282 clearData
[1] = *(DWORD
*)&clearFloat
[1];
283 clearData
[2] = *(DWORD
*)&clearFloat
[2];
284 clearData
[3] = *(DWORD
*)&clearFloat
[3];
286 PFN_CLEAR_TILES pfnClearTiles
= sClearTilesTable
[KNOB_COLOR_HOT_TILE_FORMAT
];
287 SWR_ASSERT(pfnClearTiles
!= nullptr);
289 pfnClearTiles(pDC
, SWR_ATTACHMENT_COLOR0
, macroTile
, clearData
);
292 if (pClear
->flags
.mask
& SWR_CLEAR_DEPTH
)
295 clearData
[0] = *(DWORD
*)&pClear
->clearDepth
;
296 PFN_CLEAR_TILES pfnClearTiles
= sClearTilesTable
[KNOB_DEPTH_HOT_TILE_FORMAT
];
297 SWR_ASSERT(pfnClearTiles
!= nullptr);
299 pfnClearTiles(pDC
, SWR_ATTACHMENT_DEPTH
, macroTile
, clearData
);
302 if (pClear
->flags
.mask
& SWR_CLEAR_STENCIL
)
304 uint32_t value
= pClear
->clearStencil
;
306 clearData
[0] = *(DWORD
*)&value
;
307 PFN_CLEAR_TILES pfnClearTiles
= sClearTilesTable
[KNOB_STENCIL_HOT_TILE_FORMAT
];
309 pfnClearTiles(pDC
, SWR_ATTACHMENT_STENCIL
, macroTile
, clearData
);
312 RDTSC_STOP(BEClear
, 0, 0);
317 void ProcessStoreTileBE(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t macroTile
, void *pData
)
319 RDTSC_START(BEStoreTiles
);
320 STORE_TILES_DESC
*pDesc
= (STORE_TILES_DESC
*)pData
;
321 SWR_CONTEXT
*pContext
= pDC
->pContext
;
323 #ifdef KNOB_ENABLE_RDTSC
324 uint32_t numTiles
= 0;
326 SWR_FORMAT srcFormat
;
327 switch (pDesc
->attachment
)
329 case SWR_ATTACHMENT_COLOR0
:
330 case SWR_ATTACHMENT_COLOR1
:
331 case SWR_ATTACHMENT_COLOR2
:
332 case SWR_ATTACHMENT_COLOR3
:
333 case SWR_ATTACHMENT_COLOR4
:
334 case SWR_ATTACHMENT_COLOR5
:
335 case SWR_ATTACHMENT_COLOR6
:
336 case SWR_ATTACHMENT_COLOR7
: srcFormat
= KNOB_COLOR_HOT_TILE_FORMAT
; break;
337 case SWR_ATTACHMENT_DEPTH
: srcFormat
= KNOB_DEPTH_HOT_TILE_FORMAT
; break;
338 case SWR_ATTACHMENT_STENCIL
: srcFormat
= KNOB_STENCIL_HOT_TILE_FORMAT
; break;
339 default: SWR_ASSERT(false, "Unknown attachment: %d", pDesc
->attachment
); srcFormat
= KNOB_COLOR_HOT_TILE_FORMAT
; break;
343 MacroTileMgr::getTileIndices(macroTile
, x
, y
);
345 // Only need to store the hottile if it's been rendered to...
346 HOTTILE
*pHotTile
= pContext
->pHotTileMgr
->GetHotTile(pContext
, pDC
, macroTile
, pDesc
->attachment
, false);
349 // clear if clear is pending (i.e., not rendered to), then mark as dirty for store.
350 if (pHotTile
->state
== HOTTILE_CLEAR
)
352 PFN_CLEAR_TILES pfnClearTiles
= sClearTilesTable
[srcFormat
];
353 SWR_ASSERT(pfnClearTiles
!= nullptr);
355 pfnClearTiles(pDC
, pDesc
->attachment
, macroTile
, pHotTile
->clearData
);
358 if (pHotTile
->state
== HOTTILE_DIRTY
|| pDesc
->postStoreTileState
== (SWR_TILE_STATE
)HOTTILE_DIRTY
)
360 int destX
= KNOB_MACROTILE_X_DIM
* x
;
361 int destY
= KNOB_MACROTILE_Y_DIM
* y
;
363 pContext
->pfnStoreTile(GetPrivateState(pDC
), srcFormat
,
364 pDesc
->attachment
, destX
, destY
, pHotTile
->renderTargetArrayIndex
, pHotTile
->pBuffer
);
368 if (pHotTile
->state
== HOTTILE_DIRTY
|| pHotTile
->state
== HOTTILE_RESOLVED
)
370 pHotTile
->state
= (HOTTILE_STATE
)pDesc
->postStoreTileState
;
373 RDTSC_STOP(BEStoreTiles
, numTiles
, pDC
->drawId
);
377 void ProcessDiscardInvalidateTilesBE(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t macroTile
, void *pData
)
379 DISCARD_INVALIDATE_TILES_DESC
*pDesc
= (DISCARD_INVALIDATE_TILES_DESC
*)pData
;
380 SWR_CONTEXT
*pContext
= pDC
->pContext
;
382 const int numSamples
= GetNumSamples(pDC
->pState
->state
.rastState
.sampleCount
);
384 for (uint32_t i
= 0; i
< SWR_NUM_ATTACHMENTS
; ++i
)
386 if (pDesc
->attachmentMask
& (1 << i
))
388 HOTTILE
*pHotTile
= pContext
->pHotTileMgr
->GetHotTileNoLoad(
389 pContext
, pDC
, macroTile
, (SWR_RENDERTARGET_ATTACHMENT
)i
, pDesc
->createNewTiles
, numSamples
);
392 pHotTile
->state
= (HOTTILE_STATE
)pDesc
->newTileState
;
398 #if KNOB_SIMD_WIDTH == 8
399 const __m256 vCenterOffsetsX
= {0.5, 1.5, 0.5, 1.5, 2.5, 3.5, 2.5, 3.5};
400 const __m256 vCenterOffsetsY
= {0.5, 0.5, 1.5, 1.5, 0.5, 0.5, 1.5, 1.5};
401 const __m256 vULOffsetsX
= {0.0, 1.0, 0.0, 1.0, 2.0, 3.0, 2.0, 3.0};
402 const __m256 vULOffsetsY
= {0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0};
404 #error Unsupported vector width
408 bool CanEarlyZ(const SWR_PS_STATE
*pPSState
)
410 return (pPSState
->forceEarlyZ
|| (!pPSState
->writesODepth
&& !pPSState
->usesSourceDepth
&& !pPSState
->usesUAV
));
413 simdmask
ComputeUserClipMask(uint8_t clipMask
, float* pUserClipBuffer
, simdscalar vI
, simdscalar vJ
)
415 simdscalar vClipMask
= _simd_setzero_ps();
416 uint32_t numClipDistance
= _mm_popcnt_u32(clipMask
);
418 for (uint32_t i
= 0; i
< numClipDistance
; ++i
)
420 // pull triangle clip distance values from clip buffer
421 simdscalar vA
= _simd_broadcast_ss(pUserClipBuffer
++);
422 simdscalar vB
= _simd_broadcast_ss(pUserClipBuffer
++);
423 simdscalar vC
= _simd_broadcast_ss(pUserClipBuffer
++);
426 simdscalar vInterp
= vplaneps(vA
, vB
, vC
, vI
, vJ
);
428 // clip if interpolated clip distance is < 0 || NAN
429 simdscalar vCull
= _simd_cmp_ps(_simd_setzero_ps(), vInterp
, _CMP_NLE_UQ
);
431 vClipMask
= _simd_or_ps(vClipMask
, vCull
);
434 return _simd_movemask_ps(vClipMask
);
438 void BackendSingleSample(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t x
, uint32_t y
, SWR_TRIANGLE_DESC
&work
, RenderOutputBuffers
&renderBuffers
)
440 RDTSC_START(BESingleSampleBackend
);
441 RDTSC_START(BESetup
);
443 SWR_CONTEXT
*pContext
= pDC
->pContext
;
444 const API_STATE
& state
= GetApiState(pDC
);
445 const SWR_RASTSTATE
& rastState
= state
.rastState
;
446 const SWR_PS_STATE
*pPSState
= &state
.psState
;
447 const SWR_BLEND_STATE
*pBlendState
= &state
.blendState
;
448 uint64_t coverageMask
= work
.coverageMask
[0];
451 BarycentricCoeffs coeffs
;
452 coeffs
.vIa
= _simd_broadcast_ss(&work
.I
[0]);
453 coeffs
.vIb
= _simd_broadcast_ss(&work
.I
[1]);
454 coeffs
.vIc
= _simd_broadcast_ss(&work
.I
[2]);
456 coeffs
.vJa
= _simd_broadcast_ss(&work
.J
[0]);
457 coeffs
.vJb
= _simd_broadcast_ss(&work
.J
[1]);
458 coeffs
.vJc
= _simd_broadcast_ss(&work
.J
[2]);
460 coeffs
.vZa
= _simd_broadcast_ss(&work
.Z
[0]);
461 coeffs
.vZb
= _simd_broadcast_ss(&work
.Z
[1]);
462 coeffs
.vZc
= _simd_broadcast_ss(&work
.Z
[2]);
464 coeffs
.vRecipDet
= _simd_broadcast_ss(&work
.recipDet
);
466 coeffs
.vAOneOverW
= _simd_broadcast_ss(&work
.OneOverW
[0]);
467 coeffs
.vBOneOverW
= _simd_broadcast_ss(&work
.OneOverW
[1]);
468 coeffs
.vCOneOverW
= _simd_broadcast_ss(&work
.OneOverW
[2]);
470 uint8_t *pColorBase
[SWR_NUM_RENDERTARGETS
];
471 uint32_t NumRT
= state
.psState
.numRenderTargets
;
472 for(uint32_t rt
= 0; rt
< NumRT
; ++rt
)
474 pColorBase
[rt
] = renderBuffers
.pColor
[rt
];
476 uint8_t *pDepthBase
= renderBuffers
.pDepth
, *pStencilBase
= renderBuffers
.pStencil
;
477 RDTSC_STOP(BESetup
, 0, 0);
479 SWR_PS_CONTEXT psContext
;
480 psContext
.pAttribs
= work
.pAttribs
;
481 psContext
.pPerspAttribs
= work
.pPerspAttribs
;
482 psContext
.frontFace
= work
.triFlags
.frontFacing
;
483 psContext
.primID
= work
.triFlags
.primID
;
485 // save Ia/Ib/Ic and Ja/Jb/Jc if we need to reevaluate i/j/k in the shader because of pull attribs
486 psContext
.I
= work
.I
;
487 psContext
.J
= work
.J
;
488 psContext
.recipDet
= work
.recipDet
;
489 psContext
.pRecipW
= work
.pRecipW
;
490 psContext
.pSamplePosX
= (const float*)&T::MultisampleT::samplePosX
;
491 psContext
.pSamplePosY
= (const float*)&T::MultisampleT::samplePosY
;
493 for(uint32_t yy
= y
; yy
< y
+ KNOB_TILE_Y_DIM
; yy
+= SIMD_TILE_Y_DIM
)
496 psContext
.vY
.UL
= _simd_add_ps(vULOffsetsY
, _simd_set1_ps((float)yy
));
498 psContext
.vY
.center
= _simd_add_ps(vCenterOffsetsY
, _simd_set1_ps((float)yy
));
500 for(uint32_t xx
= x
; xx
< x
+ KNOB_TILE_X_DIM
; xx
+= SIMD_TILE_X_DIM
)
502 if(coverageMask
& MASK
)
504 psContext
.vX
.UL
= _simd_add_ps(vULOffsetsX
, _simd_set1_ps((float)xx
));
506 psContext
.vX
.center
= _simd_add_ps(vCenterOffsetsX
, _simd_set1_ps((float)xx
));
508 if(T::bInputCoverage
)
510 generateInputCoverage
<T
>(&work
.coverageMask
[0], psContext
.inputMask
, pBlendState
->sampleMask
);
513 RDTSC_START(BEBarycentric
);
514 CalcPixelBarycentrics(coeffs
, psContext
);
518 // for 1x case, centroid is pixel center
519 psContext
.vX
.centroid
= psContext
.vX
.center
;
520 psContext
.vY
.centroid
= psContext
.vY
.center
;
521 psContext
.vI
.centroid
= psContext
.vI
.center
;
522 psContext
.vJ
.centroid
= psContext
.vJ
.center
;
523 psContext
.vOneOverW
.centroid
= psContext
.vOneOverW
.center
;
526 // interpolate and quantize z
527 psContext
.vZ
= vplaneps(coeffs
.vZa
, coeffs
.vZb
, coeffs
.vZc
, psContext
.vI
.center
, psContext
.vJ
.center
);
528 psContext
.vZ
= state
.pfnQuantizeDepth(psContext
.vZ
);
529 RDTSC_STOP(BEBarycentric
, 0, 0);
531 simdmask clipCoverageMask
= coverageMask
& MASK
;
532 // interpolate user clip distance if available
533 if(rastState
.clipDistanceMask
)
535 clipCoverageMask
&= ~ComputeUserClipMask(rastState
.clipDistanceMask
, work
.pUserClipBuffer
,
536 psContext
.vI
.center
, psContext
.vJ
.center
);
539 simdscalar vCoverageMask
= vMask(clipCoverageMask
);
540 simdscalar depthPassMask
= vCoverageMask
;
541 simdscalar stencilPassMask
= vCoverageMask
;
546 RDTSC_START(BEEarlyDepthTest
);
547 depthPassMask
= DepthStencilTest(&state
, work
.triFlags
.frontFacing
,
548 psContext
.vZ
, pDepthBase
, vCoverageMask
, pStencilBase
, &stencilPassMask
);
549 RDTSC_STOP(BEEarlyDepthTest
, 0, 0);
551 // early-exit if no pixels passed depth or earlyZ is forced on
552 if(pPSState
->forceEarlyZ
|| !_simd_movemask_ps(depthPassMask
))
554 DepthStencilWrite(&state
.vp
[0], &state
.depthStencilState
, work
.triFlags
.frontFacing
, psContext
.vZ
,
555 pDepthBase
, depthPassMask
, vCoverageMask
, pStencilBase
, stencilPassMask
);
557 if (!_simd_movemask_ps(depthPassMask
))
564 psContext
.sampleIndex
= 0;
565 psContext
.activeMask
= _simd_castps_si(vCoverageMask
);
567 // execute pixel shader
568 RDTSC_START(BEPixelShader
);
569 UPDATE_STAT(PsInvocations
, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask
)));
570 state
.psState
.pfnPixelShader(GetPrivateState(pDC
), &psContext
);
571 RDTSC_STOP(BEPixelShader
, 0, 0);
573 vCoverageMask
= _simd_castsi_ps(psContext
.activeMask
);
578 RDTSC_START(BELateDepthTest
);
579 depthPassMask
= DepthStencilTest(&state
, work
.triFlags
.frontFacing
,
580 psContext
.vZ
, pDepthBase
, vCoverageMask
, pStencilBase
, &stencilPassMask
);
581 RDTSC_STOP(BELateDepthTest
, 0, 0);
583 if(!_simd_movemask_ps(depthPassMask
))
585 // need to call depth/stencil write for stencil write
586 DepthStencilWrite(&state
.vp
[0], &state
.depthStencilState
, work
.triFlags
.frontFacing
, psContext
.vZ
,
587 pDepthBase
, depthPassMask
, vCoverageMask
, pStencilBase
, stencilPassMask
);
592 uint32_t statMask
= _simd_movemask_ps(depthPassMask
);
593 uint32_t statCount
= _mm_popcnt_u32(statMask
);
594 UPDATE_STAT(DepthPassCount
, statCount
);
597 RDTSC_START(BEOutputMerger
);
598 OutputMerger(psContext
, pColorBase
, 0, pBlendState
, state
.pfnBlendFunc
, vCoverageMask
, depthPassMask
, pPSState
->numRenderTargets
);
600 // do final depth write after all pixel kills
601 if (!pPSState
->forceEarlyZ
)
603 DepthStencilWrite(&state
.vp
[0], &state
.depthStencilState
, work
.triFlags
.frontFacing
, psContext
.vZ
,
604 pDepthBase
, depthPassMask
, vCoverageMask
, pStencilBase
, stencilPassMask
);
606 RDTSC_STOP(BEOutputMerger
, 0, 0);
610 RDTSC_START(BEEndTile
);
611 coverageMask
>>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
612 pDepthBase
+= (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
) / 8;
613 pStencilBase
+= (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
) / 8;
615 for(uint32_t rt
= 0; rt
< NumRT
; ++rt
)
617 pColorBase
[rt
] += (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
) / 8;
619 RDTSC_STOP(BEEndTile
, 0, 0);
622 RDTSC_STOP(BESingleSampleBackend
, 0, 0);
626 void BackendSampleRate(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t x
, uint32_t y
, SWR_TRIANGLE_DESC
&work
, RenderOutputBuffers
&renderBuffers
)
628 RDTSC_START(BESampleRateBackend
);
629 RDTSC_START(BESetup
);
631 SWR_CONTEXT
*pContext
= pDC
->pContext
;
632 const API_STATE
& state
= GetApiState(pDC
);
633 const SWR_RASTSTATE
& rastState
= state
.rastState
;
634 const SWR_PS_STATE
*pPSState
= &state
.psState
;
635 const SWR_BLEND_STATE
*pBlendState
= &state
.blendState
;
638 BarycentricCoeffs coeffs
;
639 coeffs
.vIa
= _simd_broadcast_ss(&work
.I
[0]);
640 coeffs
.vIb
= _simd_broadcast_ss(&work
.I
[1]);
641 coeffs
.vIc
= _simd_broadcast_ss(&work
.I
[2]);
643 coeffs
.vJa
= _simd_broadcast_ss(&work
.J
[0]);
644 coeffs
.vJb
= _simd_broadcast_ss(&work
.J
[1]);
645 coeffs
.vJc
= _simd_broadcast_ss(&work
.J
[2]);
647 coeffs
.vZa
= _simd_broadcast_ss(&work
.Z
[0]);
648 coeffs
.vZb
= _simd_broadcast_ss(&work
.Z
[1]);
649 coeffs
.vZc
= _simd_broadcast_ss(&work
.Z
[2]);
651 coeffs
.vRecipDet
= _simd_broadcast_ss(&work
.recipDet
);
653 coeffs
.vAOneOverW
= _simd_broadcast_ss(&work
.OneOverW
[0]);
654 coeffs
.vBOneOverW
= _simd_broadcast_ss(&work
.OneOverW
[1]);
655 coeffs
.vCOneOverW
= _simd_broadcast_ss(&work
.OneOverW
[2]);
657 uint8_t *pColorBase
[SWR_NUM_RENDERTARGETS
];
658 uint32_t NumRT
= state
.psState
.numRenderTargets
;
659 for(uint32_t rt
= 0; rt
< NumRT
; ++rt
)
661 pColorBase
[rt
] = renderBuffers
.pColor
[rt
];
663 uint8_t *pDepthBase
= renderBuffers
.pDepth
, *pStencilBase
= renderBuffers
.pStencil
;
664 RDTSC_STOP(BESetup
, 0, 0);
666 SWR_PS_CONTEXT psContext
;
667 psContext
.pAttribs
= work
.pAttribs
;
668 psContext
.pPerspAttribs
= work
.pPerspAttribs
;
669 psContext
.pRecipW
= work
.pRecipW
;
670 psContext
.frontFace
= work
.triFlags
.frontFacing
;
671 psContext
.primID
= work
.triFlags
.primID
;
673 // save Ia/Ib/Ic and Ja/Jb/Jc if we need to reevaluate i/j/k in the shader because of pull attribs
674 psContext
.I
= work
.I
;
675 psContext
.J
= work
.J
;
676 psContext
.recipDet
= work
.recipDet
;
677 psContext
.pSamplePosX
= (const float*)&T::MultisampleT::samplePosX
;
678 psContext
.pSamplePosY
= (const float*)&T::MultisampleT::samplePosY
;
680 for (uint32_t yy
= y
; yy
< y
+ KNOB_TILE_Y_DIM
; yy
+= SIMD_TILE_Y_DIM
)
683 psContext
.vY
.UL
= _simd_add_ps(vULOffsetsY
, _simd_set1_ps((float)yy
));
685 psContext
.vY
.center
= _simd_add_ps(vCenterOffsetsY
, _simd_set1_ps((float)yy
));
687 for (uint32_t xx
= x
; xx
< x
+ KNOB_TILE_X_DIM
; xx
+= SIMD_TILE_X_DIM
)
689 psContext
.vX
.UL
= _simd_add_ps(vULOffsetsX
, _simd_set1_ps((float)xx
));
691 psContext
.vX
.center
= _simd_add_ps(vCenterOffsetsX
, _simd_set1_ps((float)xx
));
693 RDTSC_START(BEBarycentric
);
694 CalcPixelBarycentrics(coeffs
, psContext
);
695 RDTSC_STOP(BEBarycentric
, 0, 0);
697 if(T::bInputCoverage
)
699 generateInputCoverage
<T
>(&work
.coverageMask
[0], psContext
.inputMask
, pBlendState
->sampleMask
);
704 ///@ todo: don't need to genererate input coverage 2x if input coverage and centroid
705 RDTSC_START(BEBarycentric
);
706 CalcCentroidBarycentrics
<T
>(coeffs
, psContext
, &work
.coverageMask
[0], pBlendState
->sampleMask
, psContext
.vX
.UL
, psContext
.vY
.UL
);
707 RDTSC_STOP(BEBarycentric
, 0, 0);
710 for(uint32_t sample
= 0; sample
< T::MultisampleT::numSamples
; sample
++)
712 simdmask coverageMask
= work
.coverageMask
[sample
] & MASK
;
715 RDTSC_START(BEBarycentric
);
716 // calculate per sample positions
717 psContext
.vX
.sample
= _simd_add_ps(psContext
.vX
.UL
, T::MultisampleT::vX(sample
));
718 psContext
.vY
.sample
= _simd_add_ps(psContext
.vY
.UL
, T::MultisampleT::vY(sample
));
720 CalcSampleBarycentrics(coeffs
, psContext
);
722 // interpolate and quantize z
723 psContext
.vZ
= vplaneps(coeffs
.vZa
, coeffs
.vZb
, coeffs
.vZc
, psContext
.vI
.sample
, psContext
.vJ
.sample
);
724 psContext
.vZ
= state
.pfnQuantizeDepth(psContext
.vZ
);
725 RDTSC_STOP(BEBarycentric
, 0, 0);
727 // interpolate user clip distance if available
728 if (rastState
.clipDistanceMask
)
730 coverageMask
&= ~ComputeUserClipMask(rastState
.clipDistanceMask
, work
.pUserClipBuffer
,
731 psContext
.vI
.sample
, psContext
.vJ
.sample
);
734 simdscalar vCoverageMask
= vMask(coverageMask
);
735 simdscalar depthPassMask
= vCoverageMask
;
736 simdscalar stencilPassMask
= vCoverageMask
;
738 // offset depth/stencil buffers current sample
739 uint8_t *pDepthSample
= pDepthBase
+ RasterTileDepthOffset(sample
);
740 uint8_t *pStencilSample
= pStencilBase
+ RasterTileStencilOffset(sample
);
745 RDTSC_START(BEEarlyDepthTest
);
746 depthPassMask
= DepthStencilTest(&state
, work
.triFlags
.frontFacing
,
747 psContext
.vZ
, pDepthSample
, vCoverageMask
, pStencilSample
, &stencilPassMask
);
748 RDTSC_STOP(BEEarlyDepthTest
, 0, 0);
750 // early-exit if no samples passed depth or earlyZ is forced on.
751 if (pPSState
->forceEarlyZ
|| !_simd_movemask_ps(depthPassMask
))
753 DepthStencilWrite(&state
.vp
[0], &state
.depthStencilState
, work
.triFlags
.frontFacing
, psContext
.vZ
,
754 pDepthSample
, depthPassMask
, vCoverageMask
, pStencilSample
, stencilPassMask
);
756 if (!_simd_movemask_ps(depthPassMask
))
758 work
.coverageMask
[sample
] >>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
764 psContext
.sampleIndex
= sample
;
765 psContext
.activeMask
= _simd_castps_si(vCoverageMask
);
767 // execute pixel shader
768 RDTSC_START(BEPixelShader
);
769 UPDATE_STAT(PsInvocations
, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask
)));
770 state
.psState
.pfnPixelShader(GetPrivateState(pDC
), &psContext
);
771 RDTSC_STOP(BEPixelShader
, 0, 0);
773 vCoverageMask
= _simd_castsi_ps(psContext
.activeMask
);
778 RDTSC_START(BELateDepthTest
);
779 depthPassMask
= DepthStencilTest(&state
, work
.triFlags
.frontFacing
,
780 psContext
.vZ
, pDepthSample
, vCoverageMask
, pStencilSample
, &stencilPassMask
);
781 RDTSC_STOP(BELateDepthTest
, 0, 0);
783 if (!_simd_movemask_ps(depthPassMask
))
785 // need to call depth/stencil write for stencil write
786 DepthStencilWrite(&state
.vp
[0], &state
.depthStencilState
, work
.triFlags
.frontFacing
, psContext
.vZ
,
787 pDepthSample
, depthPassMask
, vCoverageMask
, pStencilSample
, stencilPassMask
);
789 work
.coverageMask
[sample
] >>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
794 uint32_t statMask
= _simd_movemask_ps(depthPassMask
);
795 uint32_t statCount
= _mm_popcnt_u32(statMask
);
796 UPDATE_STAT(DepthPassCount
, statCount
);
799 RDTSC_START(BEOutputMerger
);
800 OutputMerger(psContext
, pColorBase
, sample
, pBlendState
, state
.pfnBlendFunc
, vCoverageMask
, depthPassMask
, pPSState
->numRenderTargets
);
802 // do final depth write after all pixel kills
803 if (!pPSState
->forceEarlyZ
)
805 DepthStencilWrite(&state
.vp
[0], &state
.depthStencilState
, work
.triFlags
.frontFacing
, psContext
.vZ
,
806 pDepthSample
, depthPassMask
, vCoverageMask
, pStencilSample
, stencilPassMask
);
808 RDTSC_STOP(BEOutputMerger
, 0, 0);
810 work
.coverageMask
[sample
] >>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
812 RDTSC_START(BEEndTile
);
813 pDepthBase
+= (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
) / 8;
814 pStencilBase
+= (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
) / 8;
816 for (uint32_t rt
= 0; rt
< NumRT
; ++rt
)
818 pColorBase
[rt
] += (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
) / 8;
820 RDTSC_STOP(BEEndTile
, 0, 0);
823 RDTSC_STOP(BESampleRateBackend
, 0, 0);
827 void BackendPixelRate(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t x
, uint32_t y
, SWR_TRIANGLE_DESC
&work
, RenderOutputBuffers
&renderBuffers
)
829 RDTSC_START(BEPixelRateBackend
);
830 RDTSC_START(BESetup
);
832 SWR_CONTEXT
*pContext
= pDC
->pContext
;
833 const API_STATE
& state
= GetApiState(pDC
);
834 const SWR_RASTSTATE
& rastState
= state
.rastState
;
835 const SWR_PS_STATE
*pPSState
= &state
.psState
;
836 const SWR_BLEND_STATE
*pBlendState
= &state
.blendState
;
839 BarycentricCoeffs coeffs
;
840 coeffs
.vIa
= _simd_broadcast_ss(&work
.I
[0]);
841 coeffs
.vIb
= _simd_broadcast_ss(&work
.I
[1]);
842 coeffs
.vIc
= _simd_broadcast_ss(&work
.I
[2]);
844 coeffs
.vJa
= _simd_broadcast_ss(&work
.J
[0]);
845 coeffs
.vJb
= _simd_broadcast_ss(&work
.J
[1]);
846 coeffs
.vJc
= _simd_broadcast_ss(&work
.J
[2]);
848 coeffs
.vZa
= _simd_broadcast_ss(&work
.Z
[0]);
849 coeffs
.vZb
= _simd_broadcast_ss(&work
.Z
[1]);
850 coeffs
.vZc
= _simd_broadcast_ss(&work
.Z
[2]);
852 coeffs
.vRecipDet
= _simd_broadcast_ss(&work
.recipDet
);
854 coeffs
.vAOneOverW
= _simd_broadcast_ss(&work
.OneOverW
[0]);
855 coeffs
.vBOneOverW
= _simd_broadcast_ss(&work
.OneOverW
[1]);
856 coeffs
.vCOneOverW
= _simd_broadcast_ss(&work
.OneOverW
[2]);
858 uint8_t *pColorBase
[SWR_NUM_RENDERTARGETS
];
859 uint32_t NumRT
= state
.psState
.numRenderTargets
;
860 for(uint32_t rt
= 0; rt
< NumRT
; ++rt
)
862 pColorBase
[rt
] = renderBuffers
.pColor
[rt
];
864 uint8_t *pDepthBase
= renderBuffers
.pDepth
, *pStencilBase
= renderBuffers
.pStencil
;
865 RDTSC_STOP(BESetup
, 0, 0);
867 SWR_PS_CONTEXT psContext
;
868 psContext
.pAttribs
= work
.pAttribs
;
869 psContext
.pPerspAttribs
= work
.pPerspAttribs
;
870 psContext
.frontFace
= work
.triFlags
.frontFacing
;
871 psContext
.primID
= work
.triFlags
.primID
;
872 psContext
.pRecipW
= work
.pRecipW
;
873 // save Ia/Ib/Ic and Ja/Jb/Jc if we need to reevaluate i/j/k in the shader because of pull attribs
874 psContext
.I
= work
.I
;
875 psContext
.J
= work
.J
;
876 psContext
.recipDet
= work
.recipDet
;
877 psContext
.pSamplePosX
= (const float*)&T::MultisampleT::samplePosX
;
878 psContext
.pSamplePosY
= (const float*)&T::MultisampleT::samplePosY
;
879 psContext
.sampleIndex
= 0;
881 PixelRateZTestLoop
<T
> PixelRateZTest(pDC
, work
, coeffs
, state
, pDepthBase
, pStencilBase
, rastState
.clipDistanceMask
);
883 for(uint32_t yy
= y
; yy
< y
+ KNOB_TILE_Y_DIM
; yy
+= SIMD_TILE_Y_DIM
)
885 psContext
.vY
.UL
= _simd_add_ps(vULOffsetsY
, _simd_set1_ps((float)yy
));
886 psContext
.vY
.center
= _simd_add_ps(vCenterOffsetsY
, _simd_set1_ps((float)yy
));
887 for(uint32_t xx
= x
; xx
< x
+ KNOB_TILE_X_DIM
; xx
+= SIMD_TILE_X_DIM
)
889 simdscalar activeLanes
;
890 if(!(work
.anyCoveredSamples
& MASK
)) {goto Endtile
;};
891 activeLanes
= vMask(work
.anyCoveredSamples
& MASK
);
893 psContext
.vX
.UL
= _simd_add_ps(vULOffsetsX
, _simd_set1_ps((float)xx
));
894 // set pixel center positions
895 psContext
.vX
.center
= _simd_add_ps(vCenterOffsetsX
, _simd_set1_ps((float)xx
));
897 RDTSC_START(BEBarycentric
);
898 CalcPixelBarycentrics(coeffs
, psContext
);
899 RDTSC_STOP(BEBarycentric
, 0, 0);
901 if (T::bInputCoverage
)
903 generateInputCoverage
<T
>(&work
.coverageMask
[0], psContext
.inputMask
, pBlendState
->sampleMask
);
908 ///@ todo: don't need to genererate input coverage 2x if input coverage and centroid
909 RDTSC_START(BEBarycentric
);
910 CalcCentroidBarycentrics
<T
>(coeffs
, psContext
, &work
.coverageMask
[0], pBlendState
->sampleMask
, psContext
.vX
.UL
, psContext
.vY
.UL
);
911 RDTSC_STOP(BEBarycentric
, 0, 0);
914 if(T::bForcedSampleCount
)
916 // candidate pixels (that passed coverage) will cause shader invocation if any bits in the samplemask are set
917 const simdscalar vSampleMask
= _simd_castsi_ps(_simd_cmpgt_epi32(_simd_set1_epi32(pBlendState
->sampleMask
), _simd_setzero_si()));
918 activeLanes
= _simd_and_ps(activeLanes
, vSampleMask
);
922 if(T::bCanEarlyZ
&& !T::bForcedSampleCount
)
924 uint32_t depthPassCount
= PixelRateZTest(activeLanes
, psContext
, BEEarlyDepthTest
);
925 UPDATE_STAT(DepthPassCount
, depthPassCount
);
928 // if we have no covered samples that passed depth at this point, go to next tile
929 if(!_simd_movemask_ps(activeLanes
)) { goto Endtile
; };
931 if(pPSState
->usesSourceDepth
)
933 RDTSC_START(BEBarycentric
);
934 // interpolate and quantize z
935 psContext
.vZ
= vplaneps(coeffs
.vZa
, coeffs
.vZb
, coeffs
.vZc
, psContext
.vI
.center
, psContext
.vJ
.center
);
936 psContext
.vZ
= state
.pfnQuantizeDepth(psContext
.vZ
);
937 RDTSC_STOP(BEBarycentric
, 0, 0);
940 // pixels that are currently active
941 psContext
.activeMask
= _simd_castps_si(activeLanes
);
942 psContext
.oMask
= T::MultisampleT::FullSampleMask();
944 // execute pixel shader
945 RDTSC_START(BEPixelShader
);
946 state
.psState
.pfnPixelShader(GetPrivateState(pDC
), &psContext
);
947 UPDATE_STAT(PsInvocations
, _mm_popcnt_u32(_simd_movemask_ps(activeLanes
)));
948 RDTSC_STOP(BEPixelShader
, 0, 0);
950 // update active lanes to remove any discarded or oMask'd pixels
951 activeLanes
= _simd_castsi_ps(_simd_and_si(psContext
.activeMask
, _simd_cmpgt_epi32(psContext
.oMask
, _simd_setzero_si())));
952 if(!_simd_movemask_ps(activeLanes
)) { goto Endtile
; };
955 if(!T::bCanEarlyZ
&& !T::bForcedSampleCount
)
957 uint32_t depthPassCount
= PixelRateZTest(activeLanes
, psContext
, BELateDepthTest
);
958 UPDATE_STAT(DepthPassCount
, depthPassCount
);
961 // if we have no covered samples that passed depth at this point, skip OM and go to next tile
962 if(!_simd_movemask_ps(activeLanes
)) { goto Endtile
; };
965 // loop over all samples, broadcasting the results of the PS to all passing pixels
966 for(uint32_t sample
= 0; sample
< GetNumOMSamples
<T
>(pBlendState
->sampleCount
); sample
++)
968 RDTSC_START(BEOutputMerger
);
969 // center pattern does a single coverage/depth/stencil test, standard pattern tests all samples
970 uint32_t coverageSampleNum
= (T::bIsStandardPattern
) ? sample
: 0;
971 simdscalar coverageMask
, depthMask
;
972 if(T::bForcedSampleCount
)
974 coverageMask
= depthMask
= activeLanes
;
978 coverageMask
= PixelRateZTest
.vCoverageMask
[coverageSampleNum
];
979 depthMask
= PixelRateZTest
.depthPassMask
[coverageSampleNum
];
980 if(!_simd_movemask_ps(depthMask
))
982 // stencil should already have been written in early/lateZ tests
983 RDTSC_STOP(BEOutputMerger
, 0, 0);
988 // broadcast the results of the PS to all passing pixels
989 OutputMerger(psContext
, pColorBase
, sample
, pBlendState
, state
.pfnBlendFunc
, coverageMask
, depthMask
, pPSState
->numRenderTargets
);
991 if(!pPSState
->forceEarlyZ
&& !T::bForcedSampleCount
)
993 uint8_t *pDepthSample
= pDepthBase
+ RasterTileDepthOffset(sample
);
994 uint8_t * pStencilSample
= pStencilBase
+ RasterTileStencilOffset(sample
);
996 DepthStencilWrite(&state
.vp
[0], &state
.depthStencilState
, work
.triFlags
.frontFacing
, PixelRateZTest
.vZ
[coverageSampleNum
],
997 pDepthSample
, depthMask
, coverageMask
, pStencilSample
, PixelRateZTest
.stencilPassMask
[coverageSampleNum
]);
999 RDTSC_STOP(BEOutputMerger
, 0, 0);
1002 RDTSC_START(BEEndTile
);
1003 for(uint32_t sample
= 0; sample
< T::MultisampleT::numCoverageSamples
; sample
++)
1005 work
.coverageMask
[sample
] >>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
1008 work
.anyCoveredSamples
>>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
1009 pDepthBase
+= (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
) / 8;
1010 pStencilBase
+= (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
) / 8;
1012 for(uint32_t rt
= 0; rt
< NumRT
; ++rt
)
1014 pColorBase
[rt
] += (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
) / 8;
1016 RDTSC_STOP(BEEndTile
, 0, 0);
1019 RDTSC_STOP(BEPixelRateBackend
, 0, 0);
1021 // optimized backend flow with NULL PS
1022 template<uint32_t sampleCountT
>
1023 void BackendNullPS(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t x
, uint32_t y
, SWR_TRIANGLE_DESC
&work
, RenderOutputBuffers
&renderBuffers
)
1025 RDTSC_START(BENullBackend
);
1026 ///@todo: handle center multisample pattern
1027 typedef SwrBackendTraits
<sampleCountT
, SWR_MSAA_STANDARD_PATTERN
> T
;
1028 RDTSC_START(BESetup
);
1030 SWR_CONTEXT
*pContext
= pDC
->pContext
;
1031 const API_STATE
& state
= GetApiState(pDC
);
1032 const SWR_RASTSTATE
& rastState
= pDC
->pState
->state
.rastState
;
1034 // broadcast scalars
1035 BarycentricCoeffs coeffs
;
1036 coeffs
.vIa
= _simd_broadcast_ss(&work
.I
[0]);
1037 coeffs
.vIb
= _simd_broadcast_ss(&work
.I
[1]);
1038 coeffs
.vIc
= _simd_broadcast_ss(&work
.I
[2]);
1040 coeffs
.vJa
= _simd_broadcast_ss(&work
.J
[0]);
1041 coeffs
.vJb
= _simd_broadcast_ss(&work
.J
[1]);
1042 coeffs
.vJc
= _simd_broadcast_ss(&work
.J
[2]);
1044 coeffs
.vZa
= _simd_broadcast_ss(&work
.Z
[0]);
1045 coeffs
.vZb
= _simd_broadcast_ss(&work
.Z
[1]);
1046 coeffs
.vZc
= _simd_broadcast_ss(&work
.Z
[2]);
1048 coeffs
.vRecipDet
= _simd_broadcast_ss(&work
.recipDet
);
1050 uint8_t *pDepthBase
= renderBuffers
.pDepth
, *pStencilBase
= renderBuffers
.pStencil
;
1052 RDTSC_STOP(BESetup
, 0, 0);
1054 SWR_PS_CONTEXT psContext
;
1055 for (uint32_t yy
= y
; yy
< y
+ KNOB_TILE_Y_DIM
; yy
+= SIMD_TILE_Y_DIM
)
1058 simdscalar vYSamplePosUL
= _simd_add_ps(vULOffsetsY
, _simd_set1_ps((float)yy
));
1060 for (uint32_t xx
= x
; xx
< x
+ KNOB_TILE_X_DIM
; xx
+= SIMD_TILE_X_DIM
)
1063 simdscalar vXSamplePosUL
= _simd_add_ps(vULOffsetsX
, _simd_set1_ps((float)xx
));
1065 // iterate over active samples
1066 unsigned long sample
= 0;
1067 uint32_t sampleMask
= state
.blendState
.sampleMask
;
1068 while (_BitScanForward(&sample
, sampleMask
))
1070 sampleMask
&= ~(1 << sample
);
1071 simdmask coverageMask
= work
.coverageMask
[sample
] & MASK
;
1074 RDTSC_START(BEBarycentric
);
1075 // calculate per sample positions
1076 psContext
.vX
.sample
= _simd_add_ps(vXSamplePosUL
, T::MultisampleT::vX(sample
));
1077 psContext
.vY
.sample
= _simd_add_ps(vYSamplePosUL
, T::MultisampleT::vY(sample
));
1079 CalcSampleBarycentrics(coeffs
, psContext
);
1081 // interpolate and quantize z
1082 psContext
.vZ
= vplaneps(coeffs
.vZa
, coeffs
.vZb
, coeffs
.vZc
, psContext
.vI
.sample
, psContext
.vJ
.sample
);
1083 psContext
.vZ
= state
.pfnQuantizeDepth(psContext
.vZ
);
1085 RDTSC_STOP(BEBarycentric
, 0, 0);
1087 // interpolate user clip distance if available
1088 if (rastState
.clipDistanceMask
)
1090 coverageMask
&= ~ComputeUserClipMask(rastState
.clipDistanceMask
, work
.pUserClipBuffer
,
1091 psContext
.vI
.sample
, psContext
.vJ
.sample
);
1094 simdscalar vCoverageMask
= vMask(coverageMask
);
1095 simdscalar stencilPassMask
= vCoverageMask
;
1097 // offset depth/stencil buffers current sample
1098 uint8_t *pDepthSample
= pDepthBase
+ RasterTileDepthOffset(sample
);
1099 uint8_t *pStencilSample
= pStencilBase
+ RasterTileStencilOffset(sample
);
1101 RDTSC_START(BEEarlyDepthTest
);
1102 simdscalar depthPassMask
= DepthStencilTest(&state
, work
.triFlags
.frontFacing
,
1103 psContext
.vZ
, pDepthSample
, vCoverageMask
, pStencilSample
, &stencilPassMask
);
1104 DepthStencilWrite(&state
.vp
[0], &state
.depthStencilState
, work
.triFlags
.frontFacing
, psContext
.vZ
,
1105 pDepthSample
, depthPassMask
, vCoverageMask
, pStencilSample
, stencilPassMask
);
1106 RDTSC_STOP(BEEarlyDepthTest
, 0, 0);
1108 uint32_t statMask
= _simd_movemask_ps(depthPassMask
);
1109 uint32_t statCount
= _mm_popcnt_u32(statMask
);
1110 UPDATE_STAT(DepthPassCount
, statCount
);
1112 work
.coverageMask
[sample
] >>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
1114 pDepthBase
+= (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
) / 8;
1115 pStencilBase
+= (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
) / 8;
1118 RDTSC_STOP(BENullBackend
, 0, 0);
1121 void InitClearTilesTable()
1123 memset(sClearTilesTable
, 0, sizeof(sClearTilesTable
));
1125 sClearTilesTable
[R8G8B8A8_UNORM
] = ClearMacroTile
<R8G8B8A8_UNORM
>;
1126 sClearTilesTable
[B8G8R8A8_UNORM
] = ClearMacroTile
<B8G8R8A8_UNORM
>;
1127 sClearTilesTable
[R32_FLOAT
] = ClearMacroTile
<R32_FLOAT
>;
1128 sClearTilesTable
[R32G32B32A32_FLOAT
] = ClearMacroTile
<R32G32B32A32_FLOAT
>;
1129 sClearTilesTable
[R8_UINT
] = ClearMacroTile
<R8_UINT
>;
1132 PFN_BACKEND_FUNC gBackendNullPs
[SWR_MULTISAMPLE_TYPE_MAX
];
1133 PFN_BACKEND_FUNC gBackendSingleSample
[2] // input coverage
1137 PFN_BACKEND_FUNC gBackendPixelRateTable
[SWR_MULTISAMPLE_TYPE_MAX
]
1138 [SWR_MSAA_SAMPLE_PATTERN_MAX
]
1139 [SWR_INPUT_COVERAGE_MAX
]
1141 [2] // forcedSampleCount
1144 PFN_BACKEND_FUNC gBackendSampleRateTable
[SWR_MULTISAMPLE_TYPE_MAX
][SWR_INPUT_COVERAGE_MAX
]
1149 // Recursive template used to auto-nest conditionals. Converts dynamic enum function
1150 // arguments to static template arguments.
1151 template <uint32_t... ArgsT
>
1154 // Last Arg Terminator
1155 static PFN_BACKEND_FUNC
GetFunc(SWR_BACKEND_FUNCS tArg
)
1159 case SWR_BACKEND_SINGLE_SAMPLE
: return BackendSingleSample
<SwrBackendTraits
<ArgsT
...>>; break;
1160 case SWR_BACKEND_MSAA_PIXEL_RATE
: return BackendPixelRate
<SwrBackendTraits
<ArgsT
...>>; break;
1161 case SWR_BACKEND_MSAA_SAMPLE_RATE
: return BackendSampleRate
<SwrBackendTraits
<ArgsT
...>>; break;
1163 SWR_ASSERT(0 && "Invalid backend func\n");
1169 // Recursively parse args
1170 template <typename
... TArgsT
>
1171 static PFN_BACKEND_FUNC
GetFunc(SWR_MSAA_SAMPLE_PATTERN tArg
, TArgsT
... remainingArgs
)
1175 case SWR_MSAA_CENTER_PATTERN
: return BEChooser
<ArgsT
..., SWR_MSAA_CENTER_PATTERN
>::GetFunc(remainingArgs
...); break;
1176 case SWR_MSAA_STANDARD_PATTERN
: return BEChooser
<ArgsT
..., SWR_MSAA_STANDARD_PATTERN
>::GetFunc(remainingArgs
...); break;
1178 SWR_ASSERT(0 && "Invalid sample pattern\n");
1179 return BEChooser
<ArgsT
..., SWR_MSAA_STANDARD_PATTERN
>::GetFunc(remainingArgs
...);
1184 // Recursively parse args
1185 template <typename
... TArgsT
>
1186 static PFN_BACKEND_FUNC
GetFunc(SWR_MULTISAMPLE_COUNT tArg
, TArgsT
... remainingArgs
)
1190 case SWR_MULTISAMPLE_1X
: return BEChooser
<ArgsT
..., SWR_MULTISAMPLE_1X
>::GetFunc(remainingArgs
...); break;
1191 case SWR_MULTISAMPLE_2X
: return BEChooser
<ArgsT
..., SWR_MULTISAMPLE_2X
>::GetFunc(remainingArgs
...); break;
1192 case SWR_MULTISAMPLE_4X
: return BEChooser
<ArgsT
..., SWR_MULTISAMPLE_4X
>::GetFunc(remainingArgs
...); break;
1193 case SWR_MULTISAMPLE_8X
: return BEChooser
<ArgsT
..., SWR_MULTISAMPLE_8X
>::GetFunc(remainingArgs
...); break;
1194 case SWR_MULTISAMPLE_16X
: return BEChooser
<ArgsT
..., SWR_MULTISAMPLE_16X
>::GetFunc(remainingArgs
...); break;
1196 SWR_ASSERT(0 && "Invalid sample count\n");
1197 return BEChooser
<ArgsT
..., SWR_MULTISAMPLE_1X
>::GetFunc(remainingArgs
...);
1202 // Recursively parse args
1203 template <typename
... TArgsT
>
1204 static PFN_BACKEND_FUNC
GetFunc(bool tArg
, TArgsT
... remainingArgs
)
1208 return BEChooser
<ArgsT
..., 1>::GetFunc(remainingArgs
...);
1211 return BEChooser
<ArgsT
..., 0>::GetFunc(remainingArgs
...);
1215 void InitBackendSingleFuncTable(PFN_BACKEND_FUNC (&table
)[2][2][2])
1217 for(uint32_t inputCoverage
= SWR_INPUT_COVERAGE_NONE
; inputCoverage
< SWR_INPUT_COVERAGE_MAX
; inputCoverage
++)
1219 for(uint32_t isCentroid
= 0; isCentroid
< 2; isCentroid
++)
1221 for(uint32_t canEarlyZ
= 0; canEarlyZ
< 2; canEarlyZ
++)
1223 table
[inputCoverage
][isCentroid
][canEarlyZ
] =
1224 BEChooser
<>::GetFunc(SWR_MULTISAMPLE_1X
, SWR_MSAA_STANDARD_PATTERN
, (inputCoverage
== SWR_INPUT_COVERAGE_NORMAL
),
1225 (isCentroid
> 0), false, (canEarlyZ
> 0), SWR_BACKEND_SINGLE_SAMPLE
);
1231 void InitBackendPixelFuncTable(PFN_BACKEND_FUNC (&table
)[SWR_MULTISAMPLE_TYPE_MAX
][SWR_MSAA_SAMPLE_PATTERN_MAX
][SWR_INPUT_COVERAGE_MAX
]
1234 for(uint32_t sampleCount
= SWR_MULTISAMPLE_1X
; sampleCount
< SWR_MULTISAMPLE_TYPE_MAX
; sampleCount
++)
1236 for(uint32_t samplePattern
= SWR_MSAA_CENTER_PATTERN
; samplePattern
< SWR_MSAA_SAMPLE_PATTERN_MAX
; samplePattern
++)
1238 for(uint32_t inputCoverage
= SWR_INPUT_COVERAGE_NONE
; inputCoverage
< SWR_INPUT_COVERAGE_MAX
; inputCoverage
++)
1240 for(uint32_t isCentroid
= 0; isCentroid
< 2; isCentroid
++)
1242 for(uint32_t forcedSampleCount
= 0; forcedSampleCount
< 2; forcedSampleCount
++)
1244 for(uint32_t canEarlyZ
= 0; canEarlyZ
< 2; canEarlyZ
++)
1246 table
[sampleCount
][samplePattern
][inputCoverage
][isCentroid
][forcedSampleCount
][canEarlyZ
] =
1247 BEChooser
<>::GetFunc((SWR_MULTISAMPLE_COUNT
)sampleCount
, (SWR_MSAA_SAMPLE_PATTERN
)samplePattern
, (inputCoverage
== SWR_INPUT_COVERAGE_NORMAL
),
1248 (isCentroid
> 0), (forcedSampleCount
> 0), (canEarlyZ
> 0), SWR_BACKEND_MSAA_PIXEL_RATE
);
1257 void InitBackendSampleFuncTable(PFN_BACKEND_FUNC (&table
)[SWR_MULTISAMPLE_TYPE_MAX
][SWR_INPUT_COVERAGE_MAX
][2][2])
1259 for(uint32_t sampleCount
= SWR_MULTISAMPLE_1X
; sampleCount
< SWR_MULTISAMPLE_TYPE_MAX
; sampleCount
++)
1261 for(uint32_t inputCoverage
= SWR_INPUT_COVERAGE_NONE
; inputCoverage
< SWR_INPUT_COVERAGE_MAX
; inputCoverage
++)
1263 for(uint32_t centroid
= 0; centroid
< 2; centroid
++)
1265 for(uint32_t canEarlyZ
= 0; canEarlyZ
< 2; canEarlyZ
++)
1267 table
[sampleCount
][inputCoverage
][centroid
][canEarlyZ
] =
1268 BEChooser
<>::GetFunc((SWR_MULTISAMPLE_COUNT
)sampleCount
, SWR_MSAA_STANDARD_PATTERN
, (inputCoverage
== SWR_INPUT_COVERAGE_NORMAL
),
1269 (centroid
> 0), false, (canEarlyZ
> 0), (SWR_BACKEND_FUNCS
)SWR_BACKEND_MSAA_SAMPLE_RATE
);
1276 void InitBackendFuncTables()
1278 InitBackendSingleFuncTable(gBackendSingleSample
);
1279 InitBackendPixelFuncTable(gBackendPixelRateTable
);
1280 InitBackendSampleFuncTable(gBackendSampleRateTable
);
1282 gBackendNullPs
[SWR_MULTISAMPLE_1X
] = &BackendNullPS
< SWR_MULTISAMPLE_1X
> ;
1283 gBackendNullPs
[SWR_MULTISAMPLE_2X
] = &BackendNullPS
< SWR_MULTISAMPLE_2X
> ;
1284 gBackendNullPs
[SWR_MULTISAMPLE_4X
] = &BackendNullPS
< SWR_MULTISAMPLE_4X
> ;
1285 gBackendNullPs
[SWR_MULTISAMPLE_8X
] = &BackendNullPS
< SWR_MULTISAMPLE_8X
> ;
1286 gBackendNullPs
[SWR_MULTISAMPLE_16X
] = &BackendNullPS
< SWR_MULTISAMPLE_16X
> ;