1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * @brief Backend handles rasterization, pixel shading and output merger
28 ******************************************************************************/
30 #include <smmintrin.h>
33 #include "depthstencil.h"
35 #include "memory/tilingtraits.h"
36 #include "core/multisample.h"
40 typedef void(*PFN_CLEAR_TILES
)(DRAW_CONTEXT
*, SWR_RENDERTARGET_ATTACHMENT rt
, uint32_t, DWORD
[4]);
41 static PFN_CLEAR_TILES sClearTilesTable
[NUM_SWR_FORMATS
];
43 //////////////////////////////////////////////////////////////////////////
44 /// @brief Process compute work.
45 /// @param pDC - pointer to draw context (dispatch).
46 /// @param workerId - The unique worker ID that is assigned to this thread.
47 /// @param threadGroupId - the linear index for the thread group within the dispatch.
48 void ProcessComputeBE(DRAW_CONTEXT
* pDC
, uint32_t workerId
, uint32_t threadGroupId
, void*& pSpillFillBuffer
)
50 RDTSC_START(BEDispatch
);
52 SWR_CONTEXT
*pContext
= pDC
->pContext
;
54 const COMPUTE_DESC
* pTaskData
= (COMPUTE_DESC
*)pDC
->pDispatch
->GetTasksData();
55 SWR_ASSERT(pTaskData
!= nullptr);
57 // Ensure spill fill memory has been allocated.
58 size_t spillFillSize
= pDC
->pState
->state
.totalSpillFillSize
;
59 if (spillFillSize
&& pSpillFillBuffer
== nullptr)
61 pSpillFillBuffer
= pDC
->pArena
->AllocAlignedSync(spillFillSize
, KNOB_SIMD_BYTES
);
64 const API_STATE
& state
= GetApiState(pDC
);
66 SWR_CS_CONTEXT csContext
{ 0 };
67 csContext
.tileCounter
= threadGroupId
;
68 csContext
.dispatchDims
[0] = pTaskData
->threadGroupCountX
;
69 csContext
.dispatchDims
[1] = pTaskData
->threadGroupCountY
;
70 csContext
.dispatchDims
[2] = pTaskData
->threadGroupCountZ
;
71 csContext
.pTGSM
= pContext
->pScratch
[workerId
];
72 csContext
.pSpillFillBuffer
= (uint8_t*)pSpillFillBuffer
;
74 state
.pfnCsFunc(GetPrivateState(pDC
), &csContext
);
76 UPDATE_STAT(CsInvocations
, state
.totalThreadsInGroup
);
78 RDTSC_STOP(BEDispatch
, 1, 0);
81 void ProcessSyncBE(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t macroTile
, void *pUserData
)
84 MacroTileMgr::getTileIndices(macroTile
, x
, y
);
85 SWR_ASSERT(x
== 0 && y
== 0);
88 void ProcessQueryStatsBE(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t macroTile
, void *pUserData
)
90 QUERY_DESC
* pQueryDesc
= (QUERY_DESC
*)pUserData
;
91 SWR_STATS
* pStats
= pQueryDesc
->pStats
;
92 SWR_CONTEXT
*pContext
= pDC
->pContext
;
94 SWR_ASSERT(pStats
!= nullptr);
96 for (uint32_t i
= 0; i
< pContext
->NumWorkerThreads
; ++i
)
98 pStats
->DepthPassCount
+= pContext
->stats
[i
].DepthPassCount
;
100 pStats
->IaVertices
+= pContext
->stats
[i
].IaVertices
;
101 pStats
->IaPrimitives
+= pContext
->stats
[i
].IaPrimitives
;
102 pStats
->VsInvocations
+= pContext
->stats
[i
].VsInvocations
;
103 pStats
->HsInvocations
+= pContext
->stats
[i
].HsInvocations
;
104 pStats
->DsInvocations
+= pContext
->stats
[i
].DsInvocations
;
105 pStats
->GsInvocations
+= pContext
->stats
[i
].GsInvocations
;
106 pStats
->PsInvocations
+= pContext
->stats
[i
].PsInvocations
;
107 pStats
->CInvocations
+= pContext
->stats
[i
].CInvocations
;
108 pStats
->CsInvocations
+= pContext
->stats
[i
].CsInvocations
;
109 pStats
->CPrimitives
+= pContext
->stats
[i
].CPrimitives
;
110 pStats
->GsPrimitives
+= pContext
->stats
[i
].GsPrimitives
;
112 for (uint32_t stream
= 0; stream
< MAX_SO_STREAMS
; ++stream
)
114 pStats
->SoWriteOffset
[stream
] += pContext
->stats
[i
].SoWriteOffset
[stream
];
116 /// @note client is required to provide valid write offset before every draw, so we clear
117 /// out the contents of the write offset when storing stats
118 pContext
->stats
[i
].SoWriteOffset
[stream
] = 0;
120 pStats
->SoPrimStorageNeeded
[stream
] += pContext
->stats
[i
].SoPrimStorageNeeded
[stream
];
121 pStats
->SoNumPrimsWritten
[stream
] += pContext
->stats
[i
].SoNumPrimsWritten
[stream
];
126 template<SWR_FORMAT format
>
127 void ClearRasterTile(uint8_t *pTileBuffer
, simdvector
&value
)
129 auto lambda
= [&](int comp
)
131 FormatTraits
<format
>::storeSOA(comp
, pTileBuffer
, value
.v
[comp
]);
132 pTileBuffer
+= (KNOB_SIMD_WIDTH
* FormatTraits
<format
>::GetBPC(comp
) / 8);
135 const uint32_t numIter
= (KNOB_TILE_Y_DIM
/ SIMD_TILE_Y_DIM
) * (KNOB_TILE_X_DIM
/ SIMD_TILE_X_DIM
);
136 for (uint32_t i
= 0; i
< numIter
; ++i
)
138 UnrollerL
<0, FormatTraits
<format
>::numComps
, 1>::step(lambda
);
142 template<SWR_FORMAT format
>
143 INLINE
void ClearMacroTile(DRAW_CONTEXT
*pDC
, SWR_RENDERTARGET_ATTACHMENT rt
, uint32_t macroTile
, DWORD clear
[4])
145 // convert clear color to hottile format
146 // clear color is in RGBA float/uint32
148 for (uint32_t comp
= 0; comp
< FormatTraits
<format
>::numComps
; ++comp
)
151 vComp
= _simd_load1_ps((const float*)&clear
[comp
]);
152 if (FormatTraits
<format
>::isNormalized(comp
))
154 vComp
= _simd_mul_ps(vComp
, _simd_set1_ps(FormatTraits
<format
>::fromFloat(comp
)));
155 vComp
= _simd_castsi_ps(_simd_cvtps_epi32(vComp
));
157 vComp
= FormatTraits
<format
>::pack(comp
, vComp
);
158 vClear
.v
[FormatTraits
<format
>::swizzle(comp
)] = vComp
;
161 uint32_t tileX
, tileY
;
162 MacroTileMgr::getTileIndices(macroTile
, tileX
, tileY
);
163 const API_STATE
& state
= GetApiState(pDC
);
165 int top
= KNOB_MACROTILE_Y_DIM_FIXED
* tileY
;
166 int bottom
= top
+ KNOB_MACROTILE_Y_DIM_FIXED
- 1;
167 int left
= KNOB_MACROTILE_X_DIM_FIXED
* tileX
;
168 int right
= left
+ KNOB_MACROTILE_X_DIM_FIXED
- 1;
170 // intersect with scissor
171 top
= std::max(top
, state
.scissorInFixedPoint
.top
);
172 left
= std::max(left
, state
.scissorInFixedPoint
.left
);
173 bottom
= std::min(bottom
, state
.scissorInFixedPoint
.bottom
);
174 right
= std::min(right
, state
.scissorInFixedPoint
.right
);
176 // translate to local hottile origin
177 top
-= KNOB_MACROTILE_Y_DIM_FIXED
* tileY
;
178 bottom
-= KNOB_MACROTILE_Y_DIM_FIXED
* tileY
;
179 left
-= KNOB_MACROTILE_X_DIM_FIXED
* tileX
;
180 right
-= KNOB_MACROTILE_X_DIM_FIXED
* tileX
;
182 // convert to raster tiles
183 top
>>= (KNOB_TILE_Y_DIM_SHIFT
+ FIXED_POINT_SHIFT
);
184 bottom
>>= (KNOB_TILE_Y_DIM_SHIFT
+ FIXED_POINT_SHIFT
);
185 left
>>= (KNOB_TILE_X_DIM_SHIFT
+ FIXED_POINT_SHIFT
);
186 right
>>= (KNOB_TILE_X_DIM_SHIFT
+ FIXED_POINT_SHIFT
);
188 const int numSamples
= GetNumSamples(pDC
->pState
->state
.rastState
.sampleCount
);
189 // compute steps between raster tile samples / raster tiles / macro tile rows
190 const uint32_t rasterTileSampleStep
= KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<format
>::bpp
/ 8;
191 const uint32_t rasterTileStep
= (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* (FormatTraits
<format
>::bpp
/ 8)) * numSamples
;
192 const uint32_t macroTileRowStep
= (KNOB_MACROTILE_X_DIM
/ KNOB_TILE_X_DIM
) * rasterTileStep
;
193 const uint32_t pitch
= (FormatTraits
<format
>::bpp
* KNOB_MACROTILE_X_DIM
/ 8);
195 HOTTILE
*pHotTile
= pDC
->pContext
->pHotTileMgr
->GetHotTile(pDC
->pContext
, pDC
, macroTile
, rt
, true, numSamples
);
196 uint32_t rasterTileStartOffset
= (ComputeTileOffset2D
< TilingTraits
<SWR_TILE_SWRZ
, FormatTraits
<format
>::bpp
> >(pitch
, left
, top
)) * numSamples
;
197 uint8_t* pRasterTileRow
= pHotTile
->pBuffer
+ rasterTileStartOffset
; //(ComputeTileOffset2D< TilingTraits<SWR_TILE_SWRZ, FormatTraits<format>::bpp > >(pitch, x, y)) * numSamples;
199 // loop over all raster tiles in the current hot tile
200 for (int y
= top
; y
<= bottom
; ++y
)
202 uint8_t* pRasterTile
= pRasterTileRow
;
203 for (int x
= left
; x
<= right
; ++x
)
205 for( int sampleNum
= 0; sampleNum
< numSamples
; sampleNum
++)
207 ClearRasterTile
<format
>(pRasterTile
, vClear
);
208 pRasterTile
+= rasterTileSampleStep
;
211 pRasterTileRow
+= macroTileRowStep
;
214 pHotTile
->state
= HOTTILE_DIRTY
;
218 void ProcessClearBE(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t macroTile
, void *pUserData
)
222 CLEAR_DESC
*pClear
= (CLEAR_DESC
*)pUserData
;
223 SWR_CONTEXT
*pContext
= pDC
->pContext
;
224 SWR_MULTISAMPLE_COUNT sampleCount
= pDC
->pState
->state
.rastState
.sampleCount
;
225 uint32_t numSamples
= GetNumSamples(sampleCount
);
227 SWR_ASSERT(pClear
->flags
.bits
!= 0); // shouldn't be here without a reason.
229 RDTSC_START(BEClear
);
231 if (pClear
->flags
.mask
& SWR_CLEAR_COLOR
)
233 HOTTILE
*pHotTile
= pContext
->pHotTileMgr
->GetHotTile(pContext
, pDC
, macroTile
, SWR_ATTACHMENT_COLOR0
, true, numSamples
);
234 // All we want to do here is to mark the hot tile as being in a "needs clear" state.
235 pHotTile
->clearData
[0] = *(DWORD
*)&(pClear
->clearRTColor
[0]);
236 pHotTile
->clearData
[1] = *(DWORD
*)&(pClear
->clearRTColor
[1]);
237 pHotTile
->clearData
[2] = *(DWORD
*)&(pClear
->clearRTColor
[2]);
238 pHotTile
->clearData
[3] = *(DWORD
*)&(pClear
->clearRTColor
[3]);
239 pHotTile
->state
= HOTTILE_CLEAR
;
242 if (pClear
->flags
.mask
& SWR_CLEAR_DEPTH
)
244 HOTTILE
*pHotTile
= pContext
->pHotTileMgr
->GetHotTile(pContext
, pDC
, macroTile
, SWR_ATTACHMENT_DEPTH
, true, numSamples
);
245 pHotTile
->clearData
[0] = *(DWORD
*)&pClear
->clearDepth
;
246 pHotTile
->state
= HOTTILE_CLEAR
;
249 if (pClear
->flags
.mask
& SWR_CLEAR_STENCIL
)
251 HOTTILE
*pHotTile
= pContext
->pHotTileMgr
->GetHotTile(pContext
, pDC
, macroTile
, SWR_ATTACHMENT_STENCIL
, true, numSamples
);
253 pHotTile
->clearData
[0] = *(DWORD
*)&pClear
->clearStencil
;
254 pHotTile
->state
= HOTTILE_CLEAR
;
257 RDTSC_STOP(BEClear
, 0, 0);
262 CLEAR_DESC
*pClear
= (CLEAR_DESC
*)pUserData
;
263 RDTSC_START(BEClear
);
265 if (pClear
->flags
.mask
& SWR_CLEAR_COLOR
)
267 /// @todo clear data should come in as RGBA32_FLOAT
270 clearFloat
[0] = ((uint8_t*)(&pClear
->clearRTColor
))[0] / 255.0f
;
271 clearFloat
[1] = ((uint8_t*)(&pClear
->clearRTColor
))[1] / 255.0f
;
272 clearFloat
[2] = ((uint8_t*)(&pClear
->clearRTColor
))[2] / 255.0f
;
273 clearFloat
[3] = ((uint8_t*)(&pClear
->clearRTColor
))[3] / 255.0f
;
274 clearData
[0] = *(DWORD
*)&clearFloat
[0];
275 clearData
[1] = *(DWORD
*)&clearFloat
[1];
276 clearData
[2] = *(DWORD
*)&clearFloat
[2];
277 clearData
[3] = *(DWORD
*)&clearFloat
[3];
279 PFN_CLEAR_TILES pfnClearTiles
= sClearTilesTable
[KNOB_COLOR_HOT_TILE_FORMAT
];
280 SWR_ASSERT(pfnClearTiles
!= nullptr);
282 pfnClearTiles(pDC
, SWR_ATTACHMENT_COLOR0
, macroTile
, clearData
);
285 if (pClear
->flags
.mask
& SWR_CLEAR_DEPTH
)
288 clearData
[0] = *(DWORD
*)&pClear
->clearDepth
;
289 PFN_CLEAR_TILES pfnClearTiles
= sClearTilesTable
[KNOB_DEPTH_HOT_TILE_FORMAT
];
290 SWR_ASSERT(pfnClearTiles
!= nullptr);
292 pfnClearTiles(pDC
, SWR_ATTACHMENT_DEPTH
, macroTile
, clearData
);
295 if (pClear
->flags
.mask
& SWR_CLEAR_STENCIL
)
297 uint32_t value
= pClear
->clearStencil
;
299 clearData
[0] = *(DWORD
*)&value
;
300 PFN_CLEAR_TILES pfnClearTiles
= sClearTilesTable
[KNOB_STENCIL_HOT_TILE_FORMAT
];
302 pfnClearTiles(pDC
, SWR_ATTACHMENT_STENCIL
, macroTile
, clearData
);
305 RDTSC_STOP(BEClear
, 0, 0);
310 void ProcessStoreTileBE(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t macroTile
, void *pData
)
312 RDTSC_START(BEStoreTiles
);
313 STORE_TILES_DESC
*pDesc
= (STORE_TILES_DESC
*)pData
;
314 SWR_CONTEXT
*pContext
= pDC
->pContext
;
316 #ifdef KNOB_ENABLE_RDTSC
317 uint32_t numTiles
= 0;
319 SWR_FORMAT srcFormat
;
320 switch (pDesc
->attachment
)
322 case SWR_ATTACHMENT_COLOR0
:
323 case SWR_ATTACHMENT_COLOR1
:
324 case SWR_ATTACHMENT_COLOR2
:
325 case SWR_ATTACHMENT_COLOR3
:
326 case SWR_ATTACHMENT_COLOR4
:
327 case SWR_ATTACHMENT_COLOR5
:
328 case SWR_ATTACHMENT_COLOR6
:
329 case SWR_ATTACHMENT_COLOR7
: srcFormat
= KNOB_COLOR_HOT_TILE_FORMAT
; break;
330 case SWR_ATTACHMENT_DEPTH
: srcFormat
= KNOB_DEPTH_HOT_TILE_FORMAT
; break;
331 case SWR_ATTACHMENT_STENCIL
: srcFormat
= KNOB_STENCIL_HOT_TILE_FORMAT
; break;
332 default: SWR_ASSERT(false, "Unknown attachment: %d", pDesc
->attachment
); srcFormat
= KNOB_COLOR_HOT_TILE_FORMAT
; break;
336 MacroTileMgr::getTileIndices(macroTile
, x
, y
);
338 // Only need to store the hottile if it's been rendered to...
339 HOTTILE
*pHotTile
= pContext
->pHotTileMgr
->GetHotTile(pContext
, pDC
, macroTile
, pDesc
->attachment
, false);
342 // clear if clear is pending (i.e., not rendered to), then mark as dirty for store.
343 if (pHotTile
->state
== HOTTILE_CLEAR
)
345 PFN_CLEAR_TILES pfnClearTiles
= sClearTilesTable
[srcFormat
];
346 SWR_ASSERT(pfnClearTiles
!= nullptr);
348 pfnClearTiles(pDC
, pDesc
->attachment
, macroTile
, pHotTile
->clearData
);
351 if (pHotTile
->state
== HOTTILE_DIRTY
|| pDesc
->postStoreTileState
== (SWR_TILE_STATE
)HOTTILE_DIRTY
)
353 int destX
= KNOB_MACROTILE_X_DIM
* x
;
354 int destY
= KNOB_MACROTILE_Y_DIM
* y
;
356 pContext
->pfnStoreTile(GetPrivateState(pDC
), srcFormat
,
357 pDesc
->attachment
, destX
, destY
, pHotTile
->renderTargetArrayIndex
, pHotTile
->pBuffer
);
361 if (pHotTile
->state
== HOTTILE_DIRTY
|| pHotTile
->state
== HOTTILE_RESOLVED
)
363 pHotTile
->state
= (HOTTILE_STATE
)pDesc
->postStoreTileState
;
366 RDTSC_STOP(BEStoreTiles
, numTiles
, pDC
->drawId
);
370 void ProcessDiscardInvalidateTilesBE(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t macroTile
, void *pData
)
372 DISCARD_INVALIDATE_TILES_DESC
*pDesc
= (DISCARD_INVALIDATE_TILES_DESC
*)pData
;
373 SWR_CONTEXT
*pContext
= pDC
->pContext
;
375 const int numSamples
= GetNumSamples(pDC
->pState
->state
.rastState
.sampleCount
);
377 for (uint32_t i
= 0; i
< SWR_NUM_ATTACHMENTS
; ++i
)
379 if (pDesc
->attachmentMask
& (1 << i
))
381 HOTTILE
*pHotTile
= pContext
->pHotTileMgr
->GetHotTileNoLoad(
382 pContext
, pDC
, macroTile
, (SWR_RENDERTARGET_ATTACHMENT
)i
, pDesc
->createNewTiles
, numSamples
);
385 pHotTile
->state
= (HOTTILE_STATE
)pDesc
->newTileState
;
391 #if KNOB_SIMD_WIDTH == 8
392 const __m256 vCenterOffsetsX
= {0.5, 1.5, 0.5, 1.5, 2.5, 3.5, 2.5, 3.5};
393 const __m256 vCenterOffsetsY
= {0.5, 0.5, 1.5, 1.5, 0.5, 0.5, 1.5, 1.5};
394 const __m256 vULOffsetsX
= {0.0, 1.0, 0.0, 1.0, 2.0, 3.0, 2.0, 3.0};
395 const __m256 vULOffsetsY
= {0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0};
397 #error Unsupported vector width
400 simdmask
ComputeUserClipMask(uint8_t clipMask
, float* pUserClipBuffer
, simdscalar vI
, simdscalar vJ
)
402 simdscalar vClipMask
= _simd_setzero_ps();
403 uint32_t numClipDistance
= _mm_popcnt_u32(clipMask
);
405 for (uint32_t i
= 0; i
< numClipDistance
; ++i
)
407 // pull triangle clip distance values from clip buffer
408 simdscalar vA
= _simd_broadcast_ss(pUserClipBuffer
++);
409 simdscalar vB
= _simd_broadcast_ss(pUserClipBuffer
++);
410 simdscalar vC
= _simd_broadcast_ss(pUserClipBuffer
++);
413 simdscalar vInterp
= vplaneps(vA
, vB
, vC
, vI
, vJ
);
415 // clip if interpolated clip distance is < 0 || NAN
416 simdscalar vCull
= _simd_cmp_ps(_simd_setzero_ps(), vInterp
, _CMP_NLE_UQ
);
418 vClipMask
= _simd_or_ps(vClipMask
, vCull
);
421 return _simd_movemask_ps(vClipMask
);
425 void BackendSingleSample(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t x
, uint32_t y
, SWR_TRIANGLE_DESC
&work
, RenderOutputBuffers
&renderBuffers
)
427 RDTSC_START(BESingleSampleBackend
);
428 RDTSC_START(BESetup
);
430 SWR_CONTEXT
*pContext
= pDC
->pContext
;
431 const API_STATE
& state
= GetApiState(pDC
);
432 const SWR_RASTSTATE
& rastState
= state
.rastState
;
433 const SWR_PS_STATE
*pPSState
= &state
.psState
;
434 const SWR_BLEND_STATE
*pBlendState
= &state
.blendState
;
435 uint64_t coverageMask
= work
.coverageMask
[0];
438 BarycentricCoeffs coeffs
;
439 coeffs
.vIa
= _simd_broadcast_ss(&work
.I
[0]);
440 coeffs
.vIb
= _simd_broadcast_ss(&work
.I
[1]);
441 coeffs
.vIc
= _simd_broadcast_ss(&work
.I
[2]);
443 coeffs
.vJa
= _simd_broadcast_ss(&work
.J
[0]);
444 coeffs
.vJb
= _simd_broadcast_ss(&work
.J
[1]);
445 coeffs
.vJc
= _simd_broadcast_ss(&work
.J
[2]);
447 coeffs
.vZa
= _simd_broadcast_ss(&work
.Z
[0]);
448 coeffs
.vZb
= _simd_broadcast_ss(&work
.Z
[1]);
449 coeffs
.vZc
= _simd_broadcast_ss(&work
.Z
[2]);
451 coeffs
.vRecipDet
= _simd_broadcast_ss(&work
.recipDet
);
453 coeffs
.vAOneOverW
= _simd_broadcast_ss(&work
.OneOverW
[0]);
454 coeffs
.vBOneOverW
= _simd_broadcast_ss(&work
.OneOverW
[1]);
455 coeffs
.vCOneOverW
= _simd_broadcast_ss(&work
.OneOverW
[2]);
457 uint8_t *pColorBase
[SWR_NUM_RENDERTARGETS
];
458 uint32_t NumRT
= state
.psState
.numRenderTargets
;
459 for(uint32_t rt
= 0; rt
< NumRT
; ++rt
)
461 pColorBase
[rt
] = renderBuffers
.pColor
[rt
];
463 uint8_t *pDepthBase
= renderBuffers
.pDepth
, *pStencilBase
= renderBuffers
.pStencil
;
464 RDTSC_STOP(BESetup
, 0, 0);
466 SWR_PS_CONTEXT psContext
;
467 psContext
.pAttribs
= work
.pAttribs
;
468 psContext
.pPerspAttribs
= work
.pPerspAttribs
;
469 psContext
.frontFace
= work
.triFlags
.frontFacing
;
470 psContext
.primID
= work
.triFlags
.primID
;
472 // save Ia/Ib/Ic and Ja/Jb/Jc if we need to reevaluate i/j/k in the shader because of pull attribs
473 psContext
.I
= work
.I
;
474 psContext
.J
= work
.J
;
475 psContext
.recipDet
= work
.recipDet
;
476 psContext
.pRecipW
= work
.pRecipW
;
477 psContext
.pSamplePosX
= (const float*)&T::MultisampleT::samplePosX
;
478 psContext
.pSamplePosY
= (const float*)&T::MultisampleT::samplePosY
;
480 for(uint32_t yy
= y
; yy
< y
+ KNOB_TILE_Y_DIM
; yy
+= SIMD_TILE_Y_DIM
)
483 psContext
.vY
.UL
= _simd_add_ps(vULOffsetsY
, _simd_set1_ps((float)yy
));
485 psContext
.vY
.center
= _simd_add_ps(vCenterOffsetsY
, _simd_set1_ps((float)yy
));
487 for(uint32_t xx
= x
; xx
< x
+ KNOB_TILE_X_DIM
; xx
+= SIMD_TILE_X_DIM
)
489 if(coverageMask
& MASK
)
491 psContext
.vX
.UL
= _simd_add_ps(vULOffsetsX
, _simd_set1_ps((float)xx
));
493 psContext
.vX
.center
= _simd_add_ps(vCenterOffsetsX
, _simd_set1_ps((float)xx
));
495 if(T::bInputCoverage
)
497 generateInputCoverage
<T
>(&work
.coverageMask
[0], psContext
.inputMask
, pBlendState
->sampleMask
);
500 RDTSC_START(BEBarycentric
);
501 CalcPixelBarycentrics(coeffs
, psContext
);
503 // for 1x case, centroid is pixel center
504 psContext
.vX
.centroid
= psContext
.vX
.center
;
505 psContext
.vY
.centroid
= psContext
.vY
.center
;
506 psContext
.vI
.centroid
= psContext
.vI
.center
;
507 psContext
.vJ
.centroid
= psContext
.vJ
.center
;
508 psContext
.vOneOverW
.centroid
= psContext
.vOneOverW
.center
;
510 // interpolate and quantize z
511 psContext
.vZ
= vplaneps(coeffs
.vZa
, coeffs
.vZb
, coeffs
.vZc
, psContext
.vI
.center
, psContext
.vJ
.center
);
512 psContext
.vZ
= state
.pfnQuantizeDepth(psContext
.vZ
);
513 RDTSC_STOP(BEBarycentric
, 0, 0);
515 simdmask clipCoverageMask
= coverageMask
& MASK
;
516 // interpolate user clip distance if available
517 if(rastState
.clipDistanceMask
)
519 clipCoverageMask
&= ~ComputeUserClipMask(rastState
.clipDistanceMask
, work
.pUserClipBuffer
,
520 psContext
.vI
.center
, psContext
.vJ
.center
);
523 simdscalar vCoverageMask
= vMask(clipCoverageMask
);
524 simdscalar depthPassMask
= vCoverageMask
;
525 simdscalar stencilPassMask
= vCoverageMask
;
530 RDTSC_START(BEEarlyDepthTest
);
531 depthPassMask
= DepthStencilTest(&state
, work
.triFlags
.frontFacing
,
532 psContext
.vZ
, pDepthBase
, vCoverageMask
, pStencilBase
, &stencilPassMask
);
533 RDTSC_STOP(BEEarlyDepthTest
, 0, 0);
535 // early-exit if no pixels passed depth or earlyZ is forced on
536 if(pPSState
->forceEarlyZ
|| !_simd_movemask_ps(depthPassMask
))
538 DepthStencilWrite(&state
.vp
[0], &state
.depthStencilState
, work
.triFlags
.frontFacing
, psContext
.vZ
,
539 pDepthBase
, depthPassMask
, vCoverageMask
, pStencilBase
, stencilPassMask
);
541 if (!_simd_movemask_ps(depthPassMask
))
548 psContext
.sampleIndex
= 0;
549 psContext
.activeMask
= _simd_castps_si(vCoverageMask
);
551 // execute pixel shader
552 RDTSC_START(BEPixelShader
);
553 UPDATE_STAT(PsInvocations
, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask
)));
554 state
.psState
.pfnPixelShader(GetPrivateState(pDC
), &psContext
);
555 RDTSC_STOP(BEPixelShader
, 0, 0);
557 vCoverageMask
= _simd_castsi_ps(psContext
.activeMask
);
562 RDTSC_START(BELateDepthTest
);
563 depthPassMask
= DepthStencilTest(&state
, work
.triFlags
.frontFacing
,
564 psContext
.vZ
, pDepthBase
, vCoverageMask
, pStencilBase
, &stencilPassMask
);
565 RDTSC_STOP(BELateDepthTest
, 0, 0);
567 if(!_simd_movemask_ps(depthPassMask
))
569 // need to call depth/stencil write for stencil write
570 DepthStencilWrite(&state
.vp
[0], &state
.depthStencilState
, work
.triFlags
.frontFacing
, psContext
.vZ
,
571 pDepthBase
, depthPassMask
, vCoverageMask
, pStencilBase
, stencilPassMask
);
576 uint32_t statMask
= _simd_movemask_ps(depthPassMask
);
577 uint32_t statCount
= _mm_popcnt_u32(statMask
);
578 UPDATE_STAT(DepthPassCount
, statCount
);
581 RDTSC_START(BEOutputMerger
);
582 OutputMerger(psContext
, pColorBase
, 0, pBlendState
, state
.pfnBlendFunc
, vCoverageMask
, depthPassMask
, pPSState
->numRenderTargets
);
584 // do final depth write after all pixel kills
585 if (!pPSState
->forceEarlyZ
)
587 DepthStencilWrite(&state
.vp
[0], &state
.depthStencilState
, work
.triFlags
.frontFacing
, psContext
.vZ
,
588 pDepthBase
, depthPassMask
, vCoverageMask
, pStencilBase
, stencilPassMask
);
590 RDTSC_STOP(BEOutputMerger
, 0, 0);
594 RDTSC_START(BEEndTile
);
595 coverageMask
>>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
596 pDepthBase
+= (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
) / 8;
597 pStencilBase
+= (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
) / 8;
599 for(uint32_t rt
= 0; rt
< NumRT
; ++rt
)
601 pColorBase
[rt
] += (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
) / 8;
603 RDTSC_STOP(BEEndTile
, 0, 0);
606 RDTSC_STOP(BESingleSampleBackend
, 0, 0);
610 void BackendSampleRate(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t x
, uint32_t y
, SWR_TRIANGLE_DESC
&work
, RenderOutputBuffers
&renderBuffers
)
612 RDTSC_START(BESampleRateBackend
);
613 RDTSC_START(BESetup
);
615 SWR_CONTEXT
*pContext
= pDC
->pContext
;
616 const API_STATE
& state
= GetApiState(pDC
);
617 const SWR_RASTSTATE
& rastState
= state
.rastState
;
618 const SWR_PS_STATE
*pPSState
= &state
.psState
;
619 const SWR_BLEND_STATE
*pBlendState
= &state
.blendState
;
622 BarycentricCoeffs coeffs
;
623 coeffs
.vIa
= _simd_broadcast_ss(&work
.I
[0]);
624 coeffs
.vIb
= _simd_broadcast_ss(&work
.I
[1]);
625 coeffs
.vIc
= _simd_broadcast_ss(&work
.I
[2]);
627 coeffs
.vJa
= _simd_broadcast_ss(&work
.J
[0]);
628 coeffs
.vJb
= _simd_broadcast_ss(&work
.J
[1]);
629 coeffs
.vJc
= _simd_broadcast_ss(&work
.J
[2]);
631 coeffs
.vZa
= _simd_broadcast_ss(&work
.Z
[0]);
632 coeffs
.vZb
= _simd_broadcast_ss(&work
.Z
[1]);
633 coeffs
.vZc
= _simd_broadcast_ss(&work
.Z
[2]);
635 coeffs
.vRecipDet
= _simd_broadcast_ss(&work
.recipDet
);
637 coeffs
.vAOneOverW
= _simd_broadcast_ss(&work
.OneOverW
[0]);
638 coeffs
.vBOneOverW
= _simd_broadcast_ss(&work
.OneOverW
[1]);
639 coeffs
.vCOneOverW
= _simd_broadcast_ss(&work
.OneOverW
[2]);
641 uint8_t *pColorBase
[SWR_NUM_RENDERTARGETS
];
642 uint32_t NumRT
= state
.psState
.numRenderTargets
;
643 for(uint32_t rt
= 0; rt
< NumRT
; ++rt
)
645 pColorBase
[rt
] = renderBuffers
.pColor
[rt
];
647 uint8_t *pDepthBase
= renderBuffers
.pDepth
, *pStencilBase
= renderBuffers
.pStencil
;
648 RDTSC_STOP(BESetup
, 0, 0);
650 SWR_PS_CONTEXT psContext
;
651 psContext
.pAttribs
= work
.pAttribs
;
652 psContext
.pPerspAttribs
= work
.pPerspAttribs
;
653 psContext
.pRecipW
= work
.pRecipW
;
654 psContext
.frontFace
= work
.triFlags
.frontFacing
;
655 psContext
.primID
= work
.triFlags
.primID
;
657 // save Ia/Ib/Ic and Ja/Jb/Jc if we need to reevaluate i/j/k in the shader because of pull attribs
658 psContext
.I
= work
.I
;
659 psContext
.J
= work
.J
;
660 psContext
.recipDet
= work
.recipDet
;
661 psContext
.pSamplePosX
= (const float*)&T::MultisampleT::samplePosX
;
662 psContext
.pSamplePosY
= (const float*)&T::MultisampleT::samplePosY
;
664 for (uint32_t yy
= y
; yy
< y
+ KNOB_TILE_Y_DIM
; yy
+= SIMD_TILE_Y_DIM
)
667 psContext
.vY
.UL
= _simd_add_ps(vULOffsetsY
, _simd_set1_ps((float)yy
));
669 psContext
.vY
.center
= _simd_add_ps(vCenterOffsetsY
, _simd_set1_ps((float)yy
));
671 for (uint32_t xx
= x
; xx
< x
+ KNOB_TILE_X_DIM
; xx
+= SIMD_TILE_X_DIM
)
673 psContext
.vX
.UL
= _simd_add_ps(vULOffsetsX
, _simd_set1_ps((float)xx
));
675 psContext
.vX
.center
= _simd_add_ps(vCenterOffsetsX
, _simd_set1_ps((float)xx
));
677 RDTSC_START(BEBarycentric
);
678 CalcPixelBarycentrics(coeffs
, psContext
);
679 RDTSC_STOP(BEBarycentric
, 0, 0);
681 if(T::bInputCoverage
)
683 generateInputCoverage
<T
>(&work
.coverageMask
[0], psContext
.inputMask
, pBlendState
->sampleMask
);
688 ///@ todo: don't need to genererate input coverage 2x if input coverage and centroid
689 RDTSC_START(BEBarycentric
);
690 if(T::bIsStandardPattern
)
692 CalcCentroidPos
<T
>(psContext
, &work
.coverageMask
[0], pBlendState
->sampleMask
, psContext
.vX
.UL
, psContext
.vY
.UL
);
696 psContext
.vX
.centroid
= _simd_add_ps(psContext
.vX
.UL
, _simd_set1_ps(0.5f
));
697 psContext
.vY
.centroid
= _simd_add_ps(psContext
.vY
.UL
, _simd_set1_ps(0.5f
));
699 CalcCentroidBarycentrics(coeffs
, psContext
, psContext
.vX
.UL
, psContext
.vY
.UL
);
700 RDTSC_STOP(BEBarycentric
, 0, 0);
704 psContext
.vX
.centroid
= psContext
.vX
.sample
;
705 psContext
.vY
.centroid
= psContext
.vY
.sample
;
708 for(uint32_t sample
= 0; sample
< T::MultisampleT::numSamples
; sample
++)
710 simdmask coverageMask
= work
.coverageMask
[sample
] & MASK
;
713 RDTSC_START(BEBarycentric
);
714 // calculate per sample positions
715 psContext
.vX
.sample
= _simd_add_ps(psContext
.vX
.UL
, T::MultisampleT::vX(sample
));
716 psContext
.vY
.sample
= _simd_add_ps(psContext
.vY
.UL
, T::MultisampleT::vY(sample
));
718 CalcSampleBarycentrics(coeffs
, psContext
);
720 // interpolate and quantize z
721 psContext
.vZ
= vplaneps(coeffs
.vZa
, coeffs
.vZb
, coeffs
.vZc
, psContext
.vI
.sample
, psContext
.vJ
.sample
);
722 psContext
.vZ
= state
.pfnQuantizeDepth(psContext
.vZ
);
723 RDTSC_STOP(BEBarycentric
, 0, 0);
725 // interpolate user clip distance if available
726 if (rastState
.clipDistanceMask
)
728 coverageMask
&= ~ComputeUserClipMask(rastState
.clipDistanceMask
, work
.pUserClipBuffer
,
729 psContext
.vI
.sample
, psContext
.vJ
.sample
);
732 simdscalar vCoverageMask
= vMask(coverageMask
);
733 simdscalar depthPassMask
= vCoverageMask
;
734 simdscalar stencilPassMask
= vCoverageMask
;
736 // offset depth/stencil buffers current sample
737 uint8_t *pDepthSample
= pDepthBase
+ RasterTileDepthOffset(sample
);
738 uint8_t *pStencilSample
= pStencilBase
+ RasterTileStencilOffset(sample
);
743 RDTSC_START(BEEarlyDepthTest
);
744 depthPassMask
= DepthStencilTest(&state
, work
.triFlags
.frontFacing
,
745 psContext
.vZ
, pDepthSample
, vCoverageMask
, pStencilSample
, &stencilPassMask
);
746 RDTSC_STOP(BEEarlyDepthTest
, 0, 0);
748 // early-exit if no samples passed depth or earlyZ is forced on.
749 if (pPSState
->forceEarlyZ
|| !_simd_movemask_ps(depthPassMask
))
751 DepthStencilWrite(&state
.vp
[0], &state
.depthStencilState
, work
.triFlags
.frontFacing
, psContext
.vZ
,
752 pDepthSample
, depthPassMask
, vCoverageMask
, pStencilSample
, stencilPassMask
);
754 if (!_simd_movemask_ps(depthPassMask
))
756 work
.coverageMask
[sample
] >>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
762 psContext
.sampleIndex
= sample
;
763 psContext
.activeMask
= _simd_castps_si(vCoverageMask
);
765 // execute pixel shader
766 RDTSC_START(BEPixelShader
);
767 UPDATE_STAT(PsInvocations
, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask
)));
768 state
.psState
.pfnPixelShader(GetPrivateState(pDC
), &psContext
);
769 RDTSC_STOP(BEPixelShader
, 0, 0);
771 vCoverageMask
= _simd_castsi_ps(psContext
.activeMask
);
776 RDTSC_START(BELateDepthTest
);
777 depthPassMask
= DepthStencilTest(&state
, work
.triFlags
.frontFacing
,
778 psContext
.vZ
, pDepthSample
, vCoverageMask
, pStencilSample
, &stencilPassMask
);
779 RDTSC_STOP(BELateDepthTest
, 0, 0);
781 if (!_simd_movemask_ps(depthPassMask
))
783 // need to call depth/stencil write for stencil write
784 DepthStencilWrite(&state
.vp
[0], &state
.depthStencilState
, work
.triFlags
.frontFacing
, psContext
.vZ
,
785 pDepthSample
, depthPassMask
, vCoverageMask
, pStencilSample
, stencilPassMask
);
787 work
.coverageMask
[sample
] >>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
792 uint32_t statMask
= _simd_movemask_ps(depthPassMask
);
793 uint32_t statCount
= _mm_popcnt_u32(statMask
);
794 UPDATE_STAT(DepthPassCount
, statCount
);
797 RDTSC_START(BEOutputMerger
);
798 OutputMerger(psContext
, pColorBase
, sample
, pBlendState
, state
.pfnBlendFunc
, vCoverageMask
, depthPassMask
, pPSState
->numRenderTargets
);
800 // do final depth write after all pixel kills
801 if (!pPSState
->forceEarlyZ
)
803 DepthStencilWrite(&state
.vp
[0], &state
.depthStencilState
, work
.triFlags
.frontFacing
, psContext
.vZ
,
804 pDepthSample
, depthPassMask
, vCoverageMask
, pStencilSample
, stencilPassMask
);
806 RDTSC_STOP(BEOutputMerger
, 0, 0);
808 work
.coverageMask
[sample
] >>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
810 RDTSC_START(BEEndTile
);
811 pDepthBase
+= (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
) / 8;
812 pStencilBase
+= (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
) / 8;
814 for (uint32_t rt
= 0; rt
< NumRT
; ++rt
)
816 pColorBase
[rt
] += (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
) / 8;
818 RDTSC_STOP(BEEndTile
, 0, 0);
821 RDTSC_STOP(BESampleRateBackend
, 0, 0);
825 void BackendPixelRate(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t x
, uint32_t y
, SWR_TRIANGLE_DESC
&work
, RenderOutputBuffers
&renderBuffers
)
827 RDTSC_START(BEPixelRateBackend
);
828 RDTSC_START(BESetup
);
830 SWR_CONTEXT
*pContext
= pDC
->pContext
;
831 const API_STATE
& state
= GetApiState(pDC
);
832 const SWR_RASTSTATE
& rastState
= state
.rastState
;
833 const SWR_PS_STATE
*pPSState
= &state
.psState
;
834 const SWR_BLEND_STATE
*pBlendState
= &state
.blendState
;
837 BarycentricCoeffs coeffs
;
838 coeffs
.vIa
= _simd_broadcast_ss(&work
.I
[0]);
839 coeffs
.vIb
= _simd_broadcast_ss(&work
.I
[1]);
840 coeffs
.vIc
= _simd_broadcast_ss(&work
.I
[2]);
842 coeffs
.vJa
= _simd_broadcast_ss(&work
.J
[0]);
843 coeffs
.vJb
= _simd_broadcast_ss(&work
.J
[1]);
844 coeffs
.vJc
= _simd_broadcast_ss(&work
.J
[2]);
846 coeffs
.vZa
= _simd_broadcast_ss(&work
.Z
[0]);
847 coeffs
.vZb
= _simd_broadcast_ss(&work
.Z
[1]);
848 coeffs
.vZc
= _simd_broadcast_ss(&work
.Z
[2]);
850 coeffs
.vRecipDet
= _simd_broadcast_ss(&work
.recipDet
);
852 coeffs
.vAOneOverW
= _simd_broadcast_ss(&work
.OneOverW
[0]);
853 coeffs
.vBOneOverW
= _simd_broadcast_ss(&work
.OneOverW
[1]);
854 coeffs
.vCOneOverW
= _simd_broadcast_ss(&work
.OneOverW
[2]);
856 uint8_t *pColorBase
[SWR_NUM_RENDERTARGETS
];
857 uint32_t NumRT
= state
.psState
.numRenderTargets
;
858 for(uint32_t rt
= 0; rt
< NumRT
; ++rt
)
860 pColorBase
[rt
] = renderBuffers
.pColor
[rt
];
862 uint8_t *pDepthBase
= renderBuffers
.pDepth
, *pStencilBase
= renderBuffers
.pStencil
;
863 RDTSC_STOP(BESetup
, 0, 0);
865 SWR_PS_CONTEXT psContext
;
866 psContext
.pAttribs
= work
.pAttribs
;
867 psContext
.pPerspAttribs
= work
.pPerspAttribs
;
868 psContext
.frontFace
= work
.triFlags
.frontFacing
;
869 psContext
.primID
= work
.triFlags
.primID
;
870 psContext
.pRecipW
= work
.pRecipW
;
871 // save Ia/Ib/Ic and Ja/Jb/Jc if we need to reevaluate i/j/k in the shader because of pull attribs
872 psContext
.I
= work
.I
;
873 psContext
.J
= work
.J
;
874 psContext
.recipDet
= work
.recipDet
;
875 psContext
.pSamplePosX
= (const float*)&T::MultisampleT::samplePosX
;
876 psContext
.pSamplePosY
= (const float*)&T::MultisampleT::samplePosY
;
877 psContext
.sampleIndex
= 0;
879 PixelRateZTestLoop
<T
> PixelRateZTest(pDC
, work
, coeffs
, state
, pDepthBase
, pStencilBase
, rastState
.clipDistanceMask
);
881 for(uint32_t yy
= y
; yy
< y
+ KNOB_TILE_Y_DIM
; yy
+= SIMD_TILE_Y_DIM
)
883 psContext
.vY
.UL
= _simd_add_ps(vULOffsetsY
, _simd_set1_ps((float)yy
));
884 psContext
.vY
.center
= _simd_add_ps(vCenterOffsetsY
, _simd_set1_ps((float)yy
));
885 for(uint32_t xx
= x
; xx
< x
+ KNOB_TILE_X_DIM
; xx
+= SIMD_TILE_X_DIM
)
887 simdscalar activeLanes
;
888 if(!(work
.anyCoveredSamples
& MASK
)) {goto Endtile
;};
889 activeLanes
= vMask(work
.anyCoveredSamples
& MASK
);
891 psContext
.vX
.UL
= _simd_add_ps(vULOffsetsX
, _simd_set1_ps((float)xx
));
892 // set pixel center positions
893 psContext
.vX
.center
= _simd_add_ps(vCenterOffsetsX
, _simd_set1_ps((float)xx
));
895 RDTSC_START(BEBarycentric
);
896 CalcPixelBarycentrics(coeffs
, psContext
);
897 RDTSC_STOP(BEBarycentric
, 0, 0);
899 if (T::bInputCoverage
)
901 generateInputCoverage
<T
>(&work
.coverageMask
[0], psContext
.inputMask
, pBlendState
->sampleMask
);
906 ///@ todo: don't need to genererate input coverage 2x if input coverage and centroid
907 RDTSC_START(BEBarycentric
);
908 if(T::bIsStandardPattern
)
910 CalcCentroidPos
<T
>(psContext
, &work
.coverageMask
[0], pBlendState
->sampleMask
, psContext
.vX
.UL
, psContext
.vY
.UL
);
914 psContext
.vX
.centroid
= _simd_add_ps(psContext
.vX
.UL
, _simd_set1_ps(0.5f
));
915 psContext
.vY
.centroid
= _simd_add_ps(psContext
.vY
.UL
, _simd_set1_ps(0.5f
));
918 CalcCentroidBarycentrics(coeffs
, psContext
, psContext
.vX
.UL
, psContext
.vY
.UL
);
919 RDTSC_STOP(BEBarycentric
, 0, 0);
923 psContext
.vX
.centroid
= _simd_add_ps(psContext
.vX
.UL
, _simd_set1_ps(0.5f
));
924 psContext
.vY
.centroid
= _simd_add_ps(psContext
.vY
.UL
, _simd_set1_ps(0.5f
));
927 if(T::bForcedSampleCount
)
929 // candidate pixels (that passed coverage) will cause shader invocation if any bits in the samplemask are set
930 const simdscalar vSampleMask
= _simd_castsi_ps(_simd_cmpgt_epi32(_simd_set1_epi32(pBlendState
->sampleMask
), _simd_setzero_si()));
931 activeLanes
= _simd_and_ps(activeLanes
, vSampleMask
);
935 if(T::bCanEarlyZ
&& !T::bForcedSampleCount
)
937 uint32_t depthPassCount
= PixelRateZTest(activeLanes
, psContext
, BEEarlyDepthTest
);
938 UPDATE_STAT(DepthPassCount
, depthPassCount
);
941 // if we have no covered samples that passed depth at this point, go to next tile
942 if(!_simd_movemask_ps(activeLanes
)) { goto Endtile
; };
944 if(pPSState
->usesSourceDepth
)
946 RDTSC_START(BEBarycentric
);
947 // interpolate and quantize z
948 psContext
.vZ
= vplaneps(coeffs
.vZa
, coeffs
.vZb
, coeffs
.vZc
, psContext
.vI
.center
, psContext
.vJ
.center
);
949 psContext
.vZ
= state
.pfnQuantizeDepth(psContext
.vZ
);
950 RDTSC_STOP(BEBarycentric
, 0, 0);
953 // pixels that are currently active
954 psContext
.activeMask
= _simd_castps_si(activeLanes
);
955 psContext
.oMask
= T::MultisampleT::FullSampleMask();
957 // execute pixel shader
958 RDTSC_START(BEPixelShader
);
959 state
.psState
.pfnPixelShader(GetPrivateState(pDC
), &psContext
);
960 UPDATE_STAT(PsInvocations
, _mm_popcnt_u32(_simd_movemask_ps(activeLanes
)));
961 RDTSC_STOP(BEPixelShader
, 0, 0);
963 // update active lanes to remove any discarded or oMask'd pixels
964 activeLanes
= _simd_castsi_ps(_simd_and_si(psContext
.activeMask
, _simd_cmpgt_epi32(psContext
.oMask
, _simd_setzero_si())));
965 if(!_simd_movemask_ps(activeLanes
)) { goto Endtile
; };
968 if(!T::bCanEarlyZ
&& !T::bForcedSampleCount
)
970 uint32_t depthPassCount
= PixelRateZTest(activeLanes
, psContext
, BELateDepthTest
);
971 UPDATE_STAT(DepthPassCount
, depthPassCount
);
974 // if we have no covered samples that passed depth at this point, skip OM and go to next tile
975 if(!_simd_movemask_ps(activeLanes
)) { goto Endtile
; };
978 // loop over all samples, broadcasting the results of the PS to all passing pixels
979 for(uint32_t sample
= 0; sample
< GetNumOMSamples
<T
>(pBlendState
->sampleCount
); sample
++)
981 RDTSC_START(BEOutputMerger
);
982 // center pattern does a single coverage/depth/stencil test, standard pattern tests all samples
983 uint32_t coverageSampleNum
= (T::bIsStandardPattern
) ? sample
: 0;
984 simdscalar coverageMask
, depthMask
;
985 if(T::bForcedSampleCount
)
987 coverageMask
= depthMask
= activeLanes
;
991 coverageMask
= PixelRateZTest
.vCoverageMask
[coverageSampleNum
];
992 depthMask
= PixelRateZTest
.depthPassMask
[coverageSampleNum
];
993 if(!_simd_movemask_ps(depthMask
))
995 // stencil should already have been written in early/lateZ tests
996 RDTSC_STOP(BEOutputMerger
, 0, 0);
1001 // broadcast the results of the PS to all passing pixels
1002 OutputMerger(psContext
, pColorBase
, sample
, pBlendState
, state
.pfnBlendFunc
, coverageMask
, depthMask
, pPSState
->numRenderTargets
);
1004 if(!pPSState
->forceEarlyZ
&& !T::bForcedSampleCount
)
1006 uint8_t *pDepthSample
= pDepthBase
+ RasterTileDepthOffset(sample
);
1007 uint8_t * pStencilSample
= pStencilBase
+ RasterTileStencilOffset(sample
);
1009 DepthStencilWrite(&state
.vp
[0], &state
.depthStencilState
, work
.triFlags
.frontFacing
, PixelRateZTest
.vZ
[coverageSampleNum
],
1010 pDepthSample
, depthMask
, coverageMask
, pStencilSample
, PixelRateZTest
.stencilPassMask
[coverageSampleNum
]);
1012 RDTSC_STOP(BEOutputMerger
, 0, 0);
1015 RDTSC_START(BEEndTile
);
1016 for(uint32_t sample
= 0; sample
< T::MultisampleT::numCoverageSamples
; sample
++)
1018 work
.coverageMask
[sample
] >>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
1021 work
.anyCoveredSamples
>>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
1022 pDepthBase
+= (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
) / 8;
1023 pStencilBase
+= (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
) / 8;
1025 for(uint32_t rt
= 0; rt
< NumRT
; ++rt
)
1027 pColorBase
[rt
] += (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
) / 8;
1029 RDTSC_STOP(BEEndTile
, 0, 0);
1032 RDTSC_STOP(BEPixelRateBackend
, 0, 0);
1034 // optimized backend flow with NULL PS
1035 template<uint32_t sampleCountT
>
1036 void BackendNullPS(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t x
, uint32_t y
, SWR_TRIANGLE_DESC
&work
, RenderOutputBuffers
&renderBuffers
)
1038 RDTSC_START(BENullBackend
);
1039 ///@todo: handle center multisample pattern
1040 typedef SwrBackendTraits
<sampleCountT
, SWR_MSAA_STANDARD_PATTERN
> T
;
1041 RDTSC_START(BESetup
);
1043 SWR_CONTEXT
*pContext
= pDC
->pContext
;
1044 const API_STATE
& state
= GetApiState(pDC
);
1045 const SWR_RASTSTATE
& rastState
= pDC
->pState
->state
.rastState
;
1047 // broadcast scalars
1048 BarycentricCoeffs coeffs
;
1049 coeffs
.vIa
= _simd_broadcast_ss(&work
.I
[0]);
1050 coeffs
.vIb
= _simd_broadcast_ss(&work
.I
[1]);
1051 coeffs
.vIc
= _simd_broadcast_ss(&work
.I
[2]);
1053 coeffs
.vJa
= _simd_broadcast_ss(&work
.J
[0]);
1054 coeffs
.vJb
= _simd_broadcast_ss(&work
.J
[1]);
1055 coeffs
.vJc
= _simd_broadcast_ss(&work
.J
[2]);
1057 coeffs
.vZa
= _simd_broadcast_ss(&work
.Z
[0]);
1058 coeffs
.vZb
= _simd_broadcast_ss(&work
.Z
[1]);
1059 coeffs
.vZc
= _simd_broadcast_ss(&work
.Z
[2]);
1061 coeffs
.vRecipDet
= _simd_broadcast_ss(&work
.recipDet
);
1063 uint8_t *pDepthBase
= renderBuffers
.pDepth
, *pStencilBase
= renderBuffers
.pStencil
;
1065 RDTSC_STOP(BESetup
, 0, 0);
1067 SWR_PS_CONTEXT psContext
;
1068 for (uint32_t yy
= y
; yy
< y
+ KNOB_TILE_Y_DIM
; yy
+= SIMD_TILE_Y_DIM
)
1071 simdscalar vYSamplePosUL
= _simd_add_ps(vULOffsetsY
, _simd_set1_ps((float)yy
));
1073 for (uint32_t xx
= x
; xx
< x
+ KNOB_TILE_X_DIM
; xx
+= SIMD_TILE_X_DIM
)
1076 simdscalar vXSamplePosUL
= _simd_add_ps(vULOffsetsX
, _simd_set1_ps((float)xx
));
1078 // iterate over active samples
1079 unsigned long sample
= 0;
1080 uint32_t sampleMask
= state
.blendState
.sampleMask
;
1081 while (_BitScanForward(&sample
, sampleMask
))
1083 sampleMask
&= ~(1 << sample
);
1084 simdmask coverageMask
= work
.coverageMask
[sample
] & MASK
;
1087 RDTSC_START(BEBarycentric
);
1088 // calculate per sample positions
1089 psContext
.vX
.sample
= _simd_add_ps(vXSamplePosUL
, T::MultisampleT::vX(sample
));
1090 psContext
.vY
.sample
= _simd_add_ps(vYSamplePosUL
, T::MultisampleT::vY(sample
));
1092 CalcSampleBarycentrics(coeffs
, psContext
);
1094 // interpolate and quantize z
1095 psContext
.vZ
= vplaneps(coeffs
.vZa
, coeffs
.vZb
, coeffs
.vZc
, psContext
.vI
.sample
, psContext
.vJ
.sample
);
1096 psContext
.vZ
= state
.pfnQuantizeDepth(psContext
.vZ
);
1098 RDTSC_STOP(BEBarycentric
, 0, 0);
1100 // interpolate user clip distance if available
1101 if (rastState
.clipDistanceMask
)
1103 coverageMask
&= ~ComputeUserClipMask(rastState
.clipDistanceMask
, work
.pUserClipBuffer
,
1104 psContext
.vI
.sample
, psContext
.vJ
.sample
);
1107 simdscalar vCoverageMask
= vMask(coverageMask
);
1108 simdscalar stencilPassMask
= vCoverageMask
;
1110 // offset depth/stencil buffers current sample
1111 uint8_t *pDepthSample
= pDepthBase
+ RasterTileDepthOffset(sample
);
1112 uint8_t *pStencilSample
= pStencilBase
+ RasterTileStencilOffset(sample
);
1114 RDTSC_START(BEEarlyDepthTest
);
1115 simdscalar depthPassMask
= DepthStencilTest(&state
, work
.triFlags
.frontFacing
,
1116 psContext
.vZ
, pDepthSample
, vCoverageMask
, pStencilSample
, &stencilPassMask
);
1117 DepthStencilWrite(&state
.vp
[0], &state
.depthStencilState
, work
.triFlags
.frontFacing
, psContext
.vZ
,
1118 pDepthSample
, depthPassMask
, vCoverageMask
, pStencilSample
, stencilPassMask
);
1119 RDTSC_STOP(BEEarlyDepthTest
, 0, 0);
1121 uint32_t statMask
= _simd_movemask_ps(depthPassMask
);
1122 uint32_t statCount
= _mm_popcnt_u32(statMask
);
1123 UPDATE_STAT(DepthPassCount
, statCount
);
1125 work
.coverageMask
[sample
] >>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
1127 pDepthBase
+= (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
) / 8;
1128 pStencilBase
+= (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
) / 8;
1131 RDTSC_STOP(BENullBackend
, 0, 0);
1134 void InitClearTilesTable()
1136 memset(sClearTilesTable
, 0, sizeof(sClearTilesTable
));
1138 sClearTilesTable
[R8G8B8A8_UNORM
] = ClearMacroTile
<R8G8B8A8_UNORM
>;
1139 sClearTilesTable
[B8G8R8A8_UNORM
] = ClearMacroTile
<B8G8R8A8_UNORM
>;
1140 sClearTilesTable
[R32_FLOAT
] = ClearMacroTile
<R32_FLOAT
>;
1141 sClearTilesTable
[R32G32B32A32_FLOAT
] = ClearMacroTile
<R32G32B32A32_FLOAT
>;
1142 sClearTilesTable
[R8_UINT
] = ClearMacroTile
<R8_UINT
>;
1145 PFN_BACKEND_FUNC gBackendNullPs
[SWR_MULTISAMPLE_TYPE_COUNT
];
1146 PFN_BACKEND_FUNC gBackendSingleSample
[2] // input coverage
1150 PFN_BACKEND_FUNC gBackendPixelRateTable
[SWR_MULTISAMPLE_TYPE_COUNT
]
1151 [SWR_MSAA_SAMPLE_PATTERN_COUNT
]
1152 [2] // input coverage
1154 [2] // forcedSampleCount
1157 PFN_BACKEND_FUNC gBackendSampleRateTable
[SWR_MULTISAMPLE_TYPE_COUNT
]
1158 [2] // input coverage
1163 // Recursive template used to auto-nest conditionals. Converts dynamic enum function
1164 // arguments to static template arguments.
1165 template <uint32_t... ArgsT
>
1168 // Last Arg Terminator
1169 static PFN_BACKEND_FUNC
GetFunc(SWR_BACKEND_FUNCS tArg
)
1173 case SWR_BACKEND_SINGLE_SAMPLE
: return BackendSingleSample
<SwrBackendTraits
<ArgsT
...>>; break;
1174 case SWR_BACKEND_MSAA_PIXEL_RATE
: return BackendPixelRate
<SwrBackendTraits
<ArgsT
...>>; break;
1175 case SWR_BACKEND_MSAA_SAMPLE_RATE
: return BackendSampleRate
<SwrBackendTraits
<ArgsT
...>>; break;
1177 SWR_ASSERT(0 && "Invalid backend func\n");
1183 // Recursively parse args
1184 template <typename
... TArgsT
>
1185 static PFN_BACKEND_FUNC
GetFunc(SWR_MSAA_SAMPLE_PATTERN tArg
, TArgsT
... remainingArgs
)
1189 case SWR_MSAA_CENTER_PATTERN
: return BEChooser
<ArgsT
..., SWR_MSAA_CENTER_PATTERN
>::GetFunc(remainingArgs
...); break;
1190 case SWR_MSAA_STANDARD_PATTERN
: return BEChooser
<ArgsT
..., SWR_MSAA_STANDARD_PATTERN
>::GetFunc(remainingArgs
...); break;
1192 SWR_ASSERT(0 && "Invalid sample pattern\n");
1193 return BEChooser
<ArgsT
..., SWR_MSAA_STANDARD_PATTERN
>::GetFunc(remainingArgs
...);
1198 // Recursively parse args
1199 template <typename
... TArgsT
>
1200 static PFN_BACKEND_FUNC
GetFunc(SWR_MULTISAMPLE_COUNT tArg
, TArgsT
... remainingArgs
)
1204 case SWR_MULTISAMPLE_1X
: return BEChooser
<ArgsT
..., SWR_MULTISAMPLE_1X
>::GetFunc(remainingArgs
...); break;
1205 case SWR_MULTISAMPLE_2X
: return BEChooser
<ArgsT
..., SWR_MULTISAMPLE_2X
>::GetFunc(remainingArgs
...); break;
1206 case SWR_MULTISAMPLE_4X
: return BEChooser
<ArgsT
..., SWR_MULTISAMPLE_4X
>::GetFunc(remainingArgs
...); break;
1207 case SWR_MULTISAMPLE_8X
: return BEChooser
<ArgsT
..., SWR_MULTISAMPLE_8X
>::GetFunc(remainingArgs
...); break;
1208 case SWR_MULTISAMPLE_16X
: return BEChooser
<ArgsT
..., SWR_MULTISAMPLE_16X
>::GetFunc(remainingArgs
...); break;
1210 SWR_ASSERT(0 && "Invalid sample count\n");
1211 return BEChooser
<ArgsT
..., SWR_MULTISAMPLE_1X
>::GetFunc(remainingArgs
...);
1216 // Recursively parse args
1217 template <typename
... TArgsT
>
1218 static PFN_BACKEND_FUNC
GetFunc(bool tArg
, TArgsT
... remainingArgs
)
1222 return BEChooser
<ArgsT
..., 1>::GetFunc(remainingArgs
...);
1225 return BEChooser
<ArgsT
..., 0>::GetFunc(remainingArgs
...);
1229 void InitBackendSingleFuncTable(PFN_BACKEND_FUNC (&table
)[2][2][2])
1231 for(uint32_t inputCoverage
= 0; inputCoverage
< 2; inputCoverage
++)
1233 for(uint32_t isCentroid
= 0; isCentroid
< 2; isCentroid
++)
1235 for(uint32_t canEarlyZ
= 0; canEarlyZ
< 2; canEarlyZ
++)
1237 table
[inputCoverage
][isCentroid
][canEarlyZ
] =
1238 BEChooser
<>::GetFunc(SWR_MULTISAMPLE_1X
, SWR_MSAA_STANDARD_PATTERN
, (inputCoverage
> 0),
1239 (isCentroid
> 0), false, (canEarlyZ
> 0), SWR_BACKEND_SINGLE_SAMPLE
);
1245 void InitBackendPixelFuncTable(PFN_BACKEND_FUNC (&table
)[SWR_MULTISAMPLE_TYPE_COUNT
][SWR_MSAA_SAMPLE_PATTERN_COUNT
][2][2][2][2])
1247 for(uint32_t sampleCount
= SWR_MULTISAMPLE_1X
; sampleCount
< SWR_MULTISAMPLE_TYPE_COUNT
; sampleCount
++)
1249 for(uint32_t samplePattern
= SWR_MSAA_CENTER_PATTERN
; samplePattern
< SWR_MSAA_SAMPLE_PATTERN_COUNT
; samplePattern
++)
1251 for(uint32_t inputCoverage
= 0; inputCoverage
< 2; inputCoverage
++)
1253 for(uint32_t isCentroid
= 0; isCentroid
< 2; isCentroid
++)
1255 for(uint32_t forcedSampleCount
= 0; forcedSampleCount
< 2; forcedSampleCount
++)
1257 for(uint32_t canEarlyZ
= 0; canEarlyZ
< 2; canEarlyZ
++)
1259 table
[sampleCount
][samplePattern
][inputCoverage
][isCentroid
][forcedSampleCount
][canEarlyZ
] =
1260 BEChooser
<>::GetFunc((SWR_MULTISAMPLE_COUNT
)sampleCount
, (SWR_MSAA_SAMPLE_PATTERN
)samplePattern
, (inputCoverage
> 0),
1261 (isCentroid
> 0), (forcedSampleCount
> 0), (canEarlyZ
> 0), SWR_BACKEND_MSAA_PIXEL_RATE
);
1270 void InitBackendSampleFuncTable(PFN_BACKEND_FUNC (&table
)[SWR_MULTISAMPLE_TYPE_COUNT
][2][2][2])
1272 for(uint32_t sampleCount
= SWR_MULTISAMPLE_1X
; sampleCount
< SWR_MULTISAMPLE_TYPE_COUNT
; sampleCount
++)
1274 for(uint32_t inputCoverage
= 0; inputCoverage
< 2; inputCoverage
++)
1276 for(uint32_t centroid
= 0; centroid
< 2; centroid
++)
1278 for(uint32_t canEarlyZ
= 0; canEarlyZ
< 2; canEarlyZ
++)
1280 table
[sampleCount
][inputCoverage
][centroid
][canEarlyZ
] =
1281 BEChooser
<>::GetFunc((SWR_MULTISAMPLE_COUNT
)sampleCount
, SWR_MSAA_STANDARD_PATTERN
, (inputCoverage
> 0),
1282 (centroid
> 0), false, (canEarlyZ
> 0), (SWR_BACKEND_FUNCS
)SWR_BACKEND_MSAA_SAMPLE_RATE
);
1289 void InitBackendFuncTables()
1291 InitBackendSingleFuncTable(gBackendSingleSample
);
1292 InitBackendPixelFuncTable(gBackendPixelRateTable
);
1293 InitBackendSampleFuncTable(gBackendSampleRateTable
);
1295 gBackendNullPs
[SWR_MULTISAMPLE_1X
] = &BackendNullPS
< SWR_MULTISAMPLE_1X
> ;
1296 gBackendNullPs
[SWR_MULTISAMPLE_2X
] = &BackendNullPS
< SWR_MULTISAMPLE_2X
> ;
1297 gBackendNullPs
[SWR_MULTISAMPLE_4X
] = &BackendNullPS
< SWR_MULTISAMPLE_4X
> ;
1298 gBackendNullPs
[SWR_MULTISAMPLE_8X
] = &BackendNullPS
< SWR_MULTISAMPLE_8X
> ;
1299 gBackendNullPs
[SWR_MULTISAMPLE_16X
] = &BackendNullPS
< SWR_MULTISAMPLE_16X
> ;