1 /****************************************************************************
2 * Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * @brief Backend handles rasterization, pixel shading and output merger
28 ******************************************************************************/
30 #include <smmintrin.h>
33 #include "backend_impl.h"
35 #include "memory/tilingtraits.h"
36 #include "core/multisample.h"
37 #include "backends/gen_BackendPixelRate.hpp"
42 //////////////////////////////////////////////////////////////////////////
43 /// @brief Process compute work.
44 /// @param pDC - pointer to draw context (dispatch).
45 /// @param workerId - The unique worker ID that is assigned to this thread.
46 /// @param threadGroupId - the linear index for the thread group within the dispatch.
47 void ProcessComputeBE(DRAW_CONTEXT
* pDC
,
49 uint32_t threadGroupId
,
50 void*& pSpillFillBuffer
,
53 SWR_CONTEXT
* pContext
= pDC
->pContext
;
55 RDTSC_BEGIN(pDC
->pContext
->pBucketMgr
, BEDispatch
, pDC
->drawId
);
57 const COMPUTE_DESC
* pTaskData
= (COMPUTE_DESC
*)pDC
->pDispatch
->GetTasksData();
58 SWR_ASSERT(pTaskData
!= nullptr);
60 // Ensure spill fill memory has been allocated.
61 size_t spillFillSize
= pDC
->pState
->state
.totalSpillFillSize
;
62 if (spillFillSize
&& pSpillFillBuffer
== nullptr)
64 pSpillFillBuffer
= pDC
->pArena
->AllocAlignedSync(spillFillSize
, KNOB_SIMD16_BYTES
);
67 size_t scratchSpaceSize
=
68 pDC
->pState
->state
.scratchSpaceSizePerWarp
* pDC
->pState
->state
.scratchSpaceNumWarps
;
69 if (scratchSpaceSize
&& pScratchSpace
== nullptr)
71 pScratchSpace
= pDC
->pArena
->AllocAlignedSync(scratchSpaceSize
, KNOB_SIMD16_BYTES
);
74 const API_STATE
& state
= GetApiState(pDC
);
76 SWR_CS_CONTEXT csContext
{0};
77 csContext
.tileCounter
= threadGroupId
;
78 csContext
.dispatchDims
[0] = pTaskData
->threadGroupCountX
;
79 csContext
.dispatchDims
[1] = pTaskData
->threadGroupCountY
;
80 csContext
.dispatchDims
[2] = pTaskData
->threadGroupCountZ
;
81 csContext
.pTGSM
= pContext
->ppScratch
[workerId
];
82 csContext
.pSpillFillBuffer
= (uint8_t*)pSpillFillBuffer
;
83 csContext
.pScratchSpace
= (uint8_t*)pScratchSpace
;
84 csContext
.scratchSpacePerWarp
= pDC
->pState
->state
.scratchSpaceSizePerWarp
;
86 state
.pfnCsFunc(GetPrivateState(pDC
),
87 pContext
->threadPool
.pThreadData
[workerId
].pWorkerPrivateData
,
90 UPDATE_STAT_BE(CsInvocations
, state
.totalThreadsInGroup
);
91 AR_EVENT(CSStats((HANDLE
)&csContext
.stats
));
93 RDTSC_END(pDC
->pContext
->pBucketMgr
, BEDispatch
, 1);
96 //////////////////////////////////////////////////////////////////////////
97 /// @brief Process shutdown.
98 /// @param pDC - pointer to draw context (dispatch).
99 /// @param workerId - The unique worker ID that is assigned to this thread.
100 /// @param threadGroupId - the linear index for the thread group within the dispatch.
101 void ProcessShutdownBE(DRAW_CONTEXT
* pDC
, uint32_t workerId
, uint32_t macroTile
, void* pUserData
)
106 void ProcessSyncBE(DRAW_CONTEXT
* pDC
, uint32_t workerId
, uint32_t macroTile
, void* pUserData
)
109 MacroTileMgr::getTileIndices(macroTile
, x
, y
);
110 SWR_ASSERT(x
== 0 && y
== 0);
113 void ProcessStoreTileBE(DRAW_CONTEXT
* pDC
,
116 STORE_TILES_DESC
* pDesc
,
117 SWR_RENDERTARGET_ATTACHMENT attachment
)
119 SWR_CONTEXT
* pContext
= pDC
->pContext
;
120 HANDLE hWorkerPrivateData
= pContext
->threadPool
.pThreadData
[workerId
].pWorkerPrivateData
;
122 RDTSC_BEGIN(pDC
->pContext
->pBucketMgr
, BEStoreTiles
, pDC
->drawId
);
124 SWR_FORMAT srcFormat
;
127 case SWR_ATTACHMENT_COLOR0
:
128 case SWR_ATTACHMENT_COLOR1
:
129 case SWR_ATTACHMENT_COLOR2
:
130 case SWR_ATTACHMENT_COLOR3
:
131 case SWR_ATTACHMENT_COLOR4
:
132 case SWR_ATTACHMENT_COLOR5
:
133 case SWR_ATTACHMENT_COLOR6
:
134 case SWR_ATTACHMENT_COLOR7
:
135 srcFormat
= KNOB_COLOR_HOT_TILE_FORMAT
;
137 case SWR_ATTACHMENT_DEPTH
:
138 srcFormat
= KNOB_DEPTH_HOT_TILE_FORMAT
;
140 case SWR_ATTACHMENT_STENCIL
:
141 srcFormat
= KNOB_STENCIL_HOT_TILE_FORMAT
;
144 SWR_INVALID("Unknown attachment: %d", attachment
);
145 srcFormat
= KNOB_COLOR_HOT_TILE_FORMAT
;
150 MacroTileMgr::getTileIndices(macroTile
, x
, y
);
152 // Only need to store the hottile if it's been rendered to...
154 pContext
->pHotTileMgr
->GetHotTileNoLoad(pContext
, pDC
, macroTile
, attachment
, false);
157 // clear if clear is pending (i.e., not rendered to), then mark as dirty for store.
158 if (pHotTile
->state
== HOTTILE_CLEAR
)
160 PFN_CLEAR_TILES pfnClearTiles
= gClearTilesTable
[srcFormat
];
161 SWR_ASSERT(pfnClearTiles
!= nullptr);
167 pHotTile
->renderTargetArrayIndex
,
172 if (pHotTile
->state
== HOTTILE_DIRTY
||
173 pDesc
->postStoreTileState
== (SWR_TILE_STATE
)HOTTILE_DIRTY
)
175 int32_t destX
= KNOB_MACROTILE_X_DIM
* x
;
176 int32_t destY
= KNOB_MACROTILE_Y_DIM
* y
;
178 pContext
->pfnStoreTile(GetPrivateState(pDC
),
184 pHotTile
->renderTargetArrayIndex
,
188 if (pHotTile
->state
== HOTTILE_DIRTY
|| pHotTile
->state
== HOTTILE_RESOLVED
)
190 if (!(pDesc
->postStoreTileState
== (SWR_TILE_STATE
)HOTTILE_DIRTY
&&
191 pHotTile
->state
== HOTTILE_RESOLVED
))
193 pHotTile
->state
= (HOTTILE_STATE
)pDesc
->postStoreTileState
;
197 RDTSC_END(pDC
->pContext
->pBucketMgr
, BEStoreTiles
, 1);
200 void ProcessStoreTilesBE(DRAW_CONTEXT
* pDC
, uint32_t workerId
, uint32_t macroTile
, void* pData
)
202 STORE_TILES_DESC
* pDesc
= (STORE_TILES_DESC
*)pData
;
204 unsigned long rt
= 0;
205 uint32_t mask
= pDesc
->attachmentMask
;
206 while (_BitScanForward(&rt
, mask
))
209 ProcessStoreTileBE(pDC
, workerId
, macroTile
, pDesc
, (SWR_RENDERTARGET_ATTACHMENT
)rt
);
213 void ProcessDiscardInvalidateTilesBE(DRAW_CONTEXT
* pDC
,
218 DISCARD_INVALIDATE_TILES_DESC
* pDesc
= (DISCARD_INVALIDATE_TILES_DESC
*)pData
;
219 SWR_CONTEXT
* pContext
= pDC
->pContext
;
221 const int32_t numSamples
= GetNumSamples(pDC
->pState
->state
.rastState
.sampleCount
);
223 for (uint32_t i
= 0; i
< SWR_NUM_ATTACHMENTS
; ++i
)
225 if (pDesc
->attachmentMask
& (1 << i
))
228 pContext
->pHotTileMgr
->GetHotTileNoLoad(pContext
,
231 (SWR_RENDERTARGET_ATTACHMENT
)i
,
232 pDesc
->createNewTiles
,
236 pHotTile
->state
= (HOTTILE_STATE
)pDesc
->newTileState
;
242 template <uint32_t sampleCountT
>
243 void BackendNullPS(DRAW_CONTEXT
* pDC
,
247 SWR_TRIANGLE_DESC
& work
,
248 RenderOutputBuffers
& renderBuffers
)
250 RDTSC_BEGIN(pDC
->pContext
->pBucketMgr
, BENullBackend
, pDC
->drawId
);
251 ///@todo: handle center multisample pattern
252 RDTSC_BEGIN(pDC
->pContext
->pBucketMgr
, BESetup
, pDC
->drawId
);
254 const API_STATE
& state
= GetApiState(pDC
);
256 BarycentricCoeffs coeffs
;
257 SetupBarycentricCoeffs(&coeffs
, work
);
259 uint8_t *pDepthBuffer
, *pStencilBuffer
;
260 SetupRenderBuffers(NULL
, &pDepthBuffer
, &pStencilBuffer
, 0, renderBuffers
);
262 SWR_PS_CONTEXT psContext
;
263 // skip SetupPixelShaderContext(&psContext, ...); // not needed here
265 RDTSC_END(pDC
->pContext
->pBucketMgr
, BESetup
, 0);
267 simdscalar vYSamplePosUL
= _simd_add_ps(vULOffsetsY
, _simd_set1_ps(static_cast<float>(y
)));
269 const simdscalar dy
= _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM
));
270 const SWR_MULTISAMPLE_POS
& samplePos
= state
.rastState
.samplePositions
;
271 for (uint32_t yy
= y
; yy
< y
+ KNOB_TILE_Y_DIM
; yy
+= SIMD_TILE_Y_DIM
)
273 simdscalar vXSamplePosUL
= _simd_add_ps(vULOffsetsX
, _simd_set1_ps(static_cast<float>(x
)));
275 const simdscalar dx
= _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM
));
277 for (uint32_t xx
= x
; xx
< x
+ KNOB_TILE_X_DIM
; xx
+= SIMD_TILE_X_DIM
)
279 // iterate over active samples
280 unsigned long sample
= 0;
281 uint32_t sampleMask
= state
.blendState
.sampleMask
;
282 while (_BitScanForward(&sample
, sampleMask
))
284 sampleMask
&= ~(1 << sample
);
286 simdmask coverageMask
= work
.coverageMask
[sample
] & MASK
;
290 // offset depth/stencil buffers current sample
291 uint8_t* pDepthSample
= pDepthBuffer
+ RasterTileDepthOffset(sample
);
292 uint8_t* pStencilSample
= pStencilBuffer
+ RasterTileStencilOffset(sample
);
294 if (state
.depthHottileEnable
&& state
.depthBoundsState
.depthBoundsTestEnable
)
296 static_assert(KNOB_DEPTH_HOT_TILE_FORMAT
== R32_FLOAT
,
297 "Unsupported depth hot tile format");
300 _simd_load_ps(reinterpret_cast<const float*>(pDepthSample
));
302 const float minz
= state
.depthBoundsState
.depthBoundsTestMinValue
;
303 const float maxz
= state
.depthBoundsState
.depthBoundsTestMaxValue
;
305 coverageMask
&= CalcDepthBoundsAcceptMask(z
, minz
, maxz
);
308 RDTSC_BEGIN(pDC
->pContext
->pBucketMgr
, BEBarycentric
, pDC
->drawId
);
310 // calculate per sample positions
311 psContext
.vX
.sample
= _simd_add_ps(vXSamplePosUL
, samplePos
.vX(sample
));
312 psContext
.vY
.sample
= _simd_add_ps(vYSamplePosUL
, samplePos
.vY(sample
));
314 CalcSampleBarycentrics(coeffs
, psContext
);
316 // interpolate and quantize z
317 psContext
.vZ
= vplaneps(coeffs
.vZa
,
321 psContext
.vJ
.sample
);
322 psContext
.vZ
= state
.pfnQuantizeDepth(psContext
.vZ
);
324 RDTSC_END(pDC
->pContext
->pBucketMgr
, BEBarycentric
, 0);
326 // interpolate user clip distance if available
327 if (state
.backendState
.clipDistanceMask
)
329 coverageMask
&= ~ComputeUserClipMask(state
.backendState
.clipDistanceMask
,
330 work
.pUserClipBuffer
,
332 psContext
.vJ
.sample
);
335 simdscalar vCoverageMask
= _simd_vmask_ps(coverageMask
);
336 simdscalar stencilPassMask
= vCoverageMask
;
338 RDTSC_BEGIN(pDC
->pContext
->pBucketMgr
, BEEarlyDepthTest
, pDC
->drawId
);
339 simdscalar depthPassMask
= DepthStencilTest(&state
,
340 work
.triFlags
.frontFacing
,
341 work
.triFlags
.viewportIndex
,
347 AR_EVENT(EarlyDepthStencilInfoNullPS(_simd_movemask_ps(depthPassMask
),
348 _simd_movemask_ps(stencilPassMask
),
349 _simd_movemask_ps(vCoverageMask
)));
350 DepthStencilWrite(&state
.vp
[work
.triFlags
.viewportIndex
],
351 &state
.depthStencilState
,
352 work
.triFlags
.frontFacing
,
359 RDTSC_END(pDC
->pContext
->pBucketMgr
, BEEarlyDepthTest
, 0);
361 uint32_t statMask
= _simd_movemask_ps(depthPassMask
);
362 uint32_t statCount
= _mm_popcnt_u32(statMask
);
363 UPDATE_STAT_BE(DepthPassCount
, statCount
);
368 work
.coverageMask
[sample
] >>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
371 pDepthBuffer
+= (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
) / 8;
373 (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
) / 8;
375 vXSamplePosUL
= _simd_add_ps(vXSamplePosUL
, dx
);
378 vYSamplePosUL
= _simd_add_ps(vYSamplePosUL
, dy
);
381 RDTSC_END(pDC
->pContext
->pBucketMgr
, BENullBackend
, 0);
384 PFN_CLEAR_TILES gClearTilesTable
[NUM_SWR_FORMATS
] = {};
385 PFN_BACKEND_FUNC gBackendNullPs
[SWR_MULTISAMPLE_TYPE_COUNT
];
386 PFN_BACKEND_FUNC gBackendSingleSample
[SWR_INPUT_COVERAGE_COUNT
][2] // centroid
389 PFN_BACKEND_FUNC gBackendPixelRateTable
[SWR_MULTISAMPLE_TYPE_COUNT
][2] // isCenterPattern
390 [SWR_INPUT_COVERAGE_COUNT
][2] // centroid
391 [2] // forcedSampleCount
394 PFN_BACKEND_FUNC gBackendSampleRateTable
[SWR_MULTISAMPLE_TYPE_COUNT
][SWR_INPUT_COVERAGE_COUNT
]
399 void InitBackendFuncTables()
401 InitBackendPixelRate();
402 InitBackendSingleFuncTable(gBackendSingleSample
);
403 InitBackendSampleFuncTable(gBackendSampleRateTable
);
405 gBackendNullPs
[SWR_MULTISAMPLE_1X
] = &BackendNullPS
<SWR_MULTISAMPLE_1X
>;
406 gBackendNullPs
[SWR_MULTISAMPLE_2X
] = &BackendNullPS
<SWR_MULTISAMPLE_2X
>;
407 gBackendNullPs
[SWR_MULTISAMPLE_4X
] = &BackendNullPS
<SWR_MULTISAMPLE_4X
>;
408 gBackendNullPs
[SWR_MULTISAMPLE_8X
] = &BackendNullPS
<SWR_MULTISAMPLE_8X
>;
409 gBackendNullPs
[SWR_MULTISAMPLE_16X
] = &BackendNullPS
<SWR_MULTISAMPLE_16X
>;