1 /****************************************************************************
2 * Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * @brief Backend handles rasterization, pixel shading and output merger
28 ******************************************************************************/
30 #include <smmintrin.h>
33 #include "backend_impl.h"
35 #include "memory/tilingtraits.h"
36 #include "core/multisample.h"
37 #include "backends/gen_BackendPixelRate.hpp"
42 //////////////////////////////////////////////////////////////////////////
43 /// @brief Process compute work.
44 /// @param pDC - pointer to draw context (dispatch).
45 /// @param workerId - The unique worker ID that is assigned to this thread.
46 /// @param threadGroupId - the linear index for the thread group within the dispatch.
47 void ProcessComputeBE(DRAW_CONTEXT
* pDC
,
49 uint32_t threadGroupId
,
50 void*& pSpillFillBuffer
,
53 SWR_CONTEXT
* pContext
= pDC
->pContext
;
55 RDTSC_BEGIN(pDC
->pContext
->pBucketMgr
, BEDispatch
, pDC
->drawId
);
57 const COMPUTE_DESC
* pTaskData
= (COMPUTE_DESC
*)pDC
->pDispatch
->GetTasksData();
58 SWR_ASSERT(pTaskData
!= nullptr);
60 // Ensure spill fill memory has been allocated.
61 size_t spillFillSize
= pDC
->pState
->state
.totalSpillFillSize
;
62 if (spillFillSize
&& pSpillFillBuffer
== nullptr)
64 pSpillFillBuffer
= pDC
->pArena
->AllocAlignedSync(spillFillSize
, KNOB_SIMD16_BYTES
);
67 size_t scratchSpaceSize
=
68 pDC
->pState
->state
.scratchSpaceSizePerWarp
* pDC
->pState
->state
.scratchSpaceNumWarps
;
69 if (scratchSpaceSize
&& pScratchSpace
== nullptr)
71 pScratchSpace
= pDC
->pArena
->AllocAlignedSync(scratchSpaceSize
, KNOB_SIMD16_BYTES
);
74 const API_STATE
& state
= GetApiState(pDC
);
76 SWR_CS_CONTEXT csContext
{0};
77 csContext
.tileCounter
= threadGroupId
;
78 csContext
.dispatchDims
[0] = pTaskData
->threadGroupCountX
;
79 csContext
.dispatchDims
[1] = pTaskData
->threadGroupCountY
;
80 csContext
.dispatchDims
[2] = pTaskData
->threadGroupCountZ
;
81 csContext
.pTGSM
= pContext
->ppScratch
[workerId
];
82 csContext
.pSpillFillBuffer
= (uint8_t*)pSpillFillBuffer
;
83 csContext
.pScratchSpace
= (uint8_t*)pScratchSpace
;
84 csContext
.scratchSpacePerWarp
= pDC
->pState
->state
.scratchSpaceSizePerWarp
;
86 state
.pfnCsFunc(GetPrivateState(pDC
),
87 pContext
->threadPool
.pThreadData
[workerId
].pWorkerPrivateData
,
90 UPDATE_STAT_BE(CsInvocations
, state
.totalThreadsInGroup
);
91 AR_EVENT(CSStats((HANDLE
)&csContext
.stats
));
93 RDTSC_END(pDC
->pContext
->pBucketMgr
, BEDispatch
, 1);
96 //////////////////////////////////////////////////////////////////////////
97 /// @brief Process shutdown.
98 /// @param pDC - pointer to draw context (dispatch).
99 /// @param workerId - The unique worker ID that is assigned to this thread.
100 /// @param threadGroupId - the linear index for the thread group within the dispatch.
101 void ProcessShutdownBE(DRAW_CONTEXT
* pDC
, uint32_t workerId
, uint32_t macroTile
, void* pUserData
)
106 void ProcessSyncBE(DRAW_CONTEXT
* pDC
, uint32_t workerId
, uint32_t macroTile
, void* pUserData
)
109 MacroTileMgr::getTileIndices(macroTile
, x
, y
);
110 SWR_ASSERT(x
== 0 && y
== 0);
113 void ProcessStoreTileBE(DRAW_CONTEXT
* pDC
,
116 STORE_TILES_DESC
* pDesc
,
117 SWR_RENDERTARGET_ATTACHMENT attachment
)
119 SWR_CONTEXT
* pContext
= pDC
->pContext
;
120 HANDLE hWorkerPrivateData
= pContext
->threadPool
.pThreadData
[workerId
].pWorkerPrivateData
;
122 RDTSC_BEGIN(pDC
->pContext
->pBucketMgr
, BEStoreTiles
, pDC
->drawId
);
124 SWR_FORMAT srcFormat
;
127 case SWR_ATTACHMENT_COLOR0
:
128 case SWR_ATTACHMENT_COLOR1
:
129 case SWR_ATTACHMENT_COLOR2
:
130 case SWR_ATTACHMENT_COLOR3
:
131 case SWR_ATTACHMENT_COLOR4
:
132 case SWR_ATTACHMENT_COLOR5
:
133 case SWR_ATTACHMENT_COLOR6
:
134 case SWR_ATTACHMENT_COLOR7
:
135 srcFormat
= KNOB_COLOR_HOT_TILE_FORMAT
;
137 case SWR_ATTACHMENT_DEPTH
:
138 srcFormat
= KNOB_DEPTH_HOT_TILE_FORMAT
;
140 case SWR_ATTACHMENT_STENCIL
:
141 srcFormat
= KNOB_STENCIL_HOT_TILE_FORMAT
;
144 SWR_INVALID("Unknown attachment: %d", attachment
);
145 srcFormat
= KNOB_COLOR_HOT_TILE_FORMAT
;
150 MacroTileMgr::getTileIndices(macroTile
, x
, y
);
152 // Only need to store the hottile if it's been rendered to...
154 pContext
->pHotTileMgr
->GetHotTileNoLoad(pContext
, pDC
, macroTile
, attachment
, false);
157 // clear if clear is pending (i.e., not rendered to), then mark as dirty for store.
158 if (pHotTile
->state
== HOTTILE_CLEAR
)
160 PFN_CLEAR_TILES pfnClearTiles
= gClearTilesTable
[srcFormat
];
161 SWR_ASSERT(pfnClearTiles
!= nullptr);
167 pHotTile
->renderTargetArrayIndex
,
172 if (pHotTile
->state
== HOTTILE_DIRTY
||
173 pDesc
->postStoreTileState
== (SWR_TILE_STATE
)HOTTILE_DIRTY
)
175 int32_t destX
= KNOB_MACROTILE_X_DIM
* x
;
176 int32_t destY
= KNOB_MACROTILE_Y_DIM
* y
;
178 pContext
->pfnStoreTile(GetPrivateState(pDC
),
184 pHotTile
->renderTargetArrayIndex
,
188 if (pHotTile
->state
== HOTTILE_DIRTY
|| pHotTile
->state
== HOTTILE_RESOLVED
)
190 if (!(pDesc
->postStoreTileState
== (SWR_TILE_STATE
)HOTTILE_DIRTY
&&
191 pHotTile
->state
== HOTTILE_RESOLVED
))
193 pHotTile
->state
= (HOTTILE_STATE
)pDesc
->postStoreTileState
;
197 RDTSC_END(pDC
->pContext
->pBucketMgr
, BEStoreTiles
, 1);
200 void ProcessStoreTilesBE(DRAW_CONTEXT
* pDC
, uint32_t workerId
, uint32_t macroTile
, void* pData
)
202 STORE_TILES_DESC
* pDesc
= (STORE_TILES_DESC
*)pData
;
204 unsigned long rt
= 0;
205 uint32_t mask
= pDesc
->attachmentMask
;
206 while (_BitScanForward(&rt
, mask
))
209 ProcessStoreTileBE(pDC
, workerId
, macroTile
, pDesc
, (SWR_RENDERTARGET_ATTACHMENT
)rt
);
213 void ProcessDiscardInvalidateTilesBE(DRAW_CONTEXT
* pDC
,
218 DISCARD_INVALIDATE_TILES_DESC
* pDesc
= (DISCARD_INVALIDATE_TILES_DESC
*)pData
;
219 SWR_CONTEXT
* pContext
= pDC
->pContext
;
221 const int32_t numSamples
= GetNumSamples(pDC
->pState
->state
.rastState
.sampleCount
);
223 for (uint32_t i
= 0; i
< SWR_NUM_ATTACHMENTS
; ++i
)
225 if (pDesc
->attachmentMask
& (1 << i
))
228 pContext
->pHotTileMgr
->GetHotTileNoLoad(pContext
,
231 (SWR_RENDERTARGET_ATTACHMENT
)i
,
232 pDesc
->createNewTiles
,
236 HOTTILE_STATE newState
= (HOTTILE_STATE
)pDesc
->newTileState
;;
237 if (pHotTile
->state
== HOTTILE_DIRTY
|| pHotTile
->state
== HOTTILE_CLEAR
)
239 if (newState
== HOTTILE_INVALID
)
241 // This is OK for APIs that explicitly allow discards
242 // (for e.g. depth / stencil data)
243 //SWR_INVALID("Discarding valid data!");
246 pHotTile
->state
= newState
;
252 template <uint32_t sampleCountT
>
253 void BackendNullPS(DRAW_CONTEXT
* pDC
,
257 SWR_TRIANGLE_DESC
& work
,
258 RenderOutputBuffers
& renderBuffers
)
260 RDTSC_BEGIN(pDC
->pContext
->pBucketMgr
, BENullBackend
, pDC
->drawId
);
261 ///@todo: handle center multisample pattern
262 RDTSC_BEGIN(pDC
->pContext
->pBucketMgr
, BESetup
, pDC
->drawId
);
264 const API_STATE
& state
= GetApiState(pDC
);
266 BarycentricCoeffs coeffs
;
267 SetupBarycentricCoeffs(&coeffs
, work
);
269 uint8_t *pDepthBuffer
, *pStencilBuffer
;
270 SetupRenderBuffers(NULL
, &pDepthBuffer
, &pStencilBuffer
, 0, renderBuffers
);
272 SWR_PS_CONTEXT psContext
;
273 // skip SetupPixelShaderContext(&psContext, ...); // not needed here
275 RDTSC_END(pDC
->pContext
->pBucketMgr
, BESetup
, 0);
277 simdscalar vYSamplePosUL
= _simd_add_ps(vULOffsetsY
, _simd_set1_ps(static_cast<float>(y
)));
279 const simdscalar dy
= _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM
));
280 const SWR_MULTISAMPLE_POS
& samplePos
= state
.rastState
.samplePositions
;
281 for (uint32_t yy
= y
; yy
< y
+ KNOB_TILE_Y_DIM
; yy
+= SIMD_TILE_Y_DIM
)
283 simdscalar vXSamplePosUL
= _simd_add_ps(vULOffsetsX
, _simd_set1_ps(static_cast<float>(x
)));
285 const simdscalar dx
= _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM
));
287 for (uint32_t xx
= x
; xx
< x
+ KNOB_TILE_X_DIM
; xx
+= SIMD_TILE_X_DIM
)
289 // iterate over active samples
290 unsigned long sample
= 0;
291 uint32_t sampleMask
= state
.blendState
.sampleMask
;
292 while (_BitScanForward(&sample
, sampleMask
))
294 sampleMask
&= ~(1 << sample
);
296 simdmask coverageMask
= work
.coverageMask
[sample
] & MASK
;
300 // offset depth/stencil buffers current sample
301 uint8_t* pDepthSample
= pDepthBuffer
+ RasterTileDepthOffset(sample
);
302 uint8_t* pStencilSample
= pStencilBuffer
+ RasterTileStencilOffset(sample
);
304 if (state
.depthHottileEnable
&& state
.depthBoundsState
.depthBoundsTestEnable
)
306 static_assert(KNOB_DEPTH_HOT_TILE_FORMAT
== R32_FLOAT
,
307 "Unsupported depth hot tile format");
310 _simd_load_ps(reinterpret_cast<const float*>(pDepthSample
));
312 const float minz
= state
.depthBoundsState
.depthBoundsTestMinValue
;
313 const float maxz
= state
.depthBoundsState
.depthBoundsTestMaxValue
;
315 coverageMask
&= CalcDepthBoundsAcceptMask(z
, minz
, maxz
);
318 RDTSC_BEGIN(pDC
->pContext
->pBucketMgr
, BEBarycentric
, pDC
->drawId
);
320 // calculate per sample positions
321 psContext
.vX
.sample
= _simd_add_ps(vXSamplePosUL
, samplePos
.vX(sample
));
322 psContext
.vY
.sample
= _simd_add_ps(vYSamplePosUL
, samplePos
.vY(sample
));
324 CalcSampleBarycentrics(coeffs
, psContext
);
326 // interpolate and quantize z
327 psContext
.vZ
= vplaneps(coeffs
.vZa
,
331 psContext
.vJ
.sample
);
332 psContext
.vZ
= state
.pfnQuantizeDepth(psContext
.vZ
);
334 RDTSC_END(pDC
->pContext
->pBucketMgr
, BEBarycentric
, 0);
336 // interpolate user clip distance if available
337 if (state
.backendState
.clipDistanceMask
)
339 coverageMask
&= ~ComputeUserClipMask(state
.backendState
.clipDistanceMask
,
340 work
.pUserClipBuffer
,
342 psContext
.vJ
.sample
);
345 simdscalar vCoverageMask
= _simd_vmask_ps(coverageMask
);
346 simdscalar stencilPassMask
= vCoverageMask
;
348 RDTSC_BEGIN(pDC
->pContext
->pBucketMgr
, BEEarlyDepthTest
, pDC
->drawId
);
349 simdscalar depthPassMask
= DepthStencilTest(&state
,
350 work
.triFlags
.frontFacing
,
351 work
.triFlags
.viewportIndex
,
357 AR_EVENT(EarlyDepthStencilInfoNullPS(_simd_movemask_ps(depthPassMask
),
358 _simd_movemask_ps(stencilPassMask
),
359 _simd_movemask_ps(vCoverageMask
)));
360 DepthStencilWrite(&state
.vp
[work
.triFlags
.viewportIndex
],
361 &state
.depthStencilState
,
362 work
.triFlags
.frontFacing
,
369 RDTSC_END(pDC
->pContext
->pBucketMgr
, BEEarlyDepthTest
, 0);
371 uint32_t statMask
= _simd_movemask_ps(depthPassMask
);
372 uint32_t statCount
= _mm_popcnt_u32(statMask
);
373 UPDATE_STAT_BE(DepthPassCount
, statCount
);
378 work
.coverageMask
[sample
] >>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
381 pDepthBuffer
+= (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
) / 8;
383 (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
) / 8;
385 vXSamplePosUL
= _simd_add_ps(vXSamplePosUL
, dx
);
388 vYSamplePosUL
= _simd_add_ps(vYSamplePosUL
, dy
);
391 RDTSC_END(pDC
->pContext
->pBucketMgr
, BENullBackend
, 0);
394 PFN_CLEAR_TILES gClearTilesTable
[NUM_SWR_FORMATS
] = {};
395 PFN_BACKEND_FUNC gBackendNullPs
[SWR_MULTISAMPLE_TYPE_COUNT
];
396 PFN_BACKEND_FUNC gBackendSingleSample
[SWR_INPUT_COVERAGE_COUNT
][2] // centroid
399 PFN_BACKEND_FUNC gBackendPixelRateTable
[SWR_MULTISAMPLE_TYPE_COUNT
][2] // isCenterPattern
400 [SWR_INPUT_COVERAGE_COUNT
][2] // centroid
401 [2] // forcedSampleCount
404 PFN_BACKEND_FUNC gBackendSampleRateTable
[SWR_MULTISAMPLE_TYPE_COUNT
][SWR_INPUT_COVERAGE_COUNT
]
409 void InitBackendFuncTables()
411 InitBackendPixelRate();
412 InitBackendSingleFuncTable(gBackendSingleSample
);
413 InitBackendSampleFuncTable(gBackendSampleRateTable
);
415 gBackendNullPs
[SWR_MULTISAMPLE_1X
] = &BackendNullPS
<SWR_MULTISAMPLE_1X
>;
416 gBackendNullPs
[SWR_MULTISAMPLE_2X
] = &BackendNullPS
<SWR_MULTISAMPLE_2X
>;
417 gBackendNullPs
[SWR_MULTISAMPLE_4X
] = &BackendNullPS
<SWR_MULTISAMPLE_4X
>;
418 gBackendNullPs
[SWR_MULTISAMPLE_8X
] = &BackendNullPS
<SWR_MULTISAMPLE_8X
>;
419 gBackendNullPs
[SWR_MULTISAMPLE_16X
] = &BackendNullPS
<SWR_MULTISAMPLE_16X
>;