1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * @brief Backend handles rasterization, pixel shading and output merger
28 ******************************************************************************/
30 #include <smmintrin.h>
33 #include "backend_impl.h"
35 #include "memory/tilingtraits.h"
36 #include "core/multisample.h"
37 #include "backends/gen_BackendPixelRate.hpp"
42 //////////////////////////////////////////////////////////////////////////
43 /// @brief Process compute work.
44 /// @param pDC - pointer to draw context (dispatch).
45 /// @param workerId - The unique worker ID that is assigned to this thread.
46 /// @param threadGroupId - the linear index for the thread group within the dispatch.
47 void ProcessComputeBE(DRAW_CONTEXT
* pDC
, uint32_t workerId
, uint32_t threadGroupId
, void*& pSpillFillBuffer
, void*& pScratchSpace
)
49 SWR_CONTEXT
*pContext
= pDC
->pContext
;
51 AR_BEGIN(BEDispatch
, pDC
->drawId
);
53 const COMPUTE_DESC
* pTaskData
= (COMPUTE_DESC
*)pDC
->pDispatch
->GetTasksData();
54 SWR_ASSERT(pTaskData
!= nullptr);
56 // Ensure spill fill memory has been allocated.
57 size_t spillFillSize
= pDC
->pState
->state
.totalSpillFillSize
;
58 if (spillFillSize
&& pSpillFillBuffer
== nullptr)
60 pSpillFillBuffer
= pDC
->pArena
->AllocAlignedSync(spillFillSize
, KNOB_SIMD_BYTES
);
63 size_t scratchSpaceSize
= pDC
->pState
->state
.scratchSpaceSize
* pDC
->pState
->state
.scratchSpaceNumInstances
;
64 if (scratchSpaceSize
&& pScratchSpace
== nullptr)
66 pScratchSpace
= pDC
->pArena
->AllocAlignedSync(scratchSpaceSize
, KNOB_SIMD_BYTES
);
69 const API_STATE
& state
= GetApiState(pDC
);
71 SWR_CS_CONTEXT csContext
{ 0 };
72 csContext
.tileCounter
= threadGroupId
;
73 csContext
.dispatchDims
[0] = pTaskData
->threadGroupCountX
;
74 csContext
.dispatchDims
[1] = pTaskData
->threadGroupCountY
;
75 csContext
.dispatchDims
[2] = pTaskData
->threadGroupCountZ
;
76 csContext
.pTGSM
= pContext
->ppScratch
[workerId
];
77 csContext
.pSpillFillBuffer
= (uint8_t*)pSpillFillBuffer
;
78 csContext
.pScratchSpace
= (uint8_t*)pScratchSpace
;
79 csContext
.scratchSpacePerSimd
= pDC
->pState
->state
.scratchSpaceSize
;
81 state
.pfnCsFunc(GetPrivateState(pDC
), &csContext
);
83 UPDATE_STAT_BE(CsInvocations
, state
.totalThreadsInGroup
);
85 AR_END(BEDispatch
, 1);
88 //////////////////////////////////////////////////////////////////////////
89 /// @brief Process shutdown.
90 /// @param pDC - pointer to draw context (dispatch).
91 /// @param workerId - The unique worker ID that is assigned to this thread.
92 /// @param threadGroupId - the linear index for the thread group within the dispatch.
93 void ProcessShutdownBE(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t macroTile
, void *pUserData
)
98 void ProcessSyncBE(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t macroTile
, void *pUserData
)
101 MacroTileMgr::getTileIndices(macroTile
, x
, y
);
102 SWR_ASSERT(x
== 0 && y
== 0);
105 void ProcessStoreTileBE(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t macroTile
, STORE_TILES_DESC
* pDesc
,
106 SWR_RENDERTARGET_ATTACHMENT attachment
)
108 SWR_CONTEXT
*pContext
= pDC
->pContext
;
110 AR_BEGIN(BEStoreTiles
, pDC
->drawId
);
112 SWR_FORMAT srcFormat
;
115 case SWR_ATTACHMENT_COLOR0
:
116 case SWR_ATTACHMENT_COLOR1
:
117 case SWR_ATTACHMENT_COLOR2
:
118 case SWR_ATTACHMENT_COLOR3
:
119 case SWR_ATTACHMENT_COLOR4
:
120 case SWR_ATTACHMENT_COLOR5
:
121 case SWR_ATTACHMENT_COLOR6
:
122 case SWR_ATTACHMENT_COLOR7
: srcFormat
= KNOB_COLOR_HOT_TILE_FORMAT
; break;
123 case SWR_ATTACHMENT_DEPTH
: srcFormat
= KNOB_DEPTH_HOT_TILE_FORMAT
; break;
124 case SWR_ATTACHMENT_STENCIL
: srcFormat
= KNOB_STENCIL_HOT_TILE_FORMAT
; break;
125 default: SWR_INVALID("Unknown attachment: %d", attachment
); srcFormat
= KNOB_COLOR_HOT_TILE_FORMAT
; break;
129 MacroTileMgr::getTileIndices(macroTile
, x
, y
);
131 // Only need to store the hottile if it's been rendered to...
132 HOTTILE
*pHotTile
= pContext
->pHotTileMgr
->GetHotTileNoLoad(pContext
, pDC
, macroTile
, attachment
, false);
135 // clear if clear is pending (i.e., not rendered to), then mark as dirty for store.
136 if (pHotTile
->state
== HOTTILE_CLEAR
)
138 PFN_CLEAR_TILES pfnClearTiles
= gClearTilesTable
[srcFormat
];
139 SWR_ASSERT(pfnClearTiles
!= nullptr);
141 pfnClearTiles(pDC
, attachment
, macroTile
, pHotTile
->renderTargetArrayIndex
, pHotTile
->clearData
, pDesc
->rect
);
144 if (pHotTile
->state
== HOTTILE_DIRTY
|| pDesc
->postStoreTileState
== (SWR_TILE_STATE
)HOTTILE_DIRTY
)
146 int32_t destX
= KNOB_MACROTILE_X_DIM
* x
;
147 int32_t destY
= KNOB_MACROTILE_Y_DIM
* y
;
149 pContext
->pfnStoreTile(GetPrivateState(pDC
), srcFormat
,
150 attachment
, destX
, destY
, pHotTile
->renderTargetArrayIndex
, pHotTile
->pBuffer
);
154 if (pHotTile
->state
== HOTTILE_DIRTY
|| pHotTile
->state
== HOTTILE_RESOLVED
)
156 if (!(pDesc
->postStoreTileState
== (SWR_TILE_STATE
)HOTTILE_DIRTY
&& pHotTile
->state
== HOTTILE_RESOLVED
))
158 pHotTile
->state
= (HOTTILE_STATE
)pDesc
->postStoreTileState
;
162 AR_END(BEStoreTiles
, 1);
165 void ProcessStoreTilesBE(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t macroTile
, void *pData
)
167 STORE_TILES_DESC
*pDesc
= (STORE_TILES_DESC
*)pData
;
169 unsigned long rt
= 0;
170 uint32_t mask
= pDesc
->attachmentMask
;
171 while (_BitScanForward(&rt
, mask
))
174 ProcessStoreTileBE(pDC
, workerId
, macroTile
, pDesc
, (SWR_RENDERTARGET_ATTACHMENT
)rt
);
178 void ProcessDiscardInvalidateTilesBE(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t macroTile
, void *pData
)
180 DISCARD_INVALIDATE_TILES_DESC
*pDesc
= (DISCARD_INVALIDATE_TILES_DESC
*)pData
;
181 SWR_CONTEXT
*pContext
= pDC
->pContext
;
183 const int32_t numSamples
= GetNumSamples(pDC
->pState
->state
.rastState
.sampleCount
);
185 for (uint32_t i
= 0; i
< SWR_NUM_ATTACHMENTS
; ++i
)
187 if (pDesc
->attachmentMask
& (1 << i
))
189 HOTTILE
*pHotTile
= pContext
->pHotTileMgr
->GetHotTileNoLoad(
190 pContext
, pDC
, macroTile
, (SWR_RENDERTARGET_ATTACHMENT
)i
, pDesc
->createNewTiles
, numSamples
);
193 pHotTile
->state
= (HOTTILE_STATE
)pDesc
->newTileState
;
199 template<uint32_t sampleCountT
>
200 void BackendNullPS(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t x
, uint32_t y
, SWR_TRIANGLE_DESC
&work
, RenderOutputBuffers
&renderBuffers
)
202 SWR_CONTEXT
*pContext
= pDC
->pContext
;
204 AR_BEGIN(BENullBackend
, pDC
->drawId
);
205 ///@todo: handle center multisample pattern
206 AR_BEGIN(BESetup
, pDC
->drawId
);
208 const API_STATE
&state
= GetApiState(pDC
);
210 BarycentricCoeffs coeffs
;
211 SetupBarycentricCoeffs(&coeffs
, work
);
213 uint8_t *pDepthBuffer
, *pStencilBuffer
;
214 SetupRenderBuffers(NULL
, &pDepthBuffer
, &pStencilBuffer
, 0, renderBuffers
);
216 SWR_PS_CONTEXT psContext
;
217 // skip SetupPixelShaderContext(&psContext, ...); // not needed here
221 simdscalar vYSamplePosUL
= _simd_add_ps(vULOffsetsY
, _simd_set1_ps(static_cast<float>(y
)));
223 const simdscalar dy
= _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM
));
224 const SWR_MULTISAMPLE_POS
& samplePos
= state
.rastState
.samplePositions
;
225 for (uint32_t yy
= y
; yy
< y
+ KNOB_TILE_Y_DIM
; yy
+= SIMD_TILE_Y_DIM
)
227 simdscalar vXSamplePosUL
= _simd_add_ps(vULOffsetsX
, _simd_set1_ps(static_cast<float>(x
)));
229 const simdscalar dx
= _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM
));
231 for (uint32_t xx
= x
; xx
< x
+ KNOB_TILE_X_DIM
; xx
+= SIMD_TILE_X_DIM
)
233 // iterate over active samples
234 unsigned long sample
= 0;
235 uint32_t sampleMask
= state
.blendState
.sampleMask
;
236 while (_BitScanForward(&sample
, sampleMask
))
238 sampleMask
&= ~(1 << sample
);
240 simdmask coverageMask
= work
.coverageMask
[sample
] & MASK
;
244 // offset depth/stencil buffers current sample
245 uint8_t *pDepthSample
= pDepthBuffer
+ RasterTileDepthOffset(sample
);
246 uint8_t *pStencilSample
= pStencilBuffer
+ RasterTileStencilOffset(sample
);
248 if (state
.depthHottileEnable
&& state
.depthBoundsState
.depthBoundsTestEnable
)
250 static_assert(KNOB_DEPTH_HOT_TILE_FORMAT
== R32_FLOAT
, "Unsupported depth hot tile format");
252 const simdscalar z
= _simd_load_ps(reinterpret_cast<const float *>(pDepthSample
));
254 const float minz
= state
.depthBoundsState
.depthBoundsTestMinValue
;
255 const float maxz
= state
.depthBoundsState
.depthBoundsTestMaxValue
;
257 coverageMask
&= CalcDepthBoundsAcceptMask(z
, minz
, maxz
);
260 AR_BEGIN(BEBarycentric
, pDC
->drawId
);
262 // calculate per sample positions
263 psContext
.vX
.sample
= _simd_add_ps(vXSamplePosUL
, samplePos
.vX(sample
));
264 psContext
.vY
.sample
= _simd_add_ps(vYSamplePosUL
, samplePos
.vY(sample
));
266 CalcSampleBarycentrics(coeffs
, psContext
);
268 // interpolate and quantize z
269 psContext
.vZ
= vplaneps(coeffs
.vZa
, coeffs
.vZb
, coeffs
.vZc
, psContext
.vI
.sample
, psContext
.vJ
.sample
);
270 psContext
.vZ
= state
.pfnQuantizeDepth(psContext
.vZ
);
272 AR_END(BEBarycentric
, 0);
274 // interpolate user clip distance if available
275 if (state
.rastState
.clipDistanceMask
)
277 coverageMask
&= ~ComputeUserClipMask(state
.rastState
.clipDistanceMask
, work
.pUserClipBuffer
, psContext
.vI
.sample
, psContext
.vJ
.sample
);
280 simdscalar vCoverageMask
= _simd_vmask_ps(coverageMask
);
281 simdscalar stencilPassMask
= vCoverageMask
;
283 AR_BEGIN(BEEarlyDepthTest
, pDC
->drawId
);
284 simdscalar depthPassMask
= DepthStencilTest(&state
, work
.triFlags
.frontFacing
, work
.triFlags
.viewportIndex
,
285 psContext
.vZ
, pDepthSample
, vCoverageMask
, pStencilSample
, &stencilPassMask
);
286 AR_EVENT(EarlyDepthStencilInfoNullPS(_simd_movemask_ps(depthPassMask
), _simd_movemask_ps(stencilPassMask
), _simd_movemask_ps(vCoverageMask
)));
287 DepthStencilWrite(&state
.vp
[work
.triFlags
.viewportIndex
], &state
.depthStencilState
, work
.triFlags
.frontFacing
, psContext
.vZ
,
288 pDepthSample
, depthPassMask
, vCoverageMask
, pStencilSample
, stencilPassMask
);
289 AR_END(BEEarlyDepthTest
, 0);
291 uint32_t statMask
= _simd_movemask_ps(depthPassMask
);
292 uint32_t statCount
= _mm_popcnt_u32(statMask
);
293 UPDATE_STAT_BE(DepthPassCount
, statCount
);
298 work
.coverageMask
[sample
] >>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
301 pDepthBuffer
+= (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
) / 8;
302 pStencilBuffer
+= (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
) / 8;
304 vXSamplePosUL
= _simd_add_ps(vXSamplePosUL
, dx
);
307 vYSamplePosUL
= _simd_add_ps(vYSamplePosUL
, dy
);
310 AR_END(BENullBackend
, 0);
313 PFN_CLEAR_TILES gClearTilesTable
[NUM_SWR_FORMATS
] = {};
314 PFN_BACKEND_FUNC gBackendNullPs
[SWR_MULTISAMPLE_TYPE_COUNT
];
315 PFN_BACKEND_FUNC gBackendSingleSample
[SWR_INPUT_COVERAGE_COUNT
]
319 PFN_BACKEND_FUNC gBackendPixelRateTable
[SWR_MULTISAMPLE_TYPE_COUNT
]
320 [2] // isCenterPattern
321 [SWR_INPUT_COVERAGE_COUNT
]
323 [2] // forcedSampleCount
326 PFN_BACKEND_FUNC gBackendSampleRateTable
[SWR_MULTISAMPLE_TYPE_COUNT
]
327 [SWR_INPUT_COVERAGE_COUNT
]
332 void InitBackendFuncTables()
334 InitBackendPixelRate();
335 InitBackendSingleFuncTable(gBackendSingleSample
);
336 InitBackendSampleFuncTable(gBackendSampleRateTable
);
338 gBackendNullPs
[SWR_MULTISAMPLE_1X
] = &BackendNullPS
< SWR_MULTISAMPLE_1X
> ;
339 gBackendNullPs
[SWR_MULTISAMPLE_2X
] = &BackendNullPS
< SWR_MULTISAMPLE_2X
> ;
340 gBackendNullPs
[SWR_MULTISAMPLE_4X
] = &BackendNullPS
< SWR_MULTISAMPLE_4X
> ;
341 gBackendNullPs
[SWR_MULTISAMPLE_8X
] = &BackendNullPS
< SWR_MULTISAMPLE_8X
> ;
342 gBackendNullPs
[SWR_MULTISAMPLE_16X
] = &BackendNullPS
< SWR_MULTISAMPLE_16X
> ;