1 /****************************************************************************
2 * Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * @brief Backend handles rasterization, pixel shading and output merger
28 ******************************************************************************/
30 #include <smmintrin.h>
33 #include "backend_impl.h"
35 #include "memory/tilingtraits.h"
36 #include "core/multisample.h"
41 void BackendSampleRate(DRAW_CONTEXT
* pDC
,
45 SWR_TRIANGLE_DESC
& work
,
46 RenderOutputBuffers
& renderBuffers
)
48 RDTSC_BEGIN(pDC
->pContext
->pBucketMgr
, BESampleRateBackend
, pDC
->drawId
);
49 RDTSC_BEGIN(pDC
->pContext
->pBucketMgr
, BESetup
, pDC
->drawId
);
51 void* pWorkerData
= pDC
->pContext
->threadPool
.pThreadData
[workerId
].pWorkerPrivateData
;
52 const API_STATE
& state
= GetApiState(pDC
);
54 BarycentricCoeffs coeffs
;
55 SetupBarycentricCoeffs(&coeffs
, work
);
57 SWR_PS_CONTEXT psContext
;
58 const SWR_MULTISAMPLE_POS
& samplePos
= state
.rastState
.samplePositions
;
59 SetupPixelShaderContext
<T
>(&psContext
, samplePos
, work
);
61 uint8_t *pDepthBuffer
, *pStencilBuffer
;
62 SetupRenderBuffers(psContext
.pColorBuffer
,
65 state
.colorHottileEnable
,
68 bool isTileDirty
= false;
70 RDTSC_END(pDC
->pContext
->pBucketMgr
, BESetup
, 0);
72 psContext
.vY
.UL
= _simd_add_ps(vULOffsetsY
, _simd_set1_ps(static_cast<float>(y
)));
73 psContext
.vY
.center
= _simd_add_ps(vCenterOffsetsY
, _simd_set1_ps(static_cast<float>(y
)));
75 const simdscalar dy
= _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM
));
77 for (uint32_t yy
= y
; yy
< y
+ KNOB_TILE_Y_DIM
; yy
+= SIMD_TILE_Y_DIM
)
79 psContext
.vX
.UL
= _simd_add_ps(vULOffsetsX
, _simd_set1_ps(static_cast<float>(x
)));
80 psContext
.vX
.center
= _simd_add_ps(vCenterOffsetsX
, _simd_set1_ps(static_cast<float>(x
)));
82 const simdscalar dx
= _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM
));
84 for (uint32_t xx
= x
; xx
< x
+ KNOB_TILE_X_DIM
; xx
+= SIMD_TILE_X_DIM
)
86 const bool useAlternateOffset
= ((xx
& SIMD_TILE_X_DIM
) != 0);
89 if (T::InputCoverage
!= SWR_INPUT_COVERAGE_NONE
)
91 const uint64_t* pCoverageMask
=
92 (T::InputCoverage
== SWR_INPUT_COVERAGE_INNER_CONSERVATIVE
)
93 ? &work
.innerCoverageMask
94 : &work
.coverageMask
[0];
96 generateInputCoverage
<T
, T::InputCoverage
>(
97 pCoverageMask
, psContext
.inputMask
, state
.blendState
.sampleMask
);
100 RDTSC_BEGIN(pDC
->pContext
->pBucketMgr
, BEBarycentric
, pDC
->drawId
);
102 CalcPixelBarycentrics(coeffs
, psContext
);
104 CalcCentroid
<T
, false>(
105 &psContext
, samplePos
, coeffs
, work
.coverageMask
, state
.blendState
.sampleMask
);
107 RDTSC_END(pDC
->pContext
->pBucketMgr
, BEBarycentric
, 0);
109 for (uint32_t sample
= 0; sample
< T::MultisampleT::numSamples
; sample
++)
111 simdmask coverageMask
= work
.coverageMask
[sample
] & MASK
;
115 // offset depth/stencil buffers current sample
116 uint8_t* pDepthSample
= pDepthBuffer
+ RasterTileDepthOffset(sample
);
117 uint8_t* pStencilSample
= pStencilBuffer
+ RasterTileStencilOffset(sample
);
119 if (state
.depthHottileEnable
&& state
.depthBoundsState
.depthBoundsTestEnable
)
121 static_assert(KNOB_DEPTH_HOT_TILE_FORMAT
== R32_FLOAT
,
122 "Unsupported depth hot tile format");
125 _simd_load_ps(reinterpret_cast<const float*>(pDepthSample
));
127 const float minz
= state
.depthBoundsState
.depthBoundsTestMinValue
;
128 const float maxz
= state
.depthBoundsState
.depthBoundsTestMaxValue
;
130 coverageMask
&= CalcDepthBoundsAcceptMask(z
, minz
, maxz
);
133 RDTSC_BEGIN(pDC
->pContext
->pBucketMgr
, BEBarycentric
, pDC
->drawId
);
135 // calculate per sample positions
136 psContext
.vX
.sample
= _simd_add_ps(psContext
.vX
.UL
, samplePos
.vX(sample
));
137 psContext
.vY
.sample
= _simd_add_ps(psContext
.vY
.UL
, samplePos
.vY(sample
));
139 CalcSampleBarycentrics(coeffs
, psContext
);
141 // interpolate and quantize z
142 psContext
.vZ
= vplaneps(coeffs
.vZa
,
146 psContext
.vJ
.sample
);
147 psContext
.vZ
= state
.pfnQuantizeDepth(psContext
.vZ
);
149 RDTSC_END(pDC
->pContext
->pBucketMgr
, BEBarycentric
, 0);
151 // interpolate user clip distance if available
152 if (state
.backendState
.clipDistanceMask
)
154 coverageMask
&= ~ComputeUserClipMask(state
.backendState
.clipDistanceMask
,
155 work
.pUserClipBuffer
,
157 psContext
.vJ
.sample
);
160 simdscalar vCoverageMask
= _simd_vmask_ps(coverageMask
);
161 simdscalar depthPassMask
= vCoverageMask
;
162 simdscalar stencilPassMask
= vCoverageMask
;
167 RDTSC_BEGIN(pDC
->pContext
->pBucketMgr
, BEEarlyDepthTest
, pDC
->drawId
);
168 depthPassMask
= DepthStencilTest(&state
,
169 work
.triFlags
.frontFacing
,
170 work
.triFlags
.viewportIndex
,
176 AR_EVENT(EarlyDepthStencilInfoSampleRate(_simd_movemask_ps(depthPassMask
),
177 _simd_movemask_ps(stencilPassMask
),
178 _simd_movemask_ps(vCoverageMask
)));
179 RDTSC_END(pDC
->pContext
->pBucketMgr
, BEEarlyDepthTest
, 0);
181 // early-exit if no samples passed depth or earlyZ is forced on.
182 if (state
.psState
.forceEarlyZ
|| !_simd_movemask_ps(depthPassMask
))
184 DepthStencilWrite(&state
.vp
[work
.triFlags
.viewportIndex
],
185 &state
.depthStencilState
,
186 work
.triFlags
.frontFacing
,
194 if (!_simd_movemask_ps(depthPassMask
))
196 work
.coverageMask
[sample
] >>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
202 psContext
.sampleIndex
= sample
;
203 psContext
.activeMask
= _simd_castps_si(vCoverageMask
);
205 // execute pixel shader
206 RDTSC_BEGIN(pDC
->pContext
->pBucketMgr
, BEPixelShader
, pDC
->drawId
);
207 state
.psState
.pfnPixelShader(GetPrivateState(pDC
), pWorkerData
, &psContext
);
208 RDTSC_END(pDC
->pContext
->pBucketMgr
, BEPixelShader
, 0);
211 UPDATE_STAT_BE(PsInvocations
, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask
)));
212 AR_EVENT(PSStats((HANDLE
)&psContext
.stats
));
214 vCoverageMask
= _simd_castsi_ps(psContext
.activeMask
);
216 if (_simd_movemask_ps(vCoverageMask
))
224 RDTSC_BEGIN(pDC
->pContext
->pBucketMgr
, BELateDepthTest
, pDC
->drawId
);
225 depthPassMask
= DepthStencilTest(&state
,
226 work
.triFlags
.frontFacing
,
227 work
.triFlags
.viewportIndex
,
233 AR_EVENT(LateDepthStencilInfoSampleRate(_simd_movemask_ps(depthPassMask
),
234 _simd_movemask_ps(stencilPassMask
),
235 _simd_movemask_ps(vCoverageMask
)));
236 RDTSC_END(pDC
->pContext
->pBucketMgr
, BELateDepthTest
, 0);
238 if (!_simd_movemask_ps(depthPassMask
))
240 // need to call depth/stencil write for stencil write
241 DepthStencilWrite(&state
.vp
[work
.triFlags
.viewportIndex
],
242 &state
.depthStencilState
,
243 work
.triFlags
.frontFacing
,
251 work
.coverageMask
[sample
] >>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
256 uint32_t statMask
= _simd_movemask_ps(depthPassMask
);
257 uint32_t statCount
= _mm_popcnt_u32(statMask
);
258 UPDATE_STAT_BE(DepthPassCount
, statCount
);
261 RDTSC_BEGIN(pDC
->pContext
->pBucketMgr
, BEOutputMerger
, pDC
->drawId
);
265 psContext
.pColorBuffer
,
271 state
.psState
.renderTargetMask
,
275 // do final depth write after all pixel kills
276 if (!state
.psState
.forceEarlyZ
)
278 DepthStencilWrite(&state
.vp
[work
.triFlags
.viewportIndex
],
279 &state
.depthStencilState
,
280 work
.triFlags
.frontFacing
,
288 RDTSC_END(pDC
->pContext
->pBucketMgr
, BEOutputMerger
, 0);
290 work
.coverageMask
[sample
] >>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
296 RDTSC_BEGIN(pDC
->pContext
->pBucketMgr
, BEEndTile
, pDC
->drawId
);
298 if (T::InputCoverage
== SWR_INPUT_COVERAGE_INNER_CONSERVATIVE
)
300 work
.innerCoverageMask
>>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
303 if (useAlternateOffset
)
306 uint32_t rtMask
= state
.colorHottileEnable
;
307 while (_BitScanForward(&rt
, rtMask
))
309 rtMask
&= ~(1 << rt
);
310 psContext
.pColorBuffer
[rt
] +=
311 (2 * KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
) / 8;
315 pDepthBuffer
+= (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
) / 8;
317 (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
) / 8;
319 RDTSC_END(pDC
->pContext
->pBucketMgr
, BEEndTile
, 0);
321 psContext
.vX
.UL
= _simd_add_ps(psContext
.vX
.UL
, dx
);
322 psContext
.vX
.center
= _simd_add_ps(psContext
.vX
.center
, dx
);
325 psContext
.vY
.UL
= _simd_add_ps(psContext
.vY
.UL
, dy
);
326 psContext
.vY
.center
= _simd_add_ps(psContext
.vY
.center
, dy
);
331 SetRenderHotTilesDirty(pDC
, renderBuffers
);
334 RDTSC_END(pDC
->pContext
->pBucketMgr
, BESampleRateBackend
, 0);
337 // Recursive template used to auto-nest conditionals. Converts dynamic enum function
338 // arguments to static template arguments.
339 template <uint32_t... ArgsT
>
340 struct BEChooserSampleRate
342 // Last Arg Terminator
343 static PFN_BACKEND_FUNC
GetFunc(SWR_BACKEND_FUNCS tArg
)
347 case SWR_BACKEND_MSAA_SAMPLE_RATE
:
348 return BackendSampleRate
<SwrBackendTraits
<ArgsT
...>>;
350 case SWR_BACKEND_SINGLE_SAMPLE
:
351 case SWR_BACKEND_MSAA_PIXEL_RATE
:
352 SWR_ASSERT(0 && "Invalid backend func\n");
356 SWR_ASSERT(0 && "Invalid backend func\n");
362 // Recursively parse args
363 template <typename
... TArgsT
>
364 static PFN_BACKEND_FUNC
GetFunc(SWR_INPUT_COVERAGE tArg
, TArgsT
... remainingArgs
)
368 case SWR_INPUT_COVERAGE_NONE
:
369 return BEChooserSampleRate
<ArgsT
..., SWR_INPUT_COVERAGE_NONE
>::GetFunc(
372 case SWR_INPUT_COVERAGE_NORMAL
:
373 return BEChooserSampleRate
<ArgsT
..., SWR_INPUT_COVERAGE_NORMAL
>::GetFunc(
376 case SWR_INPUT_COVERAGE_INNER_CONSERVATIVE
:
377 return BEChooserSampleRate
<ArgsT
..., SWR_INPUT_COVERAGE_INNER_CONSERVATIVE
>::GetFunc(
381 SWR_ASSERT(0 && "Invalid sample pattern\n");
382 return BEChooserSampleRate
<ArgsT
..., SWR_INPUT_COVERAGE_NONE
>::GetFunc(
388 // Recursively parse args
389 template <typename
... TArgsT
>
390 static PFN_BACKEND_FUNC
GetFunc(SWR_MULTISAMPLE_COUNT tArg
, TArgsT
... remainingArgs
)
394 case SWR_MULTISAMPLE_1X
:
395 return BEChooserSampleRate
<ArgsT
..., SWR_MULTISAMPLE_1X
>::GetFunc(remainingArgs
...);
397 case SWR_MULTISAMPLE_2X
:
398 return BEChooserSampleRate
<ArgsT
..., SWR_MULTISAMPLE_2X
>::GetFunc(remainingArgs
...);
400 case SWR_MULTISAMPLE_4X
:
401 return BEChooserSampleRate
<ArgsT
..., SWR_MULTISAMPLE_4X
>::GetFunc(remainingArgs
...);
403 case SWR_MULTISAMPLE_8X
:
404 return BEChooserSampleRate
<ArgsT
..., SWR_MULTISAMPLE_8X
>::GetFunc(remainingArgs
...);
406 case SWR_MULTISAMPLE_16X
:
407 return BEChooserSampleRate
<ArgsT
..., SWR_MULTISAMPLE_16X
>::GetFunc(remainingArgs
...);
410 SWR_ASSERT(0 && "Invalid sample count\n");
411 return BEChooserSampleRate
<ArgsT
..., SWR_MULTISAMPLE_1X
>::GetFunc(remainingArgs
...);
416 // Recursively parse args
417 template <typename
... TArgsT
>
418 static PFN_BACKEND_FUNC
GetFunc(bool tArg
, TArgsT
... remainingArgs
)
422 return BEChooserSampleRate
<ArgsT
..., 1>::GetFunc(remainingArgs
...);
425 return BEChooserSampleRate
<ArgsT
..., 0>::GetFunc(remainingArgs
...);
429 void InitBackendSampleFuncTable(
430 PFN_BACKEND_FUNC (&table
)[SWR_MULTISAMPLE_TYPE_COUNT
][SWR_INPUT_COVERAGE_COUNT
][2][2])
432 for (uint32_t sampleCount
= SWR_MULTISAMPLE_1X
; sampleCount
< SWR_MULTISAMPLE_TYPE_COUNT
;
435 for (uint32_t inputCoverage
= 0; inputCoverage
< SWR_INPUT_COVERAGE_COUNT
; inputCoverage
++)
437 for (uint32_t centroid
= 0; centroid
< 2; centroid
++)
439 for (uint32_t canEarlyZ
= 0; canEarlyZ
< 2; canEarlyZ
++)
441 table
[sampleCount
][inputCoverage
][centroid
][canEarlyZ
] =
442 BEChooserSampleRate
<>::GetFunc(
443 (SWR_MULTISAMPLE_COUNT
)sampleCount
,
445 (SWR_INPUT_COVERAGE
)inputCoverage
,
449 (SWR_BACKEND_FUNCS
)SWR_BACKEND_MSAA_SAMPLE_RATE
);