1 /****************************************************************************
2 * Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * @brief Backend handles rasterization, pixel shading and output merger
28 ******************************************************************************/
30 #include <smmintrin.h>
33 #include "backend_impl.h"
35 #include "memory/tilingtraits.h"
36 #include "core/multisample.h"
41 void BackendSingleSample(DRAW_CONTEXT
* pDC
,
45 SWR_TRIANGLE_DESC
& work
,
46 RenderOutputBuffers
& renderBuffers
)
48 RDTSC_BEGIN(BESingleSampleBackend
, pDC
->drawId
);
49 RDTSC_BEGIN(BESetup
, pDC
->drawId
);
51 void* pWorkerData
= pDC
->pContext
->threadPool
.pThreadData
[workerId
].pWorkerPrivateData
;
53 const API_STATE
& state
= GetApiState(pDC
);
55 BarycentricCoeffs coeffs
;
56 SetupBarycentricCoeffs(&coeffs
, work
);
58 SWR_PS_CONTEXT psContext
;
59 const SWR_MULTISAMPLE_POS
& samplePos
= state
.rastState
.samplePositions
;
60 SetupPixelShaderContext
<T
>(&psContext
, samplePos
, work
);
62 uint8_t *pDepthBuffer
, *pStencilBuffer
;
63 SetupRenderBuffers(psContext
.pColorBuffer
,
66 state
.colorHottileEnable
,
69 RDTSC_END(BESetup
, 1);
71 psContext
.vY
.UL
= _simd_add_ps(vULOffsetsY
, _simd_set1_ps(static_cast<float>(y
)));
72 psContext
.vY
.center
= _simd_add_ps(vCenterOffsetsY
, _simd_set1_ps(static_cast<float>(y
)));
74 const simdscalar dy
= _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM
));
76 for (uint32_t yy
= y
; yy
< y
+ KNOB_TILE_Y_DIM
; yy
+= SIMD_TILE_Y_DIM
)
78 psContext
.vX
.UL
= _simd_add_ps(vULOffsetsX
, _simd_set1_ps(static_cast<float>(x
)));
79 psContext
.vX
.center
= _simd_add_ps(vCenterOffsetsX
, _simd_set1_ps(static_cast<float>(x
)));
81 const simdscalar dx
= _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM
));
83 for (uint32_t xx
= x
; xx
< x
+ KNOB_TILE_X_DIM
; xx
+= SIMD_TILE_X_DIM
)
85 const bool useAlternateOffset
= ((xx
& SIMD_TILE_X_DIM
) != 0);
88 simdmask coverageMask
= work
.coverageMask
[0] & MASK
;
92 if (state
.depthHottileEnable
&& state
.depthBoundsState
.depthBoundsTestEnable
)
94 static_assert(KNOB_DEPTH_HOT_TILE_FORMAT
== R32_FLOAT
,
95 "Unsupported depth hot tile format");
98 _simd_load_ps(reinterpret_cast<const float*>(pDepthBuffer
));
100 const float minz
= state
.depthBoundsState
.depthBoundsTestMinValue
;
101 const float maxz
= state
.depthBoundsState
.depthBoundsTestMaxValue
;
103 coverageMask
&= CalcDepthBoundsAcceptMask(z
, minz
, maxz
);
106 if (T::InputCoverage
!= SWR_INPUT_COVERAGE_NONE
)
108 const uint64_t* pCoverageMask
=
109 (T::InputCoverage
== SWR_INPUT_COVERAGE_INNER_CONSERVATIVE
)
110 ? &work
.innerCoverageMask
111 : &work
.coverageMask
[0];
113 generateInputCoverage
<T
, T::InputCoverage
>(
114 pCoverageMask
, psContext
.inputMask
, state
.blendState
.sampleMask
);
117 RDTSC_BEGIN(BEBarycentric
, pDC
->drawId
);
119 CalcPixelBarycentrics(coeffs
, psContext
);
121 CalcCentroid
<T
, true>(
122 &psContext
, samplePos
, coeffs
, work
.coverageMask
, state
.blendState
.sampleMask
);
124 // interpolate and quantize z
125 psContext
.vZ
= vplaneps(
126 coeffs
.vZa
, coeffs
.vZb
, coeffs
.vZc
, psContext
.vI
.center
, psContext
.vJ
.center
);
127 psContext
.vZ
= state
.pfnQuantizeDepth(psContext
.vZ
);
129 RDTSC_END(BEBarycentric
, 1);
131 // interpolate user clip distance if available
132 if (state
.backendState
.clipDistanceMask
)
134 coverageMask
&= ~ComputeUserClipMask(state
.backendState
.clipDistanceMask
,
135 work
.pUserClipBuffer
,
137 psContext
.vJ
.center
);
140 simdscalar vCoverageMask
= _simd_vmask_ps(coverageMask
);
141 simdscalar depthPassMask
= vCoverageMask
;
142 simdscalar stencilPassMask
= vCoverageMask
;
147 RDTSC_BEGIN(BEEarlyDepthTest
, pDC
->drawId
);
148 depthPassMask
= DepthStencilTest(&state
,
149 work
.triFlags
.frontFacing
,
150 work
.triFlags
.viewportIndex
,
156 AR_EVENT(EarlyDepthStencilInfoSingleSample(_simd_movemask_ps(depthPassMask
),
157 _simd_movemask_ps(stencilPassMask
),
158 _simd_movemask_ps(vCoverageMask
)));
159 RDTSC_END(BEEarlyDepthTest
, 0);
161 // early-exit if no pixels passed depth or earlyZ is forced on
162 if (state
.psState
.forceEarlyZ
|| !_simd_movemask_ps(depthPassMask
))
164 DepthStencilWrite(&state
.vp
[work
.triFlags
.viewportIndex
],
165 &state
.depthStencilState
,
166 work
.triFlags
.frontFacing
,
174 if (!_simd_movemask_ps(depthPassMask
))
181 psContext
.sampleIndex
= 0;
182 psContext
.activeMask
= _simd_castps_si(vCoverageMask
);
184 // execute pixel shader
185 RDTSC_BEGIN(BEPixelShader
, pDC
->drawId
);
186 state
.psState
.pfnPixelShader(GetPrivateState(pDC
), pWorkerData
, &psContext
);
187 RDTSC_END(BEPixelShader
, 0);
190 UPDATE_STAT_BE(PsInvocations
, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask
)));
191 AR_EVENT(PSStats((HANDLE
)&psContext
.stats
));
193 vCoverageMask
= _simd_castsi_ps(psContext
.activeMask
);
198 RDTSC_BEGIN(BELateDepthTest
, pDC
->drawId
);
199 depthPassMask
= DepthStencilTest(&state
,
200 work
.triFlags
.frontFacing
,
201 work
.triFlags
.viewportIndex
,
207 AR_EVENT(LateDepthStencilInfoSingleSample(_simd_movemask_ps(depthPassMask
),
208 _simd_movemask_ps(stencilPassMask
),
209 _simd_movemask_ps(vCoverageMask
)));
210 RDTSC_END(BELateDepthTest
, 0);
212 if (!_simd_movemask_ps(depthPassMask
))
214 // need to call depth/stencil write for stencil write
215 DepthStencilWrite(&state
.vp
[work
.triFlags
.viewportIndex
],
216 &state
.depthStencilState
,
217 work
.triFlags
.frontFacing
,
229 // for early z, consolidate discards from shader
230 // into depthPassMask
231 depthPassMask
= _simd_and_ps(depthPassMask
, vCoverageMask
);
234 uint32_t statMask
= _simd_movemask_ps(depthPassMask
);
235 uint32_t statCount
= _mm_popcnt_u32(statMask
);
236 UPDATE_STAT_BE(DepthPassCount
, statCount
);
239 RDTSC_BEGIN(BEOutputMerger
, pDC
->drawId
);
243 psContext
.pColorBuffer
,
249 state
.psState
.renderTargetMask
,
253 // do final depth write after all pixel kills
254 if (!state
.psState
.forceEarlyZ
)
256 DepthStencilWrite(&state
.vp
[work
.triFlags
.viewportIndex
],
257 &state
.depthStencilState
,
258 work
.triFlags
.frontFacing
,
266 RDTSC_END(BEOutputMerger
, 0);
270 RDTSC_BEGIN(BEEndTile
, pDC
->drawId
);
272 work
.coverageMask
[0] >>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
273 if (T::InputCoverage
== SWR_INPUT_COVERAGE_INNER_CONSERVATIVE
)
275 work
.innerCoverageMask
>>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
278 if (useAlternateOffset
)
281 uint32_t rtMask
= state
.colorHottileEnable
;
282 while (_BitScanForward(&rt
, rtMask
))
284 rtMask
&= ~(1 << rt
);
285 psContext
.pColorBuffer
[rt
] +=
286 (2 * KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
) / 8;
290 pDepthBuffer
+= (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
) / 8;
292 (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
) / 8;
294 RDTSC_END(BEEndTile
, 0);
296 psContext
.vX
.UL
= _simd_add_ps(psContext
.vX
.UL
, dx
);
297 psContext
.vX
.center
= _simd_add_ps(psContext
.vX
.center
, dx
);
300 psContext
.vY
.UL
= _simd_add_ps(psContext
.vY
.UL
, dy
);
301 psContext
.vY
.center
= _simd_add_ps(psContext
.vY
.center
, dy
);
304 RDTSC_END(BESingleSampleBackend
, 0);
307 // Recursive template used to auto-nest conditionals. Converts dynamic enum function
308 // arguments to static template arguments.
309 template <uint32_t... ArgsT
>
310 struct BEChooserSingleSample
312 // Last Arg Terminator
313 static PFN_BACKEND_FUNC
GetFunc(SWR_BACKEND_FUNCS tArg
)
317 case SWR_BACKEND_SINGLE_SAMPLE
:
318 return BackendSingleSample
<SwrBackendTraits
<ArgsT
...>>;
320 case SWR_BACKEND_MSAA_PIXEL_RATE
:
321 case SWR_BACKEND_MSAA_SAMPLE_RATE
:
323 SWR_ASSERT(0 && "Invalid backend func\n");
329 // Recursively parse args
330 template <typename
... TArgsT
>
331 static PFN_BACKEND_FUNC
GetFunc(SWR_INPUT_COVERAGE tArg
, TArgsT
... remainingArgs
)
335 case SWR_INPUT_COVERAGE_NONE
:
336 return BEChooserSingleSample
<ArgsT
..., SWR_INPUT_COVERAGE_NONE
>::GetFunc(
339 case SWR_INPUT_COVERAGE_NORMAL
:
340 return BEChooserSingleSample
<ArgsT
..., SWR_INPUT_COVERAGE_NORMAL
>::GetFunc(
343 case SWR_INPUT_COVERAGE_INNER_CONSERVATIVE
:
344 return BEChooserSingleSample
<ArgsT
..., SWR_INPUT_COVERAGE_INNER_CONSERVATIVE
>::GetFunc(
348 SWR_ASSERT(0 && "Invalid sample pattern\n");
349 return BEChooserSingleSample
<ArgsT
..., SWR_INPUT_COVERAGE_NONE
>::GetFunc(
355 // Recursively parse args
356 template <typename
... TArgsT
>
357 static PFN_BACKEND_FUNC
GetFunc(SWR_MULTISAMPLE_COUNT tArg
, TArgsT
... remainingArgs
)
361 case SWR_MULTISAMPLE_1X
:
362 return BEChooserSingleSample
<ArgsT
..., SWR_MULTISAMPLE_1X
>::GetFunc(remainingArgs
...);
364 case SWR_MULTISAMPLE_2X
:
365 return BEChooserSingleSample
<ArgsT
..., SWR_MULTISAMPLE_2X
>::GetFunc(remainingArgs
...);
367 case SWR_MULTISAMPLE_4X
:
368 return BEChooserSingleSample
<ArgsT
..., SWR_MULTISAMPLE_4X
>::GetFunc(remainingArgs
...);
370 case SWR_MULTISAMPLE_8X
:
371 return BEChooserSingleSample
<ArgsT
..., SWR_MULTISAMPLE_8X
>::GetFunc(remainingArgs
...);
373 case SWR_MULTISAMPLE_16X
:
374 return BEChooserSingleSample
<ArgsT
..., SWR_MULTISAMPLE_16X
>::GetFunc(remainingArgs
...);
377 SWR_ASSERT(0 && "Invalid sample count\n");
378 return BEChooserSingleSample
<ArgsT
..., SWR_MULTISAMPLE_1X
>::GetFunc(remainingArgs
...);
383 // Recursively parse args
384 template <typename
... TArgsT
>
385 static PFN_BACKEND_FUNC
GetFunc(bool tArg
, TArgsT
... remainingArgs
)
389 return BEChooserSingleSample
<ArgsT
..., 1>::GetFunc(remainingArgs
...);
392 return BEChooserSingleSample
<ArgsT
..., 0>::GetFunc(remainingArgs
...);
396 void InitBackendSingleFuncTable(PFN_BACKEND_FUNC (&table
)[SWR_INPUT_COVERAGE_COUNT
][2][2])
398 for (uint32_t inputCoverage
= 0; inputCoverage
< SWR_INPUT_COVERAGE_COUNT
; inputCoverage
++)
400 for (uint32_t isCentroid
= 0; isCentroid
< 2; isCentroid
++)
402 for (uint32_t canEarlyZ
= 0; canEarlyZ
< 2; canEarlyZ
++)
404 table
[inputCoverage
][isCentroid
][canEarlyZ
] =
405 BEChooserSingleSample
<>::GetFunc(SWR_MULTISAMPLE_1X
,
407 (SWR_INPUT_COVERAGE
)inputCoverage
,
411 SWR_BACKEND_SINGLE_SAMPLE
);