1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * @brief Backend handles rasterization, pixel shading and output merger
28 ******************************************************************************/
30 #include <smmintrin.h>
33 #include "backend_impl.h"
35 #include "memory/tilingtraits.h"
36 #include "core/multisample.h"
41 void BackendSampleRate(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t x
, uint32_t y
, SWR_TRIANGLE_DESC
&work
, RenderOutputBuffers
&renderBuffers
)
43 RDTSC_BEGIN(BESampleRateBackend
, pDC
->drawId
);
44 RDTSC_BEGIN(BESetup
, pDC
->drawId
);
46 const API_STATE
&state
= GetApiState(pDC
);
48 BarycentricCoeffs coeffs
;
49 SetupBarycentricCoeffs(&coeffs
, work
);
51 SWR_PS_CONTEXT psContext
;
52 const SWR_MULTISAMPLE_POS
& samplePos
= state
.rastState
.samplePositions
;
53 SetupPixelShaderContext
<T
>(&psContext
, samplePos
, work
);
55 uint8_t *pDepthBuffer
, *pStencilBuffer
;
56 SetupRenderBuffers(psContext
.pColorBuffer
, &pDepthBuffer
, &pStencilBuffer
, state
.colorHottileEnable
, renderBuffers
);
58 RDTSC_END(BESetup
, 0);
60 psContext
.vY
.UL
= _simd_add_ps(vULOffsetsY
, _simd_set1_ps(static_cast<float>(y
)));
61 psContext
.vY
.center
= _simd_add_ps(vCenterOffsetsY
, _simd_set1_ps(static_cast<float>(y
)));
63 const simdscalar dy
= _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM
));
65 for (uint32_t yy
= y
; yy
< y
+ KNOB_TILE_Y_DIM
; yy
+= SIMD_TILE_Y_DIM
)
67 psContext
.vX
.UL
= _simd_add_ps(vULOffsetsX
, _simd_set1_ps(static_cast<float>(x
)));
68 psContext
.vX
.center
= _simd_add_ps(vCenterOffsetsX
, _simd_set1_ps(static_cast<float>(x
)));
70 const simdscalar dx
= _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM
));
72 for (uint32_t xx
= x
; xx
< x
+ KNOB_TILE_X_DIM
; xx
+= SIMD_TILE_X_DIM
)
74 #if USE_8x2_TILE_BACKEND
75 const bool useAlternateOffset
= ((xx
& SIMD_TILE_X_DIM
) != 0);
77 if (T::InputCoverage
!= SWR_INPUT_COVERAGE_NONE
)
79 const uint64_t* pCoverageMask
= (T::InputCoverage
== SWR_INPUT_COVERAGE_INNER_CONSERVATIVE
) ? &work
.innerCoverageMask
: &work
.coverageMask
[0];
81 generateInputCoverage
<T
, T::InputCoverage
>(pCoverageMask
, psContext
.inputMask
, state
.blendState
.sampleMask
);
84 RDTSC_BEGIN(BEBarycentric
, pDC
->drawId
);
86 CalcPixelBarycentrics(coeffs
, psContext
);
88 CalcCentroid
<T
, false>(&psContext
, samplePos
, coeffs
, work
.coverageMask
, state
.blendState
.sampleMask
);
90 RDTSC_END(BEBarycentric
, 0);
92 for (uint32_t sample
= 0; sample
< T::MultisampleT::numSamples
; sample
++)
94 simdmask coverageMask
= work
.coverageMask
[sample
] & MASK
;
98 // offset depth/stencil buffers current sample
99 uint8_t *pDepthSample
= pDepthBuffer
+ RasterTileDepthOffset(sample
);
100 uint8_t *pStencilSample
= pStencilBuffer
+ RasterTileStencilOffset(sample
);
102 if (state
.depthHottileEnable
&& state
.depthBoundsState
.depthBoundsTestEnable
)
104 static_assert(KNOB_DEPTH_HOT_TILE_FORMAT
== R32_FLOAT
, "Unsupported depth hot tile format");
106 const simdscalar z
= _simd_load_ps(reinterpret_cast<const float *>(pDepthSample
));
108 const float minz
= state
.depthBoundsState
.depthBoundsTestMinValue
;
109 const float maxz
= state
.depthBoundsState
.depthBoundsTestMaxValue
;
111 coverageMask
&= CalcDepthBoundsAcceptMask(z
, minz
, maxz
);
114 RDTSC_BEGIN(BEBarycentric
, pDC
->drawId
);
116 // calculate per sample positions
117 psContext
.vX
.sample
= _simd_add_ps(psContext
.vX
.UL
, samplePos
.vX(sample
));
118 psContext
.vY
.sample
= _simd_add_ps(psContext
.vY
.UL
, samplePos
.vY(sample
));
120 CalcSampleBarycentrics(coeffs
, psContext
);
122 // interpolate and quantize z
123 psContext
.vZ
= vplaneps(coeffs
.vZa
, coeffs
.vZb
, coeffs
.vZc
, psContext
.vI
.sample
, psContext
.vJ
.sample
);
124 psContext
.vZ
= state
.pfnQuantizeDepth(psContext
.vZ
);
126 RDTSC_END(BEBarycentric
, 0);
128 // interpolate user clip distance if available
129 if (state
.backendState
.clipDistanceMask
)
131 coverageMask
&= ~ComputeUserClipMask(state
.backendState
.clipDistanceMask
, work
.pUserClipBuffer
, psContext
.vI
.sample
, psContext
.vJ
.sample
);
134 simdscalar vCoverageMask
= _simd_vmask_ps(coverageMask
);
135 simdscalar depthPassMask
= vCoverageMask
;
136 simdscalar stencilPassMask
= vCoverageMask
;
141 RDTSC_BEGIN(BEEarlyDepthTest
, pDC
->drawId
);
142 depthPassMask
= DepthStencilTest(&state
, work
.triFlags
.frontFacing
, work
.triFlags
.viewportIndex
,
143 psContext
.vZ
, pDepthSample
, vCoverageMask
, pStencilSample
, &stencilPassMask
);
144 AR_EVENT(EarlyDepthStencilInfoSampleRate(_simd_movemask_ps(depthPassMask
), _simd_movemask_ps(stencilPassMask
), _simd_movemask_ps(vCoverageMask
)));
145 RDTSC_END(BEEarlyDepthTest
, 0);
147 // early-exit if no samples passed depth or earlyZ is forced on.
148 if (state
.psState
.forceEarlyZ
|| !_simd_movemask_ps(depthPassMask
))
150 DepthStencilWrite(&state
.vp
[work
.triFlags
.viewportIndex
], &state
.depthStencilState
, work
.triFlags
.frontFacing
, psContext
.vZ
,
151 pDepthSample
, depthPassMask
, vCoverageMask
, pStencilSample
, stencilPassMask
);
153 if (!_simd_movemask_ps(depthPassMask
))
155 work
.coverageMask
[sample
] >>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
161 psContext
.sampleIndex
= sample
;
162 psContext
.activeMask
= _simd_castps_si(vCoverageMask
);
164 // execute pixel shader
165 RDTSC_BEGIN(BEPixelShader
, pDC
->drawId
);
166 UPDATE_STAT_BE(PsInvocations
, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask
)));
167 state
.psState
.pfnPixelShader(GetPrivateState(pDC
), &psContext
);
168 RDTSC_END(BEPixelShader
, 0);
170 vCoverageMask
= _simd_castsi_ps(psContext
.activeMask
);
175 RDTSC_BEGIN(BELateDepthTest
, pDC
->drawId
);
176 depthPassMask
= DepthStencilTest(&state
, work
.triFlags
.frontFacing
, work
.triFlags
.viewportIndex
,
177 psContext
.vZ
, pDepthSample
, vCoverageMask
, pStencilSample
, &stencilPassMask
);
178 AR_EVENT(LateDepthStencilInfoSampleRate(_simd_movemask_ps(depthPassMask
), _simd_movemask_ps(stencilPassMask
), _simd_movemask_ps(vCoverageMask
)));
179 RDTSC_END(BELateDepthTest
, 0);
181 if (!_simd_movemask_ps(depthPassMask
))
183 // need to call depth/stencil write for stencil write
184 DepthStencilWrite(&state
.vp
[work
.triFlags
.viewportIndex
], &state
.depthStencilState
, work
.triFlags
.frontFacing
, psContext
.vZ
,
185 pDepthSample
, depthPassMask
, vCoverageMask
, pStencilSample
, stencilPassMask
);
187 work
.coverageMask
[sample
] >>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
192 uint32_t statMask
= _simd_movemask_ps(depthPassMask
);
193 uint32_t statCount
= _mm_popcnt_u32(statMask
);
194 UPDATE_STAT_BE(DepthPassCount
, statCount
);
197 RDTSC_BEGIN(BEOutputMerger
, pDC
->drawId
);
198 #if USE_8x2_TILE_BACKEND
199 OutputMerger8x2(psContext
, psContext
.pColorBuffer
, sample
, &state
.blendState
, state
.pfnBlendFunc
, vCoverageMask
, depthPassMask
, state
.psState
.renderTargetMask
, useAlternateOffset
);
201 OutputMerger4x2(psContext
, psContext
.pColorBuffer
, sample
, &state
.blendState
, state
.pfnBlendFunc
, vCoverageMask
, depthPassMask
, state
.psState
.renderTargetMask
);
204 // do final depth write after all pixel kills
205 if (!state
.psState
.forceEarlyZ
)
207 DepthStencilWrite(&state
.vp
[work
.triFlags
.viewportIndex
], &state
.depthStencilState
, work
.triFlags
.frontFacing
, psContext
.vZ
,
208 pDepthSample
, depthPassMask
, vCoverageMask
, pStencilSample
, stencilPassMask
);
210 RDTSC_END(BEOutputMerger
, 0);
212 work
.coverageMask
[sample
] >>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
218 RDTSC_BEGIN(BEEndTile
, pDC
->drawId
);
220 if (T::InputCoverage
== SWR_INPUT_COVERAGE_INNER_CONSERVATIVE
)
222 work
.innerCoverageMask
>>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
225 #if USE_8x2_TILE_BACKEND
226 if (useAlternateOffset
)
229 uint32_t rtMask
= state
.colorHottileEnable
;
230 while (_BitScanForward(&rt
, rtMask
))
232 rtMask
&= ~(1 << rt
);
233 psContext
.pColorBuffer
[rt
] += (2 * KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
) / 8;
238 uint32_t rtMask
= state
.colorHottileEnable
;
239 while (_BitScanForward(&rt
, rtMask
))
241 rtMask
&= ~(1 << rt
);
242 psContext
.pColorBuffer
[rt
] += (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
) / 8;
245 pDepthBuffer
+= (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
) / 8;
246 pStencilBuffer
+= (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
) / 8;
248 RDTSC_END(BEEndTile
, 0);
250 psContext
.vX
.UL
= _simd_add_ps(psContext
.vX
.UL
, dx
);
251 psContext
.vX
.center
= _simd_add_ps(psContext
.vX
.center
, dx
);
254 psContext
.vY
.UL
= _simd_add_ps(psContext
.vY
.UL
, dy
);
255 psContext
.vY
.center
= _simd_add_ps(psContext
.vY
.center
, dy
);
258 RDTSC_END(BESampleRateBackend
, 0);
261 // Recursive template used to auto-nest conditionals. Converts dynamic enum function
262 // arguments to static template arguments.
263 template <uint32_t... ArgsT
>
264 struct BEChooserSampleRate
266 // Last Arg Terminator
267 static PFN_BACKEND_FUNC
GetFunc(SWR_BACKEND_FUNCS tArg
)
271 case SWR_BACKEND_MSAA_SAMPLE_RATE
: return BackendSampleRate
<SwrBackendTraits
<ArgsT
...>>; break;
272 case SWR_BACKEND_SINGLE_SAMPLE
:
273 case SWR_BACKEND_MSAA_PIXEL_RATE
:
274 SWR_ASSERT(0 && "Invalid backend func\n");
278 SWR_ASSERT(0 && "Invalid backend func\n");
284 // Recursively parse args
285 template <typename
... TArgsT
>
286 static PFN_BACKEND_FUNC
GetFunc(SWR_INPUT_COVERAGE tArg
, TArgsT
... remainingArgs
)
290 case SWR_INPUT_COVERAGE_NONE
: return BEChooserSampleRate
<ArgsT
..., SWR_INPUT_COVERAGE_NONE
>::GetFunc(remainingArgs
...); break;
291 case SWR_INPUT_COVERAGE_NORMAL
: return BEChooserSampleRate
<ArgsT
..., SWR_INPUT_COVERAGE_NORMAL
>::GetFunc(remainingArgs
...); break;
292 case SWR_INPUT_COVERAGE_INNER_CONSERVATIVE
: return BEChooserSampleRate
<ArgsT
..., SWR_INPUT_COVERAGE_INNER_CONSERVATIVE
>::GetFunc(remainingArgs
...); break;
294 SWR_ASSERT(0 && "Invalid sample pattern\n");
295 return BEChooserSampleRate
<ArgsT
..., SWR_INPUT_COVERAGE_NONE
>::GetFunc(remainingArgs
...);
300 // Recursively parse args
301 template <typename
... TArgsT
>
302 static PFN_BACKEND_FUNC
GetFunc(SWR_MULTISAMPLE_COUNT tArg
, TArgsT
... remainingArgs
)
306 case SWR_MULTISAMPLE_1X
: return BEChooserSampleRate
<ArgsT
..., SWR_MULTISAMPLE_1X
>::GetFunc(remainingArgs
...); break;
307 case SWR_MULTISAMPLE_2X
: return BEChooserSampleRate
<ArgsT
..., SWR_MULTISAMPLE_2X
>::GetFunc(remainingArgs
...); break;
308 case SWR_MULTISAMPLE_4X
: return BEChooserSampleRate
<ArgsT
..., SWR_MULTISAMPLE_4X
>::GetFunc(remainingArgs
...); break;
309 case SWR_MULTISAMPLE_8X
: return BEChooserSampleRate
<ArgsT
..., SWR_MULTISAMPLE_8X
>::GetFunc(remainingArgs
...); break;
310 case SWR_MULTISAMPLE_16X
: return BEChooserSampleRate
<ArgsT
..., SWR_MULTISAMPLE_16X
>::GetFunc(remainingArgs
...); break;
312 SWR_ASSERT(0 && "Invalid sample count\n");
313 return BEChooserSampleRate
<ArgsT
..., SWR_MULTISAMPLE_1X
>::GetFunc(remainingArgs
...);
318 // Recursively parse args
319 template <typename
... TArgsT
>
320 static PFN_BACKEND_FUNC
GetFunc(bool tArg
, TArgsT
... remainingArgs
)
324 return BEChooserSampleRate
<ArgsT
..., 1>::GetFunc(remainingArgs
...);
327 return BEChooserSampleRate
<ArgsT
..., 0>::GetFunc(remainingArgs
...);
331 void InitBackendSampleFuncTable(PFN_BACKEND_FUNC(&table
)[SWR_MULTISAMPLE_TYPE_COUNT
][SWR_INPUT_COVERAGE_COUNT
][2][2])
333 for (uint32_t sampleCount
= SWR_MULTISAMPLE_1X
; sampleCount
< SWR_MULTISAMPLE_TYPE_COUNT
; sampleCount
++)
335 for (uint32_t inputCoverage
= 0; inputCoverage
< SWR_INPUT_COVERAGE_COUNT
; inputCoverage
++)
337 for (uint32_t centroid
= 0; centroid
< 2; centroid
++)
339 for (uint32_t canEarlyZ
= 0; canEarlyZ
< 2; canEarlyZ
++)
341 table
[sampleCount
][inputCoverage
][centroid
][canEarlyZ
] =
342 BEChooserSampleRate
<>::GetFunc((SWR_MULTISAMPLE_COUNT
)sampleCount
, false, (SWR_INPUT_COVERAGE
)inputCoverage
,
343 (centroid
> 0), false, (canEarlyZ
> 0), (SWR_BACKEND_FUNCS
)SWR_BACKEND_MSAA_SAMPLE_RATE
);