1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * @brief Backend handles rasterization, pixel shading and output merger
28 ******************************************************************************/
30 #include <smmintrin.h>
33 #include "backend_impl.h"
35 #include "memory/tilingtraits.h"
36 #include "core/multisample.h"
41 void BackendSampleRate(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t x
, uint32_t y
, SWR_TRIANGLE_DESC
&work
, RenderOutputBuffers
&renderBuffers
)
43 SWR_CONTEXT
*pContext
= pDC
->pContext
;
45 AR_BEGIN(BESampleRateBackend
, pDC
->drawId
);
46 AR_BEGIN(BESetup
, pDC
->drawId
);
48 const API_STATE
&state
= GetApiState(pDC
);
50 BarycentricCoeffs coeffs
;
51 SetupBarycentricCoeffs(&coeffs
, work
);
53 SWR_PS_CONTEXT psContext
;
54 const SWR_MULTISAMPLE_POS
& samplePos
= state
.rastState
.samplePositions
;
55 SetupPixelShaderContext
<T
>(&psContext
, samplePos
, work
);
57 uint8_t *pDepthBuffer
, *pStencilBuffer
;
58 SetupRenderBuffers(psContext
.pColorBuffer
, &pDepthBuffer
, &pStencilBuffer
, state
.colorHottileEnable
, renderBuffers
);
62 psContext
.vY
.UL
= _simd_add_ps(vULOffsetsY
, _simd_set1_ps(static_cast<float>(y
)));
63 psContext
.vY
.center
= _simd_add_ps(vCenterOffsetsY
, _simd_set1_ps(static_cast<float>(y
)));
65 const simdscalar dy
= _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM
));
67 for (uint32_t yy
= y
; yy
< y
+ KNOB_TILE_Y_DIM
; yy
+= SIMD_TILE_Y_DIM
)
69 psContext
.vX
.UL
= _simd_add_ps(vULOffsetsX
, _simd_set1_ps(static_cast<float>(x
)));
70 psContext
.vX
.center
= _simd_add_ps(vCenterOffsetsX
, _simd_set1_ps(static_cast<float>(x
)));
72 const simdscalar dx
= _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM
));
74 for (uint32_t xx
= x
; xx
< x
+ KNOB_TILE_X_DIM
; xx
+= SIMD_TILE_X_DIM
)
76 #if USE_8x2_TILE_BACKEND
77 const bool useAlternateOffset
= ((xx
& SIMD_TILE_X_DIM
) != 0);
79 if (T::InputCoverage
!= SWR_INPUT_COVERAGE_NONE
)
81 const uint64_t* pCoverageMask
= (T::InputCoverage
== SWR_INPUT_COVERAGE_INNER_CONSERVATIVE
) ? &work
.innerCoverageMask
: &work
.coverageMask
[0];
83 generateInputCoverage
<T
, T::InputCoverage
>(pCoverageMask
, psContext
.inputMask
, state
.blendState
.sampleMask
);
86 AR_BEGIN(BEBarycentric
, pDC
->drawId
);
88 CalcPixelBarycentrics(coeffs
, psContext
);
90 CalcCentroid
<T
, false>(&psContext
, samplePos
, coeffs
, work
.coverageMask
, state
.blendState
.sampleMask
);
92 AR_END(BEBarycentric
, 0);
94 for (uint32_t sample
= 0; sample
< T::MultisampleT::numSamples
; sample
++)
96 simdmask coverageMask
= work
.coverageMask
[sample
] & MASK
;
100 // offset depth/stencil buffers current sample
101 uint8_t *pDepthSample
= pDepthBuffer
+ RasterTileDepthOffset(sample
);
102 uint8_t *pStencilSample
= pStencilBuffer
+ RasterTileStencilOffset(sample
);
104 if (state
.depthHottileEnable
&& state
.depthBoundsState
.depthBoundsTestEnable
)
106 static_assert(KNOB_DEPTH_HOT_TILE_FORMAT
== R32_FLOAT
, "Unsupported depth hot tile format");
108 const simdscalar z
= _simd_load_ps(reinterpret_cast<const float *>(pDepthSample
));
110 const float minz
= state
.depthBoundsState
.depthBoundsTestMinValue
;
111 const float maxz
= state
.depthBoundsState
.depthBoundsTestMaxValue
;
113 coverageMask
&= CalcDepthBoundsAcceptMask(z
, minz
, maxz
);
116 AR_BEGIN(BEBarycentric
, pDC
->drawId
);
118 // calculate per sample positions
119 psContext
.vX
.sample
= _simd_add_ps(psContext
.vX
.UL
, samplePos
.vX(sample
));
120 psContext
.vY
.sample
= _simd_add_ps(psContext
.vY
.UL
, samplePos
.vY(sample
));
122 CalcSampleBarycentrics(coeffs
, psContext
);
124 // interpolate and quantize z
125 psContext
.vZ
= vplaneps(coeffs
.vZa
, coeffs
.vZb
, coeffs
.vZc
, psContext
.vI
.sample
, psContext
.vJ
.sample
);
126 psContext
.vZ
= state
.pfnQuantizeDepth(psContext
.vZ
);
128 AR_END(BEBarycentric
, 0);
130 // interpolate user clip distance if available
131 if (state
.backendState
.clipDistanceMask
)
133 coverageMask
&= ~ComputeUserClipMask(state
.backendState
.clipDistanceMask
, work
.pUserClipBuffer
, psContext
.vI
.sample
, psContext
.vJ
.sample
);
136 simdscalar vCoverageMask
= _simd_vmask_ps(coverageMask
);
137 simdscalar depthPassMask
= vCoverageMask
;
138 simdscalar stencilPassMask
= vCoverageMask
;
143 AR_BEGIN(BEEarlyDepthTest
, pDC
->drawId
);
144 depthPassMask
= DepthStencilTest(&state
, work
.triFlags
.frontFacing
, work
.triFlags
.viewportIndex
,
145 psContext
.vZ
, pDepthSample
, vCoverageMask
, pStencilSample
, &stencilPassMask
);
146 AR_EVENT(EarlyDepthStencilInfoSampleRate(_simd_movemask_ps(depthPassMask
), _simd_movemask_ps(stencilPassMask
), _simd_movemask_ps(vCoverageMask
)));
147 AR_END(BEEarlyDepthTest
, 0);
149 // early-exit if no samples passed depth or earlyZ is forced on.
150 if (state
.psState
.forceEarlyZ
|| !_simd_movemask_ps(depthPassMask
))
152 DepthStencilWrite(&state
.vp
[work
.triFlags
.viewportIndex
], &state
.depthStencilState
, work
.triFlags
.frontFacing
, psContext
.vZ
,
153 pDepthSample
, depthPassMask
, vCoverageMask
, pStencilSample
, stencilPassMask
);
155 if (!_simd_movemask_ps(depthPassMask
))
157 work
.coverageMask
[sample
] >>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
163 psContext
.sampleIndex
= sample
;
164 psContext
.activeMask
= _simd_castps_si(vCoverageMask
);
166 // execute pixel shader
167 AR_BEGIN(BEPixelShader
, pDC
->drawId
);
168 UPDATE_STAT_BE(PsInvocations
, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask
)));
169 state
.psState
.pfnPixelShader(GetPrivateState(pDC
), &psContext
);
170 AR_END(BEPixelShader
, 0);
172 vCoverageMask
= _simd_castsi_ps(psContext
.activeMask
);
177 AR_BEGIN(BELateDepthTest
, pDC
->drawId
);
178 depthPassMask
= DepthStencilTest(&state
, work
.triFlags
.frontFacing
, work
.triFlags
.viewportIndex
,
179 psContext
.vZ
, pDepthSample
, vCoverageMask
, pStencilSample
, &stencilPassMask
);
180 AR_EVENT(LateDepthStencilInfoSampleRate(_simd_movemask_ps(depthPassMask
), _simd_movemask_ps(stencilPassMask
), _simd_movemask_ps(vCoverageMask
)));
181 AR_END(BELateDepthTest
, 0);
183 if (!_simd_movemask_ps(depthPassMask
))
185 // need to call depth/stencil write for stencil write
186 DepthStencilWrite(&state
.vp
[work
.triFlags
.viewportIndex
], &state
.depthStencilState
, work
.triFlags
.frontFacing
, psContext
.vZ
,
187 pDepthSample
, depthPassMask
, vCoverageMask
, pStencilSample
, stencilPassMask
);
189 work
.coverageMask
[sample
] >>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
194 uint32_t statMask
= _simd_movemask_ps(depthPassMask
);
195 uint32_t statCount
= _mm_popcnt_u32(statMask
);
196 UPDATE_STAT_BE(DepthPassCount
, statCount
);
199 AR_BEGIN(BEOutputMerger
, pDC
->drawId
);
200 #if USE_8x2_TILE_BACKEND
201 OutputMerger8x2(psContext
, psContext
.pColorBuffer
, sample
, &state
.blendState
, state
.pfnBlendFunc
, vCoverageMask
, depthPassMask
, state
.psState
.renderTargetMask
, useAlternateOffset
);
203 OutputMerger4x2(psContext
, psContext
.pColorBuffer
, sample
, &state
.blendState
, state
.pfnBlendFunc
, vCoverageMask
, depthPassMask
, state
.psState
.renderTargetMask
);
206 // do final depth write after all pixel kills
207 if (!state
.psState
.forceEarlyZ
)
209 DepthStencilWrite(&state
.vp
[work
.triFlags
.viewportIndex
], &state
.depthStencilState
, work
.triFlags
.frontFacing
, psContext
.vZ
,
210 pDepthSample
, depthPassMask
, vCoverageMask
, pStencilSample
, stencilPassMask
);
212 AR_END(BEOutputMerger
, 0);
214 work
.coverageMask
[sample
] >>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
220 AR_BEGIN(BEEndTile
, pDC
->drawId
);
222 if (T::InputCoverage
== SWR_INPUT_COVERAGE_INNER_CONSERVATIVE
)
224 work
.innerCoverageMask
>>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
227 #if USE_8x2_TILE_BACKEND
228 if (useAlternateOffset
)
231 uint32_t rtMask
= state
.colorHottileEnable
;
232 while (_BitScanForward(&rt
, rtMask
))
234 rtMask
&= ~(1 << rt
);
235 psContext
.pColorBuffer
[rt
] += (2 * KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
) / 8;
240 uint32_t rtMask
= state
.colorHottileEnable
;
241 while (_BitScanForward(&rt
, rtMask
))
243 rtMask
&= ~(1 << rt
);
244 psContext
.pColorBuffer
[rt
] += (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
) / 8;
247 pDepthBuffer
+= (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
) / 8;
248 pStencilBuffer
+= (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
) / 8;
250 AR_END(BEEndTile
, 0);
252 psContext
.vX
.UL
= _simd_add_ps(psContext
.vX
.UL
, dx
);
253 psContext
.vX
.center
= _simd_add_ps(psContext
.vX
.center
, dx
);
256 psContext
.vY
.UL
= _simd_add_ps(psContext
.vY
.UL
, dy
);
257 psContext
.vY
.center
= _simd_add_ps(psContext
.vY
.center
, dy
);
260 AR_END(BESampleRateBackend
, 0);
263 // Recursive template used to auto-nest conditionals. Converts dynamic enum function
264 // arguments to static template arguments.
265 template <uint32_t... ArgsT
>
266 struct BEChooserSampleRate
268 // Last Arg Terminator
269 static PFN_BACKEND_FUNC
GetFunc(SWR_BACKEND_FUNCS tArg
)
273 case SWR_BACKEND_MSAA_SAMPLE_RATE
: return BackendSampleRate
<SwrBackendTraits
<ArgsT
...>>; break;
274 case SWR_BACKEND_SINGLE_SAMPLE
:
275 case SWR_BACKEND_MSAA_PIXEL_RATE
:
276 SWR_ASSERT(0 && "Invalid backend func\n");
280 SWR_ASSERT(0 && "Invalid backend func\n");
286 // Recursively parse args
287 template <typename
... TArgsT
>
288 static PFN_BACKEND_FUNC
GetFunc(SWR_INPUT_COVERAGE tArg
, TArgsT
... remainingArgs
)
292 case SWR_INPUT_COVERAGE_NONE
: return BEChooserSampleRate
<ArgsT
..., SWR_INPUT_COVERAGE_NONE
>::GetFunc(remainingArgs
...); break;
293 case SWR_INPUT_COVERAGE_NORMAL
: return BEChooserSampleRate
<ArgsT
..., SWR_INPUT_COVERAGE_NORMAL
>::GetFunc(remainingArgs
...); break;
294 case SWR_INPUT_COVERAGE_INNER_CONSERVATIVE
: return BEChooserSampleRate
<ArgsT
..., SWR_INPUT_COVERAGE_INNER_CONSERVATIVE
>::GetFunc(remainingArgs
...); break;
296 SWR_ASSERT(0 && "Invalid sample pattern\n");
297 return BEChooserSampleRate
<ArgsT
..., SWR_INPUT_COVERAGE_NONE
>::GetFunc(remainingArgs
...);
302 // Recursively parse args
303 template <typename
... TArgsT
>
304 static PFN_BACKEND_FUNC
GetFunc(SWR_MULTISAMPLE_COUNT tArg
, TArgsT
... remainingArgs
)
308 case SWR_MULTISAMPLE_1X
: return BEChooserSampleRate
<ArgsT
..., SWR_MULTISAMPLE_1X
>::GetFunc(remainingArgs
...); break;
309 case SWR_MULTISAMPLE_2X
: return BEChooserSampleRate
<ArgsT
..., SWR_MULTISAMPLE_2X
>::GetFunc(remainingArgs
...); break;
310 case SWR_MULTISAMPLE_4X
: return BEChooserSampleRate
<ArgsT
..., SWR_MULTISAMPLE_4X
>::GetFunc(remainingArgs
...); break;
311 case SWR_MULTISAMPLE_8X
: return BEChooserSampleRate
<ArgsT
..., SWR_MULTISAMPLE_8X
>::GetFunc(remainingArgs
...); break;
312 case SWR_MULTISAMPLE_16X
: return BEChooserSampleRate
<ArgsT
..., SWR_MULTISAMPLE_16X
>::GetFunc(remainingArgs
...); break;
314 SWR_ASSERT(0 && "Invalid sample count\n");
315 return BEChooserSampleRate
<ArgsT
..., SWR_MULTISAMPLE_1X
>::GetFunc(remainingArgs
...);
320 // Recursively parse args
321 template <typename
... TArgsT
>
322 static PFN_BACKEND_FUNC
GetFunc(bool tArg
, TArgsT
... remainingArgs
)
326 return BEChooserSampleRate
<ArgsT
..., 1>::GetFunc(remainingArgs
...);
329 return BEChooserSampleRate
<ArgsT
..., 0>::GetFunc(remainingArgs
...);
333 void InitBackendSampleFuncTable(PFN_BACKEND_FUNC(&table
)[SWR_MULTISAMPLE_TYPE_COUNT
][SWR_INPUT_COVERAGE_COUNT
][2][2])
335 for (uint32_t sampleCount
= SWR_MULTISAMPLE_1X
; sampleCount
< SWR_MULTISAMPLE_TYPE_COUNT
; sampleCount
++)
337 for (uint32_t inputCoverage
= 0; inputCoverage
< SWR_INPUT_COVERAGE_COUNT
; inputCoverage
++)
339 for (uint32_t centroid
= 0; centroid
< 2; centroid
++)
341 for (uint32_t canEarlyZ
= 0; canEarlyZ
< 2; canEarlyZ
++)
343 table
[sampleCount
][inputCoverage
][centroid
][canEarlyZ
] =
344 BEChooserSampleRate
<>::GetFunc((SWR_MULTISAMPLE_COUNT
)sampleCount
, false, (SWR_INPUT_COVERAGE
)inputCoverage
,
345 (centroid
> 0), false, (canEarlyZ
> 0), (SWR_BACKEND_FUNCS
)SWR_BACKEND_MSAA_SAMPLE_RATE
);