1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * @brief Backend handles rasterization, pixel shading and output merger
28 ******************************************************************************/
30 #include <smmintrin.h>
33 #include "backend_impl.h"
35 #include "memory/tilingtraits.h"
36 #include "core/multisample.h"
41 void BackendSingleSample(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t x
, uint32_t y
, SWR_TRIANGLE_DESC
&work
, RenderOutputBuffers
&renderBuffers
)
43 SWR_CONTEXT
*pContext
= pDC
->pContext
;
45 AR_BEGIN(BESingleSampleBackend
, pDC
->drawId
);
46 AR_BEGIN(BESetup
, pDC
->drawId
);
48 const API_STATE
&state
= GetApiState(pDC
);
50 BarycentricCoeffs coeffs
;
51 SetupBarycentricCoeffs(&coeffs
, work
);
53 SWR_PS_CONTEXT psContext
;
54 const SWR_MULTISAMPLE_POS
& samplePos
= state
.rastState
.samplePositions
;
55 SetupPixelShaderContext
<T
>(&psContext
, samplePos
, work
);
57 uint8_t *pDepthBuffer
, *pStencilBuffer
;
58 SetupRenderBuffers(psContext
.pColorBuffer
, &pDepthBuffer
, &pStencilBuffer
, state
.colorHottileEnable
, renderBuffers
);
62 psContext
.vY
.UL
= _simd_add_ps(vULOffsetsY
, _simd_set1_ps(static_cast<float>(y
)));
63 psContext
.vY
.center
= _simd_add_ps(vCenterOffsetsY
, _simd_set1_ps(static_cast<float>(y
)));
65 const simdscalar dy
= _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM
));
67 for (uint32_t yy
= y
; yy
< y
+ KNOB_TILE_Y_DIM
; yy
+= SIMD_TILE_Y_DIM
)
69 psContext
.vX
.UL
= _simd_add_ps(vULOffsetsX
, _simd_set1_ps(static_cast<float>(x
)));
70 psContext
.vX
.center
= _simd_add_ps(vCenterOffsetsX
, _simd_set1_ps(static_cast<float>(x
)));
72 const simdscalar dx
= _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM
));
74 for (uint32_t xx
= x
; xx
< x
+ KNOB_TILE_X_DIM
; xx
+= SIMD_TILE_X_DIM
)
76 #if USE_8x2_TILE_BACKEND
77 const bool useAlternateOffset
= ((xx
& SIMD_TILE_X_DIM
) != 0);
79 simdmask coverageMask
= work
.coverageMask
[0] & MASK
;
83 if (state
.depthHottileEnable
&& state
.depthBoundsState
.depthBoundsTestEnable
)
85 static_assert(KNOB_DEPTH_HOT_TILE_FORMAT
== R32_FLOAT
, "Unsupported depth hot tile format");
87 const simdscalar z
= _simd_load_ps(reinterpret_cast<const float *>(pDepthBuffer
));
89 const float minz
= state
.depthBoundsState
.depthBoundsTestMinValue
;
90 const float maxz
= state
.depthBoundsState
.depthBoundsTestMaxValue
;
92 coverageMask
&= CalcDepthBoundsAcceptMask(z
, minz
, maxz
);
95 if (T::InputCoverage
!= SWR_INPUT_COVERAGE_NONE
)
97 const uint64_t* pCoverageMask
= (T::InputCoverage
== SWR_INPUT_COVERAGE_INNER_CONSERVATIVE
) ? &work
.innerCoverageMask
: &work
.coverageMask
[0];
99 generateInputCoverage
<T
, T::InputCoverage
>(pCoverageMask
, psContext
.inputMask
, state
.blendState
.sampleMask
);
102 AR_BEGIN(BEBarycentric
, pDC
->drawId
);
104 CalcPixelBarycentrics(coeffs
, psContext
);
106 CalcCentroid
<T
, true>(&psContext
, samplePos
, coeffs
, work
.coverageMask
, state
.blendState
.sampleMask
);
108 // interpolate and quantize z
109 psContext
.vZ
= vplaneps(coeffs
.vZa
, coeffs
.vZb
, coeffs
.vZc
, psContext
.vI
.center
, psContext
.vJ
.center
);
110 psContext
.vZ
= state
.pfnQuantizeDepth(psContext
.vZ
);
112 AR_END(BEBarycentric
, 1);
114 // interpolate user clip distance if available
115 if (state
.rastState
.clipDistanceMask
)
117 coverageMask
&= ~ComputeUserClipMask(state
.rastState
.clipDistanceMask
, work
.pUserClipBuffer
, psContext
.vI
.center
, psContext
.vJ
.center
);
120 simdscalar vCoverageMask
= _simd_vmask_ps(coverageMask
);
121 simdscalar depthPassMask
= vCoverageMask
;
122 simdscalar stencilPassMask
= vCoverageMask
;
127 AR_BEGIN(BEEarlyDepthTest
, pDC
->drawId
);
128 depthPassMask
= DepthStencilTest(&state
, work
.triFlags
.frontFacing
, work
.triFlags
.viewportIndex
,
129 psContext
.vZ
, pDepthBuffer
, vCoverageMask
, pStencilBuffer
, &stencilPassMask
);
130 AR_EVENT(EarlyDepthStencilInfoSingleSample(_simd_movemask_ps(depthPassMask
), _simd_movemask_ps(stencilPassMask
), _simd_movemask_ps(vCoverageMask
)));
131 AR_END(BEEarlyDepthTest
, 0);
133 // early-exit if no pixels passed depth or earlyZ is forced on
134 if (state
.psState
.forceEarlyZ
|| !_simd_movemask_ps(depthPassMask
))
136 DepthStencilWrite(&state
.vp
[work
.triFlags
.viewportIndex
], &state
.depthStencilState
, work
.triFlags
.frontFacing
, psContext
.vZ
,
137 pDepthBuffer
, depthPassMask
, vCoverageMask
, pStencilBuffer
, stencilPassMask
);
139 if (!_simd_movemask_ps(depthPassMask
))
146 psContext
.sampleIndex
= 0;
147 psContext
.activeMask
= _simd_castps_si(vCoverageMask
);
149 // execute pixel shader
150 AR_BEGIN(BEPixelShader
, pDC
->drawId
);
151 UPDATE_STAT_BE(PsInvocations
, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask
)));
152 state
.psState
.pfnPixelShader(GetPrivateState(pDC
), &psContext
);
153 AR_END(BEPixelShader
, 0);
155 vCoverageMask
= _simd_castsi_ps(psContext
.activeMask
);
160 AR_BEGIN(BELateDepthTest
, pDC
->drawId
);
161 depthPassMask
= DepthStencilTest(&state
, work
.triFlags
.frontFacing
, work
.triFlags
.viewportIndex
,
162 psContext
.vZ
, pDepthBuffer
, vCoverageMask
, pStencilBuffer
, &stencilPassMask
);
163 AR_EVENT(LateDepthStencilInfoSingleSample(_simd_movemask_ps(depthPassMask
), _simd_movemask_ps(stencilPassMask
), _simd_movemask_ps(vCoverageMask
)));
164 AR_END(BELateDepthTest
, 0);
166 if (!_simd_movemask_ps(depthPassMask
))
168 // need to call depth/stencil write for stencil write
169 DepthStencilWrite(&state
.vp
[work
.triFlags
.viewportIndex
], &state
.depthStencilState
, work
.triFlags
.frontFacing
, psContext
.vZ
,
170 pDepthBuffer
, depthPassMask
, vCoverageMask
, pStencilBuffer
, stencilPassMask
);
174 // for early z, consolidate discards from shader
175 // into depthPassMask
176 depthPassMask
= _simd_and_ps(depthPassMask
, vCoverageMask
);
179 uint32_t statMask
= _simd_movemask_ps(depthPassMask
);
180 uint32_t statCount
= _mm_popcnt_u32(statMask
);
181 UPDATE_STAT_BE(DepthPassCount
, statCount
);
184 AR_BEGIN(BEOutputMerger
, pDC
->drawId
);
185 #if USE_8x2_TILE_BACKEND
186 OutputMerger8x2(psContext
, psContext
.pColorBuffer
, 0, &state
.blendState
, state
.pfnBlendFunc
, vCoverageMask
, depthPassMask
, state
.psState
.renderTargetMask
, useAlternateOffset
);
188 OutputMerger4x2(psContext
, psContext
.pColorBuffer
, 0, &state
.blendState
, state
.pfnBlendFunc
, vCoverageMask
, depthPassMask
, state
.psState
.renderTargetMask
);
191 // do final depth write after all pixel kills
192 if (!state
.psState
.forceEarlyZ
)
194 DepthStencilWrite(&state
.vp
[work
.triFlags
.viewportIndex
], &state
.depthStencilState
, work
.triFlags
.frontFacing
, psContext
.vZ
,
195 pDepthBuffer
, depthPassMask
, vCoverageMask
, pStencilBuffer
, stencilPassMask
);
197 AR_END(BEOutputMerger
, 0);
201 AR_BEGIN(BEEndTile
, pDC
->drawId
);
203 work
.coverageMask
[0] >>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
204 if(T::InputCoverage
== SWR_INPUT_COVERAGE_INNER_CONSERVATIVE
)
206 work
.innerCoverageMask
>>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
209 #if USE_8x2_TILE_BACKEND
210 if (useAlternateOffset
)
213 uint32_t rtMask
= state
.colorHottileEnable
;
214 while(_BitScanForward(&rt
, rtMask
))
216 rtMask
&= ~(1 << rt
);
217 psContext
.pColorBuffer
[rt
] += (2 * KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
) / 8;
222 uint32_t rtMask
= state
.colorHottileEnable
;
223 while (_BitScanForward(&rt
, rtMask
))
225 rtMask
&= ~(1 << rt
);
226 psContext
.pColorBuffer
[rt
] += (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
) / 8;
229 pDepthBuffer
+= (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
) / 8;
230 pStencilBuffer
+= (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
) / 8;
232 AR_END(BEEndTile
, 0);
234 psContext
.vX
.UL
= _simd_add_ps(psContext
.vX
.UL
, dx
);
235 psContext
.vX
.center
= _simd_add_ps(psContext
.vX
.center
, dx
);
238 psContext
.vY
.UL
= _simd_add_ps(psContext
.vY
.UL
, dy
);
239 psContext
.vY
.center
= _simd_add_ps(psContext
.vY
.center
, dy
);
242 AR_END(BESingleSampleBackend
, 0);
245 // Recursive template used to auto-nest conditionals. Converts dynamic enum function
246 // arguments to static template arguments.
247 template <uint32_t... ArgsT
>
248 struct BEChooserSingleSample
250 // Last Arg Terminator
251 static PFN_BACKEND_FUNC
GetFunc(SWR_BACKEND_FUNCS tArg
)
255 case SWR_BACKEND_SINGLE_SAMPLE
: return BackendSingleSample
<SwrBackendTraits
<ArgsT
...>>; break;
256 case SWR_BACKEND_MSAA_PIXEL_RATE
:
257 case SWR_BACKEND_MSAA_SAMPLE_RATE
:
259 SWR_ASSERT(0 && "Invalid backend func\n");
265 // Recursively parse args
266 template <typename
... TArgsT
>
267 static PFN_BACKEND_FUNC
GetFunc(SWR_INPUT_COVERAGE tArg
, TArgsT
... remainingArgs
)
271 case SWR_INPUT_COVERAGE_NONE
: return BEChooserSingleSample
<ArgsT
..., SWR_INPUT_COVERAGE_NONE
>::GetFunc(remainingArgs
...); break;
272 case SWR_INPUT_COVERAGE_NORMAL
: return BEChooserSingleSample
<ArgsT
..., SWR_INPUT_COVERAGE_NORMAL
>::GetFunc(remainingArgs
...); break;
273 case SWR_INPUT_COVERAGE_INNER_CONSERVATIVE
: return BEChooserSingleSample
<ArgsT
..., SWR_INPUT_COVERAGE_INNER_CONSERVATIVE
>::GetFunc(remainingArgs
...); break;
275 SWR_ASSERT(0 && "Invalid sample pattern\n");
276 return BEChooserSingleSample
<ArgsT
..., SWR_INPUT_COVERAGE_NONE
>::GetFunc(remainingArgs
...);
281 // Recursively parse args
282 template <typename
... TArgsT
>
283 static PFN_BACKEND_FUNC
GetFunc(SWR_MULTISAMPLE_COUNT tArg
, TArgsT
... remainingArgs
)
287 case SWR_MULTISAMPLE_1X
: return BEChooserSingleSample
<ArgsT
..., SWR_MULTISAMPLE_1X
>::GetFunc(remainingArgs
...); break;
288 case SWR_MULTISAMPLE_2X
: return BEChooserSingleSample
<ArgsT
..., SWR_MULTISAMPLE_2X
>::GetFunc(remainingArgs
...); break;
289 case SWR_MULTISAMPLE_4X
: return BEChooserSingleSample
<ArgsT
..., SWR_MULTISAMPLE_4X
>::GetFunc(remainingArgs
...); break;
290 case SWR_MULTISAMPLE_8X
: return BEChooserSingleSample
<ArgsT
..., SWR_MULTISAMPLE_8X
>::GetFunc(remainingArgs
...); break;
291 case SWR_MULTISAMPLE_16X
: return BEChooserSingleSample
<ArgsT
..., SWR_MULTISAMPLE_16X
>::GetFunc(remainingArgs
...); break;
293 SWR_ASSERT(0 && "Invalid sample count\n");
294 return BEChooserSingleSample
<ArgsT
..., SWR_MULTISAMPLE_1X
>::GetFunc(remainingArgs
...);
299 // Recursively parse args
300 template <typename
... TArgsT
>
301 static PFN_BACKEND_FUNC
GetFunc(bool tArg
, TArgsT
... remainingArgs
)
305 return BEChooserSingleSample
<ArgsT
..., 1>::GetFunc(remainingArgs
...);
308 return BEChooserSingleSample
<ArgsT
..., 0>::GetFunc(remainingArgs
...);
312 void InitBackendSingleFuncTable(PFN_BACKEND_FUNC (&table
)[SWR_INPUT_COVERAGE_COUNT
][2][2])
314 for(uint32_t inputCoverage
= 0; inputCoverage
< SWR_INPUT_COVERAGE_COUNT
; inputCoverage
++)
316 for(uint32_t isCentroid
= 0; isCentroid
< 2; isCentroid
++)
318 for(uint32_t canEarlyZ
= 0; canEarlyZ
< 2; canEarlyZ
++)
320 table
[inputCoverage
][isCentroid
][canEarlyZ
] =
321 BEChooserSingleSample
<>::GetFunc(SWR_MULTISAMPLE_1X
, false, (SWR_INPUT_COVERAGE
)inputCoverage
,
322 (isCentroid
> 0), false, (canEarlyZ
> 0), SWR_BACKEND_SINGLE_SAMPLE
);