swr/rast: fix USE_SIMD16_FRONTEND issues
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / backend_singlesample.cpp
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file backend.cpp
24 *
25 * @brief Backend handles rasterization, pixel shading and output merger
26 * operations.
27 *
28 ******************************************************************************/
29
30 #include <smmintrin.h>
31
32 #include "backend.h"
33 #include "backend_impl.h"
34 #include "tilemgr.h"
35 #include "memory/tilingtraits.h"
36 #include "core/multisample.h"
37
38 #include <algorithm>
39
40 template<typename T>
41 void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
42 {
43 SWR_CONTEXT *pContext = pDC->pContext;
44
45 AR_BEGIN(BESingleSampleBackend, pDC->drawId);
46 AR_BEGIN(BESetup, pDC->drawId);
47
48 const API_STATE &state = GetApiState(pDC);
49
50 BarycentricCoeffs coeffs;
51 SetupBarycentricCoeffs(&coeffs, work);
52
53 SWR_PS_CONTEXT psContext;
54 const SWR_MULTISAMPLE_POS& samplePos = state.rastState.samplePositions;
55 SetupPixelShaderContext<T>(&psContext, samplePos, work);
56
57 uint8_t *pDepthBuffer, *pStencilBuffer;
58 SetupRenderBuffers(psContext.pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.colorHottileEnable, renderBuffers);
59
60 AR_END(BESetup, 1);
61
62 psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast<float>(y)));
63 psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast<float>(y)));
64
65 const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM));
66
67 for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
68 {
69 psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps(static_cast<float>(x)));
70 psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps(static_cast<float>(x)));
71
72 const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM));
73
74 for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
75 {
76 #if USE_8x2_TILE_BACKEND
77 const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0);
78 #endif
79 simdmask coverageMask = work.coverageMask[0] & MASK;
80
81 if (coverageMask)
82 {
83 if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable)
84 {
85 static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format");
86
87 const simdscalar z = _simd_load_ps(reinterpret_cast<const float *>(pDepthBuffer));
88
89 const float minz = state.depthBoundsState.depthBoundsTestMinValue;
90 const float maxz = state.depthBoundsState.depthBoundsTestMaxValue;
91
92 coverageMask &= CalcDepthBoundsAcceptMask(z, minz, maxz);
93 }
94
95 if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE)
96 {
97 const uint64_t* pCoverageMask = (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) ? &work.innerCoverageMask : &work.coverageMask[0];
98
99 generateInputCoverage<T, T::InputCoverage>(pCoverageMask, psContext.inputMask, state.blendState.sampleMask);
100 }
101
102 AR_BEGIN(BEBarycentric, pDC->drawId);
103
104 CalcPixelBarycentrics(coeffs, psContext);
105
106 CalcCentroid<T, true>(&psContext, samplePos, coeffs, work.coverageMask, state.blendState.sampleMask);
107
108 // interpolate and quantize z
109 psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center);
110 psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
111
112 AR_END(BEBarycentric, 1);
113
114 // interpolate user clip distance if available
115 if (state.rastState.clipDistanceMask)
116 {
117 coverageMask &= ~ComputeUserClipMask(state.rastState.clipDistanceMask, work.pUserClipBuffer, psContext.vI.center, psContext.vJ.center);
118 }
119
120 simdscalar vCoverageMask = _simd_vmask_ps(coverageMask);
121 simdscalar depthPassMask = vCoverageMask;
122 simdscalar stencilPassMask = vCoverageMask;
123
124 // Early-Z?
125 if (T::bCanEarlyZ)
126 {
127 AR_BEGIN(BEEarlyDepthTest, pDC->drawId);
128 depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
129 psContext.vZ, pDepthBuffer, vCoverageMask, pStencilBuffer, &stencilPassMask);
130 AR_EVENT(EarlyDepthStencilInfoSingleSample(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(stencilPassMask), _simd_movemask_ps(vCoverageMask)));
131 AR_END(BEEarlyDepthTest, 0);
132
133 // early-exit if no pixels passed depth or earlyZ is forced on
134 if (state.psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask))
135 {
136 DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
137 pDepthBuffer, depthPassMask, vCoverageMask, pStencilBuffer, stencilPassMask);
138
139 if (!_simd_movemask_ps(depthPassMask))
140 {
141 goto Endtile;
142 }
143 }
144 }
145
146 psContext.sampleIndex = 0;
147 psContext.activeMask = _simd_castps_si(vCoverageMask);
148
149 // execute pixel shader
150 AR_BEGIN(BEPixelShader, pDC->drawId);
151 UPDATE_STAT_BE(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask)));
152 state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
153 AR_END(BEPixelShader, 0);
154
155 vCoverageMask = _simd_castsi_ps(psContext.activeMask);
156
157 // late-Z
158 if (!T::bCanEarlyZ)
159 {
160 AR_BEGIN(BELateDepthTest, pDC->drawId);
161 depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
162 psContext.vZ, pDepthBuffer, vCoverageMask, pStencilBuffer, &stencilPassMask);
163 AR_EVENT(LateDepthStencilInfoSingleSample(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(stencilPassMask), _simd_movemask_ps(vCoverageMask)));
164 AR_END(BELateDepthTest, 0);
165
166 if (!_simd_movemask_ps(depthPassMask))
167 {
168 // need to call depth/stencil write for stencil write
169 DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
170 pDepthBuffer, depthPassMask, vCoverageMask, pStencilBuffer, stencilPassMask);
171 goto Endtile;
172 }
173 } else {
174 // for early z, consolidate discards from shader
175 // into depthPassMask
176 depthPassMask = _simd_and_ps(depthPassMask, vCoverageMask);
177 }
178
179 uint32_t statMask = _simd_movemask_ps(depthPassMask);
180 uint32_t statCount = _mm_popcnt_u32(statMask);
181 UPDATE_STAT_BE(DepthPassCount, statCount);
182
183 // output merger
184 AR_BEGIN(BEOutputMerger, pDC->drawId);
185 #if USE_8x2_TILE_BACKEND
186 OutputMerger8x2(psContext, psContext.pColorBuffer, 0, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.renderTargetMask, useAlternateOffset);
187 #else
188 OutputMerger4x2(psContext, psContext.pColorBuffer, 0, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.renderTargetMask);
189 #endif
190
191 // do final depth write after all pixel kills
192 if (!state.psState.forceEarlyZ)
193 {
194 DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
195 pDepthBuffer, depthPassMask, vCoverageMask, pStencilBuffer, stencilPassMask);
196 }
197 AR_END(BEOutputMerger, 0);
198 }
199
200 Endtile:
201 AR_BEGIN(BEEndTile, pDC->drawId);
202
203 work.coverageMask[0] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
204 if(T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
205 {
206 work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
207 }
208
209 #if USE_8x2_TILE_BACKEND
210 if (useAlternateOffset)
211 {
212 DWORD rt;
213 uint32_t rtMask = state.colorHottileEnable;
214 while(_BitScanForward(&rt, rtMask))
215 {
216 rtMask &= ~(1 << rt);
217 psContext.pColorBuffer[rt] += (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
218 }
219 }
220 #else
221 DWORD rt;
222 uint32_t rtMask = state.colorHottileEnable;
223 while (_BitScanForward(&rt, rtMask))
224 {
225 rtMask &= ~(1 << rt);
226 psContext.pColorBuffer[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
227 }
228 #endif
229 pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
230 pStencilBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
231
232 AR_END(BEEndTile, 0);
233
234 psContext.vX.UL = _simd_add_ps(psContext.vX.UL, dx);
235 psContext.vX.center = _simd_add_ps(psContext.vX.center, dx);
236 }
237
238 psContext.vY.UL = _simd_add_ps(psContext.vY.UL, dy);
239 psContext.vY.center = _simd_add_ps(psContext.vY.center, dy);
240 }
241
242 AR_END(BESingleSampleBackend, 0);
243 }
244
245 // Recursive template used to auto-nest conditionals. Converts dynamic enum function
246 // arguments to static template arguments.
247 template <uint32_t... ArgsT>
248 struct BEChooserSingleSample
249 {
250 // Last Arg Terminator
251 static PFN_BACKEND_FUNC GetFunc(SWR_BACKEND_FUNCS tArg)
252 {
253 switch(tArg)
254 {
255 case SWR_BACKEND_SINGLE_SAMPLE: return BackendSingleSample<SwrBackendTraits<ArgsT...>>; break;
256 case SWR_BACKEND_MSAA_PIXEL_RATE:
257 case SWR_BACKEND_MSAA_SAMPLE_RATE:
258 default:
259 SWR_ASSERT(0 && "Invalid backend func\n");
260 return nullptr;
261 break;
262 }
263 }
264
265 // Recursively parse args
266 template <typename... TArgsT>
267 static PFN_BACKEND_FUNC GetFunc(SWR_INPUT_COVERAGE tArg, TArgsT... remainingArgs)
268 {
269 switch(tArg)
270 {
271 case SWR_INPUT_COVERAGE_NONE: return BEChooserSingleSample<ArgsT..., SWR_INPUT_COVERAGE_NONE>::GetFunc(remainingArgs...); break;
272 case SWR_INPUT_COVERAGE_NORMAL: return BEChooserSingleSample<ArgsT..., SWR_INPUT_COVERAGE_NORMAL>::GetFunc(remainingArgs...); break;
273 case SWR_INPUT_COVERAGE_INNER_CONSERVATIVE: return BEChooserSingleSample<ArgsT..., SWR_INPUT_COVERAGE_INNER_CONSERVATIVE>::GetFunc(remainingArgs...); break;
274 default:
275 SWR_ASSERT(0 && "Invalid sample pattern\n");
276 return BEChooserSingleSample<ArgsT..., SWR_INPUT_COVERAGE_NONE>::GetFunc(remainingArgs...);
277 break;
278 }
279 }
280
281 // Recursively parse args
282 template <typename... TArgsT>
283 static PFN_BACKEND_FUNC GetFunc(SWR_MULTISAMPLE_COUNT tArg, TArgsT... remainingArgs)
284 {
285 switch(tArg)
286 {
287 case SWR_MULTISAMPLE_1X: return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...); break;
288 case SWR_MULTISAMPLE_2X: return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_2X>::GetFunc(remainingArgs...); break;
289 case SWR_MULTISAMPLE_4X: return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_4X>::GetFunc(remainingArgs...); break;
290 case SWR_MULTISAMPLE_8X: return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_8X>::GetFunc(remainingArgs...); break;
291 case SWR_MULTISAMPLE_16X: return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_16X>::GetFunc(remainingArgs...); break;
292 default:
293 SWR_ASSERT(0 && "Invalid sample count\n");
294 return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...);
295 break;
296 }
297 }
298
299 // Recursively parse args
300 template <typename... TArgsT>
301 static PFN_BACKEND_FUNC GetFunc(bool tArg, TArgsT... remainingArgs)
302 {
303 if(tArg == true)
304 {
305 return BEChooserSingleSample<ArgsT..., 1>::GetFunc(remainingArgs...);
306 }
307
308 return BEChooserSingleSample<ArgsT..., 0>::GetFunc(remainingArgs...);
309 }
310 };
311
312 void InitBackendSingleFuncTable(PFN_BACKEND_FUNC (&table)[SWR_INPUT_COVERAGE_COUNT][2][2])
313 {
314 for(uint32_t inputCoverage = 0; inputCoverage < SWR_INPUT_COVERAGE_COUNT; inputCoverage++)
315 {
316 for(uint32_t isCentroid = 0; isCentroid < 2; isCentroid++)
317 {
318 for(uint32_t canEarlyZ = 0; canEarlyZ < 2; canEarlyZ++)
319 {
320 table[inputCoverage][isCentroid][canEarlyZ] =
321 BEChooserSingleSample<>::GetFunc(SWR_MULTISAMPLE_1X, false, (SWR_INPUT_COVERAGE)inputCoverage,
322 (isCentroid > 0), false, (canEarlyZ > 0), SWR_BACKEND_SINGLE_SAMPLE);
323 }
324 }
325 }
326 }