swr/rast: Refactor memory API between rasterizer core and swr
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / backend_singlesample.cpp
1 /****************************************************************************
2 * Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file backend.cpp
24 *
25 * @brief Backend handles rasterization, pixel shading and output merger
26 * operations.
27 *
28 ******************************************************************************/
29
30 #include <smmintrin.h>
31
32 #include "backend.h"
33 #include "backend_impl.h"
34 #include "tilemgr.h"
35 #include "memory/tilingtraits.h"
36 #include "core/multisample.h"
37
38 #include <algorithm>
39
40 template <typename T>
41 void BackendSingleSample(DRAW_CONTEXT* pDC,
42 uint32_t workerId,
43 uint32_t x,
44 uint32_t y,
45 SWR_TRIANGLE_DESC& work,
46 RenderOutputBuffers& renderBuffers)
47 {
48 RDTSC_BEGIN(BESingleSampleBackend, pDC->drawId);
49 RDTSC_BEGIN(BESetup, pDC->drawId);
50
51 void* pWorkerData = pDC->pContext->threadPool.pThreadData[workerId].pWorkerPrivateData;
52
53 const API_STATE& state = GetApiState(pDC);
54
55 BarycentricCoeffs coeffs;
56 SetupBarycentricCoeffs(&coeffs, work);
57
58 SWR_PS_CONTEXT psContext;
59 const SWR_MULTISAMPLE_POS& samplePos = state.rastState.samplePositions;
60 SetupPixelShaderContext<T>(&psContext, samplePos, work);
61
62 uint8_t *pDepthBuffer, *pStencilBuffer;
63 SetupRenderBuffers(psContext.pColorBuffer,
64 &pDepthBuffer,
65 &pStencilBuffer,
66 state.colorHottileEnable,
67 renderBuffers);
68
69 RDTSC_END(BESetup, 1);
70
71 psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast<float>(y)));
72 psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast<float>(y)));
73
74 const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM));
75
76 for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
77 {
78 psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps(static_cast<float>(x)));
79 psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps(static_cast<float>(x)));
80
81 const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM));
82
83 for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
84 {
85 const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0);
86
87
88 simdmask coverageMask = work.coverageMask[0] & MASK;
89
90 if (coverageMask)
91 {
92 if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable)
93 {
94 static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT,
95 "Unsupported depth hot tile format");
96
97 const simdscalar z =
98 _simd_load_ps(reinterpret_cast<const float*>(pDepthBuffer));
99
100 const float minz = state.depthBoundsState.depthBoundsTestMinValue;
101 const float maxz = state.depthBoundsState.depthBoundsTestMaxValue;
102
103 coverageMask &= CalcDepthBoundsAcceptMask(z, minz, maxz);
104 }
105
106 if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE)
107 {
108 const uint64_t* pCoverageMask =
109 (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
110 ? &work.innerCoverageMask
111 : &work.coverageMask[0];
112
113 generateInputCoverage<T, T::InputCoverage>(
114 pCoverageMask, psContext.inputMask, state.blendState.sampleMask);
115 }
116
117 RDTSC_BEGIN(BEBarycentric, pDC->drawId);
118
119 CalcPixelBarycentrics(coeffs, psContext);
120
121 CalcCentroid<T, true>(
122 &psContext, samplePos, coeffs, work.coverageMask, state.blendState.sampleMask);
123
124 // interpolate and quantize z
125 psContext.vZ = vplaneps(
126 coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center);
127 psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
128
129 RDTSC_END(BEBarycentric, 1);
130
131 // interpolate user clip distance if available
132 if (state.backendState.clipDistanceMask)
133 {
134 coverageMask &= ~ComputeUserClipMask(state.backendState.clipDistanceMask,
135 work.pUserClipBuffer,
136 psContext.vI.center,
137 psContext.vJ.center);
138 }
139
140 simdscalar vCoverageMask = _simd_vmask_ps(coverageMask);
141 simdscalar depthPassMask = vCoverageMask;
142 simdscalar stencilPassMask = vCoverageMask;
143
144 // Early-Z?
145 if (T::bCanEarlyZ)
146 {
147 RDTSC_BEGIN(BEEarlyDepthTest, pDC->drawId);
148 depthPassMask = DepthStencilTest(&state,
149 work.triFlags.frontFacing,
150 work.triFlags.viewportIndex,
151 psContext.vZ,
152 pDepthBuffer,
153 vCoverageMask,
154 pStencilBuffer,
155 &stencilPassMask);
156 AR_EVENT(EarlyDepthStencilInfoSingleSample(_simd_movemask_ps(depthPassMask),
157 _simd_movemask_ps(stencilPassMask),
158 _simd_movemask_ps(vCoverageMask)));
159 RDTSC_END(BEEarlyDepthTest, 0);
160
161 // early-exit if no pixels passed depth or earlyZ is forced on
162 if (state.psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask))
163 {
164 DepthStencilWrite(&state.vp[work.triFlags.viewportIndex],
165 &state.depthStencilState,
166 work.triFlags.frontFacing,
167 psContext.vZ,
168 pDepthBuffer,
169 depthPassMask,
170 vCoverageMask,
171 pStencilBuffer,
172 stencilPassMask);
173
174 if (!_simd_movemask_ps(depthPassMask))
175 {
176 goto Endtile;
177 }
178 }
179 }
180
181 psContext.sampleIndex = 0;
182 psContext.activeMask = _simd_castps_si(vCoverageMask);
183
184 // execute pixel shader
185 RDTSC_BEGIN(BEPixelShader, pDC->drawId);
186 state.psState.pfnPixelShader(GetPrivateState(pDC), pWorkerData, &psContext);
187 RDTSC_END(BEPixelShader, 0);
188
189 // update stats
190 UPDATE_STAT_BE(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask)));
191 AR_EVENT(PSStats((HANDLE)&psContext.stats));
192
193 vCoverageMask = _simd_castsi_ps(psContext.activeMask);
194
195 // late-Z
196 if (!T::bCanEarlyZ)
197 {
198 RDTSC_BEGIN(BELateDepthTest, pDC->drawId);
199 depthPassMask = DepthStencilTest(&state,
200 work.triFlags.frontFacing,
201 work.triFlags.viewportIndex,
202 psContext.vZ,
203 pDepthBuffer,
204 vCoverageMask,
205 pStencilBuffer,
206 &stencilPassMask);
207 AR_EVENT(LateDepthStencilInfoSingleSample(_simd_movemask_ps(depthPassMask),
208 _simd_movemask_ps(stencilPassMask),
209 _simd_movemask_ps(vCoverageMask)));
210 RDTSC_END(BELateDepthTest, 0);
211
212 if (!_simd_movemask_ps(depthPassMask))
213 {
214 // need to call depth/stencil write for stencil write
215 DepthStencilWrite(&state.vp[work.triFlags.viewportIndex],
216 &state.depthStencilState,
217 work.triFlags.frontFacing,
218 psContext.vZ,
219 pDepthBuffer,
220 depthPassMask,
221 vCoverageMask,
222 pStencilBuffer,
223 stencilPassMask);
224 goto Endtile;
225 }
226 }
227 else
228 {
229 // for early z, consolidate discards from shader
230 // into depthPassMask
231 depthPassMask = _simd_and_ps(depthPassMask, vCoverageMask);
232 }
233
234 uint32_t statMask = _simd_movemask_ps(depthPassMask);
235 uint32_t statCount = _mm_popcnt_u32(statMask);
236 UPDATE_STAT_BE(DepthPassCount, statCount);
237
238 // output merger
239 RDTSC_BEGIN(BEOutputMerger, pDC->drawId);
240
241 OutputMerger8x2(pDC,
242 psContext,
243 psContext.pColorBuffer,
244 0,
245 &state.blendState,
246 state.pfnBlendFunc,
247 vCoverageMask,
248 depthPassMask,
249 state.psState.renderTargetMask,
250 useAlternateOffset,
251 workerId);
252
253 // do final depth write after all pixel kills
254 if (!state.psState.forceEarlyZ)
255 {
256 DepthStencilWrite(&state.vp[work.triFlags.viewportIndex],
257 &state.depthStencilState,
258 work.triFlags.frontFacing,
259 psContext.vZ,
260 pDepthBuffer,
261 depthPassMask,
262 vCoverageMask,
263 pStencilBuffer,
264 stencilPassMask);
265 }
266 RDTSC_END(BEOutputMerger, 0);
267 }
268
269 Endtile:
270 RDTSC_BEGIN(BEEndTile, pDC->drawId);
271
272 work.coverageMask[0] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
273 if (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
274 {
275 work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
276 }
277
278 if (useAlternateOffset)
279 {
280 DWORD rt;
281 uint32_t rtMask = state.colorHottileEnable;
282 while (_BitScanForward(&rt, rtMask))
283 {
284 rtMask &= ~(1 << rt);
285 psContext.pColorBuffer[rt] +=
286 (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
287 }
288 }
289
290 pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
291 pStencilBuffer +=
292 (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
293
294 RDTSC_END(BEEndTile, 0);
295
296 psContext.vX.UL = _simd_add_ps(psContext.vX.UL, dx);
297 psContext.vX.center = _simd_add_ps(psContext.vX.center, dx);
298 }
299
300 psContext.vY.UL = _simd_add_ps(psContext.vY.UL, dy);
301 psContext.vY.center = _simd_add_ps(psContext.vY.center, dy);
302 }
303
304 RDTSC_END(BESingleSampleBackend, 0);
305 }
306
307 // Recursive template used to auto-nest conditionals. Converts dynamic enum function
308 // arguments to static template arguments.
309 template <uint32_t... ArgsT>
310 struct BEChooserSingleSample
311 {
312 // Last Arg Terminator
313 static PFN_BACKEND_FUNC GetFunc(SWR_BACKEND_FUNCS tArg)
314 {
315 switch (tArg)
316 {
317 case SWR_BACKEND_SINGLE_SAMPLE:
318 return BackendSingleSample<SwrBackendTraits<ArgsT...>>;
319 break;
320 case SWR_BACKEND_MSAA_PIXEL_RATE:
321 case SWR_BACKEND_MSAA_SAMPLE_RATE:
322 default:
323 SWR_ASSERT(0 && "Invalid backend func\n");
324 return nullptr;
325 break;
326 }
327 }
328
329 // Recursively parse args
330 template <typename... TArgsT>
331 static PFN_BACKEND_FUNC GetFunc(SWR_INPUT_COVERAGE tArg, TArgsT... remainingArgs)
332 {
333 switch (tArg)
334 {
335 case SWR_INPUT_COVERAGE_NONE:
336 return BEChooserSingleSample<ArgsT..., SWR_INPUT_COVERAGE_NONE>::GetFunc(
337 remainingArgs...);
338 break;
339 case SWR_INPUT_COVERAGE_NORMAL:
340 return BEChooserSingleSample<ArgsT..., SWR_INPUT_COVERAGE_NORMAL>::GetFunc(
341 remainingArgs...);
342 break;
343 case SWR_INPUT_COVERAGE_INNER_CONSERVATIVE:
344 return BEChooserSingleSample<ArgsT..., SWR_INPUT_COVERAGE_INNER_CONSERVATIVE>::GetFunc(
345 remainingArgs...);
346 break;
347 default:
348 SWR_ASSERT(0 && "Invalid sample pattern\n");
349 return BEChooserSingleSample<ArgsT..., SWR_INPUT_COVERAGE_NONE>::GetFunc(
350 remainingArgs...);
351 break;
352 }
353 }
354
355 // Recursively parse args
356 template <typename... TArgsT>
357 static PFN_BACKEND_FUNC GetFunc(SWR_MULTISAMPLE_COUNT tArg, TArgsT... remainingArgs)
358 {
359 switch (tArg)
360 {
361 case SWR_MULTISAMPLE_1X:
362 return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...);
363 break;
364 case SWR_MULTISAMPLE_2X:
365 return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_2X>::GetFunc(remainingArgs...);
366 break;
367 case SWR_MULTISAMPLE_4X:
368 return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_4X>::GetFunc(remainingArgs...);
369 break;
370 case SWR_MULTISAMPLE_8X:
371 return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_8X>::GetFunc(remainingArgs...);
372 break;
373 case SWR_MULTISAMPLE_16X:
374 return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_16X>::GetFunc(remainingArgs...);
375 break;
376 default:
377 SWR_ASSERT(0 && "Invalid sample count\n");
378 return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...);
379 break;
380 }
381 }
382
383 // Recursively parse args
384 template <typename... TArgsT>
385 static PFN_BACKEND_FUNC GetFunc(bool tArg, TArgsT... remainingArgs)
386 {
387 if (tArg == true)
388 {
389 return BEChooserSingleSample<ArgsT..., 1>::GetFunc(remainingArgs...);
390 }
391
392 return BEChooserSingleSample<ArgsT..., 0>::GetFunc(remainingArgs...);
393 }
394 };
395
396 void InitBackendSingleFuncTable(PFN_BACKEND_FUNC (&table)[SWR_INPUT_COVERAGE_COUNT][2][2])
397 {
398 for (uint32_t inputCoverage = 0; inputCoverage < SWR_INPUT_COVERAGE_COUNT; inputCoverage++)
399 {
400 for (uint32_t isCentroid = 0; isCentroid < 2; isCentroid++)
401 {
402 for (uint32_t canEarlyZ = 0; canEarlyZ < 2; canEarlyZ++)
403 {
404 table[inputCoverage][isCentroid][canEarlyZ] =
405 BEChooserSingleSample<>::GetFunc(SWR_MULTISAMPLE_1X,
406 false,
407 (SWR_INPUT_COVERAGE)inputCoverage,
408 (isCentroid > 0),
409 false,
410 (canEarlyZ > 0),
411 SWR_BACKEND_SINGLE_SAMPLE);
412 }
413 }
414 }
415 }