swr/rast: More precise user clip distance interpolation
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / backend_impl.h
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file backend.h
24 *
25 * @brief Backend handles rasterization, pixel shading and output merger
26 * operations.
27 *
28 ******************************************************************************/
29 #pragma once
30
31 void InitBackendSingleFuncTable(PFN_BACKEND_FUNC(&table)[SWR_INPUT_COVERAGE_COUNT][2][2]);
32 void InitBackendSampleFuncTable(PFN_BACKEND_FUNC(&table)[SWR_MULTISAMPLE_TYPE_COUNT][SWR_INPUT_COVERAGE_COUNT][2][2]);
33
34 static INLINE void CalcSampleBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext);
35
36
37 enum SWR_BACKEND_FUNCS
38 {
39 SWR_BACKEND_SINGLE_SAMPLE,
40 SWR_BACKEND_MSAA_PIXEL_RATE,
41 SWR_BACKEND_MSAA_SAMPLE_RATE,
42 SWR_BACKEND_FUNCS_MAX,
43 };
44
45 #if KNOB_SIMD_WIDTH == 8
46 static const __m256 vCenterOffsetsX = __m256{0.5, 1.5, 0.5, 1.5, 2.5, 3.5, 2.5, 3.5};
47 static const __m256 vCenterOffsetsY = __m256{0.5, 0.5, 1.5, 1.5, 0.5, 0.5, 1.5, 1.5};
48 static const __m256 vULOffsetsX = __m256{0.0, 1.0, 0.0, 1.0, 2.0, 3.0, 2.0, 3.0};
49 static const __m256 vULOffsetsY = __m256{0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0};
50 #define MASK 0xff
51 #endif
52
53 static INLINE simdmask ComputeUserClipMask(uint8_t clipMask, float* pUserClipBuffer, simdscalar const &vI, simdscalar const &vJ)
54 {
55 simdscalar vClipMask = _simd_setzero_ps();
56 uint32_t numClipDistance = _mm_popcnt_u32(clipMask);
57
58 for (uint32_t i = 0; i < numClipDistance; ++i)
59 {
60 // pull triangle clip distance values from clip buffer
61 simdscalar vA = _simd_broadcast_ss(pUserClipBuffer++);
62 simdscalar vB = _simd_broadcast_ss(pUserClipBuffer++);
63 simdscalar vC = _simd_broadcast_ss(pUserClipBuffer++);
64
65 simdscalar vK = _simd_sub_ps(_simd_sub_ps(_simd_set1_ps(1.0f), vI), vJ);
66
67 // interpolate
68 simdscalar vInterp = vplaneps(vA, vB, _simd_mul_ps(vK, vC), vI, vJ);
69
70 // clip if interpolated clip distance is < 0 || NAN
71 simdscalar vCull = _simd_cmp_ps(_simd_setzero_ps(), vInterp, _CMP_NLE_UQ);
72
73 vClipMask = _simd_or_ps(vClipMask, vCull);
74 }
75
76 return _simd_movemask_ps(vClipMask);
77 }
78
79 INLINE static uint32_t RasterTileColorOffset(uint32_t sampleNum)
80 {
81 static const uint32_t RasterTileColorOffsets[16]
82 { 0,
83 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8),
84 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 2,
85 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 3,
86 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 4,
87 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 5,
88 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 6,
89 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 7,
90 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 8,
91 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 9,
92 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 10,
93 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 11,
94 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 12,
95 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 13,
96 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 14,
97 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 15,
98 };
99 assert(sampleNum < 16);
100 return RasterTileColorOffsets[sampleNum];
101 }
102
103 INLINE static uint32_t RasterTileDepthOffset(uint32_t sampleNum)
104 {
105 static const uint32_t RasterTileDepthOffsets[16]
106 { 0,
107 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8),
108 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 2,
109 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 3,
110 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 4,
111 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 5,
112 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 6,
113 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 7,
114 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 8,
115 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 9,
116 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 10,
117 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 11,
118 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 12,
119 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 13,
120 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 14,
121 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 15,
122 };
123 assert(sampleNum < 16);
124 return RasterTileDepthOffsets[sampleNum];
125 }
126
127 INLINE static uint32_t RasterTileStencilOffset(uint32_t sampleNum)
128 {
129 static const uint32_t RasterTileStencilOffsets[16]
130 { 0,
131 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8),
132 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 2,
133 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 3,
134 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 4,
135 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 5,
136 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 6,
137 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 7,
138 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 8,
139 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 9,
140 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 10,
141 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 11,
142 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 12,
143 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 13,
144 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 14,
145 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 15,
146 };
147 assert(sampleNum < 16);
148 return RasterTileStencilOffsets[sampleNum];
149 }
150
151 template<typename T, uint32_t InputCoverage>
152 struct generateInputCoverage
153 {
154 INLINE generateInputCoverage(const uint64_t *const coverageMask, uint32_t (&inputMask)[KNOB_SIMD_WIDTH], const uint32_t sampleMask)
155 {
156 // will need to update for avx512
157 assert(KNOB_SIMD_WIDTH == 8);
158
159 simdscalari mask[2];
160 simdscalari sampleCoverage[2];
161
162 if(T::bIsCenterPattern)
163 {
164 // center coverage is the same for all samples; just broadcast to the sample slots
165 uint32_t centerCoverage = ((uint32_t)(*coverageMask) & MASK);
166 if(T::MultisampleT::numSamples == 1)
167 {
168 sampleCoverage[0] = _simd_set_epi32(0, 0, 0, 0, 0, 0, 0, centerCoverage);
169 }
170 else if(T::MultisampleT::numSamples == 2)
171 {
172 sampleCoverage[0] = _simd_set_epi32(0, 0, 0, 0, 0, 0, centerCoverage, centerCoverage);
173 }
174 else if(T::MultisampleT::numSamples == 4)
175 {
176 sampleCoverage[0] = _simd_set_epi32(0, 0, 0, 0, centerCoverage, centerCoverage, centerCoverage, centerCoverage);
177 }
178 else if(T::MultisampleT::numSamples == 8)
179 {
180 sampleCoverage[0] = _simd_set1_epi32(centerCoverage);
181 }
182 else if(T::MultisampleT::numSamples == 16)
183 {
184 sampleCoverage[0] = _simd_set1_epi32(centerCoverage);
185 sampleCoverage[1] = _simd_set1_epi32(centerCoverage);
186 }
187 }
188 else
189 {
190 simdscalari src = _simd_set1_epi32(0);
191 simdscalari index0 = _simd_set_epi32(7, 6, 5, 4, 3, 2, 1, 0), index1;
192
193 if(T::MultisampleT::numSamples == 1)
194 {
195 mask[0] = _simd_set_epi32(0, 0, 0, 0, 0, 0, 0, -1);
196 }
197 else if(T::MultisampleT::numSamples == 2)
198 {
199 mask[0] = _simd_set_epi32(0, 0, 0, 0, 0, 0, -1, -1);
200 }
201 else if(T::MultisampleT::numSamples == 4)
202 {
203 mask[0] = _simd_set_epi32(0, 0, 0, 0, -1, -1, -1, -1);
204 }
205 else if(T::MultisampleT::numSamples == 8)
206 {
207 mask[0] = _simd_set1_epi32(-1);
208 }
209 else if(T::MultisampleT::numSamples == 16)
210 {
211 mask[0] = _simd_set1_epi32(-1);
212 mask[1] = _simd_set1_epi32(-1);
213 index1 = _simd_set_epi32(15, 14, 13, 12, 11, 10, 9, 8);
214 }
215
216 // gather coverage for samples 0-7
217 sampleCoverage[0] = _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src), (const float*)coverageMask, index0, _mm256_castsi256_ps(mask[0]), 8));
218 if(T::MultisampleT::numSamples > 8)
219 {
220 // gather coverage for samples 8-15
221 sampleCoverage[1] = _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src), (const float*)coverageMask, index1, _mm256_castsi256_ps(mask[1]), 8));
222 }
223 }
224
225 mask[0] = _mm256_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xC, 0x8, 0x4, 0x0,
226 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xC, 0x8, 0x4, 0x0);
227 // pull out the 8bit 4x2 coverage for samples 0-7 into the lower 32 bits of each 128bit lane
228 simdscalari packedCoverage0 = _simd_shuffle_epi8(sampleCoverage[0], mask[0]);
229
230 simdscalari packedCoverage1;
231 if(T::MultisampleT::numSamples > 8)
232 {
233 // pull out the 8bit 4x2 coverage for samples 8-15 into the lower 32 bits of each 128bit lane
234 packedCoverage1 = _simd_shuffle_epi8(sampleCoverage[1], mask[0]);
235 }
236
237 #if (KNOB_ARCH == KNOB_ARCH_AVX)
238 // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane
239 simdscalari hiToLow = _mm256_permute2f128_si256(packedCoverage0, packedCoverage0, 0x83);
240 simdscalar shufRes = _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow), _mm256_castsi256_ps(hiToLow), _MM_SHUFFLE(1, 1, 0, 1));
241 packedCoverage0 = _mm256_castps_si256(_mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0), shufRes, 0xFE));
242
243 simdscalari packedSampleCoverage;
244 if(T::MultisampleT::numSamples > 8)
245 {
246 // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane
247 hiToLow = _mm256_permute2f128_si256(packedCoverage1, packedCoverage1, 0x83);
248 shufRes = _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow), _mm256_castsi256_ps(hiToLow), _MM_SHUFFLE(1, 1, 0, 1));
249 shufRes = _mm256_blend_ps(_mm256_castsi256_ps(packedCoverage1), shufRes, 0xFE);
250 packedCoverage1 = _mm256_castps_si256(_mm256_castpd_ps(_mm256_shuffle_pd(_mm256_castps_pd(shufRes), _mm256_castps_pd(shufRes), 0x01)));
251 packedSampleCoverage = _mm256_castps_si256(_mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0), _mm256_castsi256_ps(packedCoverage1), 0xFC));
252 }
253 else
254 {
255 packedSampleCoverage = packedCoverage0;
256 }
257 #else
258 simdscalari permMask = _simd_set_epi32(0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x4, 0x0);
259 // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane
260 packedCoverage0 = _mm256_permutevar8x32_epi32(packedCoverage0, permMask);
261
262 simdscalari packedSampleCoverage;
263 if(T::MultisampleT::numSamples > 8)
264 {
265 permMask = _simd_set_epi32(0x7, 0x7, 0x7, 0x7, 0x4, 0x0, 0x7, 0x7);
266 // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane
267 packedCoverage1 = _mm256_permutevar8x32_epi32(packedCoverage1, permMask);
268
269 // blend coverage masks for samples 0-7 and samples 8-15 into single 128 bit lane
270 packedSampleCoverage = _mm256_blend_epi32(packedCoverage0, packedCoverage1, 0x0C);
271 }
272 else
273 {
274 packedSampleCoverage = packedCoverage0;
275 }
276 #endif
277
278 for(int32_t i = KNOB_SIMD_WIDTH - 1; i >= 0; i--)
279 {
280 // convert packed sample coverage masks into single coverage masks for all samples for each pixel in the 4x2
281 inputMask[i] = _simd_movemask_epi8(packedSampleCoverage);
282
283 if(!T::bForcedSampleCount)
284 {
285 // input coverage has to be anded with sample mask if MSAA isn't forced on
286 inputMask[i] &= sampleMask;
287 }
288
289 // shift to the next pixel in the 4x2
290 packedSampleCoverage = _simd_slli_epi32(packedSampleCoverage, 1);
291 }
292 }
293
294 INLINE generateInputCoverage(const uint64_t *const coverageMask, simdscalar &inputCoverage, const uint32_t sampleMask)
295 {
296 uint32_t inputMask[KNOB_SIMD_WIDTH];
297 generateInputCoverage<T, T::InputCoverage>(coverageMask, inputMask, sampleMask);
298 inputCoverage = _simd_castsi_ps(_simd_set_epi32(inputMask[7], inputMask[6], inputMask[5], inputMask[4], inputMask[3], inputMask[2], inputMask[1], inputMask[0]));
299 }
300
301 };
302
303 template<typename T>
304 struct generateInputCoverage<T, SWR_INPUT_COVERAGE_INNER_CONSERVATIVE>
305 {
306 INLINE generateInputCoverage(const uint64_t *const coverageMask, simdscalar &inputCoverage, const uint32_t sampleMask)
307 {
308 // will need to update for avx512
309 assert(KNOB_SIMD_WIDTH == 8);
310 simdscalari vec = _simd_set1_epi32(coverageMask[0]);
311 const simdscalari bit = _simd_set_epi32(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
312 vec = _simd_and_si(vec, bit);
313 vec = _simd_cmplt_epi32(_simd_setzero_si(), vec);
314 vec = _simd_blendv_epi32(_simd_setzero_si(), _simd_set1_epi32(1), vec);
315 inputCoverage = _simd_castsi_ps(vec);
316 }
317
318 INLINE generateInputCoverage(const uint64_t *const coverageMask, uint32_t (&inputMask)[KNOB_SIMD_WIDTH], const uint32_t sampleMask)
319 {
320 uint32_t simdCoverage = (coverageMask[0] & MASK);
321 static const uint32_t FullCoverageMask = (1 << T::MultisampleT::numSamples) - 1;
322 for(int i = 0; i < KNOB_SIMD_WIDTH; i++)
323 {
324 // set all samples to covered if conservative coverage mask is set for that pixel
325 inputMask[i] = (((1 << i) & simdCoverage) > 0) ? FullCoverageMask : 0;
326 }
327 }
328 };
329
330 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
331 // Centroid behaves exactly as follows :
332 // (1) If all samples in the primitive are covered, the attribute is evaluated at the pixel center (even if the sample pattern does not happen to
333 // have a sample location there).
334 // (2) Else the attribute is evaluated at the first covered sample, in increasing order of sample index, where sample coverage is after ANDing the
335 // coverage with the SampleMask Rasterizer State.
336 // (3) If no samples are covered, such as on helper pixels executed off the bounds of a primitive to fill out 2x2 pixel stamps, the attribute is
337 // evaluated as follows : If the SampleMask Rasterizer state is a subset of the samples in the pixel, then the first sample covered by the
338 // SampleMask Rasterizer State is the evaluation point.Otherwise (full SampleMask), the pixel center is the evaluation point.
339 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
340 template<typename T>
341 INLINE void CalcCentroidPos(SWR_PS_CONTEXT &psContext, const SWR_MULTISAMPLE_POS& samplePos,
342 const uint64_t *const coverageMask, const uint32_t sampleMask,
343 simdscalar const &vXSamplePosUL, simdscalar const &vYSamplePosUL)
344 {
345 uint32_t inputMask[KNOB_SIMD_WIDTH];
346 generateInputCoverage<T, T::InputCoverage>(coverageMask, inputMask, sampleMask);
347
348 // Case (2) - partially covered pixel
349
350 // scan for first covered sample per pixel in the 4x2 span
351 unsigned long sampleNum[KNOB_SIMD_WIDTH];
352 (inputMask[0] > 0) ? (_BitScanForward(&sampleNum[0], inputMask[0])) : (sampleNum[0] = 0);
353 (inputMask[1] > 0) ? (_BitScanForward(&sampleNum[1], inputMask[1])) : (sampleNum[1] = 0);
354 (inputMask[2] > 0) ? (_BitScanForward(&sampleNum[2], inputMask[2])) : (sampleNum[2] = 0);
355 (inputMask[3] > 0) ? (_BitScanForward(&sampleNum[3], inputMask[3])) : (sampleNum[3] = 0);
356 (inputMask[4] > 0) ? (_BitScanForward(&sampleNum[4], inputMask[4])) : (sampleNum[4] = 0);
357 (inputMask[5] > 0) ? (_BitScanForward(&sampleNum[5], inputMask[5])) : (sampleNum[5] = 0);
358 (inputMask[6] > 0) ? (_BitScanForward(&sampleNum[6], inputMask[6])) : (sampleNum[6] = 0);
359 (inputMask[7] > 0) ? (_BitScanForward(&sampleNum[7], inputMask[7])) : (sampleNum[7] = 0);
360
361 // look up and set the sample offsets from UL pixel corner for first covered sample
362 simdscalar vXSample = _simd_set_ps(samplePos.X(sampleNum[7]),
363 samplePos.X(sampleNum[6]),
364 samplePos.X(sampleNum[5]),
365 samplePos.X(sampleNum[4]),
366 samplePos.X(sampleNum[3]),
367 samplePos.X(sampleNum[2]),
368 samplePos.X(sampleNum[1]),
369 samplePos.X(sampleNum[0]));
370
371 simdscalar vYSample = _simd_set_ps(samplePos.Y(sampleNum[7]),
372 samplePos.Y(sampleNum[6]),
373 samplePos.Y(sampleNum[5]),
374 samplePos.Y(sampleNum[4]),
375 samplePos.Y(sampleNum[3]),
376 samplePos.Y(sampleNum[2]),
377 samplePos.Y(sampleNum[1]),
378 samplePos.Y(sampleNum[0]));
379 // add sample offset to UL pixel corner
380 vXSample = _simd_add_ps(vXSamplePosUL, vXSample);
381 vYSample = _simd_add_ps(vYSamplePosUL, vYSample);
382
383 // Case (1) and case (3b) - All samples covered or not covered with full SampleMask
384 static const simdscalari vFullyCoveredMask = T::MultisampleT::FullSampleMask();
385 simdscalari vInputCoveragei = _simd_set_epi32(inputMask[7], inputMask[6], inputMask[5], inputMask[4], inputMask[3], inputMask[2], inputMask[1], inputMask[0]);
386 simdscalari vAllSamplesCovered = _simd_cmpeq_epi32(vInputCoveragei, vFullyCoveredMask);
387
388 static const simdscalari vZero = _simd_setzero_si();
389 const simdscalari vSampleMask = _simd_and_si(_simd_set1_epi32(sampleMask), vFullyCoveredMask);
390 simdscalari vNoSamplesCovered = _simd_cmpeq_epi32(vInputCoveragei, vZero);
391 simdscalari vIsFullSampleMask = _simd_cmpeq_epi32(vSampleMask, vFullyCoveredMask);
392 simdscalari vCase3b = _simd_and_si(vNoSamplesCovered, vIsFullSampleMask);
393
394 simdscalari vEvalAtCenter = _simd_or_si(vAllSamplesCovered, vCase3b);
395
396 // set the centroid position based on results from above
397 psContext.vX.centroid = _simd_blendv_ps(vXSample, psContext.vX.center, _simd_castsi_ps(vEvalAtCenter));
398 psContext.vY.centroid = _simd_blendv_ps(vYSample, psContext.vY.center, _simd_castsi_ps(vEvalAtCenter));
399
400 // Case (3a) No samples covered and partial sample mask
401 simdscalari vSomeSampleMaskSamples = _simd_cmplt_epi32(vSampleMask, vFullyCoveredMask);
402 // sample mask should never be all 0's for this case, but handle it anyways
403 unsigned long firstCoveredSampleMaskSample = 0;
404 (sampleMask > 0) ? (_BitScanForward(&firstCoveredSampleMaskSample, sampleMask)) : (firstCoveredSampleMaskSample = 0);
405
406 simdscalari vCase3a = _simd_and_si(vNoSamplesCovered, vSomeSampleMaskSamples);
407
408 vXSample = _simd_set1_ps(samplePos.X(firstCoveredSampleMaskSample));
409 vYSample = _simd_set1_ps(samplePos.Y(firstCoveredSampleMaskSample));
410
411 // blend in case 3a pixel locations
412 psContext.vX.centroid = _simd_blendv_ps(psContext.vX.centroid, vXSample, _simd_castsi_ps(vCase3a));
413 psContext.vY.centroid = _simd_blendv_ps(psContext.vY.centroid, vYSample, _simd_castsi_ps(vCase3a));
414 }
415
416 INLINE void CalcCentroidBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext,
417 const simdscalar &vXSamplePosUL, const simdscalar &vYSamplePosUL)
418 {
419 // evaluate I,J
420 psContext.vI.centroid = vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.centroid, psContext.vY.centroid);
421 psContext.vJ.centroid = vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.centroid, psContext.vY.centroid);
422 psContext.vI.centroid = _simd_mul_ps(psContext.vI.centroid, coeffs.vRecipDet);
423 psContext.vJ.centroid = _simd_mul_ps(psContext.vJ.centroid, coeffs.vRecipDet);
424
425 // interpolate 1/w
426 psContext.vOneOverW.centroid = vplaneps(coeffs.vAOneOverW, coeffs.vBOneOverW, coeffs.vCOneOverW, psContext.vI.centroid, psContext.vJ.centroid);
427 }
428
429 INLINE simdmask CalcDepthBoundsAcceptMask(simdscalar const &z, float minz, float maxz)
430 {
431 const simdscalar minzMask = _simd_cmpge_ps(z, _simd_set1_ps(minz));
432 const simdscalar maxzMask = _simd_cmple_ps(z, _simd_set1_ps(maxz));
433
434 return _simd_movemask_ps(_simd_and_ps(minzMask, maxzMask));
435 }
436
437 template<typename T>
438 INLINE uint32_t GetNumOMSamples(SWR_MULTISAMPLE_COUNT blendSampleCount)
439 {
440 // RT has to be single sample if we're in forcedMSAA mode
441 if(T::bForcedSampleCount && (T::MultisampleT::sampleCount > SWR_MULTISAMPLE_1X))
442 {
443 return 1;
444 }
445 // unless we're forced to single sample, in which case we run the OM at the sample count of the RT
446 else if(T::bForcedSampleCount && (T::MultisampleT::sampleCount == SWR_MULTISAMPLE_1X))
447 {
448 return GetNumSamples(blendSampleCount);
449 }
450 // else we're in normal MSAA mode and rasterizer and OM are running at the same sample count
451 else
452 {
453 return T::MultisampleT::numSamples;
454 }
455 }
456
457 inline void SetupBarycentricCoeffs(BarycentricCoeffs *coeffs, const SWR_TRIANGLE_DESC &work)
458 {
459 // broadcast scalars
460
461 coeffs->vIa = _simd_broadcast_ss(&work.I[0]);
462 coeffs->vIb = _simd_broadcast_ss(&work.I[1]);
463 coeffs->vIc = _simd_broadcast_ss(&work.I[2]);
464
465 coeffs->vJa = _simd_broadcast_ss(&work.J[0]);
466 coeffs->vJb = _simd_broadcast_ss(&work.J[1]);
467 coeffs->vJc = _simd_broadcast_ss(&work.J[2]);
468
469 coeffs->vZa = _simd_broadcast_ss(&work.Z[0]);
470 coeffs->vZb = _simd_broadcast_ss(&work.Z[1]);
471 coeffs->vZc = _simd_broadcast_ss(&work.Z[2]);
472
473 coeffs->vRecipDet = _simd_broadcast_ss(&work.recipDet);
474
475 coeffs->vAOneOverW = _simd_broadcast_ss(&work.OneOverW[0]);
476 coeffs->vBOneOverW = _simd_broadcast_ss(&work.OneOverW[1]);
477 coeffs->vCOneOverW = _simd_broadcast_ss(&work.OneOverW[2]);
478 }
479
480 inline void SetupRenderBuffers(uint8_t *pColorBuffer[SWR_NUM_RENDERTARGETS], uint8_t **pDepthBuffer, uint8_t **pStencilBuffer, uint32_t colorHotTileMask, RenderOutputBuffers &renderBuffers)
481 {
482
483 DWORD index;
484 while (_BitScanForward(&index, colorHotTileMask))
485 {
486 assert(index < SWR_NUM_RENDERTARGETS);
487 colorHotTileMask &= ~(1 << index);
488 pColorBuffer[index] = renderBuffers.pColor[index];
489 }
490
491 if (pDepthBuffer)
492 {
493 *pDepthBuffer = renderBuffers.pDepth;
494 }
495
496 if (pStencilBuffer)
497 {
498 *pStencilBuffer = renderBuffers.pStencil;;
499 }
500 }
501
502 template<typename T>
503 void SetupPixelShaderContext(SWR_PS_CONTEXT *psContext, const SWR_MULTISAMPLE_POS& samplePos, SWR_TRIANGLE_DESC &work)
504 {
505 psContext->pAttribs = work.pAttribs;
506 psContext->pPerspAttribs = work.pPerspAttribs;
507 psContext->frontFace = work.triFlags.frontFacing;
508 psContext->renderTargetArrayIndex = work.triFlags.renderTargetArrayIndex;
509
510 // save Ia/Ib/Ic and Ja/Jb/Jc if we need to reevaluate i/j/k in the shader because of pull attribs
511 psContext->I = work.I;
512 psContext->J = work.J;
513
514 psContext->recipDet = work.recipDet;
515 psContext->pRecipW = work.pRecipW;
516 psContext->pSamplePosX = samplePos.X();//reinterpret_cast<const float *>(&T::MultisampleT::samplePosX);
517 psContext->pSamplePosY = samplePos.Y();//reinterpret_cast<const float *>(&T::MultisampleT::samplePosY);
518 psContext->rasterizerSampleCount = T::MultisampleT::numSamples;
519 psContext->sampleIndex = 0;
520 }
521
522 template<typename T, bool IsSingleSample>
523 void CalcCentroid(SWR_PS_CONTEXT *psContext, const SWR_MULTISAMPLE_POS& samplePos,
524 const BarycentricCoeffs &coeffs, const uint64_t * const coverageMask, uint32_t sampleMask)
525 {
526 if (IsSingleSample) // if (T::MultisampleT::numSamples == 1) // doesn't cut it, the centroid positions are still different
527 {
528 // for 1x case, centroid is pixel center
529 psContext->vX.centroid = psContext->vX.center;
530 psContext->vY.centroid = psContext->vY.center;
531 psContext->vI.centroid = psContext->vI.center;
532 psContext->vJ.centroid = psContext->vJ.center;
533 psContext->vOneOverW.centroid = psContext->vOneOverW.center;
534 }
535 else
536 {
537 if (T::bCentroidPos)
538 {
539 ///@ todo: don't need to genererate input coverage 2x if input coverage and centroid
540 if (T::bIsCenterPattern)
541 {
542 psContext->vX.centroid = _simd_add_ps(psContext->vX.UL, _simd_set1_ps(0.5f));
543 psContext->vY.centroid = _simd_add_ps(psContext->vY.UL, _simd_set1_ps(0.5f));
544 }
545 else
546 {
547 // add param: const uint32_t inputMask[KNOB_SIMD_WIDTH] to eliminate 'generate coverage 2X'..
548 CalcCentroidPos<T>(*psContext, samplePos, coverageMask, sampleMask, psContext->vX.UL, psContext->vY.UL);
549 }
550
551 CalcCentroidBarycentrics(coeffs, *psContext, psContext->vX.UL, psContext->vY.UL);
552 }
553 else
554 {
555 psContext->vX.centroid = psContext->vX.sample;
556 psContext->vY.centroid = psContext->vY.sample;
557 }
558 }
559 }
560
561 template<typename T>
562 struct PixelRateZTestLoop
563 {
564 PixelRateZTestLoop(DRAW_CONTEXT *DC, uint32_t _workerId, const SWR_TRIANGLE_DESC &Work, const BarycentricCoeffs& Coeffs, const API_STATE& apiState,
565 uint8_t*& depthBuffer, uint8_t*& stencilBuffer, const uint8_t ClipDistanceMask) :
566 pDC(DC), workerId(_workerId), work(Work), coeffs(Coeffs), state(apiState), psState(apiState.psState),
567 samplePos(state.rastState.samplePositions),
568 clipDistanceMask(ClipDistanceMask), pDepthBuffer(depthBuffer), pStencilBuffer(stencilBuffer){};
569
570 INLINE
571 uint32_t operator()(simdscalar& activeLanes, SWR_PS_CONTEXT& psContext,
572 const CORE_BUCKETS BEDepthBucket, uint32_t currentSimdIn8x8 = 0)
573 {
574
575 uint32_t statCount = 0;
576 simdscalar anyDepthSamplePassed = _simd_setzero_ps();
577 for(uint32_t sample = 0; sample < T::MultisampleT::numCoverageSamples; sample++)
578 {
579 const uint8_t *pCoverageMask = (uint8_t*)&work.coverageMask[sample];
580 vCoverageMask[sample] = _simd_and_ps(activeLanes, _simd_vmask_ps(pCoverageMask[currentSimdIn8x8] & MASK));
581
582 if(!_simd_movemask_ps(vCoverageMask[sample]))
583 {
584 vCoverageMask[sample] = depthPassMask[sample] = stencilPassMask[sample] = _simd_setzero_ps();
585 continue;
586 }
587
588 // offset depth/stencil buffers current sample
589 uint8_t *pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample);
590 uint8_t * pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample);
591
592 if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable)
593 {
594 static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format");
595
596 const simdscalar z = _simd_load_ps(reinterpret_cast<const float *>(pDepthSample));
597
598 const float minz = state.depthBoundsState.depthBoundsTestMinValue;
599 const float maxz = state.depthBoundsState.depthBoundsTestMaxValue;
600
601 vCoverageMask[sample] = _simd_and_ps(vCoverageMask[sample], _simd_vmask_ps(CalcDepthBoundsAcceptMask(z, minz, maxz)));
602 }
603
604 RDTSC_BEGIN(BEBarycentric, pDC->drawId);
605
606 // calculate per sample positions
607 psContext.vX.sample = _simd_add_ps(psContext.vX.UL, samplePos.vX(sample));
608 psContext.vY.sample = _simd_add_ps(psContext.vY.UL, samplePos.vY(sample));
609
610 // calc I & J per sample
611 CalcSampleBarycentrics(coeffs, psContext);
612
613 if(psState.writesODepth)
614 {
615 {
616 // broadcast and test oDepth(psContext.vZ) written from the PS for each sample
617 vZ[sample] = psContext.vZ;
618 }
619 }
620 else
621 {
622 vZ[sample] = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample);
623 vZ[sample] = state.pfnQuantizeDepth(vZ[sample]);
624 }
625
626 RDTSC_END(BEBarycentric, 0);
627
628 ///@todo: perspective correct vs non-perspective correct clipping?
629 // if clip distances are enabled, we need to interpolate for each sample
630 if(clipDistanceMask)
631 {
632 uint8_t clipMask = ComputeUserClipMask(clipDistanceMask, work.pUserClipBuffer, psContext.vI.sample, psContext.vJ.sample);
633
634 vCoverageMask[sample] = _simd_and_ps(vCoverageMask[sample], _simd_vmask_ps(~clipMask));
635 }
636
637 // ZTest for this sample
638 ///@todo Need to uncomment out this bucket.
639 //RDTSC_BEGIN(BEDepthBucket, pDC->drawId);
640 depthPassMask[sample] = vCoverageMask[sample];
641 stencilPassMask[sample] = vCoverageMask[sample];
642 depthPassMask[sample] = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
643 vZ[sample], pDepthSample, vCoverageMask[sample],
644 pStencilSample, &stencilPassMask[sample]);
645 //RDTSC_END(BEDepthBucket, 0);
646
647 // early-exit if no pixels passed depth or earlyZ is forced on
648 if(psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask[sample]))
649 {
650 DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, vZ[sample],
651 pDepthSample, depthPassMask[sample], vCoverageMask[sample], pStencilSample, stencilPassMask[sample]);
652
653 if(!_simd_movemask_ps(depthPassMask[sample]))
654 {
655 continue;
656 }
657 }
658 anyDepthSamplePassed = _simd_or_ps(anyDepthSamplePassed, depthPassMask[sample]);
659 uint32_t statMask = _simd_movemask_ps(depthPassMask[sample]);
660 statCount += _mm_popcnt_u32(statMask);
661 }
662
663 activeLanes = _simd_and_ps(anyDepthSamplePassed, activeLanes);
664 // return number of samples that passed depth and coverage
665 return statCount;
666 }
667
668 // saved depth/stencil/coverage masks and interpolated Z used in OM and DepthWrite
669 simdscalar vZ[T::MultisampleT::numCoverageSamples];
670 simdscalar vCoverageMask[T::MultisampleT::numCoverageSamples];
671 simdscalar depthPassMask[T::MultisampleT::numCoverageSamples];
672 simdscalar stencilPassMask[T::MultisampleT::numCoverageSamples];
673
674 private:
675 // functor inputs
676 DRAW_CONTEXT* pDC;
677 uint32_t workerId;
678
679 const SWR_TRIANGLE_DESC& work;
680 const BarycentricCoeffs& coeffs;
681 const API_STATE& state;
682 const SWR_PS_STATE& psState;
683 const SWR_MULTISAMPLE_POS& samplePos;
684 const uint8_t clipDistanceMask;
685 uint8_t*& pDepthBuffer;
686 uint8_t*& pStencilBuffer;
687 };
688
689 INLINE void CalcPixelBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext)
690 {
691 // evaluate I,J
692 psContext.vI.center = vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.center, psContext.vY.center);
693 psContext.vJ.center = vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.center, psContext.vY.center);
694 psContext.vI.center = _simd_mul_ps(psContext.vI.center, coeffs.vRecipDet);
695 psContext.vJ.center = _simd_mul_ps(psContext.vJ.center, coeffs.vRecipDet);
696
697 // interpolate 1/w
698 psContext.vOneOverW.center = vplaneps(coeffs.vAOneOverW, coeffs.vBOneOverW, coeffs.vCOneOverW, psContext.vI.center, psContext.vJ.center);
699 }
700
701 static INLINE void CalcSampleBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext)
702 {
703 // evaluate I,J
704 psContext.vI.sample = vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.sample, psContext.vY.sample);
705 psContext.vJ.sample = vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.sample, psContext.vY.sample);
706 psContext.vI.sample = _simd_mul_ps(psContext.vI.sample, coeffs.vRecipDet);
707 psContext.vJ.sample = _simd_mul_ps(psContext.vJ.sample, coeffs.vRecipDet);
708
709 // interpolate 1/w
710 psContext.vOneOverW.sample = vplaneps(coeffs.vAOneOverW, coeffs.vBOneOverW, coeffs.vCOneOverW, psContext.vI.sample, psContext.vJ.sample);
711 }
712
713 // Merge Output to 4x2 SIMD Tile Format
714 INLINE void OutputMerger4x2(SWR_PS_CONTEXT &psContext, uint8_t* (&pColorBase)[SWR_NUM_RENDERTARGETS], uint32_t sample, const SWR_BLEND_STATE *pBlendState,
715 const PFN_BLEND_JIT_FUNC (&pfnBlendFunc)[SWR_NUM_RENDERTARGETS], simdscalar &coverageMask, simdscalar const &depthPassMask, uint32_t renderTargetMask)
716 {
717 // type safety guaranteed from template instantiation in BEChooser<>::GetFunc
718 const uint32_t rasterTileColorOffset = RasterTileColorOffset(sample);
719 simdvector blendOut;
720
721 DWORD rt = 0;
722 while (_BitScanForward(&rt, renderTargetMask))
723 {
724 renderTargetMask &= ~(1 << rt);
725 uint8_t *pColorSample = pColorBase[rt] + rasterTileColorOffset;
726
727 const SWR_RENDER_TARGET_BLEND_STATE *pRTBlend = &pBlendState->renderTarget[rt];
728
729 {
730 // pfnBlendFunc may not update all channels. Initialize with PS output.
731 /// TODO: move this into the blend JIT.
732 blendOut = psContext.shaded[rt];
733
734 // Blend outputs and update coverage mask for alpha test
735 if(pfnBlendFunc[rt] != nullptr)
736 {
737 pfnBlendFunc[rt](
738 pBlendState,
739 psContext.shaded[rt],
740 psContext.shaded[1],
741 psContext.shaded[0].w,
742 sample,
743 pColorSample,
744 blendOut,
745 &psContext.oMask,
746 (simdscalari*)&coverageMask);
747 }
748 }
749
750 // final write mask
751 simdscalari outputMask = _simd_castps_si(_simd_and_ps(coverageMask, depthPassMask));
752
753 ///@todo can only use maskstore fast path if bpc is 32. Assuming hot tile is RGBA32_FLOAT.
754 static_assert(KNOB_COLOR_HOT_TILE_FORMAT == R32G32B32A32_FLOAT, "Unsupported hot tile format");
755
756 const uint32_t simd = KNOB_SIMD_WIDTH * sizeof(float);
757
758 // store with color mask
759 if(!pRTBlend->writeDisableRed)
760 {
761 _simd_maskstore_ps((float*)pColorSample, outputMask, blendOut.x);
762 }
763 if(!pRTBlend->writeDisableGreen)
764 {
765 _simd_maskstore_ps((float*)(pColorSample + simd), outputMask, blendOut.y);
766 }
767 if(!pRTBlend->writeDisableBlue)
768 {
769 _simd_maskstore_ps((float*)(pColorSample + simd * 2), outputMask, blendOut.z);
770 }
771 if(!pRTBlend->writeDisableAlpha)
772 {
773 _simd_maskstore_ps((float*)(pColorSample + simd * 3), outputMask, blendOut.w);
774 }
775 }
776 }
777
778 #if USE_8x2_TILE_BACKEND
779 // Merge Output to 8x2 SIMD16 Tile Format
780 INLINE void OutputMerger8x2(SWR_PS_CONTEXT &psContext, uint8_t* (&pColorBase)[SWR_NUM_RENDERTARGETS], uint32_t sample, const SWR_BLEND_STATE *pBlendState,
781 const PFN_BLEND_JIT_FUNC(&pfnBlendFunc)[SWR_NUM_RENDERTARGETS], simdscalar &coverageMask, simdscalar const &depthPassMask, uint32_t renderTargetMask, bool useAlternateOffset)
782 {
783 // type safety guaranteed from template instantiation in BEChooser<>::GetFunc
784 uint32_t rasterTileColorOffset = RasterTileColorOffset(sample);
785
786 if (useAlternateOffset)
787 {
788 rasterTileColorOffset += sizeof(simdscalar);
789 }
790
791 simdvector blendSrc;
792 simdvector blendOut;
793
794 DWORD rt;
795 while (_BitScanForward(&rt, renderTargetMask))
796 {
797 renderTargetMask &= ~(1 << rt);
798
799 const SWR_RENDER_TARGET_BLEND_STATE *pRTBlend = &pBlendState->renderTarget[rt];
800
801 simdscalar* pColorSample;
802 bool hotTileEnable = !pRTBlend->writeDisableAlpha || !pRTBlend->writeDisableRed || !pRTBlend->writeDisableGreen || !pRTBlend->writeDisableBlue;
803 if (hotTileEnable)
804 {
805 pColorSample = reinterpret_cast<simdscalar *>(pColorBase[rt] + rasterTileColorOffset);
806 blendSrc[0] = pColorSample[0];
807 blendSrc[1] = pColorSample[2];
808 blendSrc[2] = pColorSample[4];
809 blendSrc[3] = pColorSample[6];
810 }
811 else
812 {
813 pColorSample = nullptr;
814 }
815
816 {
817 // pfnBlendFunc may not update all channels. Initialize with PS output.
818 /// TODO: move this into the blend JIT.
819 blendOut = psContext.shaded[rt];
820
821 // Blend outputs and update coverage mask for alpha test
822 if(pfnBlendFunc[rt] != nullptr)
823 {
824 pfnBlendFunc[rt](
825 pBlendState,
826 psContext.shaded[rt],
827 psContext.shaded[1],
828 psContext.shaded[0].w,
829 sample,
830 reinterpret_cast<uint8_t *>(&blendSrc),
831 blendOut,
832 &psContext.oMask,
833 reinterpret_cast<simdscalari *>(&coverageMask));
834 }
835 }
836
837 // final write mask
838 simdscalari outputMask = _simd_castps_si(_simd_and_ps(coverageMask, depthPassMask));
839
840 ///@todo can only use maskstore fast path if bpc is 32. Assuming hot tile is RGBA32_FLOAT.
841 static_assert(KNOB_COLOR_HOT_TILE_FORMAT == R32G32B32A32_FLOAT, "Unsupported hot tile format");
842
843 // store with color mask
844 if (!pRTBlend->writeDisableRed)
845 {
846 _simd_maskstore_ps(reinterpret_cast<float *>(&pColorSample[0]), outputMask, blendOut.x);
847 }
848 if (!pRTBlend->writeDisableGreen)
849 {
850 _simd_maskstore_ps(reinterpret_cast<float *>(&pColorSample[2]), outputMask, blendOut.y);
851 }
852 if (!pRTBlend->writeDisableBlue)
853 {
854 _simd_maskstore_ps(reinterpret_cast<float *>(&pColorSample[4]), outputMask, blendOut.z);
855 }
856 if (!pRTBlend->writeDisableAlpha)
857 {
858 _simd_maskstore_ps(reinterpret_cast<float *>(&pColorSample[6]), outputMask, blendOut.w);
859 }
860 }
861 }
862
863 #endif
864
865 template<typename T>
866 void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
867 {
868 ///@todo: Need to move locals off stack to prevent __chkstk's from being generated for the backend
869
870
871 RDTSC_BEGIN(BEPixelRateBackend, pDC->drawId);
872 RDTSC_BEGIN(BESetup, pDC->drawId);
873
874 const API_STATE &state = GetApiState(pDC);
875
876 BarycentricCoeffs coeffs;
877 SetupBarycentricCoeffs(&coeffs, work);
878
879 SWR_PS_CONTEXT psContext;
880 const SWR_MULTISAMPLE_POS& samplePos = state.rastState.samplePositions;
881 SetupPixelShaderContext<T>(&psContext, samplePos, work);
882
883 uint8_t *pDepthBuffer, *pStencilBuffer;
884 SetupRenderBuffers(psContext.pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.colorHottileEnable, renderBuffers);
885
886 RDTSC_END(BESetup, 0);
887
888 PixelRateZTestLoop<T> PixelRateZTest(pDC, workerId, work, coeffs, state, pDepthBuffer, pStencilBuffer, state.backendState.clipDistanceMask);
889
890 psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast<float>(y)));
891 psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast<float>(y)));
892
893 const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM));
894
895 for(uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
896 {
897 psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps(static_cast<float>(x)));
898 psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps(static_cast<float>(x)));
899
900 const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM));
901
902 for(uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
903 {
904 #if USE_8x2_TILE_BACKEND
905 const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0);
906 #endif
907 simdscalar activeLanes;
908 if(!(work.anyCoveredSamples & MASK)) {goto Endtile;};
909 activeLanes = _simd_vmask_ps(work.anyCoveredSamples & MASK);
910
911 if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE)
912 {
913 const uint64_t* pCoverageMask = (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) ? &work.innerCoverageMask : &work.coverageMask[0];
914
915 generateInputCoverage<T, T::InputCoverage>(pCoverageMask, psContext.inputMask, state.blendState.sampleMask);
916 }
917
918 RDTSC_BEGIN(BEBarycentric, pDC->drawId);
919
920 CalcPixelBarycentrics(coeffs, psContext);
921
922 CalcCentroid<T, false>(&psContext, samplePos, coeffs, work.coverageMask, state.blendState.sampleMask);
923
924 RDTSC_END(BEBarycentric, 0);
925
926 if(T::bForcedSampleCount)
927 {
928 // candidate pixels (that passed coverage) will cause shader invocation if any bits in the samplemask are set
929 const simdscalar vSampleMask = _simd_castsi_ps(_simd_cmpgt_epi32(_simd_set1_epi32(state.blendState.sampleMask), _simd_setzero_si()));
930 activeLanes = _simd_and_ps(activeLanes, vSampleMask);
931 }
932
933 // Early-Z?
934 if(T::bCanEarlyZ && !T::bForcedSampleCount)
935 {
936 uint32_t depthPassCount = PixelRateZTest(activeLanes, psContext, BEEarlyDepthTest);
937 UPDATE_STAT_BE(DepthPassCount, depthPassCount);
938 AR_EVENT(EarlyDepthInfoPixelRate(depthPassCount, _simd_movemask_ps(activeLanes)));
939 }
940
941 // if we have no covered samples that passed depth at this point, go to next tile
942 if(!_simd_movemask_ps(activeLanes)) { goto Endtile; };
943
944 if(state.psState.usesSourceDepth)
945 {
946 RDTSC_BEGIN(BEBarycentric, pDC->drawId);
947 // interpolate and quantize z
948 psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center);
949 psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
950 RDTSC_END(BEBarycentric, 0);
951 }
952
953 // pixels that are currently active
954 psContext.activeMask = _simd_castps_si(activeLanes);
955 psContext.oMask = T::MultisampleT::FullSampleMask();
956
957 // execute pixel shader
958 RDTSC_BEGIN(BEPixelShader, pDC->drawId);
959 state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
960 UPDATE_STAT_BE(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(activeLanes)));
961 RDTSC_END(BEPixelShader, 0);
962
963 // update active lanes to remove any discarded or oMask'd pixels
964 activeLanes = _simd_castsi_ps(_simd_and_si(psContext.activeMask, _simd_cmpgt_epi32(psContext.oMask, _simd_setzero_si())));
965 if(!_simd_movemask_ps(activeLanes)) { goto Endtile; };
966
967 // late-Z
968 if(!T::bCanEarlyZ && !T::bForcedSampleCount)
969 {
970 uint32_t depthPassCount = PixelRateZTest(activeLanes, psContext, BELateDepthTest);
971 UPDATE_STAT_BE(DepthPassCount, depthPassCount);
972 AR_EVENT(LateDepthInfoPixelRate(depthPassCount, _simd_movemask_ps(activeLanes)));
973 }
974
975 // if we have no covered samples that passed depth at this point, skip OM and go to next tile
976 if(!_simd_movemask_ps(activeLanes)) { goto Endtile; };
977
978 // output merger
979 // loop over all samples, broadcasting the results of the PS to all passing pixels
980 for(uint32_t sample = 0; sample < GetNumOMSamples<T>(state.blendState.sampleCount); sample++)
981 {
982 RDTSC_BEGIN(BEOutputMerger, pDC->drawId);
983 // center pattern does a single coverage/depth/stencil test, standard pattern tests all samples
984 uint32_t coverageSampleNum = (T::bIsCenterPattern) ? 0 : sample;
985 simdscalar coverageMask, depthMask;
986 if(T::bForcedSampleCount)
987 {
988 coverageMask = depthMask = activeLanes;
989 }
990 else
991 {
992 coverageMask = PixelRateZTest.vCoverageMask[coverageSampleNum];
993 depthMask = PixelRateZTest.depthPassMask[coverageSampleNum];
994 if(!_simd_movemask_ps(depthMask))
995 {
996 // stencil should already have been written in early/lateZ tests
997 RDTSC_END(BEOutputMerger, 0);
998 continue;
999 }
1000 }
1001
1002 // broadcast the results of the PS to all passing pixels
1003 #if USE_8x2_TILE_BACKEND
1004 OutputMerger8x2(psContext, psContext.pColorBuffer, sample, &state.blendState,state.pfnBlendFunc, coverageMask, depthMask, state.psState.renderTargetMask, useAlternateOffset);
1005 #else // USE_8x2_TILE_BACKEND
1006 OutputMerger4x2(psContext, psContext.pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, coverageMask, depthMask, state.psState.renderTargetMask);
1007 #endif // USE_8x2_TILE_BACKEND
1008
1009 if(!state.psState.forceEarlyZ && !T::bForcedSampleCount)
1010 {
1011 uint8_t *pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample);
1012 uint8_t * pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample);
1013
1014 DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, PixelRateZTest.vZ[coverageSampleNum],
1015 pDepthSample, depthMask, coverageMask, pStencilSample, PixelRateZTest.stencilPassMask[coverageSampleNum]);
1016 }
1017 RDTSC_END(BEOutputMerger, 0);
1018 }
1019 Endtile:
1020 RDTSC_BEGIN(BEEndTile, pDC->drawId);
1021
1022 for(uint32_t sample = 0; sample < T::MultisampleT::numCoverageSamples; sample++)
1023 {
1024 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
1025 }
1026
1027 if(T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
1028 {
1029 work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
1030 }
1031 work.anyCoveredSamples >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
1032
1033 #if USE_8x2_TILE_BACKEND
1034 if (useAlternateOffset)
1035 {
1036 DWORD rt;
1037 uint32_t rtMask = state.colorHottileEnable;
1038 while (_BitScanForward(&rt, rtMask))
1039 {
1040 rtMask &= ~(1 << rt);
1041 psContext.pColorBuffer[rt] += (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
1042 }
1043 }
1044 #else
1045 DWORD rt;
1046 uint32_t rtMask = state.colorHottileEnable;
1047 while (_BitScanForward(&rt, rtMask))
1048 {
1049 rtMask &= ~(1 << rt);
1050 psContext.pColorBuffer[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
1051 }
1052 #endif
1053 pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
1054 pStencilBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
1055
1056 RDTSC_END(BEEndTile, 0);
1057
1058 psContext.vX.UL = _simd_add_ps(psContext.vX.UL, dx);
1059 psContext.vX.center = _simd_add_ps(psContext.vX.center, dx);
1060 }
1061
1062 psContext.vY.UL = _simd_add_ps(psContext.vY.UL, dy);
1063 psContext.vY.center = _simd_add_ps(psContext.vY.center, dy);
1064 }
1065
1066 RDTSC_END(BEPixelRateBackend, 0);
1067 }
1068
1069 template<uint32_t sampleCountT = SWR_MULTISAMPLE_1X, uint32_t isCenter = 0,
1070 uint32_t coverage = 0, uint32_t centroid = 0, uint32_t forced = 0, uint32_t canEarlyZ = 0
1071 >
1072 struct SwrBackendTraits
1073 {
1074 static const bool bIsCenterPattern = (isCenter == 1);
1075 static const uint32_t InputCoverage = coverage;
1076 static const bool bCentroidPos = (centroid == 1);
1077 static const bool bForcedSampleCount = (forced == 1);
1078 static const bool bCanEarlyZ = (canEarlyZ == 1);
1079 typedef MultisampleTraits<(SWR_MULTISAMPLE_COUNT)sampleCountT, bIsCenterPattern> MultisampleT;
1080 };