swr/rast: Properly size GS stage scratch space
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / backend.h
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file backend.h
24 *
25 * @brief Backend handles rasterization, pixel shading and output merger
26 * operations.
27 *
28 ******************************************************************************/
29 #pragma once
30
31 #include "common/os.h"
32 #include "core/context.h"
33 #include "core/multisample.h"
34 #include "depthstencil.h"
35 #include "rdtsc_core.h"
36
37 void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId, void*& pSpillFillBuffer, void*& pScratchSpace);
38 void ProcessSyncBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData);
39 void ProcessClearBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData);
40 void ProcessStoreTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData);
41 void ProcessDiscardInvalidateTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData);
42 void ProcessShutdownBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData);
43 void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers);
44 simdmask ComputeUserClipMask(uint8_t clipMask, float* pUserClipBuffer, simdscalar vI, simdscalar vJ);
45 void CalcSampleBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext);
46
47 extern PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_COUNT]
48 [2] // isCenterPattern
49 [SWR_INPUT_COVERAGE_COUNT]
50 [2] // centroid
51 [2] // forcedSampleCount
52 [2] // canEarlyZ
53 ;
54
55 enum SWR_BACKEND_FUNCS
56 {
57 SWR_BACKEND_SINGLE_SAMPLE,
58 SWR_BACKEND_MSAA_PIXEL_RATE,
59 SWR_BACKEND_MSAA_SAMPLE_RATE,
60 SWR_BACKEND_FUNCS_MAX,
61 };
62
63 #if KNOB_SIMD_WIDTH == 8
64 extern const simdscalar vCenterOffsetsX;
65 extern const simdscalar vCenterOffsetsY;
66 extern const simdscalar vULOffsetsX;
67 extern const simdscalar vULOffsetsY;
68 #define MASK 0xff
69 #endif
70
71 INLINE static uint32_t RasterTileColorOffset(uint32_t sampleNum)
72 {
73 static const uint32_t RasterTileColorOffsets[16]
74 { 0,
75 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8),
76 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 2,
77 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 3,
78 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 4,
79 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 5,
80 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 6,
81 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 7,
82 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 8,
83 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 9,
84 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 10,
85 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 11,
86 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 12,
87 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 13,
88 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 14,
89 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 15,
90 };
91 assert(sampleNum < 16);
92 return RasterTileColorOffsets[sampleNum];
93 }
94
95 INLINE static uint32_t RasterTileDepthOffset(uint32_t sampleNum)
96 {
97 static const uint32_t RasterTileDepthOffsets[16]
98 { 0,
99 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8),
100 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 2,
101 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 3,
102 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 4,
103 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 5,
104 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 6,
105 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 7,
106 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 8,
107 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 9,
108 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 10,
109 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 11,
110 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 12,
111 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 13,
112 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 14,
113 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 15,
114 };
115 assert(sampleNum < 16);
116 return RasterTileDepthOffsets[sampleNum];
117 }
118
119 INLINE static uint32_t RasterTileStencilOffset(uint32_t sampleNum)
120 {
121 static const uint32_t RasterTileStencilOffsets[16]
122 { 0,
123 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8),
124 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 2,
125 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 3,
126 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 4,
127 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 5,
128 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 6,
129 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 7,
130 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 8,
131 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 9,
132 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 10,
133 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 11,
134 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 12,
135 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 13,
136 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 14,
137 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 15,
138 };
139 assert(sampleNum < 16);
140 return RasterTileStencilOffsets[sampleNum];
141 }
142
143 template<typename T, uint32_t InputCoverage>
144 struct generateInputCoverage
145 {
146 INLINE generateInputCoverage(const uint64_t *const coverageMask, uint32_t (&inputMask)[KNOB_SIMD_WIDTH], const uint32_t sampleMask)
147 {
148 // will need to update for avx512
149 assert(KNOB_SIMD_WIDTH == 8);
150
151 simdscalari mask[2];
152 simdscalari sampleCoverage[2];
153
154 if(T::bIsCenterPattern)
155 {
156 // center coverage is the same for all samples; just broadcast to the sample slots
157 uint32_t centerCoverage = ((uint32_t)(*coverageMask) & MASK);
158 if(T::MultisampleT::numSamples == 1)
159 {
160 sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, centerCoverage);
161 }
162 else if(T::MultisampleT::numSamples == 2)
163 {
164 sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, centerCoverage, centerCoverage);
165 }
166 else if(T::MultisampleT::numSamples == 4)
167 {
168 sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, centerCoverage, centerCoverage, centerCoverage, centerCoverage);
169 }
170 else if(T::MultisampleT::numSamples == 8)
171 {
172 sampleCoverage[0] = _mm256_set1_epi32(centerCoverage);
173 }
174 else if(T::MultisampleT::numSamples == 16)
175 {
176 sampleCoverage[0] = _mm256_set1_epi32(centerCoverage);
177 sampleCoverage[1] = _mm256_set1_epi32(centerCoverage);
178 }
179 }
180 else
181 {
182 __m256i src = _mm256_set1_epi32(0);
183 __m256i index0 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0), index1;
184
185 if(T::MultisampleT::numSamples == 1)
186 {
187 mask[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, -1);
188 }
189 else if(T::MultisampleT::numSamples == 2)
190 {
191 mask[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1);
192 }
193 else if(T::MultisampleT::numSamples == 4)
194 {
195 mask[0] = _mm256_set_epi32(0, 0, 0, 0, -1, -1, -1, -1);
196 }
197 else if(T::MultisampleT::numSamples == 8)
198 {
199 mask[0] = _mm256_set1_epi32(-1);
200 }
201 else if(T::MultisampleT::numSamples == 16)
202 {
203 mask[0] = _mm256_set1_epi32(-1);
204 mask[1] = _mm256_set1_epi32(-1);
205 index1 = _mm256_set_epi32(15, 14, 13, 12, 11, 10, 9, 8);
206 }
207
208 // gather coverage for samples 0-7
209 sampleCoverage[0] = _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src), (const float*)coverageMask, index0, _mm256_castsi256_ps(mask[0]), 8));
210 if(T::MultisampleT::numSamples > 8)
211 {
212 // gather coverage for samples 8-15
213 sampleCoverage[1] = _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src), (const float*)coverageMask, index1, _mm256_castsi256_ps(mask[1]), 8));
214 }
215 }
216
217 mask[0] = _mm256_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xC, 0x8, 0x4, 0x0,
218 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xC, 0x8, 0x4, 0x0);
219 // pull out the 8bit 4x2 coverage for samples 0-7 into the lower 32 bits of each 128bit lane
220 simdscalari packedCoverage0 = _simd_shuffle_epi8(sampleCoverage[0], mask[0]);
221
222 simdscalari packedCoverage1;
223 if(T::MultisampleT::numSamples > 8)
224 {
225 // pull out the 8bit 4x2 coverage for samples 8-15 into the lower 32 bits of each 128bit lane
226 packedCoverage1 = _simd_shuffle_epi8(sampleCoverage[1], mask[0]);
227 }
228
229 #if (KNOB_ARCH == KNOB_ARCH_AVX)
230 // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane
231 simdscalari hiToLow = _mm256_permute2f128_si256(packedCoverage0, packedCoverage0, 0x83);
232 simdscalar shufRes = _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow), _mm256_castsi256_ps(hiToLow), _MM_SHUFFLE(1, 1, 0, 1));
233 packedCoverage0 = _mm256_castps_si256(_mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0), shufRes, 0xFE));
234
235 simdscalari packedSampleCoverage;
236 if(T::MultisampleT::numSamples > 8)
237 {
238 // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane
239 hiToLow = _mm256_permute2f128_si256(packedCoverage1, packedCoverage1, 0x83);
240 shufRes = _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow), _mm256_castsi256_ps(hiToLow), _MM_SHUFFLE(1, 1, 0, 1));
241 shufRes = _mm256_blend_ps(_mm256_castsi256_ps(packedCoverage1), shufRes, 0xFE);
242 packedCoverage1 = _mm256_castps_si256(_mm256_castpd_ps(_mm256_shuffle_pd(_mm256_castps_pd(shufRes), _mm256_castps_pd(shufRes), 0x01)));
243 packedSampleCoverage = _mm256_castps_si256(_mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0), _mm256_castsi256_ps(packedCoverage1), 0xFC));
244 }
245 else
246 {
247 packedSampleCoverage = packedCoverage0;
248 }
249 #else
250 simdscalari permMask = _mm256_set_epi32(0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x4, 0x0);
251 // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane
252 packedCoverage0 = _mm256_permutevar8x32_epi32(packedCoverage0, permMask);
253
254 simdscalari packedSampleCoverage;
255 if(T::MultisampleT::numSamples > 8)
256 {
257 permMask = _mm256_set_epi32(0x7, 0x7, 0x7, 0x7, 0x4, 0x0, 0x7, 0x7);
258 // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane
259 packedCoverage1 = _mm256_permutevar8x32_epi32(packedCoverage1, permMask);
260
261 // blend coverage masks for samples 0-7 and samples 8-15 into single 128 bit lane
262 packedSampleCoverage = _mm256_blend_epi32(packedCoverage0, packedCoverage1, 0x0C);
263 }
264 else
265 {
266 packedSampleCoverage = packedCoverage0;
267 }
268 #endif
269
270 for(int32_t i = KNOB_SIMD_WIDTH - 1; i >= 0; i--)
271 {
272 // convert packed sample coverage masks into single coverage masks for all samples for each pixel in the 4x2
273 inputMask[i] = _simd_movemask_epi8(packedSampleCoverage);
274
275 if(!T::bForcedSampleCount)
276 {
277 // input coverage has to be anded with sample mask if MSAA isn't forced on
278 inputMask[i] &= sampleMask;
279 }
280
281 // shift to the next pixel in the 4x2
282 packedSampleCoverage = _simd_slli_epi32(packedSampleCoverage, 1);
283 }
284 }
285
286 INLINE generateInputCoverage(const uint64_t *const coverageMask, simdscalar &inputCoverage, const uint32_t sampleMask)
287 {
288 uint32_t inputMask[KNOB_SIMD_WIDTH];
289 generateInputCoverage<T, T::InputCoverage>(coverageMask, inputMask, sampleMask);
290 inputCoverage = _simd_castsi_ps(_mm256_set_epi32(inputMask[7], inputMask[6], inputMask[5], inputMask[4], inputMask[3], inputMask[2], inputMask[1], inputMask[0]));
291 }
292
293 };
294
295 template<typename T>
296 struct generateInputCoverage<T, SWR_INPUT_COVERAGE_INNER_CONSERVATIVE>
297 {
298 INLINE generateInputCoverage(const uint64_t *const coverageMask, simdscalar &inputCoverage, const uint32_t sampleMask)
299 {
300 // will need to update for avx512
301 assert(KNOB_SIMD_WIDTH == 8);
302 simdscalari vec = _mm256_set1_epi32(coverageMask[0]);
303 const simdscalari bit = _mm256_set_epi32(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
304 vec = _simd_and_si(vec, bit);
305 vec = _simd_cmplt_epi32(_mm256_setzero_si256(), vec);
306 vec = _simd_blendv_epi32(_simd_setzero_si(), _simd_set1_epi32(1), vec);
307 inputCoverage = _simd_castsi_ps(vec);
308 }
309
310 INLINE generateInputCoverage(const uint64_t *const coverageMask, uint32_t (&inputMask)[KNOB_SIMD_WIDTH], const uint32_t sampleMask)
311 {
312 uint32_t simdCoverage = (coverageMask[0] & MASK);
313 static const uint32_t FullCoverageMask = (1 << T::MultisampleT::numSamples) - 1;
314 for(int i = 0; i < KNOB_SIMD_WIDTH; i++)
315 {
316 // set all samples to covered if conservative coverage mask is set for that pixel
317 inputMask[i] = (((1 << i) & simdCoverage) > 0) ? FullCoverageMask : 0;
318 }
319 }
320 };
321
322 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
323 // Centroid behaves exactly as follows :
324 // (1) If all samples in the primitive are covered, the attribute is evaluated at the pixel center (even if the sample pattern does not happen to
325 // have a sample location there).
326 // (2) Else the attribute is evaluated at the first covered sample, in increasing order of sample index, where sample coverage is after ANDing the
327 // coverage with the SampleMask Rasterizer State.
328 // (3) If no samples are covered, such as on helper pixels executed off the bounds of a primitive to fill out 2x2 pixel stamps, the attribute is
329 // evaluated as follows : If the SampleMask Rasterizer state is a subset of the samples in the pixel, then the first sample covered by the
330 // SampleMask Rasterizer State is the evaluation point.Otherwise (full SampleMask), the pixel center is the evaluation point.
331 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
332 template<typename T>
333 INLINE void CalcCentroidPos(SWR_PS_CONTEXT &psContext, const SWR_MULTISAMPLE_POS& samplePos,
334 const uint64_t *const coverageMask, const uint32_t sampleMask,
335 const simdscalar vXSamplePosUL, const simdscalar vYSamplePosUL)
336 {
337 uint32_t inputMask[KNOB_SIMD_WIDTH];
338 generateInputCoverage<T, T::InputCoverage>(coverageMask, inputMask, sampleMask);
339
340 // Case (2) - partially covered pixel
341
342 // scan for first covered sample per pixel in the 4x2 span
343 unsigned long sampleNum[KNOB_SIMD_WIDTH];
344 (inputMask[0] > 0) ? (_BitScanForward(&sampleNum[0], inputMask[0])) : (sampleNum[0] = 0);
345 (inputMask[1] > 0) ? (_BitScanForward(&sampleNum[1], inputMask[1])) : (sampleNum[1] = 0);
346 (inputMask[2] > 0) ? (_BitScanForward(&sampleNum[2], inputMask[2])) : (sampleNum[2] = 0);
347 (inputMask[3] > 0) ? (_BitScanForward(&sampleNum[3], inputMask[3])) : (sampleNum[3] = 0);
348 (inputMask[4] > 0) ? (_BitScanForward(&sampleNum[4], inputMask[4])) : (sampleNum[4] = 0);
349 (inputMask[5] > 0) ? (_BitScanForward(&sampleNum[5], inputMask[5])) : (sampleNum[5] = 0);
350 (inputMask[6] > 0) ? (_BitScanForward(&sampleNum[6], inputMask[6])) : (sampleNum[6] = 0);
351 (inputMask[7] > 0) ? (_BitScanForward(&sampleNum[7], inputMask[7])) : (sampleNum[7] = 0);
352
353 // look up and set the sample offsets from UL pixel corner for first covered sample
354 __m256 vXSample = _mm256_set_ps(samplePos.X(sampleNum[7]),
355 samplePos.X(sampleNum[6]),
356 samplePos.X(sampleNum[5]),
357 samplePos.X(sampleNum[4]),
358 samplePos.X(sampleNum[3]),
359 samplePos.X(sampleNum[2]),
360 samplePos.X(sampleNum[1]),
361 samplePos.X(sampleNum[0]));
362
363 __m256 vYSample = _mm256_set_ps(samplePos.Y(sampleNum[7]),
364 samplePos.Y(sampleNum[6]),
365 samplePos.Y(sampleNum[5]),
366 samplePos.Y(sampleNum[4]),
367 samplePos.Y(sampleNum[3]),
368 samplePos.Y(sampleNum[2]),
369 samplePos.Y(sampleNum[1]),
370 samplePos.Y(sampleNum[0]));
371 // add sample offset to UL pixel corner
372 vXSample = _simd_add_ps(vXSamplePosUL, vXSample);
373 vYSample = _simd_add_ps(vYSamplePosUL, vYSample);
374
375 // Case (1) and case (3b) - All samples covered or not covered with full SampleMask
376 static const simdscalari vFullyCoveredMask = T::MultisampleT::FullSampleMask();
377 simdscalari vInputCoveragei = _mm256_set_epi32(inputMask[7], inputMask[6], inputMask[5], inputMask[4], inputMask[3], inputMask[2], inputMask[1], inputMask[0]);
378 simdscalari vAllSamplesCovered = _simd_cmpeq_epi32(vInputCoveragei, vFullyCoveredMask);
379
380 static const simdscalari vZero = _simd_setzero_si();
381 const simdscalari vSampleMask = _simd_and_si(_simd_set1_epi32(sampleMask), vFullyCoveredMask);
382 simdscalari vNoSamplesCovered = _simd_cmpeq_epi32(vInputCoveragei, vZero);
383 simdscalari vIsFullSampleMask = _simd_cmpeq_epi32(vSampleMask, vFullyCoveredMask);
384 simdscalari vCase3b = _simd_and_si(vNoSamplesCovered, vIsFullSampleMask);
385
386 simdscalari vEvalAtCenter = _simd_or_si(vAllSamplesCovered, vCase3b);
387
388 // set the centroid position based on results from above
389 psContext.vX.centroid = _simd_blendv_ps(vXSample, psContext.vX.center, _simd_castsi_ps(vEvalAtCenter));
390 psContext.vY.centroid = _simd_blendv_ps(vYSample, psContext.vY.center, _simd_castsi_ps(vEvalAtCenter));
391
392 // Case (3a) No samples covered and partial sample mask
393 simdscalari vSomeSampleMaskSamples = _simd_cmplt_epi32(vSampleMask, vFullyCoveredMask);
394 // sample mask should never be all 0's for this case, but handle it anyways
395 unsigned long firstCoveredSampleMaskSample = 0;
396 (sampleMask > 0) ? (_BitScanForward(&firstCoveredSampleMaskSample, sampleMask)) : (firstCoveredSampleMaskSample = 0);
397
398 simdscalari vCase3a = _simd_and_si(vNoSamplesCovered, vSomeSampleMaskSamples);
399
400 vXSample = _simd_set1_ps(samplePos.X(firstCoveredSampleMaskSample));
401 vYSample = _simd_set1_ps(samplePos.Y(firstCoveredSampleMaskSample));
402
403 // blend in case 3a pixel locations
404 psContext.vX.centroid = _simd_blendv_ps(psContext.vX.centroid, vXSample, _simd_castsi_ps(vCase3a));
405 psContext.vY.centroid = _simd_blendv_ps(psContext.vY.centroid, vYSample, _simd_castsi_ps(vCase3a));
406 }
407
408 INLINE void CalcCentroidBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext,
409 const simdscalar vXSamplePosUL, const simdscalar vYSamplePosUL)
410 {
411 // evaluate I,J
412 psContext.vI.centroid = vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.centroid, psContext.vY.centroid);
413 psContext.vJ.centroid = vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.centroid, psContext.vY.centroid);
414 psContext.vI.centroid = _simd_mul_ps(psContext.vI.centroid, coeffs.vRecipDet);
415 psContext.vJ.centroid = _simd_mul_ps(psContext.vJ.centroid, coeffs.vRecipDet);
416
417 // interpolate 1/w
418 psContext.vOneOverW.centroid = vplaneps(coeffs.vAOneOverW, coeffs.vBOneOverW, coeffs.vCOneOverW, psContext.vI.centroid, psContext.vJ.centroid);
419 }
420
421 INLINE simdmask CalcDepthBoundsAcceptMask(simdscalar z, float minz, float maxz)
422 {
423 const simdscalar minzMask = _simd_cmpge_ps(z, _simd_set1_ps(minz));
424 const simdscalar maxzMask = _simd_cmple_ps(z, _simd_set1_ps(maxz));
425
426 return _simd_movemask_ps(_simd_and_ps(minzMask, maxzMask));
427 }
428
429 template<typename T>
430 INLINE uint32_t GetNumOMSamples(SWR_MULTISAMPLE_COUNT blendSampleCount)
431 {
432 // RT has to be single sample if we're in forcedMSAA mode
433 if(T::bForcedSampleCount && (T::MultisampleT::sampleCount > SWR_MULTISAMPLE_1X))
434 {
435 return 1;
436 }
437 // unless we're forced to single sample, in which case we run the OM at the sample count of the RT
438 else if(T::bForcedSampleCount && (T::MultisampleT::sampleCount == SWR_MULTISAMPLE_1X))
439 {
440 return GetNumSamples(blendSampleCount);
441 }
442 // else we're in normal MSAA mode and rasterizer and OM are running at the same sample count
443 else
444 {
445 return T::MultisampleT::numSamples;
446 }
447 }
448
449 inline void SetupBarycentricCoeffs(BarycentricCoeffs *coeffs, const SWR_TRIANGLE_DESC &work)
450 {
451 // broadcast scalars
452
453 coeffs->vIa = _simd_broadcast_ss(&work.I[0]);
454 coeffs->vIb = _simd_broadcast_ss(&work.I[1]);
455 coeffs->vIc = _simd_broadcast_ss(&work.I[2]);
456
457 coeffs->vJa = _simd_broadcast_ss(&work.J[0]);
458 coeffs->vJb = _simd_broadcast_ss(&work.J[1]);
459 coeffs->vJc = _simd_broadcast_ss(&work.J[2]);
460
461 coeffs->vZa = _simd_broadcast_ss(&work.Z[0]);
462 coeffs->vZb = _simd_broadcast_ss(&work.Z[1]);
463 coeffs->vZc = _simd_broadcast_ss(&work.Z[2]);
464
465 coeffs->vRecipDet = _simd_broadcast_ss(&work.recipDet);
466
467 coeffs->vAOneOverW = _simd_broadcast_ss(&work.OneOverW[0]);
468 coeffs->vBOneOverW = _simd_broadcast_ss(&work.OneOverW[1]);
469 coeffs->vCOneOverW = _simd_broadcast_ss(&work.OneOverW[2]);
470 }
471
472 inline void SetupRenderBuffers(uint8_t *pColorBuffer[SWR_NUM_RENDERTARGETS], uint8_t **pDepthBuffer, uint8_t **pStencilBuffer, uint32_t colorBufferCount, RenderOutputBuffers &renderBuffers)
473 {
474 assert(colorBufferCount <= SWR_NUM_RENDERTARGETS);
475
476 if (pColorBuffer)
477 {
478 for (uint32_t index = 0; index < colorBufferCount; index += 1)
479 {
480 pColorBuffer[index] = renderBuffers.pColor[index];
481 }
482 }
483
484 if (pDepthBuffer)
485 {
486 *pDepthBuffer = renderBuffers.pDepth;
487 }
488
489 if (pStencilBuffer)
490 {
491 *pStencilBuffer = renderBuffers.pStencil;;
492 }
493 }
494
495 template<typename T>
496 void SetupPixelShaderContext(SWR_PS_CONTEXT *psContext, const SWR_MULTISAMPLE_POS& samplePos, SWR_TRIANGLE_DESC &work)
497 {
498 psContext->pAttribs = work.pAttribs;
499 psContext->pPerspAttribs = work.pPerspAttribs;
500 psContext->frontFace = work.triFlags.frontFacing;
501 psContext->renderTargetArrayIndex = work.triFlags.renderTargetArrayIndex;
502
503 // save Ia/Ib/Ic and Ja/Jb/Jc if we need to reevaluate i/j/k in the shader because of pull attribs
504 psContext->I = work.I;
505 psContext->J = work.J;
506
507 psContext->recipDet = work.recipDet;
508 psContext->pRecipW = work.pRecipW;
509 psContext->pSamplePosX = samplePos.X();//reinterpret_cast<const float *>(&T::MultisampleT::samplePosX);
510 psContext->pSamplePosY = samplePos.Y();//reinterpret_cast<const float *>(&T::MultisampleT::samplePosY);
511 psContext->rasterizerSampleCount = T::MultisampleT::numSamples;
512 psContext->sampleIndex = 0;
513 }
514
515 template<typename T, bool IsSingleSample>
516 void CalcCentroid(SWR_PS_CONTEXT *psContext, const SWR_MULTISAMPLE_POS& samplePos,
517 const BarycentricCoeffs &coeffs, const uint64_t * const coverageMask, uint32_t sampleMask)
518 {
519 if (IsSingleSample) // if (T::MultisampleT::numSamples == 1) // doesn't cut it, the centroid positions are still different
520 {
521 // for 1x case, centroid is pixel center
522 psContext->vX.centroid = psContext->vX.center;
523 psContext->vY.centroid = psContext->vY.center;
524 psContext->vI.centroid = psContext->vI.center;
525 psContext->vJ.centroid = psContext->vJ.center;
526 psContext->vOneOverW.centroid = psContext->vOneOverW.center;
527 }
528 else
529 {
530 if (T::bCentroidPos)
531 {
532 ///@ todo: don't need to genererate input coverage 2x if input coverage and centroid
533 if (T::bIsCenterPattern)
534 {
535 psContext->vX.centroid = _simd_add_ps(psContext->vX.UL, _simd_set1_ps(0.5f));
536 psContext->vY.centroid = _simd_add_ps(psContext->vY.UL, _simd_set1_ps(0.5f));
537 }
538 else
539 {
540 // add param: const uint32_t inputMask[KNOB_SIMD_WIDTH] to eliminate 'generate coverage 2X'..
541 CalcCentroidPos<T>(*psContext, samplePos, coverageMask, sampleMask, psContext->vX.UL, psContext->vY.UL);
542 }
543
544 CalcCentroidBarycentrics(coeffs, *psContext, psContext->vX.UL, psContext->vY.UL);
545 }
546 else
547 {
548 psContext->vX.centroid = psContext->vX.sample;
549 psContext->vY.centroid = psContext->vY.sample;
550 }
551 }
552 }
553
554 template<typename T>
555 struct PixelRateZTestLoop
556 {
557 PixelRateZTestLoop(DRAW_CONTEXT *DC, uint32_t _workerId, const SWR_TRIANGLE_DESC &Work, const BarycentricCoeffs& Coeffs, const API_STATE& apiState,
558 uint8_t*& depthBuffer, uint8_t*& stencilBuffer, const uint8_t ClipDistanceMask) :
559 pDC(DC), workerId(_workerId), work(Work), coeffs(Coeffs), state(apiState), psState(apiState.psState),
560 samplePos(state.rastState.samplePositions),
561 clipDistanceMask(ClipDistanceMask), pDepthBuffer(depthBuffer), pStencilBuffer(stencilBuffer){};
562
563 INLINE
564 uint32_t operator()(simdscalar& activeLanes, SWR_PS_CONTEXT& psContext,
565 const CORE_BUCKETS BEDepthBucket, uint32_t currentSimdIn8x8 = 0)
566 {
567 SWR_CONTEXT *pContext = pDC->pContext;
568
569 uint32_t statCount = 0;
570 simdscalar anyDepthSamplePassed = _simd_setzero_ps();
571 for(uint32_t sample = 0; sample < T::MultisampleT::numCoverageSamples; sample++)
572 {
573 const uint8_t *pCoverageMask = (uint8_t*)&work.coverageMask[sample];
574 vCoverageMask[sample] = _simd_and_ps(activeLanes, vMask(pCoverageMask[currentSimdIn8x8] & MASK));
575
576 if(!_simd_movemask_ps(vCoverageMask[sample]))
577 {
578 vCoverageMask[sample] = depthPassMask[sample] = stencilPassMask[sample] = _simd_setzero_ps();
579 continue;
580 }
581
582 // offset depth/stencil buffers current sample
583 uint8_t *pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample);
584 uint8_t * pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample);
585
586 if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable)
587 {
588 static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format");
589
590 const simdscalar z = _simd_load_ps(reinterpret_cast<const float *>(pDepthSample));
591
592 const float minz = state.depthBoundsState.depthBoundsTestMinValue;
593 const float maxz = state.depthBoundsState.depthBoundsTestMaxValue;
594
595 vCoverageMask[sample] = _simd_and_ps(vCoverageMask[sample], vMask(CalcDepthBoundsAcceptMask(z, minz, maxz)));
596 }
597
598 AR_BEGIN(BEBarycentric, pDC->drawId);
599
600 // calculate per sample positions
601 psContext.vX.sample = _simd_add_ps(psContext.vX.UL, samplePos.vX(sample));
602 psContext.vY.sample = _simd_add_ps(psContext.vY.UL, samplePos.vY(sample));
603
604 // calc I & J per sample
605 CalcSampleBarycentrics(coeffs, psContext);
606
607 if(psState.writesODepth)
608 {
609 {
610 // broadcast and test oDepth(psContext.vZ) written from the PS for each sample
611 vZ[sample] = psContext.vZ;
612 }
613 }
614 else
615 {
616 vZ[sample] = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample);
617 vZ[sample] = state.pfnQuantizeDepth(vZ[sample]);
618 }
619
620 AR_END(BEBarycentric, 0);
621
622 ///@todo: perspective correct vs non-perspective correct clipping?
623 // if clip distances are enabled, we need to interpolate for each sample
624 if(clipDistanceMask)
625 {
626 uint8_t clipMask = ComputeUserClipMask(clipDistanceMask, work.pUserClipBuffer, psContext.vI.sample, psContext.vJ.sample);
627
628 vCoverageMask[sample] = _simd_and_ps(vCoverageMask[sample], vMask(~clipMask));
629 }
630
631 // ZTest for this sample
632 ///@todo Need to uncomment out this bucket.
633 //AR_BEGIN(BEDepthBucket, pDC->drawId);
634 depthPassMask[sample] = vCoverageMask[sample];
635 stencilPassMask[sample] = vCoverageMask[sample];
636 depthPassMask[sample] = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
637 vZ[sample], pDepthSample, vCoverageMask[sample],
638 pStencilSample, &stencilPassMask[sample]);
639 //AR_END(BEDepthBucket, 0);
640
641 // early-exit if no pixels passed depth or earlyZ is forced on
642 if(psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask[sample]))
643 {
644 DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, vZ[sample],
645 pDepthSample, depthPassMask[sample], vCoverageMask[sample], pStencilSample, stencilPassMask[sample]);
646
647 if(!_simd_movemask_ps(depthPassMask[sample]))
648 {
649 continue;
650 }
651 }
652 anyDepthSamplePassed = _simd_or_ps(anyDepthSamplePassed, depthPassMask[sample]);
653 uint32_t statMask = _simd_movemask_ps(depthPassMask[sample]);
654 statCount += _mm_popcnt_u32(statMask);
655 }
656
657 activeLanes = _simd_and_ps(anyDepthSamplePassed, activeLanes);
658 // return number of samples that passed depth and coverage
659 return statCount;
660 }
661
662 // saved depth/stencil/coverage masks and interpolated Z used in OM and DepthWrite
663 simdscalar vZ[T::MultisampleT::numCoverageSamples];
664 simdscalar vCoverageMask[T::MultisampleT::numCoverageSamples];
665 simdscalar depthPassMask[T::MultisampleT::numCoverageSamples];
666 simdscalar stencilPassMask[T::MultisampleT::numCoverageSamples];
667
668 private:
669 // functor inputs
670 DRAW_CONTEXT* pDC;
671 uint32_t workerId;
672
673 const SWR_TRIANGLE_DESC& work;
674 const BarycentricCoeffs& coeffs;
675 const API_STATE& state;
676 const SWR_PS_STATE& psState;
677 const SWR_MULTISAMPLE_POS& samplePos;
678 const uint8_t clipDistanceMask;
679 uint8_t*& pDepthBuffer;
680 uint8_t*& pStencilBuffer;
681 };
682
683 INLINE void CalcPixelBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext)
684 {
685 // evaluate I,J
686 psContext.vI.center = vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.center, psContext.vY.center);
687 psContext.vJ.center = vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.center, psContext.vY.center);
688 psContext.vI.center = _simd_mul_ps(psContext.vI.center, coeffs.vRecipDet);
689 psContext.vJ.center = _simd_mul_ps(psContext.vJ.center, coeffs.vRecipDet);
690
691 // interpolate 1/w
692 psContext.vOneOverW.center = vplaneps(coeffs.vAOneOverW, coeffs.vBOneOverW, coeffs.vCOneOverW, psContext.vI.center, psContext.vJ.center);
693 }
694
695 INLINE void CalcSampleBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext)
696 {
697 // evaluate I,J
698 psContext.vI.sample = vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.sample, psContext.vY.sample);
699 psContext.vJ.sample = vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.sample, psContext.vY.sample);
700 psContext.vI.sample = _simd_mul_ps(psContext.vI.sample, coeffs.vRecipDet);
701 psContext.vJ.sample = _simd_mul_ps(psContext.vJ.sample, coeffs.vRecipDet);
702
703 // interpolate 1/w
704 psContext.vOneOverW.sample = vplaneps(coeffs.vAOneOverW, coeffs.vBOneOverW, coeffs.vCOneOverW, psContext.vI.sample, psContext.vJ.sample);
705 }
706
707 // Merge Output to 4x2 SIMD Tile Format
708 INLINE void OutputMerger4x2(SWR_PS_CONTEXT &psContext, uint8_t* (&pColorBase)[SWR_NUM_RENDERTARGETS], uint32_t sample, const SWR_BLEND_STATE *pBlendState,
709 const PFN_BLEND_JIT_FUNC (&pfnBlendFunc)[SWR_NUM_RENDERTARGETS], simdscalar &coverageMask, simdscalar depthPassMask, const uint32_t NumRT)
710 {
711 // type safety guaranteed from template instantiation in BEChooser<>::GetFunc
712 const uint32_t rasterTileColorOffset = RasterTileColorOffset(sample);
713 simdvector blendOut;
714
715 for(uint32_t rt = 0; rt < NumRT; ++rt)
716 {
717 uint8_t *pColorSample = pColorBase[rt] + rasterTileColorOffset;
718
719 const SWR_RENDER_TARGET_BLEND_STATE *pRTBlend = &pBlendState->renderTarget[rt];
720
721 {
722 // pfnBlendFunc may not update all channels. Initialize with PS output.
723 /// TODO: move this into the blend JIT.
724 blendOut = psContext.shaded[rt];
725
726 // Blend outputs and update coverage mask for alpha test
727 if(pfnBlendFunc[rt] != nullptr)
728 {
729 pfnBlendFunc[rt](
730 pBlendState,
731 psContext.shaded[rt],
732 psContext.shaded[1],
733 psContext.shaded[0].w,
734 sample,
735 pColorSample,
736 blendOut,
737 &psContext.oMask,
738 (simdscalari*)&coverageMask);
739 }
740 }
741
742 // final write mask
743 simdscalari outputMask = _simd_castps_si(_simd_and_ps(coverageMask, depthPassMask));
744
745 ///@todo can only use maskstore fast path if bpc is 32. Assuming hot tile is RGBA32_FLOAT.
746 static_assert(KNOB_COLOR_HOT_TILE_FORMAT == R32G32B32A32_FLOAT, "Unsupported hot tile format");
747
748 const uint32_t simd = KNOB_SIMD_WIDTH * sizeof(float);
749
750 // store with color mask
751 if(!pRTBlend->writeDisableRed)
752 {
753 _simd_maskstore_ps((float*)pColorSample, outputMask, blendOut.x);
754 }
755 if(!pRTBlend->writeDisableGreen)
756 {
757 _simd_maskstore_ps((float*)(pColorSample + simd), outputMask, blendOut.y);
758 }
759 if(!pRTBlend->writeDisableBlue)
760 {
761 _simd_maskstore_ps((float*)(pColorSample + simd * 2), outputMask, blendOut.z);
762 }
763 if(!pRTBlend->writeDisableAlpha)
764 {
765 _simd_maskstore_ps((float*)(pColorSample + simd * 3), outputMask, blendOut.w);
766 }
767 }
768 }
769
770 #if USE_8x2_TILE_BACKEND
771 // Merge Output to 8x2 SIMD16 Tile Format
772 INLINE void OutputMerger8x2(SWR_PS_CONTEXT &psContext, uint8_t* (&pColorBase)[SWR_NUM_RENDERTARGETS], uint32_t sample, const SWR_BLEND_STATE *pBlendState,
773 const PFN_BLEND_JIT_FUNC(&pfnBlendFunc)[SWR_NUM_RENDERTARGETS], simdscalar &coverageMask, simdscalar depthPassMask, const uint32_t NumRT, const uint32_t colorBufferEnableMask, bool useAlternateOffset)
774 {
775 // type safety guaranteed from template instantiation in BEChooser<>::GetFunc
776 uint32_t rasterTileColorOffset = RasterTileColorOffset(sample);
777
778 if (useAlternateOffset)
779 {
780 rasterTileColorOffset += sizeof(simdscalar);
781 }
782
783 simdvector blendSrc;
784 simdvector blendOut;
785
786 uint32_t colorBufferBit = 1;
787 for (uint32_t rt = 0; rt < NumRT; rt += 1, colorBufferBit <<= 1)
788 {
789 simdscalar *pColorSample = reinterpret_cast<simdscalar *>(pColorBase[rt] + rasterTileColorOffset);
790
791 const SWR_RENDER_TARGET_BLEND_STATE *pRTBlend = &pBlendState->renderTarget[rt];
792
793 if (colorBufferBit & colorBufferEnableMask)
794 {
795 blendSrc[0] = pColorSample[0];
796 blendSrc[1] = pColorSample[2];
797 blendSrc[2] = pColorSample[4];
798 blendSrc[3] = pColorSample[6];
799 }
800
801 {
802 // pfnBlendFunc may not update all channels. Initialize with PS output.
803 /// TODO: move this into the blend JIT.
804 blendOut = psContext.shaded[rt];
805
806 // Blend outputs and update coverage mask for alpha test
807 if(pfnBlendFunc[rt] != nullptr)
808 {
809 pfnBlendFunc[rt](
810 pBlendState,
811 psContext.shaded[rt],
812 psContext.shaded[1],
813 psContext.shaded[0].w,
814 sample,
815 reinterpret_cast<uint8_t *>(&blendSrc),
816 blendOut,
817 &psContext.oMask,
818 reinterpret_cast<simdscalari *>(&coverageMask));
819 }
820 }
821
822 // final write mask
823 simdscalari outputMask = _simd_castps_si(_simd_and_ps(coverageMask, depthPassMask));
824
825 ///@todo can only use maskstore fast path if bpc is 32. Assuming hot tile is RGBA32_FLOAT.
826 static_assert(KNOB_COLOR_HOT_TILE_FORMAT == R32G32B32A32_FLOAT, "Unsupported hot tile format");
827
828 // store with color mask
829 if (!pRTBlend->writeDisableRed)
830 {
831 _simd_maskstore_ps(reinterpret_cast<float *>(&pColorSample[0]), outputMask, blendOut.x);
832 }
833 if (!pRTBlend->writeDisableGreen)
834 {
835 _simd_maskstore_ps(reinterpret_cast<float *>(&pColorSample[2]), outputMask, blendOut.y);
836 }
837 if (!pRTBlend->writeDisableBlue)
838 {
839 _simd_maskstore_ps(reinterpret_cast<float *>(&pColorSample[4]), outputMask, blendOut.z);
840 }
841 if (!pRTBlend->writeDisableAlpha)
842 {
843 _simd_maskstore_ps(reinterpret_cast<float *>(&pColorSample[6]), outputMask, blendOut.w);
844 }
845 }
846 }
847
848 #endif
849
850 template<typename T>
851 void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
852 {
853 ///@todo: Need to move locals off stack to prevent __chkstk's from being generated for the backend
854
855
856 SWR_CONTEXT *pContext = pDC->pContext;
857
858 AR_BEGIN(BEPixelRateBackend, pDC->drawId);
859 AR_BEGIN(BESetup, pDC->drawId);
860
861 const API_STATE &state = GetApiState(pDC);
862
863 BarycentricCoeffs coeffs;
864 SetupBarycentricCoeffs(&coeffs, work);
865
866 SWR_PS_CONTEXT psContext;
867 const SWR_MULTISAMPLE_POS& samplePos = state.rastState.samplePositions;
868 SetupPixelShaderContext<T>(&psContext, samplePos, work);
869
870 uint8_t *pDepthBuffer, *pStencilBuffer;
871 SetupRenderBuffers(psContext.pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.psState.numRenderTargets, renderBuffers);
872
873 AR_END(BESetup, 0);
874
875 PixelRateZTestLoop<T> PixelRateZTest(pDC, workerId, work, coeffs, state, pDepthBuffer, pStencilBuffer, state.rastState.clipDistanceMask);
876
877 psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast<float>(y)));
878 psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast<float>(y)));
879
880 const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM));
881
882 for(uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
883 {
884 psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps(static_cast<float>(x)));
885 psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps(static_cast<float>(x)));
886
887 const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM));
888
889 for(uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
890 {
891 #if USE_8x2_TILE_BACKEND
892 const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0);
893 #endif
894 simdscalar activeLanes;
895 if(!(work.anyCoveredSamples & MASK)) {goto Endtile;};
896 activeLanes = vMask(work.anyCoveredSamples & MASK);
897
898 if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE)
899 {
900 const uint64_t* pCoverageMask = (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) ? &work.innerCoverageMask : &work.coverageMask[0];
901
902 generateInputCoverage<T, T::InputCoverage>(pCoverageMask, psContext.inputMask, state.blendState.sampleMask);
903 }
904
905 AR_BEGIN(BEBarycentric, pDC->drawId);
906
907 CalcPixelBarycentrics(coeffs, psContext);
908
909 CalcCentroid<T, false>(&psContext, samplePos, coeffs, work.coverageMask, state.blendState.sampleMask);
910
911 AR_END(BEBarycentric, 0);
912
913 if(T::bForcedSampleCount)
914 {
915 // candidate pixels (that passed coverage) will cause shader invocation if any bits in the samplemask are set
916 const simdscalar vSampleMask = _simd_castsi_ps(_simd_cmpgt_epi32(_simd_set1_epi32(state.blendState.sampleMask), _simd_setzero_si()));
917 activeLanes = _simd_and_ps(activeLanes, vSampleMask);
918 }
919
920 // Early-Z?
921 if(T::bCanEarlyZ && !T::bForcedSampleCount)
922 {
923 uint32_t depthPassCount = PixelRateZTest(activeLanes, psContext, BEEarlyDepthTest);
924 UPDATE_STAT_BE(DepthPassCount, depthPassCount);
925 AR_EVENT(EarlyDepthInfoPixelRate(depthPassCount, _simd_movemask_ps(activeLanes)));
926 }
927
928 // if we have no covered samples that passed depth at this point, go to next tile
929 if(!_simd_movemask_ps(activeLanes)) { goto Endtile; };
930
931 if(state.psState.usesSourceDepth)
932 {
933 AR_BEGIN(BEBarycentric, pDC->drawId);
934 // interpolate and quantize z
935 psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center);
936 psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
937 AR_END(BEBarycentric, 0);
938 }
939
940 // pixels that are currently active
941 psContext.activeMask = _simd_castps_si(activeLanes);
942 psContext.oMask = T::MultisampleT::FullSampleMask();
943
944 // execute pixel shader
945 AR_BEGIN(BEPixelShader, pDC->drawId);
946 state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
947 UPDATE_STAT_BE(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(activeLanes)));
948 AR_END(BEPixelShader, 0);
949
950 // update active lanes to remove any discarded or oMask'd pixels
951 activeLanes = _simd_castsi_ps(_simd_and_si(psContext.activeMask, _simd_cmpgt_epi32(psContext.oMask, _simd_setzero_si())));
952 if(!_simd_movemask_ps(activeLanes)) { goto Endtile; };
953
954 // late-Z
955 if(!T::bCanEarlyZ && !T::bForcedSampleCount)
956 {
957 uint32_t depthPassCount = PixelRateZTest(activeLanes, psContext, BELateDepthTest);
958 UPDATE_STAT_BE(DepthPassCount, depthPassCount);
959 AR_EVENT(LateDepthInfoPixelRate(depthPassCount, _simd_movemask_ps(activeLanes)));
960 }
961
962 // if we have no covered samples that passed depth at this point, skip OM and go to next tile
963 if(!_simd_movemask_ps(activeLanes)) { goto Endtile; };
964
965 // output merger
966 // loop over all samples, broadcasting the results of the PS to all passing pixels
967 for(uint32_t sample = 0; sample < GetNumOMSamples<T>(state.blendState.sampleCount); sample++)
968 {
969 AR_BEGIN(BEOutputMerger, pDC->drawId);
970 // center pattern does a single coverage/depth/stencil test, standard pattern tests all samples
971 uint32_t coverageSampleNum = (T::bIsCenterPattern) ? 0 : sample;
972 simdscalar coverageMask, depthMask;
973 if(T::bForcedSampleCount)
974 {
975 coverageMask = depthMask = activeLanes;
976 }
977 else
978 {
979 coverageMask = PixelRateZTest.vCoverageMask[coverageSampleNum];
980 depthMask = PixelRateZTest.depthPassMask[coverageSampleNum];
981 if(!_simd_movemask_ps(depthMask))
982 {
983 // stencil should already have been written in early/lateZ tests
984 AR_END(BEOutputMerger, 0);
985 continue;
986 }
987 }
988
989 // broadcast the results of the PS to all passing pixels
990 #if USE_8x2_TILE_BACKEND
991 OutputMerger8x2(psContext, psContext.pColorBuffer, sample, &state.blendState,state.pfnBlendFunc, coverageMask, depthMask, state.psState.numRenderTargets, state.colorHottileEnable, useAlternateOffset);
992 #else // USE_8x2_TILE_BACKEND
993 OutputMerger4x2(psContext, psContext.pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, coverageMask, depthMask, state.psState.numRenderTargets);
994 #endif // USE_8x2_TILE_BACKEND
995
996 if(!state.psState.forceEarlyZ && !T::bForcedSampleCount)
997 {
998 uint8_t *pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample);
999 uint8_t * pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample);
1000
1001 DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, PixelRateZTest.vZ[coverageSampleNum],
1002 pDepthSample, depthMask, coverageMask, pStencilSample, PixelRateZTest.stencilPassMask[coverageSampleNum]);
1003 }
1004 AR_END(BEOutputMerger, 0);
1005 }
1006 Endtile:
1007 AR_BEGIN(BEEndTile, pDC->drawId);
1008
1009 for(uint32_t sample = 0; sample < T::MultisampleT::numCoverageSamples; sample++)
1010 {
1011 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
1012 }
1013
1014 if(T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
1015 {
1016 work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
1017 }
1018 work.anyCoveredSamples >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
1019
1020 #if USE_8x2_TILE_BACKEND
1021 if (useAlternateOffset)
1022 {
1023 for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt)
1024 {
1025 psContext.pColorBuffer[rt] += (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
1026 }
1027 }
1028 #else
1029 for(uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt)
1030 {
1031 psContext.pColorBuffer[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
1032 }
1033 #endif
1034 pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
1035 pStencilBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
1036
1037 AR_END(BEEndTile, 0);
1038
1039 psContext.vX.UL = _simd_add_ps(psContext.vX.UL, dx);
1040 psContext.vX.center = _simd_add_ps(psContext.vX.center, dx);
1041 }
1042
1043 psContext.vY.UL = _simd_add_ps(psContext.vY.UL, dy);
1044 psContext.vY.center = _simd_add_ps(psContext.vY.center, dy);
1045 }
1046
1047 AR_END(BEPixelRateBackend, 0);
1048 }
1049
1050 template<uint32_t sampleCountT = SWR_MULTISAMPLE_1X, uint32_t isCenter = 0,
1051 uint32_t coverage = 0, uint32_t centroid = 0, uint32_t forced = 0, uint32_t canEarlyZ = 0
1052 >
1053 struct SwrBackendTraits
1054 {
1055 static const bool bIsCenterPattern = (isCenter == 1);
1056 static const uint32_t InputCoverage = coverage;
1057 static const bool bCentroidPos = (centroid == 1);
1058 static const bool bForcedSampleCount = (forced == 1);
1059 static const bool bCanEarlyZ = (canEarlyZ == 1);
1060 typedef MultisampleTraits<(SWR_MULTISAMPLE_COUNT)sampleCountT, bIsCenterPattern> MultisampleT;
1061 };