swr: [rasterizer] Backend code adjustments
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / backend.h
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file backend.h
24 *
25 * @brief Backend handles rasterization, pixel shading and output merger
26 * operations.
27 *
28 ******************************************************************************/
29 #pragma once
30
31 #include "common/os.h"
32 #include "core/context.h"
33 #include "core/multisample.h"
34 #include "depthstencil.h"
35 #include "rdtsc_core.h"
36
37 void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId, void*& pSpillFillBuffer);
38 void ProcessSyncBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData);
39 void ProcessClearBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData);
40 void ProcessStoreTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData);
41 void ProcessDiscardInvalidateTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData);
42 void ProcessShutdownBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData);
43 void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers);
44 void InitClearTilesTable();
45 simdmask ComputeUserClipMask(uint8_t clipMask, float* pUserClipBuffer, simdscalar vI, simdscalar vJ);
46 void InitBackendFuncTables();
47 void InitCPSFuncTables();
48 void CalcSampleBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext);
49
50 extern PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_COUNT]
51 [SWR_MSAA_SAMPLE_PATTERN_COUNT]
52 [SWR_INPUT_COVERAGE_COUNT]
53 [2] // centroid
54 [2] // forcedSampleCount
55 [2] // canEarlyZ
56 ;
57
58 enum SWR_BACKEND_FUNCS
59 {
60 SWR_BACKEND_SINGLE_SAMPLE,
61 SWR_BACKEND_MSAA_PIXEL_RATE,
62 SWR_BACKEND_MSAA_SAMPLE_RATE,
63 SWR_BACKEND_FUNCS_MAX,
64 };
65
66 #if KNOB_SIMD_WIDTH == 8
67 extern const __m256 vCenterOffsetsX;
68 extern const __m256 vCenterOffsetsY;
69 extern const __m256 vULOffsetsX;
70 extern const __m256 vULOffsetsY;
71 #define MASK 0xff
72 #endif
73
74 INLINE static uint32_t RasterTileColorOffset(uint32_t sampleNum)
75 {
76 static const uint32_t RasterTileColorOffsets[16]
77 { 0,
78 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8),
79 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 2,
80 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 3,
81 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 4,
82 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 5,
83 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 6,
84 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 7,
85 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 8,
86 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 9,
87 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 10,
88 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 11,
89 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 12,
90 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 13,
91 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 14,
92 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 15,
93 };
94 assert(sampleNum < 16);
95 return RasterTileColorOffsets[sampleNum];
96 }
97
98 INLINE static uint32_t RasterTileDepthOffset(uint32_t sampleNum)
99 {
100 static const uint32_t RasterTileDepthOffsets[16]
101 { 0,
102 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8),
103 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 2,
104 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 3,
105 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 4,
106 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 5,
107 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 6,
108 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 7,
109 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 8,
110 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 9,
111 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 10,
112 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 11,
113 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 12,
114 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 13,
115 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 14,
116 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 15,
117 };
118 assert(sampleNum < 16);
119 return RasterTileDepthOffsets[sampleNum];
120 }
121
122 INLINE static uint32_t RasterTileStencilOffset(uint32_t sampleNum)
123 {
124 static const uint32_t RasterTileStencilOffsets[16]
125 { 0,
126 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8),
127 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 2,
128 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 3,
129 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 4,
130 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 5,
131 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 6,
132 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 7,
133 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 8,
134 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 9,
135 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 10,
136 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 11,
137 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 12,
138 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 13,
139 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 14,
140 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 15,
141 };
142 assert(sampleNum < 16);
143 return RasterTileStencilOffsets[sampleNum];
144 }
145
146 template<typename T, uint32_t InputCoverage>
147 struct generateInputCoverage
148 {
149 INLINE generateInputCoverage(const uint64_t *const coverageMask, uint32_t (&inputMask)[KNOB_SIMD_WIDTH], const uint32_t sampleMask)
150 {
151 // will need to update for avx512
152 assert(KNOB_SIMD_WIDTH == 8);
153
154 __m256i mask[2];
155 __m256i sampleCoverage[2];
156 if(T::bIsStandardPattern)
157 {
158 __m256i src = _mm256_set1_epi32(0);
159 __m256i index0 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0), index1;
160
161 if(T::MultisampleT::numSamples == 1)
162 {
163 mask[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, -1);
164 }
165 else if(T::MultisampleT::numSamples == 2)
166 {
167 mask[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1);
168 }
169 else if(T::MultisampleT::numSamples == 4)
170 {
171 mask[0] = _mm256_set_epi32(0, 0, 0, 0, -1, -1, -1, -1);
172 }
173 else if(T::MultisampleT::numSamples == 8)
174 {
175 mask[0] = _mm256_set1_epi32(-1);
176 }
177 else if(T::MultisampleT::numSamples == 16)
178 {
179 mask[0] = _mm256_set1_epi32(-1);
180 mask[1] = _mm256_set1_epi32(-1);
181 index1 = _mm256_set_epi32(15, 14, 13, 12, 11, 10, 9, 8);
182 }
183
184 // gather coverage for samples 0-7
185 sampleCoverage[0] = _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src), (const float*)coverageMask, index0, _mm256_castsi256_ps(mask[0]), 8));
186 if(T::MultisampleT::numSamples > 8)
187 {
188 // gather coverage for samples 8-15
189 sampleCoverage[1] = _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src), (const float*)coverageMask, index1, _mm256_castsi256_ps(mask[1]), 8));
190 }
191 }
192 else
193 {
194 // center coverage is the same for all samples; just broadcast to the sample slots
195 uint32_t centerCoverage = ((uint32_t)(*coverageMask) & MASK);
196 if(T::MultisampleT::numSamples == 1)
197 {
198 sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, centerCoverage);
199 }
200 else if(T::MultisampleT::numSamples == 2)
201 {
202 sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, centerCoverage, centerCoverage);
203 }
204 else if(T::MultisampleT::numSamples == 4)
205 {
206 sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, centerCoverage, centerCoverage, centerCoverage, centerCoverage);
207 }
208 else if(T::MultisampleT::numSamples == 8)
209 {
210 sampleCoverage[0] = _mm256_set1_epi32(centerCoverage);
211 }
212 else if(T::MultisampleT::numSamples == 16)
213 {
214 sampleCoverage[0] = _mm256_set1_epi32(centerCoverage);
215 sampleCoverage[1] = _mm256_set1_epi32(centerCoverage);
216 }
217 }
218
219 mask[0] = _mm256_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xC, 0x8, 0x4, 0x0,
220 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xC, 0x8, 0x4, 0x0);
221 // pull out the 8bit 4x2 coverage for samples 0-7 into the lower 32 bits of each 128bit lane
222 __m256i packedCoverage0 = _simd_shuffle_epi8(sampleCoverage[0], mask[0]);
223
224 __m256i packedCoverage1;
225 if(T::MultisampleT::numSamples > 8)
226 {
227 // pull out the 8bit 4x2 coverage for samples 8-15 into the lower 32 bits of each 128bit lane
228 packedCoverage1 = _simd_shuffle_epi8(sampleCoverage[1], mask[0]);
229 }
230
231 #if (KNOB_ARCH == KNOB_ARCH_AVX)
232 // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane
233 __m256i hiToLow = _mm256_permute2f128_si256(packedCoverage0, packedCoverage0, 0x83);
234 __m256 shufRes = _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow), _mm256_castsi256_ps(hiToLow), _MM_SHUFFLE(1, 1, 0, 1));
235 packedCoverage0 = _mm256_castps_si256(_mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0), shufRes, 0xFE));
236
237 __m256i packedSampleCoverage;
238 if(T::MultisampleT::numSamples > 8)
239 {
240 // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane
241 hiToLow = _mm256_permute2f128_si256(packedCoverage1, packedCoverage1, 0x83);
242 shufRes = _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow), _mm256_castsi256_ps(hiToLow), _MM_SHUFFLE(1, 1, 0, 1));
243 shufRes = _mm256_blend_ps(_mm256_castsi256_ps(packedCoverage1), shufRes, 0xFE);
244 packedCoverage1 = _mm256_castps_si256(_mm256_castpd_ps(_mm256_shuffle_pd(_mm256_castps_pd(shufRes), _mm256_castps_pd(shufRes), 0x01)));
245 packedSampleCoverage = _mm256_castps_si256(_mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0), _mm256_castsi256_ps(packedCoverage1), 0xFC));
246 }
247 else
248 {
249 packedSampleCoverage = packedCoverage0;
250 }
251 #else
252 __m256i permMask = _mm256_set_epi32(0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x4, 0x0);
253 // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane
254 packedCoverage0 = _mm256_permutevar8x32_epi32(packedCoverage0, permMask);
255
256 __m256i packedSampleCoverage;
257 if(T::MultisampleT::numSamples > 8)
258 {
259 permMask = _mm256_set_epi32(0x7, 0x7, 0x7, 0x7, 0x4, 0x0, 0x7, 0x7);
260 // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane
261 packedCoverage1 = _mm256_permutevar8x32_epi32(packedCoverage1, permMask);
262
263 // blend coverage masks for samples 0-7 and samples 8-15 into single 128 bit lane
264 packedSampleCoverage = _mm256_blend_epi32(packedCoverage0, packedCoverage1, 0x0C);
265 }
266 else
267 {
268 packedSampleCoverage = packedCoverage0;
269 }
270 #endif
271
272 for(int32_t i = KNOB_SIMD_WIDTH - 1; i >= 0; i--)
273 {
274 // convert packed sample coverage masks into single coverage masks for all samples for each pixel in the 4x2
275 inputMask[i] = _simd_movemask_epi8(packedSampleCoverage);
276
277 if(!T::bForcedSampleCount)
278 {
279 // input coverage has to be anded with sample mask if MSAA isn't forced on
280 inputMask[i] &= sampleMask;
281 }
282
283 // shift to the next pixel in the 4x2
284 packedSampleCoverage = _simd_slli_epi32(packedSampleCoverage, 1);
285 }
286 }
287
288 INLINE generateInputCoverage(const uint64_t *const coverageMask, __m256 &inputCoverage, const uint32_t sampleMask)
289 {
290 uint32_t inputMask[KNOB_SIMD_WIDTH];
291 generateInputCoverage<T, T::InputCoverage>(coverageMask, inputMask, sampleMask);
292 inputCoverage = _simd_castsi_ps(_mm256_set_epi32(inputMask[7], inputMask[6], inputMask[5], inputMask[4], inputMask[3], inputMask[2], inputMask[1], inputMask[0]));
293 }
294
295 };
296
297 template<typename T>
298 struct generateInputCoverage<T, SWR_INPUT_COVERAGE_INNER_CONSERVATIVE>
299 {
300 INLINE generateInputCoverage(const uint64_t *const coverageMask, __m256 &inputCoverage, const uint32_t sampleMask)
301 {
302 // will need to update for avx512
303 assert(KNOB_SIMD_WIDTH == 8);
304 __m256i vec = _mm256_set1_epi32(coverageMask[0]);
305 const __m256i bit = _mm256_set_epi32(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
306 vec = _simd_and_si(vec, bit);
307 vec = _simd_cmplt_epi32(_mm256_setzero_si256(), vec);
308 vec = _simd_blendv_epi32(_simd_setzero_si(), _simd_set1_epi32(1), vec);
309 inputCoverage = _simd_castsi_ps(vec);
310 }
311
312 INLINE generateInputCoverage(const uint64_t *const coverageMask, uint32_t (&inputMask)[KNOB_SIMD_WIDTH], const uint32_t sampleMask)
313 {
314 uint32_t simdCoverage = (coverageMask[0] & MASK);
315 static const uint32_t FullCoverageMask = (1 << T::MultisampleT::numSamples) - 1;
316 for(int i = 0; i < KNOB_SIMD_WIDTH; i++)
317 {
318 // set all samples to covered if conservative coverage mask is set for that pixel
319 inputMask[i] = (((1 << i) & simdCoverage) > 0) ? FullCoverageMask : 0;
320 }
321 }
322 };
323
324 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
325 // Centroid behaves exactly as follows :
326 // (1) If all samples in the primitive are covered, the attribute is evaluated at the pixel center (even if the sample pattern does not happen to
327 // have a sample location there).
328 // (2) Else the attribute is evaluated at the first covered sample, in increasing order of sample index, where sample coverage is after ANDing the
329 // coverage with the SampleMask Rasterizer State.
330 // (3) If no samples are covered, such as on helper pixels executed off the bounds of a primitive to fill out 2x2 pixel stamps, the attribute is
331 // evaluated as follows : If the SampleMask Rasterizer state is a subset of the samples in the pixel, then the first sample covered by the
332 // SampleMask Rasterizer State is the evaluation point.Otherwise (full SampleMask), the pixel center is the evaluation point.
333 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
334 template<typename T>
335 INLINE void CalcCentroidPos(SWR_PS_CONTEXT &psContext, const uint64_t *const coverageMask, const uint32_t sampleMask,
336 const simdscalar vXSamplePosUL, const simdscalar vYSamplePosUL)
337 {
338 uint32_t inputMask[KNOB_SIMD_WIDTH];
339 generateInputCoverage<T, T::InputCoverage>(coverageMask, inputMask, sampleMask);
340
341 // Case (2) - partially covered pixel
342
343 // scan for first covered sample per pixel in the 4x2 span
344 unsigned long sampleNum[KNOB_SIMD_WIDTH];
345 (inputMask[0] > 0) ? (_BitScanForward(&sampleNum[0], inputMask[0])) : (sampleNum[0] = 0);
346 (inputMask[1] > 0) ? (_BitScanForward(&sampleNum[1], inputMask[1])) : (sampleNum[1] = 0);
347 (inputMask[2] > 0) ? (_BitScanForward(&sampleNum[2], inputMask[2])) : (sampleNum[2] = 0);
348 (inputMask[3] > 0) ? (_BitScanForward(&sampleNum[3], inputMask[3])) : (sampleNum[3] = 0);
349 (inputMask[4] > 0) ? (_BitScanForward(&sampleNum[4], inputMask[4])) : (sampleNum[4] = 0);
350 (inputMask[5] > 0) ? (_BitScanForward(&sampleNum[5], inputMask[5])) : (sampleNum[5] = 0);
351 (inputMask[6] > 0) ? (_BitScanForward(&sampleNum[6], inputMask[6])) : (sampleNum[6] = 0);
352 (inputMask[7] > 0) ? (_BitScanForward(&sampleNum[7], inputMask[7])) : (sampleNum[7] = 0);
353
354 // look up and set the sample offsets from UL pixel corner for first covered sample
355 __m256 vXSample = _mm256_set_ps(T::MultisampleT::X(sampleNum[7]),
356 T::MultisampleT::X(sampleNum[6]),
357 T::MultisampleT::X(sampleNum[5]),
358 T::MultisampleT::X(sampleNum[4]),
359 T::MultisampleT::X(sampleNum[3]),
360 T::MultisampleT::X(sampleNum[2]),
361 T::MultisampleT::X(sampleNum[1]),
362 T::MultisampleT::X(sampleNum[0]));
363
364 __m256 vYSample = _mm256_set_ps(T::MultisampleT::Y(sampleNum[7]),
365 T::MultisampleT::Y(sampleNum[6]),
366 T::MultisampleT::Y(sampleNum[5]),
367 T::MultisampleT::Y(sampleNum[4]),
368 T::MultisampleT::Y(sampleNum[3]),
369 T::MultisampleT::Y(sampleNum[2]),
370 T::MultisampleT::Y(sampleNum[1]),
371 T::MultisampleT::Y(sampleNum[0]));
372 // add sample offset to UL pixel corner
373 vXSample = _simd_add_ps(vXSamplePosUL, vXSample);
374 vYSample = _simd_add_ps(vYSamplePosUL, vYSample);
375
376 // Case (1) and case (3b) - All samples covered or not covered with full SampleMask
377 static const __m256i vFullyCoveredMask = T::MultisampleT::FullSampleMask();
378 __m256i vInputCoveragei = _mm256_set_epi32(inputMask[7], inputMask[6], inputMask[5], inputMask[4], inputMask[3], inputMask[2], inputMask[1], inputMask[0]);
379 __m256i vAllSamplesCovered = _simd_cmpeq_epi32(vInputCoveragei, vFullyCoveredMask);
380
381 static const __m256i vZero = _simd_setzero_si();
382 const __m256i vSampleMask = _simd_and_si(_simd_set1_epi32(sampleMask), vFullyCoveredMask);
383 __m256i vNoSamplesCovered = _simd_cmpeq_epi32(vInputCoveragei, vZero);
384 __m256i vIsFullSampleMask = _simd_cmpeq_epi32(vSampleMask, vFullyCoveredMask);
385 __m256i vCase3b = _simd_and_si(vNoSamplesCovered, vIsFullSampleMask);
386
387 __m256i vEvalAtCenter = _simd_or_si(vAllSamplesCovered, vCase3b);
388
389 // set the centroid position based on results from above
390 psContext.vX.centroid = _simd_blendv_ps(vXSample, psContext.vX.center, _simd_castsi_ps(vEvalAtCenter));
391 psContext.vY.centroid = _simd_blendv_ps(vYSample, psContext.vY.center, _simd_castsi_ps(vEvalAtCenter));
392
393 // Case (3a) No samples covered and partial sample mask
394 __m256i vSomeSampleMaskSamples = _simd_cmplt_epi32(vSampleMask, vFullyCoveredMask);
395 // sample mask should never be all 0's for this case, but handle it anyways
396 unsigned long firstCoveredSampleMaskSample = 0;
397 (sampleMask > 0) ? (_BitScanForward(&firstCoveredSampleMaskSample, sampleMask)) : (firstCoveredSampleMaskSample = 0);
398
399 __m256i vCase3a = _simd_and_si(vNoSamplesCovered, vSomeSampleMaskSamples);
400
401 vXSample = _simd_set1_ps(T::MultisampleT::X(firstCoveredSampleMaskSample));
402 vYSample = _simd_set1_ps(T::MultisampleT::Y(firstCoveredSampleMaskSample));
403
404 // blend in case 3a pixel locations
405 psContext.vX.centroid = _simd_blendv_ps(psContext.vX.centroid, vXSample, _simd_castsi_ps(vCase3a));
406 psContext.vY.centroid = _simd_blendv_ps(psContext.vY.centroid, vYSample, _simd_castsi_ps(vCase3a));
407 }
408
409 INLINE void CalcCentroidBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext,
410 const simdscalar vXSamplePosUL, const simdscalar vYSamplePosUL)
411 {
412 // evaluate I,J
413 psContext.vI.centroid = vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.centroid, psContext.vY.centroid);
414 psContext.vJ.centroid = vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.centroid, psContext.vY.centroid);
415 psContext.vI.centroid = _simd_mul_ps(psContext.vI.centroid, coeffs.vRecipDet);
416 psContext.vJ.centroid = _simd_mul_ps(psContext.vJ.centroid, coeffs.vRecipDet);
417
418 // interpolate 1/w
419 psContext.vOneOverW.centroid = vplaneps(coeffs.vAOneOverW, coeffs.vBOneOverW, coeffs.vCOneOverW, psContext.vI.centroid, psContext.vJ.centroid);
420 }
421
422 INLINE simdmask CalcDepthBoundsAcceptMask(simdscalar z, float minz, float maxz)
423 {
424 const simdscalar minzMask = _simd_cmpge_ps(z, _simd_set1_ps(minz));
425 const simdscalar maxzMask = _simd_cmple_ps(z, _simd_set1_ps(maxz));
426
427 return _simd_movemask_ps(_simd_and_ps(minzMask, maxzMask));
428 }
429
430 template<typename T>
431 INLINE uint32_t GetNumOMSamples(SWR_MULTISAMPLE_COUNT blendSampleCount)
432 {
433 // RT has to be single sample if we're in forcedMSAA mode
434 if(T::bForcedSampleCount && (T::MultisampleT::sampleCount > SWR_MULTISAMPLE_1X))
435 {
436 return 1;
437 }
438 // unless we're forced to single sample, in which case we run the OM at the sample count of the RT
439 else if(T::bForcedSampleCount && (T::MultisampleT::sampleCount == SWR_MULTISAMPLE_1X))
440 {
441 return GetNumSamples(blendSampleCount);
442 }
443 // else we're in normal MSAA mode and rasterizer and OM are running at the same sample count
444 else
445 {
446 return T::MultisampleT::numSamples;
447 }
448 }
449
450 inline void SetupBarycentricCoeffs(BarycentricCoeffs *coeffs, const SWR_TRIANGLE_DESC &work)
451 {
452 // broadcast scalars
453
454 coeffs->vIa = _simd_broadcast_ss(&work.I[0]);
455 coeffs->vIb = _simd_broadcast_ss(&work.I[1]);
456 coeffs->vIc = _simd_broadcast_ss(&work.I[2]);
457
458 coeffs->vJa = _simd_broadcast_ss(&work.J[0]);
459 coeffs->vJb = _simd_broadcast_ss(&work.J[1]);
460 coeffs->vJc = _simd_broadcast_ss(&work.J[2]);
461
462 coeffs->vZa = _simd_broadcast_ss(&work.Z[0]);
463 coeffs->vZb = _simd_broadcast_ss(&work.Z[1]);
464 coeffs->vZc = _simd_broadcast_ss(&work.Z[2]);
465
466 coeffs->vRecipDet = _simd_broadcast_ss(&work.recipDet);
467
468 coeffs->vAOneOverW = _simd_broadcast_ss(&work.OneOverW[0]);
469 coeffs->vBOneOverW = _simd_broadcast_ss(&work.OneOverW[1]);
470 coeffs->vCOneOverW = _simd_broadcast_ss(&work.OneOverW[2]);
471 }
472
473 inline void SetupRenderBuffers(uint8_t *pColorBuffer[SWR_NUM_RENDERTARGETS], uint8_t **pDepthBuffer, uint8_t **pStencilBuffer, uint32_t colorBufferCount, RenderOutputBuffers &renderBuffers)
474 {
475 assert(colorBufferCount <= SWR_NUM_RENDERTARGETS);
476
477 if (pColorBuffer)
478 {
479 for (uint32_t index = 0; index < colorBufferCount; index += 1)
480 {
481 pColorBuffer[index] = renderBuffers.pColor[index];
482 }
483 }
484
485 if (pDepthBuffer)
486 {
487 *pDepthBuffer = renderBuffers.pDepth;
488 }
489
490 if (pStencilBuffer)
491 {
492 *pStencilBuffer = renderBuffers.pStencil;;
493 }
494 }
495
496 template<typename T>
497 void SetupPixelShaderContext(SWR_PS_CONTEXT *psContext, const SWR_TRIANGLE_DESC &work)
498 {
499 psContext->pAttribs = work.pAttribs;
500 psContext->pPerspAttribs = work.pPerspAttribs;
501 psContext->frontFace = work.triFlags.frontFacing;
502 psContext->primID = work.triFlags.primID;
503
504 // save Ia/Ib/Ic and Ja/Jb/Jc if we need to reevaluate i/j/k in the shader because of pull attribs
505 psContext->I = work.I;
506 psContext->J = work.J;
507
508 psContext->recipDet = work.recipDet;
509 psContext->pRecipW = work.pRecipW;
510 psContext->pSamplePosX = reinterpret_cast<const float *>(&T::MultisampleT::samplePosX);
511 psContext->pSamplePosY = reinterpret_cast<const float *>(&T::MultisampleT::samplePosY);
512 psContext->rasterizerSampleCount = T::MultisampleT::numSamples;
513 psContext->sampleIndex = 0;
514 }
515
516 template<typename T, bool IsSingleSample>
517 void CalcCentroid(SWR_PS_CONTEXT *psContext, const BarycentricCoeffs &coeffs, const uint64_t * const coverageMask, uint32_t sampleMask)
518 {
519 if (IsSingleSample) // if (T::MultisampleT::numSamples == 1) // doesn't cut it, the centroid positions are still different
520 {
521 // for 1x case, centroid is pixel center
522 psContext->vX.centroid = psContext->vX.center;
523 psContext->vY.centroid = psContext->vY.center;
524 psContext->vI.centroid = psContext->vI.center;
525 psContext->vJ.centroid = psContext->vJ.center;
526 psContext->vOneOverW.centroid = psContext->vOneOverW.center;
527 }
528 else
529 {
530 if (T::bCentroidPos)
531 {
532 ///@ todo: don't need to genererate input coverage 2x if input coverage and centroid
533 if (T::bIsStandardPattern)
534 {
535 // add param: const uint32_t inputMask[KNOB_SIMD_WIDTH] to eliminate 'generate coverage 2X'..
536 CalcCentroidPos<T>(*psContext, coverageMask, sampleMask, psContext->vX.UL, psContext->vY.UL);
537 }
538 else
539 {
540 psContext->vX.centroid = _simd_add_ps(psContext->vX.UL, _simd_set1_ps(0.5f));
541 psContext->vY.centroid = _simd_add_ps(psContext->vY.UL, _simd_set1_ps(0.5f));
542 }
543
544 CalcCentroidBarycentrics(coeffs, *psContext, psContext->vX.UL, psContext->vY.UL);
545 }
546 else
547 {
548 psContext->vX.centroid = psContext->vX.sample;
549 psContext->vY.centroid = psContext->vY.sample;
550 }
551 }
552 }
553
554 template<typename T>
555 struct PixelRateZTestLoop
556 {
557 PixelRateZTestLoop(DRAW_CONTEXT *DC, uint32_t _workerId, const SWR_TRIANGLE_DESC &Work, const BarycentricCoeffs& Coeffs, const API_STATE& apiState,
558 uint8_t*& depthBuffer, uint8_t*& stencilBuffer, const uint8_t ClipDistanceMask) :
559 pDC(DC), workerId(_workerId), work(Work), coeffs(Coeffs), state(apiState), psState(apiState.psState),
560 clipDistanceMask(ClipDistanceMask), pDepthBuffer(depthBuffer), pStencilBuffer(stencilBuffer) {};
561
562 INLINE
563 uint32_t operator()(simdscalar& activeLanes, SWR_PS_CONTEXT& psContext,
564 const CORE_BUCKETS BEDepthBucket, uint32_t currentSimdIn8x8 = 0)
565 {
566 SWR_CONTEXT *pContext = pDC->pContext;
567
568 uint32_t statCount = 0;
569 simdscalar anyDepthSamplePassed = _simd_setzero_ps();
570 for(uint32_t sample = 0; sample < T::MultisampleT::numCoverageSamples; sample++)
571 {
572 const uint8_t *pCoverageMask = (uint8_t*)&work.coverageMask[sample];
573 vCoverageMask[sample] = _simd_and_ps(activeLanes, vMask(pCoverageMask[currentSimdIn8x8] & MASK));
574
575 if(!_simd_movemask_ps(vCoverageMask[sample]))
576 {
577 vCoverageMask[sample] = depthPassMask[sample] = stencilPassMask[sample] = _simd_setzero_ps();
578 continue;
579 }
580
581 // offset depth/stencil buffers current sample
582 uint8_t *pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample);
583 uint8_t * pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample);
584
585 if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable)
586 {
587 static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format");
588
589 const simdscalar z = _simd_load_ps(reinterpret_cast<const float *>(pDepthSample));
590
591 const float minz = state.depthBoundsState.depthBoundsTestMinValue;
592 const float maxz = state.depthBoundsState.depthBoundsTestMaxValue;
593
594 vCoverageMask[sample] = _simd_and_ps(vCoverageMask[sample], vMask(CalcDepthBoundsAcceptMask(z, minz, maxz)));
595 }
596
597 AR_BEGIN(BEBarycentric, pDC->drawId);
598
599 // calculate per sample positions
600 psContext.vX.sample = _simd_add_ps(psContext.vX.UL, T::MultisampleT::vX(sample));
601 psContext.vY.sample = _simd_add_ps(psContext.vY.UL, T::MultisampleT::vY(sample));
602
603 // calc I & J per sample
604 CalcSampleBarycentrics(coeffs, psContext);
605
606 if(psState.writesODepth)
607 {
608 {
609 // broadcast and test oDepth(psContext.vZ) written from the PS for each sample
610 vZ[sample] = psContext.vZ;
611 }
612 }
613 else
614 {
615 vZ[sample] = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample);
616 vZ[sample] = state.pfnQuantizeDepth(vZ[sample]);
617 }
618
619 AR_END(BEBarycentric, 0);
620
621 ///@todo: perspective correct vs non-perspective correct clipping?
622 // if clip distances are enabled, we need to interpolate for each sample
623 if(clipDistanceMask)
624 {
625 uint8_t clipMask = ComputeUserClipMask(clipDistanceMask, work.pUserClipBuffer, psContext.vI.sample, psContext.vJ.sample);
626
627 vCoverageMask[sample] = _simd_and_ps(vCoverageMask[sample], vMask(~clipMask));
628 }
629
630 // ZTest for this sample
631 ///@todo Need to uncomment out this bucket.
632 //AR_BEGIN(BEDepthBucket, pDC->drawId);
633 depthPassMask[sample] = vCoverageMask[sample];
634 stencilPassMask[sample] = vCoverageMask[sample];
635 depthPassMask[sample] = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
636 vZ[sample], pDepthSample, vCoverageMask[sample],
637 pStencilSample, &stencilPassMask[sample]);
638 //AR_END(BEDepthBucket, 0);
639
640 // early-exit if no pixels passed depth or earlyZ is forced on
641 if(psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask[sample]))
642 {
643 DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, vZ[sample],
644 pDepthSample, depthPassMask[sample], vCoverageMask[sample], pStencilSample, stencilPassMask[sample]);
645
646 if(!_simd_movemask_ps(depthPassMask[sample]))
647 {
648 continue;
649 }
650 }
651 anyDepthSamplePassed = _simd_or_ps(anyDepthSamplePassed, depthPassMask[sample]);
652 uint32_t statMask = _simd_movemask_ps(depthPassMask[sample]);
653 statCount += _mm_popcnt_u32(statMask);
654 }
655
656 activeLanes = _simd_and_ps(anyDepthSamplePassed, activeLanes);
657 // return number of samples that passed depth and coverage
658 return statCount;
659 }
660
661 // saved depth/stencil/coverage masks and interpolated Z used in OM and DepthWrite
662 simdscalar vZ[T::MultisampleT::numCoverageSamples];
663 simdscalar vCoverageMask[T::MultisampleT::numCoverageSamples];
664 simdscalar depthPassMask[T::MultisampleT::numCoverageSamples];
665 simdscalar stencilPassMask[T::MultisampleT::numCoverageSamples];
666
667 private:
668 // functor inputs
669 DRAW_CONTEXT* pDC;
670 uint32_t workerId;
671
672 const SWR_TRIANGLE_DESC& work;
673 const BarycentricCoeffs& coeffs;
674 const API_STATE& state;
675 const SWR_PS_STATE& psState;
676 const uint8_t clipDistanceMask;
677 uint8_t*& pDepthBuffer;
678 uint8_t*& pStencilBuffer;
679 };
680
681 INLINE void CalcPixelBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext)
682 {
683 // evaluate I,J
684 psContext.vI.center = vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.center, psContext.vY.center);
685 psContext.vJ.center = vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.center, psContext.vY.center);
686 psContext.vI.center = _simd_mul_ps(psContext.vI.center, coeffs.vRecipDet);
687 psContext.vJ.center = _simd_mul_ps(psContext.vJ.center, coeffs.vRecipDet);
688
689 // interpolate 1/w
690 psContext.vOneOverW.center = vplaneps(coeffs.vAOneOverW, coeffs.vBOneOverW, coeffs.vCOneOverW, psContext.vI.center, psContext.vJ.center);
691 }
692
693 INLINE void CalcSampleBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext)
694 {
695 // evaluate I,J
696 psContext.vI.sample = vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.sample, psContext.vY.sample);
697 psContext.vJ.sample = vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.sample, psContext.vY.sample);
698 psContext.vI.sample = _simd_mul_ps(psContext.vI.sample, coeffs.vRecipDet);
699 psContext.vJ.sample = _simd_mul_ps(psContext.vJ.sample, coeffs.vRecipDet);
700
701 // interpolate 1/w
702 psContext.vOneOverW.sample = vplaneps(coeffs.vAOneOverW, coeffs.vBOneOverW, coeffs.vCOneOverW, psContext.vI.sample, psContext.vJ.sample);
703 }
704
705 // Merge Output to 4x2 SIMD Tile Format
706 INLINE void OutputMerger4x2(SWR_PS_CONTEXT &psContext, uint8_t* (&pColorBase)[SWR_NUM_RENDERTARGETS], uint32_t sample, const SWR_BLEND_STATE *pBlendState,
707 const PFN_BLEND_JIT_FUNC (&pfnBlendFunc)[SWR_NUM_RENDERTARGETS], simdscalar &coverageMask, simdscalar depthPassMask, const uint32_t NumRT)
708 {
709 // type safety guaranteed from template instantiation in BEChooser<>::GetFunc
710 const uint32_t rasterTileColorOffset = RasterTileColorOffset(sample);
711 simdvector blendOut;
712
713 for(uint32_t rt = 0; rt < NumRT; ++rt)
714 {
715 uint8_t *pColorSample = pColorBase[rt] + rasterTileColorOffset;
716
717 const SWR_RENDER_TARGET_BLEND_STATE *pRTBlend = &pBlendState->renderTarget[rt];
718
719 {
720 // pfnBlendFunc may not update all channels. Initialize with PS output.
721 /// TODO: move this into the blend JIT.
722 blendOut = psContext.shaded[rt];
723
724 // Blend outputs and update coverage mask for alpha test
725 if(pfnBlendFunc[rt] != nullptr)
726 {
727 pfnBlendFunc[rt](
728 pBlendState,
729 psContext.shaded[rt],
730 psContext.shaded[1],
731 psContext.shaded[0].w,
732 sample,
733 pColorSample,
734 blendOut,
735 &psContext.oMask,
736 (simdscalari*)&coverageMask);
737 }
738 }
739
740 // final write mask
741 simdscalari outputMask = _simd_castps_si(_simd_and_ps(coverageMask, depthPassMask));
742
743 ///@todo can only use maskstore fast path if bpc is 32. Assuming hot tile is RGBA32_FLOAT.
744 static_assert(KNOB_COLOR_HOT_TILE_FORMAT == R32G32B32A32_FLOAT, "Unsupported hot tile format");
745
746 const uint32_t simd = KNOB_SIMD_WIDTH * sizeof(float);
747
748 // store with color mask
749 if(!pRTBlend->writeDisableRed)
750 {
751 _simd_maskstore_ps((float*)pColorSample, outputMask, blendOut.x);
752 }
753 if(!pRTBlend->writeDisableGreen)
754 {
755 _simd_maskstore_ps((float*)(pColorSample + simd), outputMask, blendOut.y);
756 }
757 if(!pRTBlend->writeDisableBlue)
758 {
759 _simd_maskstore_ps((float*)(pColorSample + simd * 2), outputMask, blendOut.z);
760 }
761 if(!pRTBlend->writeDisableAlpha)
762 {
763 _simd_maskstore_ps((float*)(pColorSample + simd * 3), outputMask, blendOut.w);
764 }
765 }
766 }
767
768 #if USE_8x2_TILE_BACKEND
769 // Merge Output to 8x2 SIMD16 Tile Format
770 INLINE void OutputMerger8x2(SWR_PS_CONTEXT &psContext, uint8_t* (&pColorBase)[SWR_NUM_RENDERTARGETS], uint32_t sample, const SWR_BLEND_STATE *pBlendState,
771 const PFN_BLEND_JIT_FUNC(&pfnBlendFunc)[SWR_NUM_RENDERTARGETS], simdscalar &coverageMask, simdscalar depthPassMask, const uint32_t NumRT, const uint32_t colorBufferEnableMask, bool useAlternateOffset)
772 {
773 // type safety guaranteed from template instantiation in BEChooser<>::GetFunc
774 uint32_t rasterTileColorOffset = RasterTileColorOffset(sample);
775
776 if (useAlternateOffset)
777 {
778 rasterTileColorOffset += sizeof(simdscalar);
779 }
780
781 simdvector blendSrc;
782 simdvector blendOut;
783
784 uint32_t colorBufferBit = 1;
785 for (uint32_t rt = 0; rt < NumRT; rt += 1, colorBufferBit <<= 1)
786 {
787 simdscalar *pColorSample = reinterpret_cast<simdscalar *>(pColorBase[rt] + rasterTileColorOffset);
788
789 const SWR_RENDER_TARGET_BLEND_STATE *pRTBlend = &pBlendState->renderTarget[rt];
790
791 if (colorBufferBit & colorBufferEnableMask)
792 {
793 blendSrc[0] = pColorSample[0];
794 blendSrc[1] = pColorSample[2];
795 blendSrc[2] = pColorSample[4];
796 blendSrc[3] = pColorSample[6];
797 }
798
799 {
800 // pfnBlendFunc may not update all channels. Initialize with PS output.
801 /// TODO: move this into the blend JIT.
802 blendOut = psContext.shaded[rt];
803
804 // Blend outputs and update coverage mask for alpha test
805 if(pfnBlendFunc[rt] != nullptr)
806 {
807 pfnBlendFunc[rt](
808 pBlendState,
809 psContext.shaded[rt],
810 psContext.shaded[1],
811 psContext.shaded[0].w,
812 sample,
813 reinterpret_cast<uint8_t *>(&blendSrc),
814 blendOut,
815 &psContext.oMask,
816 reinterpret_cast<simdscalari *>(&coverageMask));
817 }
818 }
819
820 // final write mask
821 simdscalari outputMask = _simd_castps_si(_simd_and_ps(coverageMask, depthPassMask));
822
823 ///@todo can only use maskstore fast path if bpc is 32. Assuming hot tile is RGBA32_FLOAT.
824 static_assert(KNOB_COLOR_HOT_TILE_FORMAT == R32G32B32A32_FLOAT, "Unsupported hot tile format");
825
826 // store with color mask
827 if (!pRTBlend->writeDisableRed)
828 {
829 _simd_maskstore_ps(reinterpret_cast<float *>(&pColorSample[0]), outputMask, blendOut.x);
830 }
831 if (!pRTBlend->writeDisableGreen)
832 {
833 _simd_maskstore_ps(reinterpret_cast<float *>(&pColorSample[2]), outputMask, blendOut.y);
834 }
835 if (!pRTBlend->writeDisableBlue)
836 {
837 _simd_maskstore_ps(reinterpret_cast<float *>(&pColorSample[4]), outputMask, blendOut.z);
838 }
839 if (!pRTBlend->writeDisableAlpha)
840 {
841 _simd_maskstore_ps(reinterpret_cast<float *>(&pColorSample[6]), outputMask, blendOut.w);
842 }
843 }
844 }
845
846 #endif
847
848 template<typename T>
849 void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
850 {
851 ///@todo: Need to move locals off stack to prevent __chkstk's from being generated for the backend
852
853
854 SWR_CONTEXT *pContext = pDC->pContext;
855
856 AR_BEGIN(BEPixelRateBackend, pDC->drawId);
857 AR_BEGIN(BESetup, pDC->drawId);
858
859 const API_STATE &state = GetApiState(pDC);
860
861 BarycentricCoeffs coeffs;
862 SetupBarycentricCoeffs(&coeffs, work);
863
864 SWR_PS_CONTEXT psContext;
865 SetupPixelShaderContext<T>(&psContext, work);
866
867 uint8_t *pDepthBuffer, *pStencilBuffer;
868 SetupRenderBuffers(psContext.pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.psState.numRenderTargets, renderBuffers);
869
870 AR_END(BESetup, 0);
871
872 PixelRateZTestLoop<T> PixelRateZTest(pDC, workerId, work, coeffs, state, pDepthBuffer, pStencilBuffer, state.rastState.clipDistanceMask);
873
874 psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast<float>(y)));
875 psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast<float>(y)));
876
877 const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM));
878
879 for(uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
880 {
881 psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps(static_cast<float>(x)));
882 psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps(static_cast<float>(x)));
883
884 const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM));
885
886 for(uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
887 {
888 #if USE_8x2_TILE_BACKEND
889 const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0);
890
891 #endif
892 simdscalar activeLanes;
893 if(!(work.anyCoveredSamples & MASK)) {goto Endtile;};
894 activeLanes = vMask(work.anyCoveredSamples & MASK);
895
896 if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE)
897 {
898 const uint64_t* pCoverageMask = (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) ? &work.innerCoverageMask : &work.coverageMask[0];
899
900 generateInputCoverage<T, T::InputCoverage>(pCoverageMask, psContext.inputMask, state.blendState.sampleMask);
901 }
902
903 AR_BEGIN(BEBarycentric, pDC->drawId);
904
905 CalcPixelBarycentrics(coeffs, psContext);
906
907 CalcCentroid<T, false>(&psContext, coeffs, work.coverageMask, state.blendState.sampleMask);
908
909 AR_END(BEBarycentric, 0);
910
911 if(T::bForcedSampleCount)
912 {
913 // candidate pixels (that passed coverage) will cause shader invocation if any bits in the samplemask are set
914 const simdscalar vSampleMask = _simd_castsi_ps(_simd_cmpgt_epi32(_simd_set1_epi32(state.blendState.sampleMask), _simd_setzero_si()));
915 activeLanes = _simd_and_ps(activeLanes, vSampleMask);
916 }
917
918 // Early-Z?
919 if(T::bCanEarlyZ && !T::bForcedSampleCount)
920 {
921 uint32_t depthPassCount = PixelRateZTest(activeLanes, psContext, BEEarlyDepthTest);
922 UPDATE_STAT_BE(DepthPassCount, depthPassCount);
923 AR_EVENT(EarlyDepthInfoPixelRate(depthPassCount, _simd_movemask_ps(activeLanes)));
924 }
925
926 // if we have no covered samples that passed depth at this point, go to next tile
927 if(!_simd_movemask_ps(activeLanes)) { goto Endtile; };
928
929 if(state.psState.usesSourceDepth)
930 {
931 AR_BEGIN(BEBarycentric, pDC->drawId);
932 // interpolate and quantize z
933 psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center);
934 psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
935 AR_END(BEBarycentric, 0);
936 }
937
938 // pixels that are currently active
939 psContext.activeMask = _simd_castps_si(activeLanes);
940 psContext.oMask = T::MultisampleT::FullSampleMask();
941
942 // execute pixel shader
943 AR_BEGIN(BEPixelShader, pDC->drawId);
944 state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
945 UPDATE_STAT_BE(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(activeLanes)));
946 AR_END(BEPixelShader, 0);
947
948 // update active lanes to remove any discarded or oMask'd pixels
949 activeLanes = _simd_castsi_ps(_simd_and_si(psContext.activeMask, _simd_cmpgt_epi32(psContext.oMask, _simd_setzero_si())));
950 if(!_simd_movemask_ps(activeLanes)) { goto Endtile; };
951
952 // late-Z
953 if(!T::bCanEarlyZ && !T::bForcedSampleCount)
954 {
955 uint32_t depthPassCount = PixelRateZTest(activeLanes, psContext, BELateDepthTest);
956 UPDATE_STAT_BE(DepthPassCount, depthPassCount);
957 AR_EVENT(LateDepthInfoPixelRate(depthPassCount, _simd_movemask_ps(activeLanes)));
958 }
959
960 // if we have no covered samples that passed depth at this point, skip OM and go to next tile
961 if(!_simd_movemask_ps(activeLanes)) { goto Endtile; };
962
963 // output merger
964 // loop over all samples, broadcasting the results of the PS to all passing pixels
965 for(uint32_t sample = 0; sample < GetNumOMSamples<T>(state.blendState.sampleCount); sample++)
966 {
967 AR_BEGIN(BEOutputMerger, pDC->drawId);
968 // center pattern does a single coverage/depth/stencil test, standard pattern tests all samples
969 uint32_t coverageSampleNum = (T::bIsStandardPattern) ? sample : 0;
970 simdscalar coverageMask, depthMask;
971 if(T::bForcedSampleCount)
972 {
973 coverageMask = depthMask = activeLanes;
974 }
975 else
976 {
977 coverageMask = PixelRateZTest.vCoverageMask[coverageSampleNum];
978 depthMask = PixelRateZTest.depthPassMask[coverageSampleNum];
979 if(!_simd_movemask_ps(depthMask))
980 {
981 // stencil should already have been written in early/lateZ tests
982 AR_END(BEOutputMerger, 0);
983 continue;
984 }
985 }
986
987 // broadcast the results of the PS to all passing pixels
988 #if USE_8x2_TILE_BACKEND
989 OutputMerger8x2(psContext, psContext.pColorBuffer, sample, &state.blendState,state.pfnBlendFunc, coverageMask, depthMask, state.psState.numRenderTargets, state.colorHottileEnable, useAlternateOffset);
990 #else // USE_8x2_TILE_BACKEND
991 OutputMerger4x2(psContext, psContext.pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, coverageMask, depthMask, state.psState.numRenderTargets);
992 #endif // USE_8x2_TILE_BACKEND
993
994 if(!state.psState.forceEarlyZ && !T::bForcedSampleCount)
995 {
996 uint8_t *pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample);
997 uint8_t * pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample);
998
999 DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, PixelRateZTest.vZ[coverageSampleNum],
1000 pDepthSample, depthMask, coverageMask, pStencilSample, PixelRateZTest.stencilPassMask[coverageSampleNum]);
1001 }
1002 AR_END(BEOutputMerger, 0);
1003 }
1004 Endtile:
1005 AR_BEGIN(BEEndTile, pDC->drawId);
1006
1007 for(uint32_t sample = 0; sample < T::MultisampleT::numCoverageSamples; sample++)
1008 {
1009 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
1010 }
1011
1012 if(T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
1013 {
1014 work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
1015 }
1016 work.anyCoveredSamples >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
1017
1018 #if USE_8x2_TILE_BACKEND
1019 if (useAlternateOffset)
1020 {
1021 for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt)
1022 {
1023 psContext.pColorBuffer[rt] += (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
1024 }
1025 }
1026 #else
1027 for(uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt)
1028 {
1029 psContext.pColorBuffer[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
1030 }
1031 pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
1032 pStencilBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
1033 #endif
1034
1035 AR_END(BEEndTile, 0);
1036
1037 psContext.vX.UL = _simd_add_ps(psContext.vX.UL, dx);
1038 psContext.vX.center = _simd_add_ps(psContext.vX.center, dx);
1039 }
1040
1041 psContext.vY.UL = _simd_add_ps(psContext.vY.UL, dy);
1042 psContext.vY.center = _simd_add_ps(psContext.vY.center, dy);
1043 }
1044
1045 AR_END(BEPixelRateBackend, 0);
1046 }
1047
1048 template<uint32_t sampleCountT = SWR_MULTISAMPLE_1X, uint32_t samplePattern = SWR_MSAA_STANDARD_PATTERN,
1049 uint32_t coverage = 0, uint32_t centroid = 0, uint32_t forced = 0, uint32_t canEarlyZ = 0
1050 >
1051 struct SwrBackendTraits
1052 {
1053 static const bool bIsStandardPattern = (samplePattern == SWR_MSAA_STANDARD_PATTERN);
1054 static const uint32_t InputCoverage = coverage;
1055 static const bool bCentroidPos = (centroid == 1);
1056 static const bool bForcedSampleCount = (forced == 1);
1057 static const bool bCanEarlyZ = (canEarlyZ == 1);
1058 typedef MultisampleTraits<(SWR_MULTISAMPLE_COUNT)sampleCountT, (bIsStandardPattern) ? SWR_MSAA_STANDARD_PATTERN : SWR_MSAA_CENTER_PATTERN> MultisampleT;
1059 };