Remove wrongly repeated words in comments
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / backend.h
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file backend.h
24 *
25 * @brief Backend handles rasterization, pixel shading and output merger
26 * operations.
27 *
28 ******************************************************************************/
29 #pragma once
30
31 #include "common/os.h"
32 #include "core/context.h"
33 #include "core/multisample.h"
34 #include "rdtsc_core.h"
35
36 void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId, void*& pSpillFillBuffer);
37 void ProcessSyncBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData);
38 void ProcessQueryStatsBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData);
39 void ProcessClearBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData);
40 void ProcessStoreTileBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData);
41 void ProcessDiscardInvalidateTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData);
42 void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers);
43 void InitClearTilesTable();
44 simdmask ComputeUserClipMask(uint8_t clipMask, float* pUserClipBuffer, simdscalar vI, simdscalar vJ);
45 void InitBackendFuncTables();
46 void InitCPSFuncTables();
47 void CalcSampleBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext);
48
49 enum SWR_BACKEND_FUNCS
50 {
51 SWR_BACKEND_SINGLE_SAMPLE,
52 SWR_BACKEND_MSAA_PIXEL_RATE,
53 SWR_BACKEND_MSAA_SAMPLE_RATE,
54 SWR_BACKEND_FUNCS_MAX,
55 };
56
57 #if KNOB_SIMD_WIDTH == 8
58 extern const __m256 vCenterOffsetsX;
59 extern const __m256 vCenterOffsetsY;
60 extern const __m256 vULOffsetsX;
61 extern const __m256 vULOffsetsY;
62 #define MASK 0xff
63 #endif
64
65 INLINE static uint32_t RasterTileColorOffset(uint32_t sampleNum)
66 {
67 static const uint32_t RasterTileColorOffsets[16]
68 { 0,
69 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8),
70 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 2,
71 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 3,
72 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 4,
73 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 5,
74 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 6,
75 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 7,
76 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 8,
77 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 9,
78 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 10,
79 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 11,
80 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 12,
81 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 13,
82 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 14,
83 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 15,
84 };
85 assert(sampleNum < 16);
86 return RasterTileColorOffsets[sampleNum];
87 }
88
89 INLINE static uint32_t RasterTileDepthOffset(uint32_t sampleNum)
90 {
91 static const uint32_t RasterTileDepthOffsets[16]
92 { 0,
93 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8),
94 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 2,
95 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 3,
96 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 4,
97 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 5,
98 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 6,
99 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 7,
100 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 8,
101 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 9,
102 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 10,
103 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 11,
104 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 12,
105 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 13,
106 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 14,
107 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 15,
108 };
109 assert(sampleNum < 16);
110 return RasterTileDepthOffsets[sampleNum];
111 }
112
113 INLINE static uint32_t RasterTileStencilOffset(uint32_t sampleNum)
114 {
115 static const uint32_t RasterTileStencilOffsets[16]
116 { 0,
117 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8),
118 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 2,
119 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 3,
120 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 4,
121 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 5,
122 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 6,
123 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 7,
124 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 8,
125 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 9,
126 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 10,
127 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 11,
128 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 12,
129 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 13,
130 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 14,
131 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 15,
132 };
133 assert(sampleNum < 16);
134 return RasterTileStencilOffsets[sampleNum];
135 }
136
137 template<typename T>
138 INLINE void generateInputCoverage(const uint64_t *const coverageMask, uint32_t (&inputMask)[KNOB_SIMD_WIDTH], const uint32_t sampleMask)
139 {
140
141 // will need to update for avx512
142 assert(KNOB_SIMD_WIDTH == 8);
143
144 __m256i mask[2];
145 __m256i sampleCoverage[2];
146 if(T::bIsStandardPattern)
147 {
148 __m256i src = _mm256_set1_epi32(0);
149 __m256i index0 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0), index1;
150
151 if(T::MultisampleT::numSamples == 1)
152 {
153 mask[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, -1);
154 }
155 else if(T::MultisampleT::numSamples == 2)
156 {
157 mask[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1);
158 }
159 else if(T::MultisampleT::numSamples == 4)
160 {
161 mask[0] = _mm256_set_epi32(0, 0, 0, 0, -1, -1, -1, -1);
162 }
163 else if(T::MultisampleT::numSamples == 8)
164 {
165 mask[0] = _mm256_set1_epi32(-1);
166 }
167 else if(T::MultisampleT::numSamples == 16)
168 {
169 mask[0] = _mm256_set1_epi32(-1);
170 mask[1] = _mm256_set1_epi32(-1);
171 index1 = _mm256_set_epi32(15, 14, 13, 12, 11, 10, 9, 8);
172 }
173
174 // gather coverage for samples 0-7
175 sampleCoverage[0] = _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src), (const float*)coverageMask, index0, _mm256_castsi256_ps(mask[0]), 8));
176 if(T::MultisampleT::numSamples > 8)
177 {
178 // gather coverage for samples 8-15
179 sampleCoverage[1] = _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src), (const float*)coverageMask, index1, _mm256_castsi256_ps(mask[1]), 8));
180 }
181 }
182 else
183 {
184 // center coverage is the same for all samples; just broadcast to the sample slots
185 uint32_t centerCoverage = ((uint32_t)(*coverageMask) & MASK);
186 if(T::MultisampleT::numSamples == 1)
187 {
188 sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, centerCoverage);
189 }
190 else if(T::MultisampleT::numSamples == 2)
191 {
192 sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, centerCoverage, centerCoverage);
193 }
194 else if(T::MultisampleT::numSamples == 4)
195 {
196 sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, centerCoverage, centerCoverage, centerCoverage, centerCoverage);
197 }
198 else if(T::MultisampleT::numSamples == 8)
199 {
200 sampleCoverage[0] = _mm256_set1_epi32(centerCoverage);
201 }
202 else if(T::MultisampleT::numSamples == 16)
203 {
204 sampleCoverage[0] = _mm256_set1_epi32(centerCoverage);
205 sampleCoverage[1] = _mm256_set1_epi32(centerCoverage);
206 }
207 }
208
209 mask[0] = _mm256_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xC, 0x8, 0x4, 0x0,
210 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xC, 0x8, 0x4, 0x0);
211 // pull out the 8bit 4x2 coverage for samples 0-7 into the lower 32 bits of each 128bit lane
212 __m256i packedCoverage0 = _simd_shuffle_epi8(sampleCoverage[0], mask[0]);
213
214 __m256i packedCoverage1;
215 if(T::MultisampleT::numSamples > 8)
216 {
217 // pull out the 8bit 4x2 coverage for samples 8-15 into the lower 32 bits of each 128bit lane
218 packedCoverage1 = _simd_shuffle_epi8(sampleCoverage[1], mask[0]);
219 }
220
221 #if (KNOB_ARCH == KNOB_ARCH_AVX)
222 // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane
223 __m256i hiToLow = _mm256_permute2f128_si256(packedCoverage0, packedCoverage0, 0x83);
224 __m256 shufRes = _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow), _mm256_castsi256_ps(hiToLow), _MM_SHUFFLE(1, 1, 0, 1));
225 packedCoverage0 = _mm256_castps_si256(_mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0), shufRes, 0xFE));
226
227 __m256i packedSampleCoverage;
228 if(T::MultisampleT::numSamples > 8)
229 {
230 // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane
231 hiToLow = _mm256_permute2f128_si256(packedCoverage1, packedCoverage1, 0x83);
232 shufRes = _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow), _mm256_castsi256_ps(hiToLow), _MM_SHUFFLE(1, 1, 0, 1));
233 shufRes = _mm256_blend_ps(_mm256_castsi256_ps(packedCoverage1), shufRes, 0xFE);
234 packedCoverage1 = _mm256_castps_si256(_mm256_castpd_ps(_mm256_shuffle_pd(_mm256_castps_pd(shufRes), _mm256_castps_pd(shufRes), 0x01)));
235 packedSampleCoverage = _mm256_castps_si256(_mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0), _mm256_castsi256_ps(packedCoverage1), 0xFC));
236 }
237 else
238 {
239 packedSampleCoverage = packedCoverage0;
240 }
241 #else
242 __m256i permMask = _mm256_set_epi32(0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x4, 0x0);
243 // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane
244 packedCoverage0 = _mm256_permutevar8x32_epi32(packedCoverage0, permMask);
245
246 __m256i packedSampleCoverage;
247 if(T::MultisampleT::numSamples > 8)
248 {
249 permMask = _mm256_set_epi32(0x7, 0x7, 0x7, 0x7, 0x4, 0x0, 0x7, 0x7);
250 // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane
251 packedCoverage1 = _mm256_permutevar8x32_epi32(packedCoverage1, permMask);
252
253 // blend coverage masks for samples 0-7 and samples 8-15 into single 128 bit lane
254 packedSampleCoverage = _mm256_blend_epi32(packedCoverage0, packedCoverage1, 0x0C);
255 }
256 else
257 {
258 packedSampleCoverage = packedCoverage0;
259 }
260 #endif
261
262 for(int32_t i = KNOB_SIMD_WIDTH - 1; i >= 0; i--)
263 {
264 // convert packed sample coverage masks into single coverage masks for all samples for each pixel in the 4x2
265 inputMask[i] = _simd_movemask_epi8(packedSampleCoverage);
266
267 if(!T::bForcedSampleCount)
268 {
269 // input coverage has to be anded with sample mask if MSAA isn't forced on
270 inputMask[i] &= sampleMask;
271 }
272
273 // shift to the next pixel in the 4x2
274 packedSampleCoverage = _simd_slli_epi32(packedSampleCoverage, 1);
275 }
276 }
277
278 template<typename T>
279 INLINE void generateInputCoverage(const uint64_t *const coverageMask, __m256 &inputCoverage, const uint32_t sampleMask)
280 {
281 uint32_t inputMask[KNOB_SIMD_WIDTH];
282 generateInputCoverage<T>(coverageMask, inputMask, sampleMask);
283 inputCoverage = _simd_castsi_ps(_mm256_set_epi32(inputMask[7], inputMask[6], inputMask[5], inputMask[4], inputMask[3], inputMask[2], inputMask[1], inputMask[0]));
284 }
285
286 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
287 // Centroid behaves exactly as follows :
288 // (1) If all samples in the primitive are covered, the attribute is evaluated at the pixel center (even if the sample pattern does not happen to
289 // have a sample location there).
290 // (2) Else the attribute is evaluated at the first covered sample, in increasing order of sample index, where sample coverage is after ANDing the
291 // coverage with the SampleMask Rasterizer State.
292 // (3) If no samples are covered, such as on helper pixels executed off the bounds of a primitive to fill out 2x2 pixel stamps, the attribute is
293 // evaluated as follows : If the SampleMask Rasterizer state is a subset of the samples in the pixel, then the first sample covered by the
294 // SampleMask Rasterizer State is the evaluation point.Otherwise (full SampleMask), the pixel center is the evaluation point.
295 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
296 template<typename T>
297 INLINE void CalcCentroidPos(SWR_PS_CONTEXT &psContext, const uint64_t *const coverageMask, const uint32_t sampleMask,
298 const simdscalar vXSamplePosUL, const simdscalar vYSamplePosUL)
299 {
300 uint32_t inputMask[KNOB_SIMD_WIDTH];
301 generateInputCoverage<T>(coverageMask, inputMask, sampleMask);
302
303 // Case (2) - partially covered pixel
304
305 // scan for first covered sample per pixel in the 4x2 span
306 unsigned long sampleNum[KNOB_SIMD_WIDTH];
307 (inputMask[0] > 0) ? (_BitScanForward(&sampleNum[0], inputMask[0])) : (sampleNum[0] = 0);
308 (inputMask[1] > 0) ? (_BitScanForward(&sampleNum[1], inputMask[1])) : (sampleNum[1] = 0);
309 (inputMask[2] > 0) ? (_BitScanForward(&sampleNum[2], inputMask[2])) : (sampleNum[2] = 0);
310 (inputMask[3] > 0) ? (_BitScanForward(&sampleNum[3], inputMask[3])) : (sampleNum[3] = 0);
311 (inputMask[4] > 0) ? (_BitScanForward(&sampleNum[4], inputMask[4])) : (sampleNum[4] = 0);
312 (inputMask[5] > 0) ? (_BitScanForward(&sampleNum[5], inputMask[5])) : (sampleNum[5] = 0);
313 (inputMask[6] > 0) ? (_BitScanForward(&sampleNum[6], inputMask[6])) : (sampleNum[6] = 0);
314 (inputMask[7] > 0) ? (_BitScanForward(&sampleNum[7], inputMask[7])) : (sampleNum[7] = 0);
315
316 // look up and set the sample offsets from UL pixel corner for first covered sample
317 __m256 vXSample = _mm256_set_ps(T::MultisampleT::X(sampleNum[7]),
318 T::MultisampleT::X(sampleNum[6]),
319 T::MultisampleT::X(sampleNum[5]),
320 T::MultisampleT::X(sampleNum[4]),
321 T::MultisampleT::X(sampleNum[3]),
322 T::MultisampleT::X(sampleNum[2]),
323 T::MultisampleT::X(sampleNum[1]),
324 T::MultisampleT::X(sampleNum[0]));
325
326 __m256 vYSample = _mm256_set_ps(T::MultisampleT::Y(sampleNum[7]),
327 T::MultisampleT::Y(sampleNum[6]),
328 T::MultisampleT::Y(sampleNum[5]),
329 T::MultisampleT::Y(sampleNum[4]),
330 T::MultisampleT::Y(sampleNum[3]),
331 T::MultisampleT::Y(sampleNum[2]),
332 T::MultisampleT::Y(sampleNum[1]),
333 T::MultisampleT::Y(sampleNum[0]));
334 // add sample offset to UL pixel corner
335 vXSample = _simd_add_ps(vXSamplePosUL, vXSample);
336 vYSample = _simd_add_ps(vYSamplePosUL, vYSample);
337
338 // Case (1) and case (3b) - All samples covered or not covered with full SampleMask
339 static const __m256i vFullyCoveredMask = T::MultisampleT::FullSampleMask();
340 __m256i vInputCoveragei = _mm256_set_epi32(inputMask[7], inputMask[6], inputMask[5], inputMask[4], inputMask[3], inputMask[2], inputMask[1], inputMask[0]);
341 __m256i vAllSamplesCovered = _simd_cmpeq_epi32(vInputCoveragei, vFullyCoveredMask);
342
343 static const __m256i vZero = _simd_setzero_si();
344 const __m256i vSampleMask = _simd_and_si(_simd_set1_epi32(sampleMask), vFullyCoveredMask);
345 __m256i vNoSamplesCovered = _simd_cmpeq_epi32(vInputCoveragei, vZero);
346 __m256i vIsFullSampleMask = _simd_cmpeq_epi32(vSampleMask, vFullyCoveredMask);
347 __m256i vCase3b = _simd_and_si(vNoSamplesCovered, vIsFullSampleMask);
348
349 __m256i vEvalAtCenter = _simd_or_si(vAllSamplesCovered, vCase3b);
350
351 // set the centroid position based on results from above
352 psContext.vX.centroid = _simd_blendv_ps(vXSample, psContext.vX.center, _simd_castsi_ps(vEvalAtCenter));
353 psContext.vY.centroid = _simd_blendv_ps(vYSample, psContext.vY.center, _simd_castsi_ps(vEvalAtCenter));
354
355 // Case (3a) No samples covered and partial sample mask
356 __m256i vSomeSampleMaskSamples = _simd_cmplt_epi32(vSampleMask, vFullyCoveredMask);
357 // sample mask should never be all 0's for this case, but handle it anyways
358 unsigned long firstCoveredSampleMaskSample = 0;
359 (sampleMask > 0) ? (_BitScanForward(&firstCoveredSampleMaskSample, sampleMask)) : (firstCoveredSampleMaskSample = 0);
360
361 __m256i vCase3a = _simd_and_si(vNoSamplesCovered, vSomeSampleMaskSamples);
362
363 vXSample = _simd_set1_ps(T::MultisampleT::X(firstCoveredSampleMaskSample));
364 vYSample = _simd_set1_ps(T::MultisampleT::Y(firstCoveredSampleMaskSample));
365
366 // blend in case 3a pixel locations
367 psContext.vX.centroid = _simd_blendv_ps(psContext.vX.centroid, vXSample, _simd_castsi_ps(vCase3a));
368 psContext.vY.centroid = _simd_blendv_ps(psContext.vY.centroid, vYSample, _simd_castsi_ps(vCase3a));
369 }
370
371 INLINE void CalcCentroidBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext,
372 const simdscalar vXSamplePosUL, const simdscalar vYSamplePosUL)
373 {
374 // evaluate I,J
375 psContext.vI.centroid = vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.centroid, psContext.vY.centroid);
376 psContext.vJ.centroid = vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.centroid, psContext.vY.centroid);
377 psContext.vI.centroid = _simd_mul_ps(psContext.vI.centroid, coeffs.vRecipDet);
378 psContext.vJ.centroid = _simd_mul_ps(psContext.vJ.centroid, coeffs.vRecipDet);
379
380 // interpolate 1/w
381 psContext.vOneOverW.centroid = vplaneps(coeffs.vAOneOverW, coeffs.vBOneOverW, coeffs.vCOneOverW, psContext.vI.centroid, psContext.vJ.centroid);
382 }
383
384 template<typename T>
385 INLINE uint32_t GetNumOMSamples(SWR_MULTISAMPLE_COUNT blendSampleCount)
386 {
387 // RT has to be single sample if we're in forcedMSAA mode
388 if(T::bForcedSampleCount && (T::MultisampleT::sampleCount > SWR_MULTISAMPLE_1X))
389 {
390 return 1;
391 }
392 // unless we're forced to single sample, in which case we run the OM at the sample count of the RT
393 else if(T::bForcedSampleCount && (T::MultisampleT::sampleCount == SWR_MULTISAMPLE_1X))
394 {
395 return GetNumSamples(blendSampleCount);
396 }
397 // else we're in normal MSAA mode and rasterizer and OM are running at the same sample count
398 else
399 {
400 return T::MultisampleT::numSamples;
401 }
402 }
403
404 template<typename T>
405 struct PixelRateZTestLoop
406 {
407 PixelRateZTestLoop(DRAW_CONTEXT *DC, const SWR_TRIANGLE_DESC &Work, const BarycentricCoeffs& Coeffs, const API_STATE& apiState,
408 uint8_t*& depthBase, uint8_t*& stencilBase, const uint8_t ClipDistanceMask) :
409 work(Work), coeffs(Coeffs), state(apiState), psState(apiState.psState),
410 clipDistanceMask(ClipDistanceMask), pDepthBase(depthBase), pStencilBase(stencilBase) {};
411
412 INLINE
413 uint32_t operator()(simdscalar& activeLanes, SWR_PS_CONTEXT& psContext,
414 const CORE_BUCKETS BEDepthBucket, uint32_t currentSimdIn8x8 = 0)
415 {
416 uint32_t statCount = 0;
417 simdscalar anyDepthSamplePassed = _simd_setzero_ps();
418 for(uint32_t sample = 0; sample < T::MultisampleT::numCoverageSamples; sample++)
419 {
420 const uint8_t *pCoverageMask = (uint8_t*)&work.coverageMask[sample];
421 vCoverageMask[sample] = _simd_and_ps(activeLanes, vMask(pCoverageMask[currentSimdIn8x8] & MASK));
422
423 if(!_simd_movemask_ps(vCoverageMask[sample]))
424 {
425 vCoverageMask[sample] = depthPassMask[sample] = stencilPassMask[sample] = _simd_setzero_ps();
426 continue;
427 }
428
429 RDTSC_START(BEBarycentric);
430 // calculate per sample positions
431 psContext.vX.sample = _simd_add_ps(psContext.vX.UL, T::MultisampleT::vX(sample));
432 psContext.vY.sample = _simd_add_ps(psContext.vY.UL, T::MultisampleT::vY(sample));
433
434 // calc I & J per sample
435 CalcSampleBarycentrics(coeffs, psContext);
436
437 if(psState.writesODepth)
438 {
439 // broadcast and test oDepth(psContext.vZ) written from the PS for each sample
440 vZ[sample] = psContext.vZ;
441 }
442 else
443 {
444 vZ[sample] = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample);
445 vZ[sample] = state.pfnQuantizeDepth(vZ[sample]);
446 }
447 RDTSC_STOP(BEBarycentric, 0, 0);
448
449 ///@todo: perspective correct vs non-perspective correct clipping?
450 // if clip distances are enabled, we need to interpolate for each sample
451 if(clipDistanceMask)
452 {
453 uint8_t clipMask = ComputeUserClipMask(clipDistanceMask, work.pUserClipBuffer,
454 psContext.vI.sample, psContext.vJ.sample);
455 vCoverageMask[sample] = _simd_and_ps(vCoverageMask[sample], vMask(~clipMask));
456 }
457
458 // offset depth/stencil buffers current sample
459 uint8_t *pDepthSample = pDepthBase + RasterTileDepthOffset(sample);
460 uint8_t * pStencilSample = pStencilBase + RasterTileStencilOffset(sample);
461
462 // ZTest for this sample
463 RDTSC_START(BEDepthBucket);
464 depthPassMask[sample] = vCoverageMask[sample];
465 stencilPassMask[sample] = vCoverageMask[sample];
466 depthPassMask[sample] = DepthStencilTest(&state, work.triFlags.frontFacing, vZ[sample], pDepthSample,
467 vCoverageMask[sample], pStencilSample, &stencilPassMask[sample]);
468 RDTSC_STOP(BEDepthBucket, 0, 0);
469
470 // early-exit if no pixels passed depth or earlyZ is forced on
471 if(psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask[sample]))
472 {
473 DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, vZ[sample],
474 pDepthSample, depthPassMask[sample], vCoverageMask[sample], pStencilSample, stencilPassMask[sample]);
475
476 if(!_simd_movemask_ps(depthPassMask[sample]))
477 {
478 continue;
479 }
480 }
481 anyDepthSamplePassed = _simd_or_ps(anyDepthSamplePassed, depthPassMask[sample]);
482 uint32_t statMask = _simd_movemask_ps(depthPassMask[sample]);
483 statCount += _mm_popcnt_u32(statMask);
484 }
485
486 activeLanes = _simd_and_ps(anyDepthSamplePassed, activeLanes);
487 // return number of samples that passed depth and coverage
488 return statCount;
489 }
490
491 // saved depth/stencil/coverage masks and interpolated Z used in OM and DepthWrite
492 simdscalar vZ[T::MultisampleT::numCoverageSamples];
493 simdscalar vCoverageMask[T::MultisampleT::numCoverageSamples];
494 simdscalar depthPassMask[T::MultisampleT::numCoverageSamples];
495 simdscalar stencilPassMask[T::MultisampleT::numCoverageSamples];
496
497 private:
498 // functor inputs
499 const SWR_TRIANGLE_DESC& work;
500 const BarycentricCoeffs& coeffs;
501 const API_STATE& state;
502 const SWR_PS_STATE& psState;
503 const uint8_t clipDistanceMask;
504 uint8_t*& pDepthBase;
505 uint8_t*& pStencilBase;
506 };
507
508 INLINE void CalcPixelBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext)
509 {
510 // evaluate I,J
511 psContext.vI.center = vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.center, psContext.vY.center);
512 psContext.vJ.center = vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.center, psContext.vY.center);
513 psContext.vI.center = _simd_mul_ps(psContext.vI.center, coeffs.vRecipDet);
514 psContext.vJ.center = _simd_mul_ps(psContext.vJ.center, coeffs.vRecipDet);
515
516 // interpolate 1/w
517 psContext.vOneOverW.center = vplaneps(coeffs.vAOneOverW, coeffs.vBOneOverW, coeffs.vCOneOverW, psContext.vI.center, psContext.vJ.center);
518 }
519
520 INLINE void CalcSampleBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext)
521 {
522 // evaluate I,J
523 psContext.vI.sample = vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.sample, psContext.vY.sample);
524 psContext.vJ.sample = vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.sample, psContext.vY.sample);
525 psContext.vI.sample = _simd_mul_ps(psContext.vI.sample, coeffs.vRecipDet);
526 psContext.vJ.sample = _simd_mul_ps(psContext.vJ.sample, coeffs.vRecipDet);
527
528 // interpolate 1/w
529 psContext.vOneOverW.sample = vplaneps(coeffs.vAOneOverW, coeffs.vBOneOverW, coeffs.vCOneOverW, psContext.vI.sample, psContext.vJ.sample);
530 }
531
532 INLINE void OutputMerger(SWR_PS_CONTEXT &psContext, uint8_t* (&pColorBase)[SWR_NUM_RENDERTARGETS], uint32_t sample, const SWR_BLEND_STATE *pBlendState,
533 const PFN_BLEND_JIT_FUNC (&pfnBlendFunc)[SWR_NUM_RENDERTARGETS], simdscalar &coverageMask, simdscalar depthPassMask, const uint32_t NumRT)
534 {
535 // type safety guaranteed from template instantiation in BEChooser<>::GetFunc
536 const uint32_t rasterTileColorOffset = RasterTileColorOffset(sample);
537 simdvector blendOut;
538
539 for(uint32_t rt = 0; rt < NumRT; ++rt)
540 {
541 uint8_t *pColorSample = pColorBase[rt] + rasterTileColorOffset;
542
543 const SWR_RENDER_TARGET_BLEND_STATE *pRTBlend = &pBlendState->renderTarget[rt];
544 // pfnBlendFunc may not update all channels. Initialize with PS output.
545 /// TODO: move this into the blend JIT.
546 blendOut = psContext.shaded[rt];
547
548 // Blend outputs and update coverage mask for alpha test
549 if(pfnBlendFunc[rt] != nullptr)
550 {
551 pfnBlendFunc[rt](
552 pBlendState,
553 psContext.shaded[rt],
554 psContext.shaded[1],
555 sample,
556 pColorSample,
557 blendOut,
558 &psContext.oMask,
559 (simdscalari*)&coverageMask);
560 }
561
562 // final write mask
563 simdscalari outputMask = _simd_castps_si(_simd_and_ps(coverageMask, depthPassMask));
564
565 ///@todo can only use maskstore fast path if bpc is 32. Assuming hot tile is RGBA32_FLOAT.
566 static_assert(KNOB_COLOR_HOT_TILE_FORMAT == R32G32B32A32_FLOAT, "Unsupported hot tile format");
567
568 const uint32_t simd = KNOB_SIMD_WIDTH * sizeof(float);
569
570 // store with color mask
571 if(!pRTBlend->writeDisableRed)
572 {
573 _simd_maskstore_ps((float*)pColorSample, outputMask, blendOut.x);
574 }
575 if(!pRTBlend->writeDisableGreen)
576 {
577 _simd_maskstore_ps((float*)(pColorSample + simd), outputMask, blendOut.y);
578 }
579 if(!pRTBlend->writeDisableBlue)
580 {
581 _simd_maskstore_ps((float*)(pColorSample + simd * 2), outputMask, blendOut.z);
582 }
583 if(!pRTBlend->writeDisableAlpha)
584 {
585 _simd_maskstore_ps((float*)(pColorSample + simd * 3), outputMask, blendOut.w);
586 }
587 }
588 }
589
590 template<uint32_t sampleCountT = SWR_MULTISAMPLE_1X, uint32_t samplePattern = SWR_MSAA_STANDARD_PATTERN,
591 uint32_t coverage = 0, uint32_t centroid = 0, uint32_t forced = 0, uint32_t canEarlyZ = 0>
592 struct SwrBackendTraits
593 {
594 static const bool bIsStandardPattern = (samplePattern == SWR_MSAA_STANDARD_PATTERN);
595 static const bool bInputCoverage = (coverage == 1);
596 static const bool bCentroidPos = (centroid == 1);
597 static const bool bForcedSampleCount = (forced == 1);
598 static const bool bCanEarlyZ = (canEarlyZ == 1);
599 typedef MultisampleTraits<(SWR_MULTISAMPLE_COUNT)sampleCountT, (bIsStandardPattern) ? SWR_MSAA_STANDARD_PATTERN : SWR_MSAA_CENTER_PATTERN> MultisampleT;
600 };