1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * @brief Backend handles rasterization, pixel shading and output merger
28 ******************************************************************************/
31 void InitBackendSingleFuncTable(PFN_BACKEND_FUNC(&table
)[SWR_INPUT_COVERAGE_COUNT
][2][2]);
32 void InitBackendSampleFuncTable(PFN_BACKEND_FUNC(&table
)[SWR_MULTISAMPLE_TYPE_COUNT
][SWR_INPUT_COVERAGE_COUNT
][2][2]);
34 static INLINE
void CalcSampleBarycentrics(const BarycentricCoeffs
& coeffs
, SWR_PS_CONTEXT
&psContext
);
37 enum SWR_BACKEND_FUNCS
39 SWR_BACKEND_SINGLE_SAMPLE
,
40 SWR_BACKEND_MSAA_PIXEL_RATE
,
41 SWR_BACKEND_MSAA_SAMPLE_RATE
,
42 SWR_BACKEND_FUNCS_MAX
,
45 #if KNOB_SIMD_WIDTH == 8
46 static const __m256 vCenterOffsetsX
= __m256
{0.5, 1.5, 0.5, 1.5, 2.5, 3.5, 2.5, 3.5};
47 static const __m256 vCenterOffsetsY
= __m256
{0.5, 0.5, 1.5, 1.5, 0.5, 0.5, 1.5, 1.5};
48 static const __m256 vULOffsetsX
= __m256
{0.0, 1.0, 0.0, 1.0, 2.0, 3.0, 2.0, 3.0};
49 static const __m256 vULOffsetsY
= __m256
{0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0};
53 static INLINE simdmask
ComputeUserClipMask(uint8_t clipMask
, float* pUserClipBuffer
, simdscalar vI
, simdscalar vJ
)
55 simdscalar vClipMask
= _simd_setzero_ps();
56 uint32_t numClipDistance
= _mm_popcnt_u32(clipMask
);
58 for (uint32_t i
= 0; i
< numClipDistance
; ++i
)
60 // pull triangle clip distance values from clip buffer
61 simdscalar vA
= _simd_broadcast_ss(pUserClipBuffer
++);
62 simdscalar vB
= _simd_broadcast_ss(pUserClipBuffer
++);
63 simdscalar vC
= _simd_broadcast_ss(pUserClipBuffer
++);
66 simdscalar vInterp
= vplaneps(vA
, vB
, vC
, vI
, vJ
);
68 // clip if interpolated clip distance is < 0 || NAN
69 simdscalar vCull
= _simd_cmp_ps(_simd_setzero_ps(), vInterp
, _CMP_NLE_UQ
);
71 vClipMask
= _simd_or_ps(vClipMask
, vCull
);
74 return _simd_movemask_ps(vClipMask
);
77 INLINE
static uint32_t RasterTileColorOffset(uint32_t sampleNum
)
79 static const uint32_t RasterTileColorOffsets
[16]
81 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
/ 8),
82 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
/ 8) * 2,
83 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
/ 8) * 3,
84 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
/ 8) * 4,
85 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
/ 8) * 5,
86 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
/ 8) * 6,
87 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
/ 8) * 7,
88 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
/ 8) * 8,
89 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
/ 8) * 9,
90 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
/ 8) * 10,
91 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
/ 8) * 11,
92 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
/ 8) * 12,
93 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
/ 8) * 13,
94 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
/ 8) * 14,
95 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
/ 8) * 15,
97 assert(sampleNum
< 16);
98 return RasterTileColorOffsets
[sampleNum
];
101 INLINE
static uint32_t RasterTileDepthOffset(uint32_t sampleNum
)
103 static const uint32_t RasterTileDepthOffsets
[16]
105 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
/ 8),
106 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
/ 8) * 2,
107 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
/ 8) * 3,
108 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
/ 8) * 4,
109 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
/ 8) * 5,
110 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
/ 8) * 6,
111 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
/ 8) * 7,
112 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
/ 8) * 8,
113 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
/ 8) * 9,
114 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
/ 8) * 10,
115 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
/ 8) * 11,
116 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
/ 8) * 12,
117 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
/ 8) * 13,
118 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
/ 8) * 14,
119 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
/ 8) * 15,
121 assert(sampleNum
< 16);
122 return RasterTileDepthOffsets
[sampleNum
];
125 INLINE
static uint32_t RasterTileStencilOffset(uint32_t sampleNum
)
127 static const uint32_t RasterTileStencilOffsets
[16]
129 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
/ 8),
130 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
/ 8) * 2,
131 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
/ 8) * 3,
132 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
/ 8) * 4,
133 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
/ 8) * 5,
134 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
/ 8) * 6,
135 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
/ 8) * 7,
136 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
/ 8) * 8,
137 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
/ 8) * 9,
138 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
/ 8) * 10,
139 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
/ 8) * 11,
140 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
/ 8) * 12,
141 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
/ 8) * 13,
142 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
/ 8) * 14,
143 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
/ 8) * 15,
145 assert(sampleNum
< 16);
146 return RasterTileStencilOffsets
[sampleNum
];
149 template<typename T
, uint32_t InputCoverage
>
150 struct generateInputCoverage
152 INLINE
generateInputCoverage(const uint64_t *const coverageMask
, uint32_t (&inputMask
)[KNOB_SIMD_WIDTH
], const uint32_t sampleMask
)
154 // will need to update for avx512
155 assert(KNOB_SIMD_WIDTH
== 8);
158 simdscalari sampleCoverage
[2];
160 if(T::bIsCenterPattern
)
162 // center coverage is the same for all samples; just broadcast to the sample slots
163 uint32_t centerCoverage
= ((uint32_t)(*coverageMask
) & MASK
);
164 if(T::MultisampleT::numSamples
== 1)
166 sampleCoverage
[0] = _simd_set_epi32(0, 0, 0, 0, 0, 0, 0, centerCoverage
);
168 else if(T::MultisampleT::numSamples
== 2)
170 sampleCoverage
[0] = _simd_set_epi32(0, 0, 0, 0, 0, 0, centerCoverage
, centerCoverage
);
172 else if(T::MultisampleT::numSamples
== 4)
174 sampleCoverage
[0] = _simd_set_epi32(0, 0, 0, 0, centerCoverage
, centerCoverage
, centerCoverage
, centerCoverage
);
176 else if(T::MultisampleT::numSamples
== 8)
178 sampleCoverage
[0] = _simd_set1_epi32(centerCoverage
);
180 else if(T::MultisampleT::numSamples
== 16)
182 sampleCoverage
[0] = _simd_set1_epi32(centerCoverage
);
183 sampleCoverage
[1] = _simd_set1_epi32(centerCoverage
);
188 simdscalari src
= _simd_set1_epi32(0);
189 simdscalari index0
= _simd_set_epi32(7, 6, 5, 4, 3, 2, 1, 0), index1
;
191 if(T::MultisampleT::numSamples
== 1)
193 mask
[0] = _simd_set_epi32(0, 0, 0, 0, 0, 0, 0, -1);
195 else if(T::MultisampleT::numSamples
== 2)
197 mask
[0] = _simd_set_epi32(0, 0, 0, 0, 0, 0, -1, -1);
199 else if(T::MultisampleT::numSamples
== 4)
201 mask
[0] = _simd_set_epi32(0, 0, 0, 0, -1, -1, -1, -1);
203 else if(T::MultisampleT::numSamples
== 8)
205 mask
[0] = _simd_set1_epi32(-1);
207 else if(T::MultisampleT::numSamples
== 16)
209 mask
[0] = _simd_set1_epi32(-1);
210 mask
[1] = _simd_set1_epi32(-1);
211 index1
= _simd_set_epi32(15, 14, 13, 12, 11, 10, 9, 8);
214 // gather coverage for samples 0-7
215 sampleCoverage
[0] = _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src
), (const float*)coverageMask
, index0
, _mm256_castsi256_ps(mask
[0]), 8));
216 if(T::MultisampleT::numSamples
> 8)
218 // gather coverage for samples 8-15
219 sampleCoverage
[1] = _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src
), (const float*)coverageMask
, index1
, _mm256_castsi256_ps(mask
[1]), 8));
223 mask
[0] = _mm256_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xC, 0x8, 0x4, 0x0,
224 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xC, 0x8, 0x4, 0x0);
225 // pull out the 8bit 4x2 coverage for samples 0-7 into the lower 32 bits of each 128bit lane
226 simdscalari packedCoverage0
= _simd_shuffle_epi8(sampleCoverage
[0], mask
[0]);
228 simdscalari packedCoverage1
;
229 if(T::MultisampleT::numSamples
> 8)
231 // pull out the 8bit 4x2 coverage for samples 8-15 into the lower 32 bits of each 128bit lane
232 packedCoverage1
= _simd_shuffle_epi8(sampleCoverage
[1], mask
[0]);
235 #if (KNOB_ARCH == KNOB_ARCH_AVX)
236 // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane
237 simdscalari hiToLow
= _mm256_permute2f128_si256(packedCoverage0
, packedCoverage0
, 0x83);
238 simdscalar shufRes
= _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow
), _mm256_castsi256_ps(hiToLow
), _MM_SHUFFLE(1, 1, 0, 1));
239 packedCoverage0
= _mm256_castps_si256(_mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0
), shufRes
, 0xFE));
241 simdscalari packedSampleCoverage
;
242 if(T::MultisampleT::numSamples
> 8)
244 // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane
245 hiToLow
= _mm256_permute2f128_si256(packedCoverage1
, packedCoverage1
, 0x83);
246 shufRes
= _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow
), _mm256_castsi256_ps(hiToLow
), _MM_SHUFFLE(1, 1, 0, 1));
247 shufRes
= _mm256_blend_ps(_mm256_castsi256_ps(packedCoverage1
), shufRes
, 0xFE);
248 packedCoverage1
= _mm256_castps_si256(_mm256_castpd_ps(_mm256_shuffle_pd(_mm256_castps_pd(shufRes
), _mm256_castps_pd(shufRes
), 0x01)));
249 packedSampleCoverage
= _mm256_castps_si256(_mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0
), _mm256_castsi256_ps(packedCoverage1
), 0xFC));
253 packedSampleCoverage
= packedCoverage0
;
256 simdscalari permMask
= _simd_set_epi32(0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x4, 0x0);
257 // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane
258 packedCoverage0
= _mm256_permutevar8x32_epi32(packedCoverage0
, permMask
);
260 simdscalari packedSampleCoverage
;
261 if(T::MultisampleT::numSamples
> 8)
263 permMask
= _simd_set_epi32(0x7, 0x7, 0x7, 0x7, 0x4, 0x0, 0x7, 0x7);
264 // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane
265 packedCoverage1
= _mm256_permutevar8x32_epi32(packedCoverage1
, permMask
);
267 // blend coverage masks for samples 0-7 and samples 8-15 into single 128 bit lane
268 packedSampleCoverage
= _mm256_blend_epi32(packedCoverage0
, packedCoverage1
, 0x0C);
272 packedSampleCoverage
= packedCoverage0
;
276 for(int32_t i
= KNOB_SIMD_WIDTH
- 1; i
>= 0; i
--)
278 // convert packed sample coverage masks into single coverage masks for all samples for each pixel in the 4x2
279 inputMask
[i
] = _simd_movemask_epi8(packedSampleCoverage
);
281 if(!T::bForcedSampleCount
)
283 // input coverage has to be anded with sample mask if MSAA isn't forced on
284 inputMask
[i
] &= sampleMask
;
287 // shift to the next pixel in the 4x2
288 packedSampleCoverage
= _simd_slli_epi32(packedSampleCoverage
, 1);
292 INLINE
generateInputCoverage(const uint64_t *const coverageMask
, simdscalar
&inputCoverage
, const uint32_t sampleMask
)
294 uint32_t inputMask
[KNOB_SIMD_WIDTH
];
295 generateInputCoverage
<T
, T::InputCoverage
>(coverageMask
, inputMask
, sampleMask
);
296 inputCoverage
= _simd_castsi_ps(_simd_set_epi32(inputMask
[7], inputMask
[6], inputMask
[5], inputMask
[4], inputMask
[3], inputMask
[2], inputMask
[1], inputMask
[0]));
302 struct generateInputCoverage
<T
, SWR_INPUT_COVERAGE_INNER_CONSERVATIVE
>
304 INLINE
generateInputCoverage(const uint64_t *const coverageMask
, simdscalar
&inputCoverage
, const uint32_t sampleMask
)
306 // will need to update for avx512
307 assert(KNOB_SIMD_WIDTH
== 8);
308 simdscalari vec
= _simd_set1_epi32(coverageMask
[0]);
309 const simdscalari bit
= _simd_set_epi32(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
310 vec
= _simd_and_si(vec
, bit
);
311 vec
= _simd_cmplt_epi32(_simd_setzero_si(), vec
);
312 vec
= _simd_blendv_epi32(_simd_setzero_si(), _simd_set1_epi32(1), vec
);
313 inputCoverage
= _simd_castsi_ps(vec
);
316 INLINE
generateInputCoverage(const uint64_t *const coverageMask
, uint32_t (&inputMask
)[KNOB_SIMD_WIDTH
], const uint32_t sampleMask
)
318 uint32_t simdCoverage
= (coverageMask
[0] & MASK
);
319 static const uint32_t FullCoverageMask
= (1 << T::MultisampleT::numSamples
) - 1;
320 for(int i
= 0; i
< KNOB_SIMD_WIDTH
; i
++)
322 // set all samples to covered if conservative coverage mask is set for that pixel
323 inputMask
[i
] = (((1 << i
) & simdCoverage
) > 0) ? FullCoverageMask
: 0;
328 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
329 // Centroid behaves exactly as follows :
330 // (1) If all samples in the primitive are covered, the attribute is evaluated at the pixel center (even if the sample pattern does not happen to
331 // have a sample location there).
332 // (2) Else the attribute is evaluated at the first covered sample, in increasing order of sample index, where sample coverage is after ANDing the
333 // coverage with the SampleMask Rasterizer State.
334 // (3) If no samples are covered, such as on helper pixels executed off the bounds of a primitive to fill out 2x2 pixel stamps, the attribute is
335 // evaluated as follows : If the SampleMask Rasterizer state is a subset of the samples in the pixel, then the first sample covered by the
336 // SampleMask Rasterizer State is the evaluation point.Otherwise (full SampleMask), the pixel center is the evaluation point.
337 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
339 INLINE
void CalcCentroidPos(SWR_PS_CONTEXT
&psContext
, const SWR_MULTISAMPLE_POS
& samplePos
,
340 const uint64_t *const coverageMask
, const uint32_t sampleMask
,
341 const simdscalar vXSamplePosUL
, const simdscalar vYSamplePosUL
)
343 uint32_t inputMask
[KNOB_SIMD_WIDTH
];
344 generateInputCoverage
<T
, T::InputCoverage
>(coverageMask
, inputMask
, sampleMask
);
346 // Case (2) - partially covered pixel
348 // scan for first covered sample per pixel in the 4x2 span
349 unsigned long sampleNum
[KNOB_SIMD_WIDTH
];
350 (inputMask
[0] > 0) ? (_BitScanForward(&sampleNum
[0], inputMask
[0])) : (sampleNum
[0] = 0);
351 (inputMask
[1] > 0) ? (_BitScanForward(&sampleNum
[1], inputMask
[1])) : (sampleNum
[1] = 0);
352 (inputMask
[2] > 0) ? (_BitScanForward(&sampleNum
[2], inputMask
[2])) : (sampleNum
[2] = 0);
353 (inputMask
[3] > 0) ? (_BitScanForward(&sampleNum
[3], inputMask
[3])) : (sampleNum
[3] = 0);
354 (inputMask
[4] > 0) ? (_BitScanForward(&sampleNum
[4], inputMask
[4])) : (sampleNum
[4] = 0);
355 (inputMask
[5] > 0) ? (_BitScanForward(&sampleNum
[5], inputMask
[5])) : (sampleNum
[5] = 0);
356 (inputMask
[6] > 0) ? (_BitScanForward(&sampleNum
[6], inputMask
[6])) : (sampleNum
[6] = 0);
357 (inputMask
[7] > 0) ? (_BitScanForward(&sampleNum
[7], inputMask
[7])) : (sampleNum
[7] = 0);
359 // look up and set the sample offsets from UL pixel corner for first covered sample
360 simdscalar vXSample
= _simd_set_ps(samplePos
.X(sampleNum
[7]),
361 samplePos
.X(sampleNum
[6]),
362 samplePos
.X(sampleNum
[5]),
363 samplePos
.X(sampleNum
[4]),
364 samplePos
.X(sampleNum
[3]),
365 samplePos
.X(sampleNum
[2]),
366 samplePos
.X(sampleNum
[1]),
367 samplePos
.X(sampleNum
[0]));
369 simdscalar vYSample
= _simd_set_ps(samplePos
.Y(sampleNum
[7]),
370 samplePos
.Y(sampleNum
[6]),
371 samplePos
.Y(sampleNum
[5]),
372 samplePos
.Y(sampleNum
[4]),
373 samplePos
.Y(sampleNum
[3]),
374 samplePos
.Y(sampleNum
[2]),
375 samplePos
.Y(sampleNum
[1]),
376 samplePos
.Y(sampleNum
[0]));
377 // add sample offset to UL pixel corner
378 vXSample
= _simd_add_ps(vXSamplePosUL
, vXSample
);
379 vYSample
= _simd_add_ps(vYSamplePosUL
, vYSample
);
381 // Case (1) and case (3b) - All samples covered or not covered with full SampleMask
382 static const simdscalari vFullyCoveredMask
= T::MultisampleT::FullSampleMask();
383 simdscalari vInputCoveragei
= _simd_set_epi32(inputMask
[7], inputMask
[6], inputMask
[5], inputMask
[4], inputMask
[3], inputMask
[2], inputMask
[1], inputMask
[0]);
384 simdscalari vAllSamplesCovered
= _simd_cmpeq_epi32(vInputCoveragei
, vFullyCoveredMask
);
386 static const simdscalari vZero
= _simd_setzero_si();
387 const simdscalari vSampleMask
= _simd_and_si(_simd_set1_epi32(sampleMask
), vFullyCoveredMask
);
388 simdscalari vNoSamplesCovered
= _simd_cmpeq_epi32(vInputCoveragei
, vZero
);
389 simdscalari vIsFullSampleMask
= _simd_cmpeq_epi32(vSampleMask
, vFullyCoveredMask
);
390 simdscalari vCase3b
= _simd_and_si(vNoSamplesCovered
, vIsFullSampleMask
);
392 simdscalari vEvalAtCenter
= _simd_or_si(vAllSamplesCovered
, vCase3b
);
394 // set the centroid position based on results from above
395 psContext
.vX
.centroid
= _simd_blendv_ps(vXSample
, psContext
.vX
.center
, _simd_castsi_ps(vEvalAtCenter
));
396 psContext
.vY
.centroid
= _simd_blendv_ps(vYSample
, psContext
.vY
.center
, _simd_castsi_ps(vEvalAtCenter
));
398 // Case (3a) No samples covered and partial sample mask
399 simdscalari vSomeSampleMaskSamples
= _simd_cmplt_epi32(vSampleMask
, vFullyCoveredMask
);
400 // sample mask should never be all 0's for this case, but handle it anyways
401 unsigned long firstCoveredSampleMaskSample
= 0;
402 (sampleMask
> 0) ? (_BitScanForward(&firstCoveredSampleMaskSample
, sampleMask
)) : (firstCoveredSampleMaskSample
= 0);
404 simdscalari vCase3a
= _simd_and_si(vNoSamplesCovered
, vSomeSampleMaskSamples
);
406 vXSample
= _simd_set1_ps(samplePos
.X(firstCoveredSampleMaskSample
));
407 vYSample
= _simd_set1_ps(samplePos
.Y(firstCoveredSampleMaskSample
));
409 // blend in case 3a pixel locations
410 psContext
.vX
.centroid
= _simd_blendv_ps(psContext
.vX
.centroid
, vXSample
, _simd_castsi_ps(vCase3a
));
411 psContext
.vY
.centroid
= _simd_blendv_ps(psContext
.vY
.centroid
, vYSample
, _simd_castsi_ps(vCase3a
));
414 INLINE
void CalcCentroidBarycentrics(const BarycentricCoeffs
& coeffs
, SWR_PS_CONTEXT
&psContext
,
415 const simdscalar vXSamplePosUL
, const simdscalar vYSamplePosUL
)
418 psContext
.vI
.centroid
= vplaneps(coeffs
.vIa
, coeffs
.vIb
, coeffs
.vIc
, psContext
.vX
.centroid
, psContext
.vY
.centroid
);
419 psContext
.vJ
.centroid
= vplaneps(coeffs
.vJa
, coeffs
.vJb
, coeffs
.vJc
, psContext
.vX
.centroid
, psContext
.vY
.centroid
);
420 psContext
.vI
.centroid
= _simd_mul_ps(psContext
.vI
.centroid
, coeffs
.vRecipDet
);
421 psContext
.vJ
.centroid
= _simd_mul_ps(psContext
.vJ
.centroid
, coeffs
.vRecipDet
);
424 psContext
.vOneOverW
.centroid
= vplaneps(coeffs
.vAOneOverW
, coeffs
.vBOneOverW
, coeffs
.vCOneOverW
, psContext
.vI
.centroid
, psContext
.vJ
.centroid
);
427 INLINE simdmask
CalcDepthBoundsAcceptMask(simdscalar z
, float minz
, float maxz
)
429 const simdscalar minzMask
= _simd_cmpge_ps(z
, _simd_set1_ps(minz
));
430 const simdscalar maxzMask
= _simd_cmple_ps(z
, _simd_set1_ps(maxz
));
432 return _simd_movemask_ps(_simd_and_ps(minzMask
, maxzMask
));
436 INLINE
uint32_t GetNumOMSamples(SWR_MULTISAMPLE_COUNT blendSampleCount
)
438 // RT has to be single sample if we're in forcedMSAA mode
439 if(T::bForcedSampleCount
&& (T::MultisampleT::sampleCount
> SWR_MULTISAMPLE_1X
))
443 // unless we're forced to single sample, in which case we run the OM at the sample count of the RT
444 else if(T::bForcedSampleCount
&& (T::MultisampleT::sampleCount
== SWR_MULTISAMPLE_1X
))
446 return GetNumSamples(blendSampleCount
);
448 // else we're in normal MSAA mode and rasterizer and OM are running at the same sample count
451 return T::MultisampleT::numSamples
;
455 inline void SetupBarycentricCoeffs(BarycentricCoeffs
*coeffs
, const SWR_TRIANGLE_DESC
&work
)
459 coeffs
->vIa
= _simd_broadcast_ss(&work
.I
[0]);
460 coeffs
->vIb
= _simd_broadcast_ss(&work
.I
[1]);
461 coeffs
->vIc
= _simd_broadcast_ss(&work
.I
[2]);
463 coeffs
->vJa
= _simd_broadcast_ss(&work
.J
[0]);
464 coeffs
->vJb
= _simd_broadcast_ss(&work
.J
[1]);
465 coeffs
->vJc
= _simd_broadcast_ss(&work
.J
[2]);
467 coeffs
->vZa
= _simd_broadcast_ss(&work
.Z
[0]);
468 coeffs
->vZb
= _simd_broadcast_ss(&work
.Z
[1]);
469 coeffs
->vZc
= _simd_broadcast_ss(&work
.Z
[2]);
471 coeffs
->vRecipDet
= _simd_broadcast_ss(&work
.recipDet
);
473 coeffs
->vAOneOverW
= _simd_broadcast_ss(&work
.OneOverW
[0]);
474 coeffs
->vBOneOverW
= _simd_broadcast_ss(&work
.OneOverW
[1]);
475 coeffs
->vCOneOverW
= _simd_broadcast_ss(&work
.OneOverW
[2]);
478 inline void SetupRenderBuffers(uint8_t *pColorBuffer
[SWR_NUM_RENDERTARGETS
], uint8_t **pDepthBuffer
, uint8_t **pStencilBuffer
, uint32_t colorHotTileMask
, RenderOutputBuffers
&renderBuffers
)
482 while (_BitScanForward(&index
, colorHotTileMask
))
484 assert(index
< SWR_NUM_RENDERTARGETS
);
485 colorHotTileMask
&= ~(1 << index
);
486 pColorBuffer
[index
] = renderBuffers
.pColor
[index
];
491 *pDepthBuffer
= renderBuffers
.pDepth
;
496 *pStencilBuffer
= renderBuffers
.pStencil
;;
501 void SetupPixelShaderContext(SWR_PS_CONTEXT
*psContext
, const SWR_MULTISAMPLE_POS
& samplePos
, SWR_TRIANGLE_DESC
&work
)
503 psContext
->pAttribs
= work
.pAttribs
;
504 psContext
->pPerspAttribs
= work
.pPerspAttribs
;
505 psContext
->frontFace
= work
.triFlags
.frontFacing
;
506 psContext
->renderTargetArrayIndex
= work
.triFlags
.renderTargetArrayIndex
;
508 // save Ia/Ib/Ic and Ja/Jb/Jc if we need to reevaluate i/j/k in the shader because of pull attribs
509 psContext
->I
= work
.I
;
510 psContext
->J
= work
.J
;
512 psContext
->recipDet
= work
.recipDet
;
513 psContext
->pRecipW
= work
.pRecipW
;
514 psContext
->pSamplePosX
= samplePos
.X();//reinterpret_cast<const float *>(&T::MultisampleT::samplePosX);
515 psContext
->pSamplePosY
= samplePos
.Y();//reinterpret_cast<const float *>(&T::MultisampleT::samplePosY);
516 psContext
->rasterizerSampleCount
= T::MultisampleT::numSamples
;
517 psContext
->sampleIndex
= 0;
520 template<typename T
, bool IsSingleSample
>
521 void CalcCentroid(SWR_PS_CONTEXT
*psContext
, const SWR_MULTISAMPLE_POS
& samplePos
,
522 const BarycentricCoeffs
&coeffs
, const uint64_t * const coverageMask
, uint32_t sampleMask
)
524 if (IsSingleSample
) // if (T::MultisampleT::numSamples == 1) // doesn't cut it, the centroid positions are still different
526 // for 1x case, centroid is pixel center
527 psContext
->vX
.centroid
= psContext
->vX
.center
;
528 psContext
->vY
.centroid
= psContext
->vY
.center
;
529 psContext
->vI
.centroid
= psContext
->vI
.center
;
530 psContext
->vJ
.centroid
= psContext
->vJ
.center
;
531 psContext
->vOneOverW
.centroid
= psContext
->vOneOverW
.center
;
537 ///@ todo: don't need to genererate input coverage 2x if input coverage and centroid
538 if (T::bIsCenterPattern
)
540 psContext
->vX
.centroid
= _simd_add_ps(psContext
->vX
.UL
, _simd_set1_ps(0.5f
));
541 psContext
->vY
.centroid
= _simd_add_ps(psContext
->vY
.UL
, _simd_set1_ps(0.5f
));
545 // add param: const uint32_t inputMask[KNOB_SIMD_WIDTH] to eliminate 'generate coverage 2X'..
546 CalcCentroidPos
<T
>(*psContext
, samplePos
, coverageMask
, sampleMask
, psContext
->vX
.UL
, psContext
->vY
.UL
);
549 CalcCentroidBarycentrics(coeffs
, *psContext
, psContext
->vX
.UL
, psContext
->vY
.UL
);
553 psContext
->vX
.centroid
= psContext
->vX
.sample
;
554 psContext
->vY
.centroid
= psContext
->vY
.sample
;
560 struct PixelRateZTestLoop
562 PixelRateZTestLoop(DRAW_CONTEXT
*DC
, uint32_t _workerId
, const SWR_TRIANGLE_DESC
&Work
, const BarycentricCoeffs
& Coeffs
, const API_STATE
& apiState
,
563 uint8_t*& depthBuffer
, uint8_t*& stencilBuffer
, const uint8_t ClipDistanceMask
) :
564 pDC(DC
), workerId(_workerId
), work(Work
), coeffs(Coeffs
), state(apiState
), psState(apiState
.psState
),
565 samplePos(state
.rastState
.samplePositions
),
566 clipDistanceMask(ClipDistanceMask
), pDepthBuffer(depthBuffer
), pStencilBuffer(stencilBuffer
){};
569 uint32_t operator()(simdscalar
& activeLanes
, SWR_PS_CONTEXT
& psContext
,
570 const CORE_BUCKETS BEDepthBucket
, uint32_t currentSimdIn8x8
= 0)
572 SWR_CONTEXT
*pContext
= pDC
->pContext
;
574 uint32_t statCount
= 0;
575 simdscalar anyDepthSamplePassed
= _simd_setzero_ps();
576 for(uint32_t sample
= 0; sample
< T::MultisampleT::numCoverageSamples
; sample
++)
578 const uint8_t *pCoverageMask
= (uint8_t*)&work
.coverageMask
[sample
];
579 vCoverageMask
[sample
] = _simd_and_ps(activeLanes
, _simd_vmask_ps(pCoverageMask
[currentSimdIn8x8
] & MASK
));
581 if(!_simd_movemask_ps(vCoverageMask
[sample
]))
583 vCoverageMask
[sample
] = depthPassMask
[sample
] = stencilPassMask
[sample
] = _simd_setzero_ps();
587 // offset depth/stencil buffers current sample
588 uint8_t *pDepthSample
= pDepthBuffer
+ RasterTileDepthOffset(sample
);
589 uint8_t * pStencilSample
= pStencilBuffer
+ RasterTileStencilOffset(sample
);
591 if (state
.depthHottileEnable
&& state
.depthBoundsState
.depthBoundsTestEnable
)
593 static_assert(KNOB_DEPTH_HOT_TILE_FORMAT
== R32_FLOAT
, "Unsupported depth hot tile format");
595 const simdscalar z
= _simd_load_ps(reinterpret_cast<const float *>(pDepthSample
));
597 const float minz
= state
.depthBoundsState
.depthBoundsTestMinValue
;
598 const float maxz
= state
.depthBoundsState
.depthBoundsTestMaxValue
;
600 vCoverageMask
[sample
] = _simd_and_ps(vCoverageMask
[sample
], _simd_vmask_ps(CalcDepthBoundsAcceptMask(z
, minz
, maxz
)));
603 AR_BEGIN(BEBarycentric
, pDC
->drawId
);
605 // calculate per sample positions
606 psContext
.vX
.sample
= _simd_add_ps(psContext
.vX
.UL
, samplePos
.vX(sample
));
607 psContext
.vY
.sample
= _simd_add_ps(psContext
.vY
.UL
, samplePos
.vY(sample
));
609 // calc I & J per sample
610 CalcSampleBarycentrics(coeffs
, psContext
);
612 if(psState
.writesODepth
)
615 // broadcast and test oDepth(psContext.vZ) written from the PS for each sample
616 vZ
[sample
] = psContext
.vZ
;
621 vZ
[sample
] = vplaneps(coeffs
.vZa
, coeffs
.vZb
, coeffs
.vZc
, psContext
.vI
.sample
, psContext
.vJ
.sample
);
622 vZ
[sample
] = state
.pfnQuantizeDepth(vZ
[sample
]);
625 AR_END(BEBarycentric
, 0);
627 ///@todo: perspective correct vs non-perspective correct clipping?
628 // if clip distances are enabled, we need to interpolate for each sample
631 uint8_t clipMask
= ComputeUserClipMask(clipDistanceMask
, work
.pUserClipBuffer
, psContext
.vI
.sample
, psContext
.vJ
.sample
);
633 vCoverageMask
[sample
] = _simd_and_ps(vCoverageMask
[sample
], _simd_vmask_ps(~clipMask
));
636 // ZTest for this sample
637 ///@todo Need to uncomment out this bucket.
638 //AR_BEGIN(BEDepthBucket, pDC->drawId);
639 depthPassMask
[sample
] = vCoverageMask
[sample
];
640 stencilPassMask
[sample
] = vCoverageMask
[sample
];
641 depthPassMask
[sample
] = DepthStencilTest(&state
, work
.triFlags
.frontFacing
, work
.triFlags
.viewportIndex
,
642 vZ
[sample
], pDepthSample
, vCoverageMask
[sample
],
643 pStencilSample
, &stencilPassMask
[sample
]);
644 //AR_END(BEDepthBucket, 0);
646 // early-exit if no pixels passed depth or earlyZ is forced on
647 if(psState
.forceEarlyZ
|| !_simd_movemask_ps(depthPassMask
[sample
]))
649 DepthStencilWrite(&state
.vp
[work
.triFlags
.viewportIndex
], &state
.depthStencilState
, work
.triFlags
.frontFacing
, vZ
[sample
],
650 pDepthSample
, depthPassMask
[sample
], vCoverageMask
[sample
], pStencilSample
, stencilPassMask
[sample
]);
652 if(!_simd_movemask_ps(depthPassMask
[sample
]))
657 anyDepthSamplePassed
= _simd_or_ps(anyDepthSamplePassed
, depthPassMask
[sample
]);
658 uint32_t statMask
= _simd_movemask_ps(depthPassMask
[sample
]);
659 statCount
+= _mm_popcnt_u32(statMask
);
662 activeLanes
= _simd_and_ps(anyDepthSamplePassed
, activeLanes
);
663 // return number of samples that passed depth and coverage
667 // saved depth/stencil/coverage masks and interpolated Z used in OM and DepthWrite
668 simdscalar vZ
[T::MultisampleT::numCoverageSamples
];
669 simdscalar vCoverageMask
[T::MultisampleT::numCoverageSamples
];
670 simdscalar depthPassMask
[T::MultisampleT::numCoverageSamples
];
671 simdscalar stencilPassMask
[T::MultisampleT::numCoverageSamples
];
678 const SWR_TRIANGLE_DESC
& work
;
679 const BarycentricCoeffs
& coeffs
;
680 const API_STATE
& state
;
681 const SWR_PS_STATE
& psState
;
682 const SWR_MULTISAMPLE_POS
& samplePos
;
683 const uint8_t clipDistanceMask
;
684 uint8_t*& pDepthBuffer
;
685 uint8_t*& pStencilBuffer
;
688 INLINE
void CalcPixelBarycentrics(const BarycentricCoeffs
& coeffs
, SWR_PS_CONTEXT
&psContext
)
691 psContext
.vI
.center
= vplaneps(coeffs
.vIa
, coeffs
.vIb
, coeffs
.vIc
, psContext
.vX
.center
, psContext
.vY
.center
);
692 psContext
.vJ
.center
= vplaneps(coeffs
.vJa
, coeffs
.vJb
, coeffs
.vJc
, psContext
.vX
.center
, psContext
.vY
.center
);
693 psContext
.vI
.center
= _simd_mul_ps(psContext
.vI
.center
, coeffs
.vRecipDet
);
694 psContext
.vJ
.center
= _simd_mul_ps(psContext
.vJ
.center
, coeffs
.vRecipDet
);
697 psContext
.vOneOverW
.center
= vplaneps(coeffs
.vAOneOverW
, coeffs
.vBOneOverW
, coeffs
.vCOneOverW
, psContext
.vI
.center
, psContext
.vJ
.center
);
700 static INLINE
void CalcSampleBarycentrics(const BarycentricCoeffs
& coeffs
, SWR_PS_CONTEXT
&psContext
)
703 psContext
.vI
.sample
= vplaneps(coeffs
.vIa
, coeffs
.vIb
, coeffs
.vIc
, psContext
.vX
.sample
, psContext
.vY
.sample
);
704 psContext
.vJ
.sample
= vplaneps(coeffs
.vJa
, coeffs
.vJb
, coeffs
.vJc
, psContext
.vX
.sample
, psContext
.vY
.sample
);
705 psContext
.vI
.sample
= _simd_mul_ps(psContext
.vI
.sample
, coeffs
.vRecipDet
);
706 psContext
.vJ
.sample
= _simd_mul_ps(psContext
.vJ
.sample
, coeffs
.vRecipDet
);
709 psContext
.vOneOverW
.sample
= vplaneps(coeffs
.vAOneOverW
, coeffs
.vBOneOverW
, coeffs
.vCOneOverW
, psContext
.vI
.sample
, psContext
.vJ
.sample
);
712 // Merge Output to 4x2 SIMD Tile Format
713 INLINE
void OutputMerger4x2(SWR_PS_CONTEXT
&psContext
, uint8_t* (&pColorBase
)[SWR_NUM_RENDERTARGETS
], uint32_t sample
, const SWR_BLEND_STATE
*pBlendState
,
714 const PFN_BLEND_JIT_FUNC (&pfnBlendFunc
)[SWR_NUM_RENDERTARGETS
], simdscalar
&coverageMask
, simdscalar depthPassMask
, uint32_t renderTargetMask
)
716 // type safety guaranteed from template instantiation in BEChooser<>::GetFunc
717 const uint32_t rasterTileColorOffset
= RasterTileColorOffset(sample
);
721 while (_BitScanForward(&rt
, renderTargetMask
))
723 renderTargetMask
&= ~(1 << rt
);
724 uint8_t *pColorSample
= pColorBase
[rt
] + rasterTileColorOffset
;
726 const SWR_RENDER_TARGET_BLEND_STATE
*pRTBlend
= &pBlendState
->renderTarget
[rt
];
729 // pfnBlendFunc may not update all channels. Initialize with PS output.
730 /// TODO: move this into the blend JIT.
731 blendOut
= psContext
.shaded
[rt
];
733 // Blend outputs and update coverage mask for alpha test
734 if(pfnBlendFunc
[rt
] != nullptr)
738 psContext
.shaded
[rt
],
740 psContext
.shaded
[0].w
,
745 (simdscalari
*)&coverageMask
);
750 simdscalari outputMask
= _simd_castps_si(_simd_and_ps(coverageMask
, depthPassMask
));
752 ///@todo can only use maskstore fast path if bpc is 32. Assuming hot tile is RGBA32_FLOAT.
753 static_assert(KNOB_COLOR_HOT_TILE_FORMAT
== R32G32B32A32_FLOAT
, "Unsupported hot tile format");
755 const uint32_t simd
= KNOB_SIMD_WIDTH
* sizeof(float);
757 // store with color mask
758 if(!pRTBlend
->writeDisableRed
)
760 _simd_maskstore_ps((float*)pColorSample
, outputMask
, blendOut
.x
);
762 if(!pRTBlend
->writeDisableGreen
)
764 _simd_maskstore_ps((float*)(pColorSample
+ simd
), outputMask
, blendOut
.y
);
766 if(!pRTBlend
->writeDisableBlue
)
768 _simd_maskstore_ps((float*)(pColorSample
+ simd
* 2), outputMask
, blendOut
.z
);
770 if(!pRTBlend
->writeDisableAlpha
)
772 _simd_maskstore_ps((float*)(pColorSample
+ simd
* 3), outputMask
, blendOut
.w
);
777 #if USE_8x2_TILE_BACKEND
778 // Merge Output to 8x2 SIMD16 Tile Format
779 INLINE
void OutputMerger8x2(SWR_PS_CONTEXT
&psContext
, uint8_t* (&pColorBase
)[SWR_NUM_RENDERTARGETS
], uint32_t sample
, const SWR_BLEND_STATE
*pBlendState
,
780 const PFN_BLEND_JIT_FUNC(&pfnBlendFunc
)[SWR_NUM_RENDERTARGETS
], simdscalar
&coverageMask
, simdscalar depthPassMask
, uint32_t renderTargetMask
, bool useAlternateOffset
)
782 // type safety guaranteed from template instantiation in BEChooser<>::GetFunc
783 uint32_t rasterTileColorOffset
= RasterTileColorOffset(sample
);
785 if (useAlternateOffset
)
787 rasterTileColorOffset
+= sizeof(simdscalar
);
794 while (_BitScanForward(&rt
, renderTargetMask
))
796 renderTargetMask
&= ~(1 << rt
);
798 const SWR_RENDER_TARGET_BLEND_STATE
*pRTBlend
= &pBlendState
->renderTarget
[rt
];
800 simdscalar
* pColorSample
;
801 bool hotTileEnable
= !pRTBlend
->writeDisableAlpha
|| !pRTBlend
->writeDisableRed
|| !pRTBlend
->writeDisableGreen
|| !pRTBlend
->writeDisableBlue
;
804 pColorSample
= reinterpret_cast<simdscalar
*>(pColorBase
[rt
] + rasterTileColorOffset
);
805 blendSrc
[0] = pColorSample
[0];
806 blendSrc
[1] = pColorSample
[2];
807 blendSrc
[2] = pColorSample
[4];
808 blendSrc
[3] = pColorSample
[6];
812 pColorSample
= nullptr;
816 // pfnBlendFunc may not update all channels. Initialize with PS output.
817 /// TODO: move this into the blend JIT.
818 blendOut
= psContext
.shaded
[rt
];
820 // Blend outputs and update coverage mask for alpha test
821 if(pfnBlendFunc
[rt
] != nullptr)
825 psContext
.shaded
[rt
],
827 psContext
.shaded
[0].w
,
829 reinterpret_cast<uint8_t *>(&blendSrc
),
832 reinterpret_cast<simdscalari
*>(&coverageMask
));
837 simdscalari outputMask
= _simd_castps_si(_simd_and_ps(coverageMask
, depthPassMask
));
839 ///@todo can only use maskstore fast path if bpc is 32. Assuming hot tile is RGBA32_FLOAT.
840 static_assert(KNOB_COLOR_HOT_TILE_FORMAT
== R32G32B32A32_FLOAT
, "Unsupported hot tile format");
842 // store with color mask
843 if (!pRTBlend
->writeDisableRed
)
845 _simd_maskstore_ps(reinterpret_cast<float *>(&pColorSample
[0]), outputMask
, blendOut
.x
);
847 if (!pRTBlend
->writeDisableGreen
)
849 _simd_maskstore_ps(reinterpret_cast<float *>(&pColorSample
[2]), outputMask
, blendOut
.y
);
851 if (!pRTBlend
->writeDisableBlue
)
853 _simd_maskstore_ps(reinterpret_cast<float *>(&pColorSample
[4]), outputMask
, blendOut
.z
);
855 if (!pRTBlend
->writeDisableAlpha
)
857 _simd_maskstore_ps(reinterpret_cast<float *>(&pColorSample
[6]), outputMask
, blendOut
.w
);
865 void BackendPixelRate(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t x
, uint32_t y
, SWR_TRIANGLE_DESC
&work
, RenderOutputBuffers
&renderBuffers
)
867 ///@todo: Need to move locals off stack to prevent __chkstk's from being generated for the backend
870 SWR_CONTEXT
*pContext
= pDC
->pContext
;
872 AR_BEGIN(BEPixelRateBackend
, pDC
->drawId
);
873 AR_BEGIN(BESetup
, pDC
->drawId
);
875 const API_STATE
&state
= GetApiState(pDC
);
877 BarycentricCoeffs coeffs
;
878 SetupBarycentricCoeffs(&coeffs
, work
);
880 SWR_PS_CONTEXT psContext
;
881 const SWR_MULTISAMPLE_POS
& samplePos
= state
.rastState
.samplePositions
;
882 SetupPixelShaderContext
<T
>(&psContext
, samplePos
, work
);
884 uint8_t *pDepthBuffer
, *pStencilBuffer
;
885 SetupRenderBuffers(psContext
.pColorBuffer
, &pDepthBuffer
, &pStencilBuffer
, state
.colorHottileEnable
, renderBuffers
);
889 PixelRateZTestLoop
<T
> PixelRateZTest(pDC
, workerId
, work
, coeffs
, state
, pDepthBuffer
, pStencilBuffer
, state
.rastState
.clipDistanceMask
);
891 psContext
.vY
.UL
= _simd_add_ps(vULOffsetsY
, _simd_set1_ps(static_cast<float>(y
)));
892 psContext
.vY
.center
= _simd_add_ps(vCenterOffsetsY
, _simd_set1_ps(static_cast<float>(y
)));
894 const simdscalar dy
= _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM
));
896 for(uint32_t yy
= y
; yy
< y
+ KNOB_TILE_Y_DIM
; yy
+= SIMD_TILE_Y_DIM
)
898 psContext
.vX
.UL
= _simd_add_ps(vULOffsetsX
, _simd_set1_ps(static_cast<float>(x
)));
899 psContext
.vX
.center
= _simd_add_ps(vCenterOffsetsX
, _simd_set1_ps(static_cast<float>(x
)));
901 const simdscalar dx
= _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM
));
903 for(uint32_t xx
= x
; xx
< x
+ KNOB_TILE_X_DIM
; xx
+= SIMD_TILE_X_DIM
)
905 #if USE_8x2_TILE_BACKEND
906 const bool useAlternateOffset
= ((xx
& SIMD_TILE_X_DIM
) != 0);
908 simdscalar activeLanes
;
909 if(!(work
.anyCoveredSamples
& MASK
)) {goto Endtile
;};
910 activeLanes
= _simd_vmask_ps(work
.anyCoveredSamples
& MASK
);
912 if (T::InputCoverage
!= SWR_INPUT_COVERAGE_NONE
)
914 const uint64_t* pCoverageMask
= (T::InputCoverage
== SWR_INPUT_COVERAGE_INNER_CONSERVATIVE
) ? &work
.innerCoverageMask
: &work
.coverageMask
[0];
916 generateInputCoverage
<T
, T::InputCoverage
>(pCoverageMask
, psContext
.inputMask
, state
.blendState
.sampleMask
);
919 AR_BEGIN(BEBarycentric
, pDC
->drawId
);
921 CalcPixelBarycentrics(coeffs
, psContext
);
923 CalcCentroid
<T
, false>(&psContext
, samplePos
, coeffs
, work
.coverageMask
, state
.blendState
.sampleMask
);
925 AR_END(BEBarycentric
, 0);
927 if(T::bForcedSampleCount
)
929 // candidate pixels (that passed coverage) will cause shader invocation if any bits in the samplemask are set
930 const simdscalar vSampleMask
= _simd_castsi_ps(_simd_cmpgt_epi32(_simd_set1_epi32(state
.blendState
.sampleMask
), _simd_setzero_si()));
931 activeLanes
= _simd_and_ps(activeLanes
, vSampleMask
);
935 if(T::bCanEarlyZ
&& !T::bForcedSampleCount
)
937 uint32_t depthPassCount
= PixelRateZTest(activeLanes
, psContext
, BEEarlyDepthTest
);
938 UPDATE_STAT_BE(DepthPassCount
, depthPassCount
);
939 AR_EVENT(EarlyDepthInfoPixelRate(depthPassCount
, _simd_movemask_ps(activeLanes
)));
942 // if we have no covered samples that passed depth at this point, go to next tile
943 if(!_simd_movemask_ps(activeLanes
)) { goto Endtile
; };
945 if(state
.psState
.usesSourceDepth
)
947 AR_BEGIN(BEBarycentric
, pDC
->drawId
);
948 // interpolate and quantize z
949 psContext
.vZ
= vplaneps(coeffs
.vZa
, coeffs
.vZb
, coeffs
.vZc
, psContext
.vI
.center
, psContext
.vJ
.center
);
950 psContext
.vZ
= state
.pfnQuantizeDepth(psContext
.vZ
);
951 AR_END(BEBarycentric
, 0);
954 // pixels that are currently active
955 psContext
.activeMask
= _simd_castps_si(activeLanes
);
956 psContext
.oMask
= T::MultisampleT::FullSampleMask();
958 // execute pixel shader
959 AR_BEGIN(BEPixelShader
, pDC
->drawId
);
960 state
.psState
.pfnPixelShader(GetPrivateState(pDC
), &psContext
);
961 UPDATE_STAT_BE(PsInvocations
, _mm_popcnt_u32(_simd_movemask_ps(activeLanes
)));
962 AR_END(BEPixelShader
, 0);
964 // update active lanes to remove any discarded or oMask'd pixels
965 activeLanes
= _simd_castsi_ps(_simd_and_si(psContext
.activeMask
, _simd_cmpgt_epi32(psContext
.oMask
, _simd_setzero_si())));
966 if(!_simd_movemask_ps(activeLanes
)) { goto Endtile
; };
969 if(!T::bCanEarlyZ
&& !T::bForcedSampleCount
)
971 uint32_t depthPassCount
= PixelRateZTest(activeLanes
, psContext
, BELateDepthTest
);
972 UPDATE_STAT_BE(DepthPassCount
, depthPassCount
);
973 AR_EVENT(LateDepthInfoPixelRate(depthPassCount
, _simd_movemask_ps(activeLanes
)));
976 // if we have no covered samples that passed depth at this point, skip OM and go to next tile
977 if(!_simd_movemask_ps(activeLanes
)) { goto Endtile
; };
980 // loop over all samples, broadcasting the results of the PS to all passing pixels
981 for(uint32_t sample
= 0; sample
< GetNumOMSamples
<T
>(state
.blendState
.sampleCount
); sample
++)
983 AR_BEGIN(BEOutputMerger
, pDC
->drawId
);
984 // center pattern does a single coverage/depth/stencil test, standard pattern tests all samples
985 uint32_t coverageSampleNum
= (T::bIsCenterPattern
) ? 0 : sample
;
986 simdscalar coverageMask
, depthMask
;
987 if(T::bForcedSampleCount
)
989 coverageMask
= depthMask
= activeLanes
;
993 coverageMask
= PixelRateZTest
.vCoverageMask
[coverageSampleNum
];
994 depthMask
= PixelRateZTest
.depthPassMask
[coverageSampleNum
];
995 if(!_simd_movemask_ps(depthMask
))
997 // stencil should already have been written in early/lateZ tests
998 AR_END(BEOutputMerger
, 0);
1003 // broadcast the results of the PS to all passing pixels
1004 #if USE_8x2_TILE_BACKEND
1005 OutputMerger8x2(psContext
, psContext
.pColorBuffer
, sample
, &state
.blendState
,state
.pfnBlendFunc
, coverageMask
, depthMask
, state
.psState
.renderTargetMask
, useAlternateOffset
);
1006 #else // USE_8x2_TILE_BACKEND
1007 OutputMerger4x2(psContext
, psContext
.pColorBuffer
, sample
, &state
.blendState
, state
.pfnBlendFunc
, coverageMask
, depthMask
, state
.psState
.renderTargetMask
);
1008 #endif // USE_8x2_TILE_BACKEND
1010 if(!state
.psState
.forceEarlyZ
&& !T::bForcedSampleCount
)
1012 uint8_t *pDepthSample
= pDepthBuffer
+ RasterTileDepthOffset(sample
);
1013 uint8_t * pStencilSample
= pStencilBuffer
+ RasterTileStencilOffset(sample
);
1015 DepthStencilWrite(&state
.vp
[work
.triFlags
.viewportIndex
], &state
.depthStencilState
, work
.triFlags
.frontFacing
, PixelRateZTest
.vZ
[coverageSampleNum
],
1016 pDepthSample
, depthMask
, coverageMask
, pStencilSample
, PixelRateZTest
.stencilPassMask
[coverageSampleNum
]);
1018 AR_END(BEOutputMerger
, 0);
1021 AR_BEGIN(BEEndTile
, pDC
->drawId
);
1023 for(uint32_t sample
= 0; sample
< T::MultisampleT::numCoverageSamples
; sample
++)
1025 work
.coverageMask
[sample
] >>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
1028 if(T::InputCoverage
== SWR_INPUT_COVERAGE_INNER_CONSERVATIVE
)
1030 work
.innerCoverageMask
>>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
1032 work
.anyCoveredSamples
>>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
1034 #if USE_8x2_TILE_BACKEND
1035 if (useAlternateOffset
)
1038 uint32_t rtMask
= state
.colorHottileEnable
;
1039 while (_BitScanForward(&rt
, rtMask
))
1041 rtMask
&= ~(1 << rt
);
1042 psContext
.pColorBuffer
[rt
] += (2 * KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
) / 8;
1047 uint32_t rtMask
= state
.colorHottileEnable
;
1048 while (_BitScanForward(&rt
, rtMask
))
1050 rtMask
&= ~(1 << rt
);
1051 psContext
.pColorBuffer
[rt
] += (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
) / 8;
1054 pDepthBuffer
+= (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
) / 8;
1055 pStencilBuffer
+= (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
) / 8;
1057 AR_END(BEEndTile
, 0);
1059 psContext
.vX
.UL
= _simd_add_ps(psContext
.vX
.UL
, dx
);
1060 psContext
.vX
.center
= _simd_add_ps(psContext
.vX
.center
, dx
);
1063 psContext
.vY
.UL
= _simd_add_ps(psContext
.vY
.UL
, dy
);
1064 psContext
.vY
.center
= _simd_add_ps(psContext
.vY
.center
, dy
);
1067 AR_END(BEPixelRateBackend
, 0);
1070 template<uint32_t sampleCountT
= SWR_MULTISAMPLE_1X
, uint32_t isCenter
= 0,
1071 uint32_t coverage
= 0, uint32_t centroid
= 0, uint32_t forced
= 0, uint32_t canEarlyZ
= 0
1073 struct SwrBackendTraits
1075 static const bool bIsCenterPattern
= (isCenter
== 1);
1076 static const uint32_t InputCoverage
= coverage
;
1077 static const bool bCentroidPos
= (centroid
== 1);
1078 static const bool bForcedSampleCount
= (forced
== 1);
1079 static const bool bCanEarlyZ
= (canEarlyZ
== 1);
1080 typedef MultisampleTraits
<(SWR_MULTISAMPLE_COUNT
)sampleCountT
, bIsCenterPattern
> MultisampleT
;