1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * @brief Backend handles rasterization, pixel shading and output merger
28 ******************************************************************************/
31 void InitBackendSingleFuncTable(PFN_BACKEND_FUNC(&table
)[SWR_INPUT_COVERAGE_COUNT
][2][2]);
32 void InitBackendSampleFuncTable(PFN_BACKEND_FUNC(&table
)[SWR_MULTISAMPLE_TYPE_COUNT
][SWR_INPUT_COVERAGE_COUNT
][2][2]);
34 static INLINE
void CalcSampleBarycentrics(const BarycentricCoeffs
& coeffs
, SWR_PS_CONTEXT
&psContext
);
37 enum SWR_BACKEND_FUNCS
39 SWR_BACKEND_SINGLE_SAMPLE
,
40 SWR_BACKEND_MSAA_PIXEL_RATE
,
41 SWR_BACKEND_MSAA_SAMPLE_RATE
,
42 SWR_BACKEND_FUNCS_MAX
,
45 #if KNOB_SIMD_WIDTH == 8
46 static const __m256 vCenterOffsetsX
= __m256
{0.5, 1.5, 0.5, 1.5, 2.5, 3.5, 2.5, 3.5};
47 static const __m256 vCenterOffsetsY
= __m256
{0.5, 0.5, 1.5, 1.5, 0.5, 0.5, 1.5, 1.5};
48 static const __m256 vULOffsetsX
= __m256
{0.0, 1.0, 0.0, 1.0, 2.0, 3.0, 2.0, 3.0};
49 static const __m256 vULOffsetsY
= __m256
{0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0};
53 static INLINE simdmask
ComputeUserClipMask(uint8_t clipMask
, float* pUserClipBuffer
, simdscalar
const &vI
, simdscalar
const &vJ
)
55 simdscalar vClipMask
= _simd_setzero_ps();
56 uint32_t numClipDistance
= _mm_popcnt_u32(clipMask
);
58 for (uint32_t i
= 0; i
< numClipDistance
; ++i
)
60 // pull triangle clip distance values from clip buffer
61 simdscalar vA
= _simd_broadcast_ss(pUserClipBuffer
++);
62 simdscalar vB
= _simd_broadcast_ss(pUserClipBuffer
++);
63 simdscalar vC
= _simd_broadcast_ss(pUserClipBuffer
++);
65 simdscalar vK
= _simd_sub_ps(_simd_sub_ps(_simd_set1_ps(1.0f
), vI
), vJ
);
68 simdscalar vInterp
= vplaneps(vA
, vB
, _simd_mul_ps(vK
, vC
), vI
, vJ
);
70 // clip if interpolated clip distance is < 0 || NAN
71 simdscalar vCull
= _simd_cmp_ps(_simd_setzero_ps(), vInterp
, _CMP_NLE_UQ
);
73 vClipMask
= _simd_or_ps(vClipMask
, vCull
);
76 return _simd_movemask_ps(vClipMask
);
79 INLINE
static uint32_t RasterTileColorOffset(uint32_t sampleNum
)
81 static const uint32_t RasterTileColorOffsets
[16]
83 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
/ 8),
84 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
/ 8) * 2,
85 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
/ 8) * 3,
86 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
/ 8) * 4,
87 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
/ 8) * 5,
88 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
/ 8) * 6,
89 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
/ 8) * 7,
90 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
/ 8) * 8,
91 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
/ 8) * 9,
92 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
/ 8) * 10,
93 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
/ 8) * 11,
94 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
/ 8) * 12,
95 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
/ 8) * 13,
96 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
/ 8) * 14,
97 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
/ 8) * 15,
99 assert(sampleNum
< 16);
100 return RasterTileColorOffsets
[sampleNum
];
103 INLINE
static uint32_t RasterTileDepthOffset(uint32_t sampleNum
)
105 static const uint32_t RasterTileDepthOffsets
[16]
107 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
/ 8),
108 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
/ 8) * 2,
109 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
/ 8) * 3,
110 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
/ 8) * 4,
111 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
/ 8) * 5,
112 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
/ 8) * 6,
113 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
/ 8) * 7,
114 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
/ 8) * 8,
115 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
/ 8) * 9,
116 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
/ 8) * 10,
117 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
/ 8) * 11,
118 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
/ 8) * 12,
119 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
/ 8) * 13,
120 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
/ 8) * 14,
121 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
/ 8) * 15,
123 assert(sampleNum
< 16);
124 return RasterTileDepthOffsets
[sampleNum
];
127 INLINE
static uint32_t RasterTileStencilOffset(uint32_t sampleNum
)
129 static const uint32_t RasterTileStencilOffsets
[16]
131 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
/ 8),
132 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
/ 8) * 2,
133 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
/ 8) * 3,
134 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
/ 8) * 4,
135 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
/ 8) * 5,
136 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
/ 8) * 6,
137 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
/ 8) * 7,
138 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
/ 8) * 8,
139 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
/ 8) * 9,
140 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
/ 8) * 10,
141 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
/ 8) * 11,
142 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
/ 8) * 12,
143 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
/ 8) * 13,
144 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
/ 8) * 14,
145 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
/ 8) * 15,
147 assert(sampleNum
< 16);
148 return RasterTileStencilOffsets
[sampleNum
];
151 template<typename T
, uint32_t InputCoverage
>
152 struct generateInputCoverage
154 INLINE
generateInputCoverage(const uint64_t *const coverageMask
, uint32_t (&inputMask
)[KNOB_SIMD_WIDTH
], const uint32_t sampleMask
)
156 // will need to update for avx512
157 assert(KNOB_SIMD_WIDTH
== 8);
160 simdscalari sampleCoverage
[2];
162 if(T::bIsCenterPattern
)
164 // center coverage is the same for all samples; just broadcast to the sample slots
165 uint32_t centerCoverage
= ((uint32_t)(*coverageMask
) & MASK
);
166 if(T::MultisampleT::numSamples
== 1)
168 sampleCoverage
[0] = _simd_set_epi32(0, 0, 0, 0, 0, 0, 0, centerCoverage
);
170 else if(T::MultisampleT::numSamples
== 2)
172 sampleCoverage
[0] = _simd_set_epi32(0, 0, 0, 0, 0, 0, centerCoverage
, centerCoverage
);
174 else if(T::MultisampleT::numSamples
== 4)
176 sampleCoverage
[0] = _simd_set_epi32(0, 0, 0, 0, centerCoverage
, centerCoverage
, centerCoverage
, centerCoverage
);
178 else if(T::MultisampleT::numSamples
== 8)
180 sampleCoverage
[0] = _simd_set1_epi32(centerCoverage
);
182 else if(T::MultisampleT::numSamples
== 16)
184 sampleCoverage
[0] = _simd_set1_epi32(centerCoverage
);
185 sampleCoverage
[1] = _simd_set1_epi32(centerCoverage
);
190 simdscalari src
= _simd_set1_epi32(0);
191 simdscalari index0
= _simd_set_epi32(7, 6, 5, 4, 3, 2, 1, 0), index1
;
193 if(T::MultisampleT::numSamples
== 1)
195 mask
[0] = _simd_set_epi32(0, 0, 0, 0, 0, 0, 0, -1);
197 else if(T::MultisampleT::numSamples
== 2)
199 mask
[0] = _simd_set_epi32(0, 0, 0, 0, 0, 0, -1, -1);
201 else if(T::MultisampleT::numSamples
== 4)
203 mask
[0] = _simd_set_epi32(0, 0, 0, 0, -1, -1, -1, -1);
205 else if(T::MultisampleT::numSamples
== 8)
207 mask
[0] = _simd_set1_epi32(-1);
209 else if(T::MultisampleT::numSamples
== 16)
211 mask
[0] = _simd_set1_epi32(-1);
212 mask
[1] = _simd_set1_epi32(-1);
213 index1
= _simd_set_epi32(15, 14, 13, 12, 11, 10, 9, 8);
216 // gather coverage for samples 0-7
217 sampleCoverage
[0] = _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src
), (const float*)coverageMask
, index0
, _mm256_castsi256_ps(mask
[0]), 8));
218 if(T::MultisampleT::numSamples
> 8)
220 // gather coverage for samples 8-15
221 sampleCoverage
[1] = _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src
), (const float*)coverageMask
, index1
, _mm256_castsi256_ps(mask
[1]), 8));
225 mask
[0] = _mm256_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xC, 0x8, 0x4, 0x0,
226 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xC, 0x8, 0x4, 0x0);
227 // pull out the 8bit 4x2 coverage for samples 0-7 into the lower 32 bits of each 128bit lane
228 simdscalari packedCoverage0
= _simd_shuffle_epi8(sampleCoverage
[0], mask
[0]);
230 simdscalari packedCoverage1
;
231 if(T::MultisampleT::numSamples
> 8)
233 // pull out the 8bit 4x2 coverage for samples 8-15 into the lower 32 bits of each 128bit lane
234 packedCoverage1
= _simd_shuffle_epi8(sampleCoverage
[1], mask
[0]);
237 #if (KNOB_ARCH == KNOB_ARCH_AVX)
238 // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane
239 simdscalari hiToLow
= _mm256_permute2f128_si256(packedCoverage0
, packedCoverage0
, 0x83);
240 simdscalar shufRes
= _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow
), _mm256_castsi256_ps(hiToLow
), _MM_SHUFFLE(1, 1, 0, 1));
241 packedCoverage0
= _mm256_castps_si256(_mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0
), shufRes
, 0xFE));
243 simdscalari packedSampleCoverage
;
244 if(T::MultisampleT::numSamples
> 8)
246 // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane
247 hiToLow
= _mm256_permute2f128_si256(packedCoverage1
, packedCoverage1
, 0x83);
248 shufRes
= _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow
), _mm256_castsi256_ps(hiToLow
), _MM_SHUFFLE(1, 1, 0, 1));
249 shufRes
= _mm256_blend_ps(_mm256_castsi256_ps(packedCoverage1
), shufRes
, 0xFE);
250 packedCoverage1
= _mm256_castps_si256(_mm256_castpd_ps(_mm256_shuffle_pd(_mm256_castps_pd(shufRes
), _mm256_castps_pd(shufRes
), 0x01)));
251 packedSampleCoverage
= _mm256_castps_si256(_mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0
), _mm256_castsi256_ps(packedCoverage1
), 0xFC));
255 packedSampleCoverage
= packedCoverage0
;
258 simdscalari permMask
= _simd_set_epi32(0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x4, 0x0);
259 // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane
260 packedCoverage0
= _mm256_permutevar8x32_epi32(packedCoverage0
, permMask
);
262 simdscalari packedSampleCoverage
;
263 if(T::MultisampleT::numSamples
> 8)
265 permMask
= _simd_set_epi32(0x7, 0x7, 0x7, 0x7, 0x4, 0x0, 0x7, 0x7);
266 // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane
267 packedCoverage1
= _mm256_permutevar8x32_epi32(packedCoverage1
, permMask
);
269 // blend coverage masks for samples 0-7 and samples 8-15 into single 128 bit lane
270 packedSampleCoverage
= _mm256_blend_epi32(packedCoverage0
, packedCoverage1
, 0x0C);
274 packedSampleCoverage
= packedCoverage0
;
278 for(int32_t i
= KNOB_SIMD_WIDTH
- 1; i
>= 0; i
--)
280 // convert packed sample coverage masks into single coverage masks for all samples for each pixel in the 4x2
281 inputMask
[i
] = _simd_movemask_epi8(packedSampleCoverage
);
283 if(!T::bForcedSampleCount
)
285 // input coverage has to be anded with sample mask if MSAA isn't forced on
286 inputMask
[i
] &= sampleMask
;
289 // shift to the next pixel in the 4x2
290 packedSampleCoverage
= _simd_slli_epi32(packedSampleCoverage
, 1);
294 INLINE
generateInputCoverage(const uint64_t *const coverageMask
, simdscalar
&inputCoverage
, const uint32_t sampleMask
)
296 uint32_t inputMask
[KNOB_SIMD_WIDTH
];
297 generateInputCoverage
<T
, T::InputCoverage
>(coverageMask
, inputMask
, sampleMask
);
298 inputCoverage
= _simd_castsi_ps(_simd_set_epi32(inputMask
[7], inputMask
[6], inputMask
[5], inputMask
[4], inputMask
[3], inputMask
[2], inputMask
[1], inputMask
[0]));
304 struct generateInputCoverage
<T
, SWR_INPUT_COVERAGE_INNER_CONSERVATIVE
>
306 INLINE
generateInputCoverage(const uint64_t *const coverageMask
, simdscalar
&inputCoverage
, const uint32_t sampleMask
)
308 // will need to update for avx512
309 assert(KNOB_SIMD_WIDTH
== 8);
310 simdscalari vec
= _simd_set1_epi32(coverageMask
[0]);
311 const simdscalari bit
= _simd_set_epi32(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
312 vec
= _simd_and_si(vec
, bit
);
313 vec
= _simd_cmplt_epi32(_simd_setzero_si(), vec
);
314 vec
= _simd_blendv_epi32(_simd_setzero_si(), _simd_set1_epi32(1), vec
);
315 inputCoverage
= _simd_castsi_ps(vec
);
318 INLINE
generateInputCoverage(const uint64_t *const coverageMask
, uint32_t (&inputMask
)[KNOB_SIMD_WIDTH
], const uint32_t sampleMask
)
320 uint32_t simdCoverage
= (coverageMask
[0] & MASK
);
321 static const uint32_t FullCoverageMask
= (1 << T::MultisampleT::numSamples
) - 1;
322 for(int i
= 0; i
< KNOB_SIMD_WIDTH
; i
++)
324 // set all samples to covered if conservative coverage mask is set for that pixel
325 inputMask
[i
] = (((1 << i
) & simdCoverage
) > 0) ? FullCoverageMask
: 0;
330 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
331 // Centroid behaves exactly as follows :
332 // (1) If all samples in the primitive are covered, the attribute is evaluated at the pixel center (even if the sample pattern does not happen to
333 // have a sample location there).
334 // (2) Else the attribute is evaluated at the first covered sample, in increasing order of sample index, where sample coverage is after ANDing the
335 // coverage with the SampleMask Rasterizer State.
336 // (3) If no samples are covered, such as on helper pixels executed off the bounds of a primitive to fill out 2x2 pixel stamps, the attribute is
337 // evaluated as follows : If the SampleMask Rasterizer state is a subset of the samples in the pixel, then the first sample covered by the
338 // SampleMask Rasterizer State is the evaluation point.Otherwise (full SampleMask), the pixel center is the evaluation point.
339 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
341 INLINE
void CalcCentroidPos(SWR_PS_CONTEXT
&psContext
, const SWR_MULTISAMPLE_POS
& samplePos
,
342 const uint64_t *const coverageMask
, const uint32_t sampleMask
,
343 simdscalar
const &vXSamplePosUL
, simdscalar
const &vYSamplePosUL
)
345 uint32_t inputMask
[KNOB_SIMD_WIDTH
];
346 generateInputCoverage
<T
, T::InputCoverage
>(coverageMask
, inputMask
, sampleMask
);
348 // Case (2) - partially covered pixel
350 // scan for first covered sample per pixel in the 4x2 span
351 unsigned long sampleNum
[KNOB_SIMD_WIDTH
];
352 (inputMask
[0] > 0) ? (_BitScanForward(&sampleNum
[0], inputMask
[0])) : (sampleNum
[0] = 0);
353 (inputMask
[1] > 0) ? (_BitScanForward(&sampleNum
[1], inputMask
[1])) : (sampleNum
[1] = 0);
354 (inputMask
[2] > 0) ? (_BitScanForward(&sampleNum
[2], inputMask
[2])) : (sampleNum
[2] = 0);
355 (inputMask
[3] > 0) ? (_BitScanForward(&sampleNum
[3], inputMask
[3])) : (sampleNum
[3] = 0);
356 (inputMask
[4] > 0) ? (_BitScanForward(&sampleNum
[4], inputMask
[4])) : (sampleNum
[4] = 0);
357 (inputMask
[5] > 0) ? (_BitScanForward(&sampleNum
[5], inputMask
[5])) : (sampleNum
[5] = 0);
358 (inputMask
[6] > 0) ? (_BitScanForward(&sampleNum
[6], inputMask
[6])) : (sampleNum
[6] = 0);
359 (inputMask
[7] > 0) ? (_BitScanForward(&sampleNum
[7], inputMask
[7])) : (sampleNum
[7] = 0);
361 // look up and set the sample offsets from UL pixel corner for first covered sample
362 simdscalar vXSample
= _simd_set_ps(samplePos
.X(sampleNum
[7]),
363 samplePos
.X(sampleNum
[6]),
364 samplePos
.X(sampleNum
[5]),
365 samplePos
.X(sampleNum
[4]),
366 samplePos
.X(sampleNum
[3]),
367 samplePos
.X(sampleNum
[2]),
368 samplePos
.X(sampleNum
[1]),
369 samplePos
.X(sampleNum
[0]));
371 simdscalar vYSample
= _simd_set_ps(samplePos
.Y(sampleNum
[7]),
372 samplePos
.Y(sampleNum
[6]),
373 samplePos
.Y(sampleNum
[5]),
374 samplePos
.Y(sampleNum
[4]),
375 samplePos
.Y(sampleNum
[3]),
376 samplePos
.Y(sampleNum
[2]),
377 samplePos
.Y(sampleNum
[1]),
378 samplePos
.Y(sampleNum
[0]));
379 // add sample offset to UL pixel corner
380 vXSample
= _simd_add_ps(vXSamplePosUL
, vXSample
);
381 vYSample
= _simd_add_ps(vYSamplePosUL
, vYSample
);
383 // Case (1) and case (3b) - All samples covered or not covered with full SampleMask
384 static const simdscalari vFullyCoveredMask
= T::MultisampleT::FullSampleMask();
385 simdscalari vInputCoveragei
= _simd_set_epi32(inputMask
[7], inputMask
[6], inputMask
[5], inputMask
[4], inputMask
[3], inputMask
[2], inputMask
[1], inputMask
[0]);
386 simdscalari vAllSamplesCovered
= _simd_cmpeq_epi32(vInputCoveragei
, vFullyCoveredMask
);
388 static const simdscalari vZero
= _simd_setzero_si();
389 const simdscalari vSampleMask
= _simd_and_si(_simd_set1_epi32(sampleMask
), vFullyCoveredMask
);
390 simdscalari vNoSamplesCovered
= _simd_cmpeq_epi32(vInputCoveragei
, vZero
);
391 simdscalari vIsFullSampleMask
= _simd_cmpeq_epi32(vSampleMask
, vFullyCoveredMask
);
392 simdscalari vCase3b
= _simd_and_si(vNoSamplesCovered
, vIsFullSampleMask
);
394 simdscalari vEvalAtCenter
= _simd_or_si(vAllSamplesCovered
, vCase3b
);
396 // set the centroid position based on results from above
397 psContext
.vX
.centroid
= _simd_blendv_ps(vXSample
, psContext
.vX
.center
, _simd_castsi_ps(vEvalAtCenter
));
398 psContext
.vY
.centroid
= _simd_blendv_ps(vYSample
, psContext
.vY
.center
, _simd_castsi_ps(vEvalAtCenter
));
400 // Case (3a) No samples covered and partial sample mask
401 simdscalari vSomeSampleMaskSamples
= _simd_cmplt_epi32(vSampleMask
, vFullyCoveredMask
);
402 // sample mask should never be all 0's for this case, but handle it anyways
403 unsigned long firstCoveredSampleMaskSample
= 0;
404 (sampleMask
> 0) ? (_BitScanForward(&firstCoveredSampleMaskSample
, sampleMask
)) : (firstCoveredSampleMaskSample
= 0);
406 simdscalari vCase3a
= _simd_and_si(vNoSamplesCovered
, vSomeSampleMaskSamples
);
408 vXSample
= _simd_set1_ps(samplePos
.X(firstCoveredSampleMaskSample
));
409 vYSample
= _simd_set1_ps(samplePos
.Y(firstCoveredSampleMaskSample
));
411 // blend in case 3a pixel locations
412 psContext
.vX
.centroid
= _simd_blendv_ps(psContext
.vX
.centroid
, vXSample
, _simd_castsi_ps(vCase3a
));
413 psContext
.vY
.centroid
= _simd_blendv_ps(psContext
.vY
.centroid
, vYSample
, _simd_castsi_ps(vCase3a
));
416 INLINE
void CalcCentroidBarycentrics(const BarycentricCoeffs
& coeffs
, SWR_PS_CONTEXT
&psContext
,
417 const simdscalar
&vXSamplePosUL
, const simdscalar
&vYSamplePosUL
)
420 psContext
.vI
.centroid
= vplaneps(coeffs
.vIa
, coeffs
.vIb
, coeffs
.vIc
, psContext
.vX
.centroid
, psContext
.vY
.centroid
);
421 psContext
.vJ
.centroid
= vplaneps(coeffs
.vJa
, coeffs
.vJb
, coeffs
.vJc
, psContext
.vX
.centroid
, psContext
.vY
.centroid
);
422 psContext
.vI
.centroid
= _simd_mul_ps(psContext
.vI
.centroid
, coeffs
.vRecipDet
);
423 psContext
.vJ
.centroid
= _simd_mul_ps(psContext
.vJ
.centroid
, coeffs
.vRecipDet
);
426 psContext
.vOneOverW
.centroid
= vplaneps(coeffs
.vAOneOverW
, coeffs
.vBOneOverW
, coeffs
.vCOneOverW
, psContext
.vI
.centroid
, psContext
.vJ
.centroid
);
429 INLINE simdmask
CalcDepthBoundsAcceptMask(simdscalar
const &z
, float minz
, float maxz
)
431 const simdscalar minzMask
= _simd_cmpge_ps(z
, _simd_set1_ps(minz
));
432 const simdscalar maxzMask
= _simd_cmple_ps(z
, _simd_set1_ps(maxz
));
434 return _simd_movemask_ps(_simd_and_ps(minzMask
, maxzMask
));
438 INLINE
uint32_t GetNumOMSamples(SWR_MULTISAMPLE_COUNT blendSampleCount
)
440 // RT has to be single sample if we're in forcedMSAA mode
441 if(T::bForcedSampleCount
&& (T::MultisampleT::sampleCount
> SWR_MULTISAMPLE_1X
))
445 // unless we're forced to single sample, in which case we run the OM at the sample count of the RT
446 else if(T::bForcedSampleCount
&& (T::MultisampleT::sampleCount
== SWR_MULTISAMPLE_1X
))
448 return GetNumSamples(blendSampleCount
);
450 // else we're in normal MSAA mode and rasterizer and OM are running at the same sample count
453 return T::MultisampleT::numSamples
;
457 inline void SetupBarycentricCoeffs(BarycentricCoeffs
*coeffs
, const SWR_TRIANGLE_DESC
&work
)
461 coeffs
->vIa
= _simd_broadcast_ss(&work
.I
[0]);
462 coeffs
->vIb
= _simd_broadcast_ss(&work
.I
[1]);
463 coeffs
->vIc
= _simd_broadcast_ss(&work
.I
[2]);
465 coeffs
->vJa
= _simd_broadcast_ss(&work
.J
[0]);
466 coeffs
->vJb
= _simd_broadcast_ss(&work
.J
[1]);
467 coeffs
->vJc
= _simd_broadcast_ss(&work
.J
[2]);
469 coeffs
->vZa
= _simd_broadcast_ss(&work
.Z
[0]);
470 coeffs
->vZb
= _simd_broadcast_ss(&work
.Z
[1]);
471 coeffs
->vZc
= _simd_broadcast_ss(&work
.Z
[2]);
473 coeffs
->vRecipDet
= _simd_broadcast_ss(&work
.recipDet
);
475 coeffs
->vAOneOverW
= _simd_broadcast_ss(&work
.OneOverW
[0]);
476 coeffs
->vBOneOverW
= _simd_broadcast_ss(&work
.OneOverW
[1]);
477 coeffs
->vCOneOverW
= _simd_broadcast_ss(&work
.OneOverW
[2]);
480 inline void SetupRenderBuffers(uint8_t *pColorBuffer
[SWR_NUM_RENDERTARGETS
], uint8_t **pDepthBuffer
, uint8_t **pStencilBuffer
, uint32_t colorHotTileMask
, RenderOutputBuffers
&renderBuffers
)
484 while (_BitScanForward(&index
, colorHotTileMask
))
486 assert(index
< SWR_NUM_RENDERTARGETS
);
487 colorHotTileMask
&= ~(1 << index
);
488 pColorBuffer
[index
] = renderBuffers
.pColor
[index
];
493 *pDepthBuffer
= renderBuffers
.pDepth
;
498 *pStencilBuffer
= renderBuffers
.pStencil
;;
503 void SetupPixelShaderContext(SWR_PS_CONTEXT
*psContext
, const SWR_MULTISAMPLE_POS
& samplePos
, SWR_TRIANGLE_DESC
&work
)
505 psContext
->pAttribs
= work
.pAttribs
;
506 psContext
->pPerspAttribs
= work
.pPerspAttribs
;
507 psContext
->frontFace
= work
.triFlags
.frontFacing
;
508 psContext
->renderTargetArrayIndex
= work
.triFlags
.renderTargetArrayIndex
;
510 // save Ia/Ib/Ic and Ja/Jb/Jc if we need to reevaluate i/j/k in the shader because of pull attribs
511 psContext
->I
= work
.I
;
512 psContext
->J
= work
.J
;
514 psContext
->recipDet
= work
.recipDet
;
515 psContext
->pRecipW
= work
.pRecipW
;
516 psContext
->pSamplePosX
= samplePos
.X();//reinterpret_cast<const float *>(&T::MultisampleT::samplePosX);
517 psContext
->pSamplePosY
= samplePos
.Y();//reinterpret_cast<const float *>(&T::MultisampleT::samplePosY);
518 psContext
->rasterizerSampleCount
= T::MultisampleT::numSamples
;
519 psContext
->sampleIndex
= 0;
522 template<typename T
, bool IsSingleSample
>
523 void CalcCentroid(SWR_PS_CONTEXT
*psContext
, const SWR_MULTISAMPLE_POS
& samplePos
,
524 const BarycentricCoeffs
&coeffs
, const uint64_t * const coverageMask
, uint32_t sampleMask
)
526 if (IsSingleSample
) // if (T::MultisampleT::numSamples == 1) // doesn't cut it, the centroid positions are still different
528 // for 1x case, centroid is pixel center
529 psContext
->vX
.centroid
= psContext
->vX
.center
;
530 psContext
->vY
.centroid
= psContext
->vY
.center
;
531 psContext
->vI
.centroid
= psContext
->vI
.center
;
532 psContext
->vJ
.centroid
= psContext
->vJ
.center
;
533 psContext
->vOneOverW
.centroid
= psContext
->vOneOverW
.center
;
539 ///@ todo: don't need to genererate input coverage 2x if input coverage and centroid
540 if (T::bIsCenterPattern
)
542 psContext
->vX
.centroid
= _simd_add_ps(psContext
->vX
.UL
, _simd_set1_ps(0.5f
));
543 psContext
->vY
.centroid
= _simd_add_ps(psContext
->vY
.UL
, _simd_set1_ps(0.5f
));
547 // add param: const uint32_t inputMask[KNOB_SIMD_WIDTH] to eliminate 'generate coverage 2X'..
548 CalcCentroidPos
<T
>(*psContext
, samplePos
, coverageMask
, sampleMask
, psContext
->vX
.UL
, psContext
->vY
.UL
);
551 CalcCentroidBarycentrics(coeffs
, *psContext
, psContext
->vX
.UL
, psContext
->vY
.UL
);
555 psContext
->vX
.centroid
= psContext
->vX
.sample
;
556 psContext
->vY
.centroid
= psContext
->vY
.sample
;
562 struct PixelRateZTestLoop
564 PixelRateZTestLoop(DRAW_CONTEXT
*DC
, uint32_t _workerId
, const SWR_TRIANGLE_DESC
&Work
, const BarycentricCoeffs
& Coeffs
, const API_STATE
& apiState
,
565 uint8_t*& depthBuffer
, uint8_t*& stencilBuffer
, const uint8_t ClipDistanceMask
) :
566 pDC(DC
), workerId(_workerId
), work(Work
), coeffs(Coeffs
), state(apiState
), psState(apiState
.psState
),
567 samplePos(state
.rastState
.samplePositions
),
568 clipDistanceMask(ClipDistanceMask
), pDepthBuffer(depthBuffer
), pStencilBuffer(stencilBuffer
){};
571 uint32_t operator()(simdscalar
& activeLanes
, SWR_PS_CONTEXT
& psContext
,
572 const CORE_BUCKETS BEDepthBucket
, uint32_t currentSimdIn8x8
= 0)
575 uint32_t statCount
= 0;
576 simdscalar anyDepthSamplePassed
= _simd_setzero_ps();
577 for(uint32_t sample
= 0; sample
< T::MultisampleT::numCoverageSamples
; sample
++)
579 const uint8_t *pCoverageMask
= (uint8_t*)&work
.coverageMask
[sample
];
580 vCoverageMask
[sample
] = _simd_and_ps(activeLanes
, _simd_vmask_ps(pCoverageMask
[currentSimdIn8x8
] & MASK
));
582 if(!_simd_movemask_ps(vCoverageMask
[sample
]))
584 vCoverageMask
[sample
] = depthPassMask
[sample
] = stencilPassMask
[sample
] = _simd_setzero_ps();
588 // offset depth/stencil buffers current sample
589 uint8_t *pDepthSample
= pDepthBuffer
+ RasterTileDepthOffset(sample
);
590 uint8_t * pStencilSample
= pStencilBuffer
+ RasterTileStencilOffset(sample
);
592 if (state
.depthHottileEnable
&& state
.depthBoundsState
.depthBoundsTestEnable
)
594 static_assert(KNOB_DEPTH_HOT_TILE_FORMAT
== R32_FLOAT
, "Unsupported depth hot tile format");
596 const simdscalar z
= _simd_load_ps(reinterpret_cast<const float *>(pDepthSample
));
598 const float minz
= state
.depthBoundsState
.depthBoundsTestMinValue
;
599 const float maxz
= state
.depthBoundsState
.depthBoundsTestMaxValue
;
601 vCoverageMask
[sample
] = _simd_and_ps(vCoverageMask
[sample
], _simd_vmask_ps(CalcDepthBoundsAcceptMask(z
, minz
, maxz
)));
604 RDTSC_BEGIN(BEBarycentric
, pDC
->drawId
);
606 // calculate per sample positions
607 psContext
.vX
.sample
= _simd_add_ps(psContext
.vX
.UL
, samplePos
.vX(sample
));
608 psContext
.vY
.sample
= _simd_add_ps(psContext
.vY
.UL
, samplePos
.vY(sample
));
610 // calc I & J per sample
611 CalcSampleBarycentrics(coeffs
, psContext
);
613 if(psState
.writesODepth
)
616 // broadcast and test oDepth(psContext.vZ) written from the PS for each sample
617 vZ
[sample
] = psContext
.vZ
;
622 vZ
[sample
] = vplaneps(coeffs
.vZa
, coeffs
.vZb
, coeffs
.vZc
, psContext
.vI
.sample
, psContext
.vJ
.sample
);
623 vZ
[sample
] = state
.pfnQuantizeDepth(vZ
[sample
]);
626 RDTSC_END(BEBarycentric
, 0);
628 ///@todo: perspective correct vs non-perspective correct clipping?
629 // if clip distances are enabled, we need to interpolate for each sample
632 uint8_t clipMask
= ComputeUserClipMask(clipDistanceMask
, work
.pUserClipBuffer
, psContext
.vI
.sample
, psContext
.vJ
.sample
);
634 vCoverageMask
[sample
] = _simd_and_ps(vCoverageMask
[sample
], _simd_vmask_ps(~clipMask
));
637 // ZTest for this sample
638 ///@todo Need to uncomment out this bucket.
639 //RDTSC_BEGIN(BEDepthBucket, pDC->drawId);
640 depthPassMask
[sample
] = vCoverageMask
[sample
];
641 stencilPassMask
[sample
] = vCoverageMask
[sample
];
642 depthPassMask
[sample
] = DepthStencilTest(&state
, work
.triFlags
.frontFacing
, work
.triFlags
.viewportIndex
,
643 vZ
[sample
], pDepthSample
, vCoverageMask
[sample
],
644 pStencilSample
, &stencilPassMask
[sample
]);
645 //RDTSC_END(BEDepthBucket, 0);
647 // early-exit if no pixels passed depth or earlyZ is forced on
648 if(psState
.forceEarlyZ
|| !_simd_movemask_ps(depthPassMask
[sample
]))
650 DepthStencilWrite(&state
.vp
[work
.triFlags
.viewportIndex
], &state
.depthStencilState
, work
.triFlags
.frontFacing
, vZ
[sample
],
651 pDepthSample
, depthPassMask
[sample
], vCoverageMask
[sample
], pStencilSample
, stencilPassMask
[sample
]);
653 if(!_simd_movemask_ps(depthPassMask
[sample
]))
658 anyDepthSamplePassed
= _simd_or_ps(anyDepthSamplePassed
, depthPassMask
[sample
]);
659 uint32_t statMask
= _simd_movemask_ps(depthPassMask
[sample
]);
660 statCount
+= _mm_popcnt_u32(statMask
);
663 activeLanes
= _simd_and_ps(anyDepthSamplePassed
, activeLanes
);
664 // return number of samples that passed depth and coverage
668 // saved depth/stencil/coverage masks and interpolated Z used in OM and DepthWrite
669 simdscalar vZ
[T::MultisampleT::numCoverageSamples
];
670 simdscalar vCoverageMask
[T::MultisampleT::numCoverageSamples
];
671 simdscalar depthPassMask
[T::MultisampleT::numCoverageSamples
];
672 simdscalar stencilPassMask
[T::MultisampleT::numCoverageSamples
];
679 const SWR_TRIANGLE_DESC
& work
;
680 const BarycentricCoeffs
& coeffs
;
681 const API_STATE
& state
;
682 const SWR_PS_STATE
& psState
;
683 const SWR_MULTISAMPLE_POS
& samplePos
;
684 const uint8_t clipDistanceMask
;
685 uint8_t*& pDepthBuffer
;
686 uint8_t*& pStencilBuffer
;
689 INLINE
void CalcPixelBarycentrics(const BarycentricCoeffs
& coeffs
, SWR_PS_CONTEXT
&psContext
)
692 psContext
.vI
.center
= vplaneps(coeffs
.vIa
, coeffs
.vIb
, coeffs
.vIc
, psContext
.vX
.center
, psContext
.vY
.center
);
693 psContext
.vJ
.center
= vplaneps(coeffs
.vJa
, coeffs
.vJb
, coeffs
.vJc
, psContext
.vX
.center
, psContext
.vY
.center
);
694 psContext
.vI
.center
= _simd_mul_ps(psContext
.vI
.center
, coeffs
.vRecipDet
);
695 psContext
.vJ
.center
= _simd_mul_ps(psContext
.vJ
.center
, coeffs
.vRecipDet
);
698 psContext
.vOneOverW
.center
= vplaneps(coeffs
.vAOneOverW
, coeffs
.vBOneOverW
, coeffs
.vCOneOverW
, psContext
.vI
.center
, psContext
.vJ
.center
);
701 static INLINE
void CalcSampleBarycentrics(const BarycentricCoeffs
& coeffs
, SWR_PS_CONTEXT
&psContext
)
704 psContext
.vI
.sample
= vplaneps(coeffs
.vIa
, coeffs
.vIb
, coeffs
.vIc
, psContext
.vX
.sample
, psContext
.vY
.sample
);
705 psContext
.vJ
.sample
= vplaneps(coeffs
.vJa
, coeffs
.vJb
, coeffs
.vJc
, psContext
.vX
.sample
, psContext
.vY
.sample
);
706 psContext
.vI
.sample
= _simd_mul_ps(psContext
.vI
.sample
, coeffs
.vRecipDet
);
707 psContext
.vJ
.sample
= _simd_mul_ps(psContext
.vJ
.sample
, coeffs
.vRecipDet
);
710 psContext
.vOneOverW
.sample
= vplaneps(coeffs
.vAOneOverW
, coeffs
.vBOneOverW
, coeffs
.vCOneOverW
, psContext
.vI
.sample
, psContext
.vJ
.sample
);
713 // Merge Output to 4x2 SIMD Tile Format
714 INLINE
void OutputMerger4x2(SWR_PS_CONTEXT
&psContext
, uint8_t* (&pColorBase
)[SWR_NUM_RENDERTARGETS
], uint32_t sample
, const SWR_BLEND_STATE
*pBlendState
,
715 const PFN_BLEND_JIT_FUNC (&pfnBlendFunc
)[SWR_NUM_RENDERTARGETS
], simdscalar
&coverageMask
, simdscalar
const &depthPassMask
, uint32_t renderTargetMask
)
717 // type safety guaranteed from template instantiation in BEChooser<>::GetFunc
718 const uint32_t rasterTileColorOffset
= RasterTileColorOffset(sample
);
722 while (_BitScanForward(&rt
, renderTargetMask
))
724 renderTargetMask
&= ~(1 << rt
);
725 uint8_t *pColorSample
= pColorBase
[rt
] + rasterTileColorOffset
;
727 const SWR_RENDER_TARGET_BLEND_STATE
*pRTBlend
= &pBlendState
->renderTarget
[rt
];
730 // pfnBlendFunc may not update all channels. Initialize with PS output.
731 /// TODO: move this into the blend JIT.
732 blendOut
= psContext
.shaded
[rt
];
734 // Blend outputs and update coverage mask for alpha test
735 if(pfnBlendFunc
[rt
] != nullptr)
739 psContext
.shaded
[rt
],
741 psContext
.shaded
[0].w
,
746 (simdscalari
*)&coverageMask
);
751 simdscalari outputMask
= _simd_castps_si(_simd_and_ps(coverageMask
, depthPassMask
));
753 ///@todo can only use maskstore fast path if bpc is 32. Assuming hot tile is RGBA32_FLOAT.
754 static_assert(KNOB_COLOR_HOT_TILE_FORMAT
== R32G32B32A32_FLOAT
, "Unsupported hot tile format");
756 const uint32_t simd
= KNOB_SIMD_WIDTH
* sizeof(float);
758 // store with color mask
759 if(!pRTBlend
->writeDisableRed
)
761 _simd_maskstore_ps((float*)pColorSample
, outputMask
, blendOut
.x
);
763 if(!pRTBlend
->writeDisableGreen
)
765 _simd_maskstore_ps((float*)(pColorSample
+ simd
), outputMask
, blendOut
.y
);
767 if(!pRTBlend
->writeDisableBlue
)
769 _simd_maskstore_ps((float*)(pColorSample
+ simd
* 2), outputMask
, blendOut
.z
);
771 if(!pRTBlend
->writeDisableAlpha
)
773 _simd_maskstore_ps((float*)(pColorSample
+ simd
* 3), outputMask
, blendOut
.w
);
778 #if USE_8x2_TILE_BACKEND
779 // Merge Output to 8x2 SIMD16 Tile Format
780 INLINE
void OutputMerger8x2(SWR_PS_CONTEXT
&psContext
, uint8_t* (&pColorBase
)[SWR_NUM_RENDERTARGETS
], uint32_t sample
, const SWR_BLEND_STATE
*pBlendState
,
781 const PFN_BLEND_JIT_FUNC(&pfnBlendFunc
)[SWR_NUM_RENDERTARGETS
], simdscalar
&coverageMask
, simdscalar
const &depthPassMask
, uint32_t renderTargetMask
, bool useAlternateOffset
)
783 // type safety guaranteed from template instantiation in BEChooser<>::GetFunc
784 uint32_t rasterTileColorOffset
= RasterTileColorOffset(sample
);
786 if (useAlternateOffset
)
788 rasterTileColorOffset
+= sizeof(simdscalar
);
795 while (_BitScanForward(&rt
, renderTargetMask
))
797 renderTargetMask
&= ~(1 << rt
);
799 const SWR_RENDER_TARGET_BLEND_STATE
*pRTBlend
= &pBlendState
->renderTarget
[rt
];
801 simdscalar
* pColorSample
;
802 bool hotTileEnable
= !pRTBlend
->writeDisableAlpha
|| !pRTBlend
->writeDisableRed
|| !pRTBlend
->writeDisableGreen
|| !pRTBlend
->writeDisableBlue
;
805 pColorSample
= reinterpret_cast<simdscalar
*>(pColorBase
[rt
] + rasterTileColorOffset
);
806 blendSrc
[0] = pColorSample
[0];
807 blendSrc
[1] = pColorSample
[2];
808 blendSrc
[2] = pColorSample
[4];
809 blendSrc
[3] = pColorSample
[6];
813 pColorSample
= nullptr;
817 // pfnBlendFunc may not update all channels. Initialize with PS output.
818 /// TODO: move this into the blend JIT.
819 blendOut
= psContext
.shaded
[rt
];
821 // Blend outputs and update coverage mask for alpha test
822 if(pfnBlendFunc
[rt
] != nullptr)
826 psContext
.shaded
[rt
],
828 psContext
.shaded
[0].w
,
830 reinterpret_cast<uint8_t *>(&blendSrc
),
833 reinterpret_cast<simdscalari
*>(&coverageMask
));
838 simdscalari outputMask
= _simd_castps_si(_simd_and_ps(coverageMask
, depthPassMask
));
840 ///@todo can only use maskstore fast path if bpc is 32. Assuming hot tile is RGBA32_FLOAT.
841 static_assert(KNOB_COLOR_HOT_TILE_FORMAT
== R32G32B32A32_FLOAT
, "Unsupported hot tile format");
843 // store with color mask
844 if (!pRTBlend
->writeDisableRed
)
846 _simd_maskstore_ps(reinterpret_cast<float *>(&pColorSample
[0]), outputMask
, blendOut
.x
);
848 if (!pRTBlend
->writeDisableGreen
)
850 _simd_maskstore_ps(reinterpret_cast<float *>(&pColorSample
[2]), outputMask
, blendOut
.y
);
852 if (!pRTBlend
->writeDisableBlue
)
854 _simd_maskstore_ps(reinterpret_cast<float *>(&pColorSample
[4]), outputMask
, blendOut
.z
);
856 if (!pRTBlend
->writeDisableAlpha
)
858 _simd_maskstore_ps(reinterpret_cast<float *>(&pColorSample
[6]), outputMask
, blendOut
.w
);
866 void BackendPixelRate(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t x
, uint32_t y
, SWR_TRIANGLE_DESC
&work
, RenderOutputBuffers
&renderBuffers
)
868 ///@todo: Need to move locals off stack to prevent __chkstk's from being generated for the backend
871 RDTSC_BEGIN(BEPixelRateBackend
, pDC
->drawId
);
872 RDTSC_BEGIN(BESetup
, pDC
->drawId
);
874 const API_STATE
&state
= GetApiState(pDC
);
876 BarycentricCoeffs coeffs
;
877 SetupBarycentricCoeffs(&coeffs
, work
);
879 SWR_PS_CONTEXT psContext
;
880 const SWR_MULTISAMPLE_POS
& samplePos
= state
.rastState
.samplePositions
;
881 SetupPixelShaderContext
<T
>(&psContext
, samplePos
, work
);
883 uint8_t *pDepthBuffer
, *pStencilBuffer
;
884 SetupRenderBuffers(psContext
.pColorBuffer
, &pDepthBuffer
, &pStencilBuffer
, state
.colorHottileEnable
, renderBuffers
);
886 RDTSC_END(BESetup
, 0);
888 PixelRateZTestLoop
<T
> PixelRateZTest(pDC
, workerId
, work
, coeffs
, state
, pDepthBuffer
, pStencilBuffer
, state
.backendState
.clipDistanceMask
);
890 psContext
.vY
.UL
= _simd_add_ps(vULOffsetsY
, _simd_set1_ps(static_cast<float>(y
)));
891 psContext
.vY
.center
= _simd_add_ps(vCenterOffsetsY
, _simd_set1_ps(static_cast<float>(y
)));
893 const simdscalar dy
= _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM
));
895 for(uint32_t yy
= y
; yy
< y
+ KNOB_TILE_Y_DIM
; yy
+= SIMD_TILE_Y_DIM
)
897 psContext
.vX
.UL
= _simd_add_ps(vULOffsetsX
, _simd_set1_ps(static_cast<float>(x
)));
898 psContext
.vX
.center
= _simd_add_ps(vCenterOffsetsX
, _simd_set1_ps(static_cast<float>(x
)));
900 const simdscalar dx
= _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM
));
902 for(uint32_t xx
= x
; xx
< x
+ KNOB_TILE_X_DIM
; xx
+= SIMD_TILE_X_DIM
)
904 #if USE_8x2_TILE_BACKEND
905 const bool useAlternateOffset
= ((xx
& SIMD_TILE_X_DIM
) != 0);
907 simdscalar activeLanes
;
908 if(!(work
.anyCoveredSamples
& MASK
)) {goto Endtile
;};
909 activeLanes
= _simd_vmask_ps(work
.anyCoveredSamples
& MASK
);
911 if (T::InputCoverage
!= SWR_INPUT_COVERAGE_NONE
)
913 const uint64_t* pCoverageMask
= (T::InputCoverage
== SWR_INPUT_COVERAGE_INNER_CONSERVATIVE
) ? &work
.innerCoverageMask
: &work
.coverageMask
[0];
915 generateInputCoverage
<T
, T::InputCoverage
>(pCoverageMask
, psContext
.inputMask
, state
.blendState
.sampleMask
);
918 RDTSC_BEGIN(BEBarycentric
, pDC
->drawId
);
920 CalcPixelBarycentrics(coeffs
, psContext
);
922 CalcCentroid
<T
, false>(&psContext
, samplePos
, coeffs
, work
.coverageMask
, state
.blendState
.sampleMask
);
924 RDTSC_END(BEBarycentric
, 0);
926 if(T::bForcedSampleCount
)
928 // candidate pixels (that passed coverage) will cause shader invocation if any bits in the samplemask are set
929 const simdscalar vSampleMask
= _simd_castsi_ps(_simd_cmpgt_epi32(_simd_set1_epi32(state
.blendState
.sampleMask
), _simd_setzero_si()));
930 activeLanes
= _simd_and_ps(activeLanes
, vSampleMask
);
934 if(T::bCanEarlyZ
&& !T::bForcedSampleCount
)
936 uint32_t depthPassCount
= PixelRateZTest(activeLanes
, psContext
, BEEarlyDepthTest
);
937 UPDATE_STAT_BE(DepthPassCount
, depthPassCount
);
938 AR_EVENT(EarlyDepthInfoPixelRate(depthPassCount
, _simd_movemask_ps(activeLanes
)));
941 // if we have no covered samples that passed depth at this point, go to next tile
942 if(!_simd_movemask_ps(activeLanes
)) { goto Endtile
; };
944 if(state
.psState
.usesSourceDepth
)
946 RDTSC_BEGIN(BEBarycentric
, pDC
->drawId
);
947 // interpolate and quantize z
948 psContext
.vZ
= vplaneps(coeffs
.vZa
, coeffs
.vZb
, coeffs
.vZc
, psContext
.vI
.center
, psContext
.vJ
.center
);
949 psContext
.vZ
= state
.pfnQuantizeDepth(psContext
.vZ
);
950 RDTSC_END(BEBarycentric
, 0);
953 // pixels that are currently active
954 psContext
.activeMask
= _simd_castps_si(activeLanes
);
955 psContext
.oMask
= T::MultisampleT::FullSampleMask();
957 // execute pixel shader
958 RDTSC_BEGIN(BEPixelShader
, pDC
->drawId
);
959 state
.psState
.pfnPixelShader(GetPrivateState(pDC
), &psContext
);
960 UPDATE_STAT_BE(PsInvocations
, _mm_popcnt_u32(_simd_movemask_ps(activeLanes
)));
961 RDTSC_END(BEPixelShader
, 0);
963 // update active lanes to remove any discarded or oMask'd pixels
964 activeLanes
= _simd_castsi_ps(_simd_and_si(psContext
.activeMask
, _simd_cmpgt_epi32(psContext
.oMask
, _simd_setzero_si())));
965 if(!_simd_movemask_ps(activeLanes
)) { goto Endtile
; };
968 if(!T::bCanEarlyZ
&& !T::bForcedSampleCount
)
970 uint32_t depthPassCount
= PixelRateZTest(activeLanes
, psContext
, BELateDepthTest
);
971 UPDATE_STAT_BE(DepthPassCount
, depthPassCount
);
972 AR_EVENT(LateDepthInfoPixelRate(depthPassCount
, _simd_movemask_ps(activeLanes
)));
975 // if we have no covered samples that passed depth at this point, skip OM and go to next tile
976 if(!_simd_movemask_ps(activeLanes
)) { goto Endtile
; };
979 // loop over all samples, broadcasting the results of the PS to all passing pixels
980 for(uint32_t sample
= 0; sample
< GetNumOMSamples
<T
>(state
.blendState
.sampleCount
); sample
++)
982 RDTSC_BEGIN(BEOutputMerger
, pDC
->drawId
);
983 // center pattern does a single coverage/depth/stencil test, standard pattern tests all samples
984 uint32_t coverageSampleNum
= (T::bIsCenterPattern
) ? 0 : sample
;
985 simdscalar coverageMask
, depthMask
;
986 if(T::bForcedSampleCount
)
988 coverageMask
= depthMask
= activeLanes
;
992 coverageMask
= PixelRateZTest
.vCoverageMask
[coverageSampleNum
];
993 depthMask
= PixelRateZTest
.depthPassMask
[coverageSampleNum
];
994 if(!_simd_movemask_ps(depthMask
))
996 // stencil should already have been written in early/lateZ tests
997 RDTSC_END(BEOutputMerger
, 0);
1002 // broadcast the results of the PS to all passing pixels
1003 #if USE_8x2_TILE_BACKEND
1004 OutputMerger8x2(psContext
, psContext
.pColorBuffer
, sample
, &state
.blendState
,state
.pfnBlendFunc
, coverageMask
, depthMask
, state
.psState
.renderTargetMask
, useAlternateOffset
);
1005 #else // USE_8x2_TILE_BACKEND
1006 OutputMerger4x2(psContext
, psContext
.pColorBuffer
, sample
, &state
.blendState
, state
.pfnBlendFunc
, coverageMask
, depthMask
, state
.psState
.renderTargetMask
);
1007 #endif // USE_8x2_TILE_BACKEND
1009 if(!state
.psState
.forceEarlyZ
&& !T::bForcedSampleCount
)
1011 uint8_t *pDepthSample
= pDepthBuffer
+ RasterTileDepthOffset(sample
);
1012 uint8_t * pStencilSample
= pStencilBuffer
+ RasterTileStencilOffset(sample
);
1014 DepthStencilWrite(&state
.vp
[work
.triFlags
.viewportIndex
], &state
.depthStencilState
, work
.triFlags
.frontFacing
, PixelRateZTest
.vZ
[coverageSampleNum
],
1015 pDepthSample
, depthMask
, coverageMask
, pStencilSample
, PixelRateZTest
.stencilPassMask
[coverageSampleNum
]);
1017 RDTSC_END(BEOutputMerger
, 0);
1020 RDTSC_BEGIN(BEEndTile
, pDC
->drawId
);
1022 for(uint32_t sample
= 0; sample
< T::MultisampleT::numCoverageSamples
; sample
++)
1024 work
.coverageMask
[sample
] >>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
1027 if(T::InputCoverage
== SWR_INPUT_COVERAGE_INNER_CONSERVATIVE
)
1029 work
.innerCoverageMask
>>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
1031 work
.anyCoveredSamples
>>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
1033 #if USE_8x2_TILE_BACKEND
1034 if (useAlternateOffset
)
1037 uint32_t rtMask
= state
.colorHottileEnable
;
1038 while (_BitScanForward(&rt
, rtMask
))
1040 rtMask
&= ~(1 << rt
);
1041 psContext
.pColorBuffer
[rt
] += (2 * KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
) / 8;
1046 uint32_t rtMask
= state
.colorHottileEnable
;
1047 while (_BitScanForward(&rt
, rtMask
))
1049 rtMask
&= ~(1 << rt
);
1050 psContext
.pColorBuffer
[rt
] += (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
) / 8;
1053 pDepthBuffer
+= (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
) / 8;
1054 pStencilBuffer
+= (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
) / 8;
1056 RDTSC_END(BEEndTile
, 0);
1058 psContext
.vX
.UL
= _simd_add_ps(psContext
.vX
.UL
, dx
);
1059 psContext
.vX
.center
= _simd_add_ps(psContext
.vX
.center
, dx
);
1062 psContext
.vY
.UL
= _simd_add_ps(psContext
.vY
.UL
, dy
);
1063 psContext
.vY
.center
= _simd_add_ps(psContext
.vY
.center
, dy
);
1066 RDTSC_END(BEPixelRateBackend
, 0);
1069 template<uint32_t sampleCountT
= SWR_MULTISAMPLE_1X
, uint32_t isCenter
= 0,
1070 uint32_t coverage
= 0, uint32_t centroid
= 0, uint32_t forced
= 0, uint32_t canEarlyZ
= 0
1072 struct SwrBackendTraits
1074 static const bool bIsCenterPattern
= (isCenter
== 1);
1075 static const uint32_t InputCoverage
= coverage
;
1076 static const bool bCentroidPos
= (centroid
== 1);
1077 static const bool bForcedSampleCount
= (forced
== 1);
1078 static const bool bCanEarlyZ
= (canEarlyZ
== 1);
1079 typedef MultisampleTraits
<(SWR_MULTISAMPLE_COUNT
)sampleCountT
, bIsCenterPattern
> MultisampleT
;