1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * @brief Backend handles rasterization, pixel shading and output merger
28 ******************************************************************************/
31 #include "common/os.h"
32 #include "core/context.h"
33 #include "core/multisample.h"
34 #include "rdtsc_core.h"
36 void ProcessComputeBE(DRAW_CONTEXT
* pDC
, uint32_t workerId
, uint32_t threadGroupId
, void*& pSpillFillBuffer
);
37 void ProcessSyncBE(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t macroTile
, void *pUserData
);
38 void ProcessClearBE(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t macroTile
, void *pUserData
);
39 void ProcessStoreTilesBE(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t macroTile
, void *pData
);
40 void ProcessDiscardInvalidateTilesBE(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t macroTile
, void *pData
);
41 void ProcessShutdownBE(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t macroTile
, void *pUserData
);
42 void BackendNullPS(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t x
, uint32_t y
, SWR_TRIANGLE_DESC
&work
, RenderOutputBuffers
&renderBuffers
);
43 void InitClearTilesTable();
44 simdmask
ComputeUserClipMask(uint8_t clipMask
, float* pUserClipBuffer
, simdscalar vI
, simdscalar vJ
);
45 void InitBackendFuncTables();
46 void InitCPSFuncTables();
47 void CalcSampleBarycentrics(const BarycentricCoeffs
& coeffs
, SWR_PS_CONTEXT
&psContext
);
49 enum SWR_BACKEND_FUNCS
51 SWR_BACKEND_SINGLE_SAMPLE
,
52 SWR_BACKEND_MSAA_PIXEL_RATE
,
53 SWR_BACKEND_MSAA_SAMPLE_RATE
,
54 SWR_BACKEND_FUNCS_MAX
,
57 #if KNOB_SIMD_WIDTH == 8
58 extern const __m256 vCenterOffsetsX
;
59 extern const __m256 vCenterOffsetsY
;
60 extern const __m256 vULOffsetsX
;
61 extern const __m256 vULOffsetsY
;
65 INLINE
static uint32_t RasterTileColorOffset(uint32_t sampleNum
)
67 static const uint32_t RasterTileColorOffsets
[16]
69 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
/ 8),
70 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
/ 8) * 2,
71 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
/ 8) * 3,
72 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
/ 8) * 4,
73 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
/ 8) * 5,
74 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
/ 8) * 6,
75 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
/ 8) * 7,
76 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
/ 8) * 8,
77 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
/ 8) * 9,
78 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
/ 8) * 10,
79 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
/ 8) * 11,
80 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
/ 8) * 12,
81 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
/ 8) * 13,
82 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
/ 8) * 14,
83 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
/ 8) * 15,
85 assert(sampleNum
< 16);
86 return RasterTileColorOffsets
[sampleNum
];
89 INLINE
static uint32_t RasterTileDepthOffset(uint32_t sampleNum
)
91 static const uint32_t RasterTileDepthOffsets
[16]
93 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
/ 8),
94 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
/ 8) * 2,
95 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
/ 8) * 3,
96 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
/ 8) * 4,
97 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
/ 8) * 5,
98 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
/ 8) * 6,
99 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
/ 8) * 7,
100 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
/ 8) * 8,
101 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
/ 8) * 9,
102 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
/ 8) * 10,
103 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
/ 8) * 11,
104 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
/ 8) * 12,
105 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
/ 8) * 13,
106 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
/ 8) * 14,
107 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
/ 8) * 15,
109 assert(sampleNum
< 16);
110 return RasterTileDepthOffsets
[sampleNum
];
113 INLINE
static uint32_t RasterTileStencilOffset(uint32_t sampleNum
)
115 static const uint32_t RasterTileStencilOffsets
[16]
117 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
/ 8),
118 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
/ 8) * 2,
119 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
/ 8) * 3,
120 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
/ 8) * 4,
121 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
/ 8) * 5,
122 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
/ 8) * 6,
123 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
/ 8) * 7,
124 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
/ 8) * 8,
125 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
/ 8) * 9,
126 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
/ 8) * 10,
127 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
/ 8) * 11,
128 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
/ 8) * 12,
129 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
/ 8) * 13,
130 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
/ 8) * 14,
131 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
/ 8) * 15,
133 assert(sampleNum
< 16);
134 return RasterTileStencilOffsets
[sampleNum
];
137 template<typename T
, uint32_t InputCoverage
>
138 struct generateInputCoverage
140 INLINE
generateInputCoverage(const uint64_t *const coverageMask
, uint32_t (&inputMask
)[KNOB_SIMD_WIDTH
], const uint32_t sampleMask
)
142 // will need to update for avx512
143 assert(KNOB_SIMD_WIDTH
== 8);
146 __m256i sampleCoverage
[2];
147 if(T::bIsStandardPattern
)
149 __m256i src
= _mm256_set1_epi32(0);
150 __m256i index0
= _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0), index1
;
152 if(T::MultisampleT::numSamples
== 1)
154 mask
[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, -1);
156 else if(T::MultisampleT::numSamples
== 2)
158 mask
[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1);
160 else if(T::MultisampleT::numSamples
== 4)
162 mask
[0] = _mm256_set_epi32(0, 0, 0, 0, -1, -1, -1, -1);
164 else if(T::MultisampleT::numSamples
== 8)
166 mask
[0] = _mm256_set1_epi32(-1);
168 else if(T::MultisampleT::numSamples
== 16)
170 mask
[0] = _mm256_set1_epi32(-1);
171 mask
[1] = _mm256_set1_epi32(-1);
172 index1
= _mm256_set_epi32(15, 14, 13, 12, 11, 10, 9, 8);
175 // gather coverage for samples 0-7
176 sampleCoverage
[0] = _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src
), (const float*)coverageMask
, index0
, _mm256_castsi256_ps(mask
[0]), 8));
177 if(T::MultisampleT::numSamples
> 8)
179 // gather coverage for samples 8-15
180 sampleCoverage
[1] = _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src
), (const float*)coverageMask
, index1
, _mm256_castsi256_ps(mask
[1]), 8));
185 // center coverage is the same for all samples; just broadcast to the sample slots
186 uint32_t centerCoverage
= ((uint32_t)(*coverageMask
) & MASK
);
187 if(T::MultisampleT::numSamples
== 1)
189 sampleCoverage
[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, centerCoverage
);
191 else if(T::MultisampleT::numSamples
== 2)
193 sampleCoverage
[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, centerCoverage
, centerCoverage
);
195 else if(T::MultisampleT::numSamples
== 4)
197 sampleCoverage
[0] = _mm256_set_epi32(0, 0, 0, 0, centerCoverage
, centerCoverage
, centerCoverage
, centerCoverage
);
199 else if(T::MultisampleT::numSamples
== 8)
201 sampleCoverage
[0] = _mm256_set1_epi32(centerCoverage
);
203 else if(T::MultisampleT::numSamples
== 16)
205 sampleCoverage
[0] = _mm256_set1_epi32(centerCoverage
);
206 sampleCoverage
[1] = _mm256_set1_epi32(centerCoverage
);
210 mask
[0] = _mm256_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xC, 0x8, 0x4, 0x0,
211 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xC, 0x8, 0x4, 0x0);
212 // pull out the 8bit 4x2 coverage for samples 0-7 into the lower 32 bits of each 128bit lane
213 __m256i packedCoverage0
= _simd_shuffle_epi8(sampleCoverage
[0], mask
[0]);
215 __m256i packedCoverage1
;
216 if(T::MultisampleT::numSamples
> 8)
218 // pull out the 8bit 4x2 coverage for samples 8-15 into the lower 32 bits of each 128bit lane
219 packedCoverage1
= _simd_shuffle_epi8(sampleCoverage
[1], mask
[0]);
222 #if (KNOB_ARCH == KNOB_ARCH_AVX)
223 // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane
224 __m256i hiToLow
= _mm256_permute2f128_si256(packedCoverage0
, packedCoverage0
, 0x83);
225 __m256 shufRes
= _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow
), _mm256_castsi256_ps(hiToLow
), _MM_SHUFFLE(1, 1, 0, 1));
226 packedCoverage0
= _mm256_castps_si256(_mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0
), shufRes
, 0xFE));
228 __m256i packedSampleCoverage
;
229 if(T::MultisampleT::numSamples
> 8)
231 // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane
232 hiToLow
= _mm256_permute2f128_si256(packedCoverage1
, packedCoverage1
, 0x83);
233 shufRes
= _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow
), _mm256_castsi256_ps(hiToLow
), _MM_SHUFFLE(1, 1, 0, 1));
234 shufRes
= _mm256_blend_ps(_mm256_castsi256_ps(packedCoverage1
), shufRes
, 0xFE);
235 packedCoverage1
= _mm256_castps_si256(_mm256_castpd_ps(_mm256_shuffle_pd(_mm256_castps_pd(shufRes
), _mm256_castps_pd(shufRes
), 0x01)));
236 packedSampleCoverage
= _mm256_castps_si256(_mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0
), _mm256_castsi256_ps(packedCoverage1
), 0xFC));
240 packedSampleCoverage
= packedCoverage0
;
243 __m256i permMask
= _mm256_set_epi32(0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x4, 0x0);
244 // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane
245 packedCoverage0
= _mm256_permutevar8x32_epi32(packedCoverage0
, permMask
);
247 __m256i packedSampleCoverage
;
248 if(T::MultisampleT::numSamples
> 8)
250 permMask
= _mm256_set_epi32(0x7, 0x7, 0x7, 0x7, 0x4, 0x0, 0x7, 0x7);
251 // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane
252 packedCoverage1
= _mm256_permutevar8x32_epi32(packedCoverage1
, permMask
);
254 // blend coverage masks for samples 0-7 and samples 8-15 into single 128 bit lane
255 packedSampleCoverage
= _mm256_blend_epi32(packedCoverage0
, packedCoverage1
, 0x0C);
259 packedSampleCoverage
= packedCoverage0
;
263 for(int32_t i
= KNOB_SIMD_WIDTH
- 1; i
>= 0; i
--)
265 // convert packed sample coverage masks into single coverage masks for all samples for each pixel in the 4x2
266 inputMask
[i
] = _simd_movemask_epi8(packedSampleCoverage
);
268 if(!T::bForcedSampleCount
)
270 // input coverage has to be anded with sample mask if MSAA isn't forced on
271 inputMask
[i
] &= sampleMask
;
274 // shift to the next pixel in the 4x2
275 packedSampleCoverage
= _simd_slli_epi32(packedSampleCoverage
, 1);
279 INLINE
generateInputCoverage(const uint64_t *const coverageMask
, __m256
&inputCoverage
, const uint32_t sampleMask
)
281 uint32_t inputMask
[KNOB_SIMD_WIDTH
];
282 generateInputCoverage
<T
, T::InputCoverage
>(coverageMask
, inputMask
, sampleMask
);
283 inputCoverage
= _simd_castsi_ps(_mm256_set_epi32(inputMask
[7], inputMask
[6], inputMask
[5], inputMask
[4], inputMask
[3], inputMask
[2], inputMask
[1], inputMask
[0]));
289 struct generateInputCoverage
<T
, SWR_INPUT_COVERAGE_INNER_CONSERVATIVE
>
291 INLINE
generateInputCoverage(const uint64_t *const coverageMask
, __m256
&inputCoverage
, const uint32_t sampleMask
)
293 // will need to update for avx512
294 assert(KNOB_SIMD_WIDTH
== 8);
295 __m256i vec
= _mm256_set1_epi32(coverageMask
[0]);
296 const __m256i bit
= _mm256_set_epi32(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
297 vec
= _simd_and_si(vec
, bit
);
298 vec
= _simd_cmplt_epi32(_mm256_setzero_si256(), vec
);
299 vec
= _simd_blendv_epi32(_simd_setzero_si(), _simd_set1_epi32(1), vec
);
300 inputCoverage
= _simd_castsi_ps(vec
);
303 INLINE
generateInputCoverage(const uint64_t *const coverageMask
, uint32_t (&inputMask
)[KNOB_SIMD_WIDTH
], const uint32_t sampleMask
)
305 uint32_t simdCoverage
= (coverageMask
[0] & MASK
);
306 static const uint32_t FullCoverageMask
= (1 << T::MultisampleT::numSamples
) - 1;
307 for(int i
= 0; i
< KNOB_SIMD_WIDTH
; i
++)
309 // set all samples to covered if conservative coverage mask is set for that pixel
310 inputMask
[i
] = (((1 << i
) & simdCoverage
) > 0) ? FullCoverageMask
: 0;
315 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
316 // Centroid behaves exactly as follows :
317 // (1) If all samples in the primitive are covered, the attribute is evaluated at the pixel center (even if the sample pattern does not happen to
318 // have a sample location there).
319 // (2) Else the attribute is evaluated at the first covered sample, in increasing order of sample index, where sample coverage is after ANDing the
320 // coverage with the SampleMask Rasterizer State.
321 // (3) If no samples are covered, such as on helper pixels executed off the bounds of a primitive to fill out 2x2 pixel stamps, the attribute is
322 // evaluated as follows : If the SampleMask Rasterizer state is a subset of the samples in the pixel, then the first sample covered by the
323 // SampleMask Rasterizer State is the evaluation point.Otherwise (full SampleMask), the pixel center is the evaluation point.
324 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
326 INLINE
void CalcCentroidPos(SWR_PS_CONTEXT
&psContext
, const uint64_t *const coverageMask
, const uint32_t sampleMask
,
327 const simdscalar vXSamplePosUL
, const simdscalar vYSamplePosUL
)
329 uint32_t inputMask
[KNOB_SIMD_WIDTH
];
330 generateInputCoverage
<T
, T::InputCoverage
>(coverageMask
, inputMask
, sampleMask
);
332 // Case (2) - partially covered pixel
334 // scan for first covered sample per pixel in the 4x2 span
335 unsigned long sampleNum
[KNOB_SIMD_WIDTH
];
336 (inputMask
[0] > 0) ? (_BitScanForward(&sampleNum
[0], inputMask
[0])) : (sampleNum
[0] = 0);
337 (inputMask
[1] > 0) ? (_BitScanForward(&sampleNum
[1], inputMask
[1])) : (sampleNum
[1] = 0);
338 (inputMask
[2] > 0) ? (_BitScanForward(&sampleNum
[2], inputMask
[2])) : (sampleNum
[2] = 0);
339 (inputMask
[3] > 0) ? (_BitScanForward(&sampleNum
[3], inputMask
[3])) : (sampleNum
[3] = 0);
340 (inputMask
[4] > 0) ? (_BitScanForward(&sampleNum
[4], inputMask
[4])) : (sampleNum
[4] = 0);
341 (inputMask
[5] > 0) ? (_BitScanForward(&sampleNum
[5], inputMask
[5])) : (sampleNum
[5] = 0);
342 (inputMask
[6] > 0) ? (_BitScanForward(&sampleNum
[6], inputMask
[6])) : (sampleNum
[6] = 0);
343 (inputMask
[7] > 0) ? (_BitScanForward(&sampleNum
[7], inputMask
[7])) : (sampleNum
[7] = 0);
345 // look up and set the sample offsets from UL pixel corner for first covered sample
346 __m256 vXSample
= _mm256_set_ps(T::MultisampleT::X(sampleNum
[7]),
347 T::MultisampleT::X(sampleNum
[6]),
348 T::MultisampleT::X(sampleNum
[5]),
349 T::MultisampleT::X(sampleNum
[4]),
350 T::MultisampleT::X(sampleNum
[3]),
351 T::MultisampleT::X(sampleNum
[2]),
352 T::MultisampleT::X(sampleNum
[1]),
353 T::MultisampleT::X(sampleNum
[0]));
355 __m256 vYSample
= _mm256_set_ps(T::MultisampleT::Y(sampleNum
[7]),
356 T::MultisampleT::Y(sampleNum
[6]),
357 T::MultisampleT::Y(sampleNum
[5]),
358 T::MultisampleT::Y(sampleNum
[4]),
359 T::MultisampleT::Y(sampleNum
[3]),
360 T::MultisampleT::Y(sampleNum
[2]),
361 T::MultisampleT::Y(sampleNum
[1]),
362 T::MultisampleT::Y(sampleNum
[0]));
363 // add sample offset to UL pixel corner
364 vXSample
= _simd_add_ps(vXSamplePosUL
, vXSample
);
365 vYSample
= _simd_add_ps(vYSamplePosUL
, vYSample
);
367 // Case (1) and case (3b) - All samples covered or not covered with full SampleMask
368 static const __m256i vFullyCoveredMask
= T::MultisampleT::FullSampleMask();
369 __m256i vInputCoveragei
= _mm256_set_epi32(inputMask
[7], inputMask
[6], inputMask
[5], inputMask
[4], inputMask
[3], inputMask
[2], inputMask
[1], inputMask
[0]);
370 __m256i vAllSamplesCovered
= _simd_cmpeq_epi32(vInputCoveragei
, vFullyCoveredMask
);
372 static const __m256i vZero
= _simd_setzero_si();
373 const __m256i vSampleMask
= _simd_and_si(_simd_set1_epi32(sampleMask
), vFullyCoveredMask
);
374 __m256i vNoSamplesCovered
= _simd_cmpeq_epi32(vInputCoveragei
, vZero
);
375 __m256i vIsFullSampleMask
= _simd_cmpeq_epi32(vSampleMask
, vFullyCoveredMask
);
376 __m256i vCase3b
= _simd_and_si(vNoSamplesCovered
, vIsFullSampleMask
);
378 __m256i vEvalAtCenter
= _simd_or_si(vAllSamplesCovered
, vCase3b
);
380 // set the centroid position based on results from above
381 psContext
.vX
.centroid
= _simd_blendv_ps(vXSample
, psContext
.vX
.center
, _simd_castsi_ps(vEvalAtCenter
));
382 psContext
.vY
.centroid
= _simd_blendv_ps(vYSample
, psContext
.vY
.center
, _simd_castsi_ps(vEvalAtCenter
));
384 // Case (3a) No samples covered and partial sample mask
385 __m256i vSomeSampleMaskSamples
= _simd_cmplt_epi32(vSampleMask
, vFullyCoveredMask
);
386 // sample mask should never be all 0's for this case, but handle it anyways
387 unsigned long firstCoveredSampleMaskSample
= 0;
388 (sampleMask
> 0) ? (_BitScanForward(&firstCoveredSampleMaskSample
, sampleMask
)) : (firstCoveredSampleMaskSample
= 0);
390 __m256i vCase3a
= _simd_and_si(vNoSamplesCovered
, vSomeSampleMaskSamples
);
392 vXSample
= _simd_set1_ps(T::MultisampleT::X(firstCoveredSampleMaskSample
));
393 vYSample
= _simd_set1_ps(T::MultisampleT::Y(firstCoveredSampleMaskSample
));
395 // blend in case 3a pixel locations
396 psContext
.vX
.centroid
= _simd_blendv_ps(psContext
.vX
.centroid
, vXSample
, _simd_castsi_ps(vCase3a
));
397 psContext
.vY
.centroid
= _simd_blendv_ps(psContext
.vY
.centroid
, vYSample
, _simd_castsi_ps(vCase3a
));
400 INLINE
void CalcCentroidBarycentrics(const BarycentricCoeffs
& coeffs
, SWR_PS_CONTEXT
&psContext
,
401 const simdscalar vXSamplePosUL
, const simdscalar vYSamplePosUL
)
404 psContext
.vI
.centroid
= vplaneps(coeffs
.vIa
, coeffs
.vIb
, coeffs
.vIc
, psContext
.vX
.centroid
, psContext
.vY
.centroid
);
405 psContext
.vJ
.centroid
= vplaneps(coeffs
.vJa
, coeffs
.vJb
, coeffs
.vJc
, psContext
.vX
.centroid
, psContext
.vY
.centroid
);
406 psContext
.vI
.centroid
= _simd_mul_ps(psContext
.vI
.centroid
, coeffs
.vRecipDet
);
407 psContext
.vJ
.centroid
= _simd_mul_ps(psContext
.vJ
.centroid
, coeffs
.vRecipDet
);
410 psContext
.vOneOverW
.centroid
= vplaneps(coeffs
.vAOneOverW
, coeffs
.vBOneOverW
, coeffs
.vCOneOverW
, psContext
.vI
.centroid
, psContext
.vJ
.centroid
);
413 INLINE simdmask
CalcDepthBoundsAcceptMask(simdscalar z
, float minz
, float maxz
)
415 const simdscalar minzMask
= _simd_cmpge_ps(z
, _simd_set1_ps(minz
));
416 const simdscalar maxzMask
= _simd_cmple_ps(z
, _simd_set1_ps(maxz
));
418 return _simd_movemask_ps(_simd_and_ps(minzMask
, maxzMask
));
422 INLINE
uint32_t GetNumOMSamples(SWR_MULTISAMPLE_COUNT blendSampleCount
)
424 // RT has to be single sample if we're in forcedMSAA mode
425 if(T::bForcedSampleCount
&& (T::MultisampleT::sampleCount
> SWR_MULTISAMPLE_1X
))
429 // unless we're forced to single sample, in which case we run the OM at the sample count of the RT
430 else if(T::bForcedSampleCount
&& (T::MultisampleT::sampleCount
== SWR_MULTISAMPLE_1X
))
432 return GetNumSamples(blendSampleCount
);
434 // else we're in normal MSAA mode and rasterizer and OM are running at the same sample count
437 return T::MultisampleT::numSamples
;
442 struct PixelRateZTestLoop
444 PixelRateZTestLoop(DRAW_CONTEXT
*DC
, uint32_t _workerId
, const SWR_TRIANGLE_DESC
&Work
, const BarycentricCoeffs
& Coeffs
, const API_STATE
& apiState
,
445 uint8_t*& depthBase
, uint8_t*& stencilBase
, const uint8_t ClipDistanceMask
) :
446 pDC(DC
), workerId(_workerId
), work(Work
), coeffs(Coeffs
), state(apiState
), psState(apiState
.psState
),
447 clipDistanceMask(ClipDistanceMask
), pDepthBase(depthBase
), pStencilBase(stencilBase
) {};
450 uint32_t operator()(simdscalar
& activeLanes
, SWR_PS_CONTEXT
& psContext
,
451 const CORE_BUCKETS BEDepthBucket
, uint32_t currentSimdIn8x8
= 0)
453 SWR_CONTEXT
*pContext
= pDC
->pContext
;
455 uint32_t statCount
= 0;
456 simdscalar anyDepthSamplePassed
= _simd_setzero_ps();
457 for(uint32_t sample
= 0; sample
< T::MultisampleT::numCoverageSamples
; sample
++)
459 const uint8_t *pCoverageMask
= (uint8_t*)&work
.coverageMask
[sample
];
460 vCoverageMask
[sample
] = _simd_and_ps(activeLanes
, vMask(pCoverageMask
[currentSimdIn8x8
] & MASK
));
462 if(!_simd_movemask_ps(vCoverageMask
[sample
]))
464 vCoverageMask
[sample
] = depthPassMask
[sample
] = stencilPassMask
[sample
] = _simd_setzero_ps();
468 AR_BEGIN(BEBarycentric
, pDC
->drawId
);
469 // calculate per sample positions
470 psContext
.vX
.sample
= _simd_add_ps(psContext
.vX
.UL
, T::MultisampleT::vX(sample
));
471 psContext
.vY
.sample
= _simd_add_ps(psContext
.vY
.UL
, T::MultisampleT::vY(sample
));
473 // calc I & J per sample
474 CalcSampleBarycentrics(coeffs
, psContext
);
476 if(psState
.writesODepth
)
478 // broadcast and test oDepth(psContext.vZ) written from the PS for each sample
479 vZ
[sample
] = psContext
.vZ
;
483 vZ
[sample
] = vplaneps(coeffs
.vZa
, coeffs
.vZb
, coeffs
.vZc
, psContext
.vI
.sample
, psContext
.vJ
.sample
);
484 vZ
[sample
] = state
.pfnQuantizeDepth(vZ
[sample
]);
486 AR_END(BEBarycentric
, 0);
488 ///@todo: perspective correct vs non-perspective correct clipping?
489 // if clip distances are enabled, we need to interpolate for each sample
492 uint8_t clipMask
= ComputeUserClipMask(clipDistanceMask
, work
.pUserClipBuffer
,
493 psContext
.vI
.sample
, psContext
.vJ
.sample
);
494 vCoverageMask
[sample
] = _simd_and_ps(vCoverageMask
[sample
], vMask(~clipMask
));
497 // offset depth/stencil buffers current sample
498 uint8_t *pDepthSample
= pDepthBase
+ RasterTileDepthOffset(sample
);
499 uint8_t * pStencilSample
= pStencilBase
+ RasterTileStencilOffset(sample
);
501 if (state
.depthHottileEnable
&& state
.depthBoundsState
.depthBoundsTestEnable
)
503 static_assert(KNOB_DEPTH_HOT_TILE_FORMAT
== R32_FLOAT
, "Unsupported depth hot tile format");
505 const simdscalar z
= _simd_load_ps(reinterpret_cast<const float *>(pDepthSample
));
507 const float minz
= state
.depthBoundsState
.depthBoundsTestMinValue
;
508 const float maxz
= state
.depthBoundsState
.depthBoundsTestMaxValue
;
510 vCoverageMask
[sample
] = _simd_and_ps(vCoverageMask
[sample
], vMask(CalcDepthBoundsAcceptMask(z
, minz
, maxz
)));
513 // ZTest for this sample
514 ///@todo Need to uncomment out this bucket.
515 //AR_BEGIN(BEDepthBucket, pDC->drawId);
516 depthPassMask
[sample
] = vCoverageMask
[sample
];
517 stencilPassMask
[sample
] = vCoverageMask
[sample
];
518 depthPassMask
[sample
] = DepthStencilTest(&state
, work
.triFlags
.frontFacing
, work
.triFlags
.viewportIndex
,
519 vZ
[sample
], pDepthSample
, vCoverageMask
[sample
],
520 pStencilSample
, &stencilPassMask
[sample
]);
521 //AR_END(BEDepthBucket, 0);
523 // early-exit if no pixels passed depth or earlyZ is forced on
524 if(psState
.forceEarlyZ
|| !_simd_movemask_ps(depthPassMask
[sample
]))
526 DepthStencilWrite(&state
.vp
[work
.triFlags
.viewportIndex
], &state
.depthStencilState
, work
.triFlags
.frontFacing
, vZ
[sample
],
527 pDepthSample
, depthPassMask
[sample
], vCoverageMask
[sample
], pStencilSample
, stencilPassMask
[sample
]);
529 if(!_simd_movemask_ps(depthPassMask
[sample
]))
534 anyDepthSamplePassed
= _simd_or_ps(anyDepthSamplePassed
, depthPassMask
[sample
]);
535 uint32_t statMask
= _simd_movemask_ps(depthPassMask
[sample
]);
536 statCount
+= _mm_popcnt_u32(statMask
);
539 activeLanes
= _simd_and_ps(anyDepthSamplePassed
, activeLanes
);
540 // return number of samples that passed depth and coverage
544 // saved depth/stencil/coverage masks and interpolated Z used in OM and DepthWrite
545 simdscalar vZ
[T::MultisampleT::numCoverageSamples
];
546 simdscalar vCoverageMask
[T::MultisampleT::numCoverageSamples
];
547 simdscalar depthPassMask
[T::MultisampleT::numCoverageSamples
];
548 simdscalar stencilPassMask
[T::MultisampleT::numCoverageSamples
];
555 const SWR_TRIANGLE_DESC
& work
;
556 const BarycentricCoeffs
& coeffs
;
557 const API_STATE
& state
;
558 const SWR_PS_STATE
& psState
;
559 const uint8_t clipDistanceMask
;
560 uint8_t*& pDepthBase
;
561 uint8_t*& pStencilBase
;
564 INLINE
void CalcPixelBarycentrics(const BarycentricCoeffs
& coeffs
, SWR_PS_CONTEXT
&psContext
)
567 psContext
.vI
.center
= vplaneps(coeffs
.vIa
, coeffs
.vIb
, coeffs
.vIc
, psContext
.vX
.center
, psContext
.vY
.center
);
568 psContext
.vJ
.center
= vplaneps(coeffs
.vJa
, coeffs
.vJb
, coeffs
.vJc
, psContext
.vX
.center
, psContext
.vY
.center
);
569 psContext
.vI
.center
= _simd_mul_ps(psContext
.vI
.center
, coeffs
.vRecipDet
);
570 psContext
.vJ
.center
= _simd_mul_ps(psContext
.vJ
.center
, coeffs
.vRecipDet
);
573 psContext
.vOneOverW
.center
= vplaneps(coeffs
.vAOneOverW
, coeffs
.vBOneOverW
, coeffs
.vCOneOverW
, psContext
.vI
.center
, psContext
.vJ
.center
);
576 INLINE
void CalcSampleBarycentrics(const BarycentricCoeffs
& coeffs
, SWR_PS_CONTEXT
&psContext
)
579 psContext
.vI
.sample
= vplaneps(coeffs
.vIa
, coeffs
.vIb
, coeffs
.vIc
, psContext
.vX
.sample
, psContext
.vY
.sample
);
580 psContext
.vJ
.sample
= vplaneps(coeffs
.vJa
, coeffs
.vJb
, coeffs
.vJc
, psContext
.vX
.sample
, psContext
.vY
.sample
);
581 psContext
.vI
.sample
= _simd_mul_ps(psContext
.vI
.sample
, coeffs
.vRecipDet
);
582 psContext
.vJ
.sample
= _simd_mul_ps(psContext
.vJ
.sample
, coeffs
.vRecipDet
);
585 psContext
.vOneOverW
.sample
= vplaneps(coeffs
.vAOneOverW
, coeffs
.vBOneOverW
, coeffs
.vCOneOverW
, psContext
.vI
.sample
, psContext
.vJ
.sample
);
588 INLINE
void OutputMerger(SWR_PS_CONTEXT
&psContext
, uint8_t* (&pColorBase
)[SWR_NUM_RENDERTARGETS
], uint32_t sample
, const SWR_BLEND_STATE
*pBlendState
,
589 const PFN_BLEND_JIT_FUNC (&pfnBlendFunc
)[SWR_NUM_RENDERTARGETS
], simdscalar
&coverageMask
, simdscalar depthPassMask
, const uint32_t NumRT
)
591 // type safety guaranteed from template instantiation in BEChooser<>::GetFunc
592 const uint32_t rasterTileColorOffset
= RasterTileColorOffset(sample
);
595 for(uint32_t rt
= 0; rt
< NumRT
; ++rt
)
597 uint8_t *pColorSample
= pColorBase
[rt
] + rasterTileColorOffset
;
599 const SWR_RENDER_TARGET_BLEND_STATE
*pRTBlend
= &pBlendState
->renderTarget
[rt
];
600 // pfnBlendFunc may not update all channels. Initialize with PS output.
601 /// TODO: move this into the blend JIT.
602 blendOut
= psContext
.shaded
[rt
];
604 // Blend outputs and update coverage mask for alpha test
605 if(pfnBlendFunc
[rt
] != nullptr)
609 psContext
.shaded
[rt
],
615 (simdscalari
*)&coverageMask
);
619 simdscalari outputMask
= _simd_castps_si(_simd_and_ps(coverageMask
, depthPassMask
));
621 ///@todo can only use maskstore fast path if bpc is 32. Assuming hot tile is RGBA32_FLOAT.
622 static_assert(KNOB_COLOR_HOT_TILE_FORMAT
== R32G32B32A32_FLOAT
, "Unsupported hot tile format");
624 const uint32_t simd
= KNOB_SIMD_WIDTH
* sizeof(float);
626 // store with color mask
627 if(!pRTBlend
->writeDisableRed
)
629 _simd_maskstore_ps((float*)pColorSample
, outputMask
, blendOut
.x
);
631 if(!pRTBlend
->writeDisableGreen
)
633 _simd_maskstore_ps((float*)(pColorSample
+ simd
), outputMask
, blendOut
.y
);
635 if(!pRTBlend
->writeDisableBlue
)
637 _simd_maskstore_ps((float*)(pColorSample
+ simd
* 2), outputMask
, blendOut
.z
);
639 if(!pRTBlend
->writeDisableAlpha
)
641 _simd_maskstore_ps((float*)(pColorSample
+ simd
* 3), outputMask
, blendOut
.w
);
646 #if USE_8x2_TILE_BACKEND
647 INLINE
void OutputMerger(SWR_PS_CONTEXT
&psContext
, uint8_t* (&pColorBase
)[SWR_NUM_RENDERTARGETS
], uint32_t sample
, const SWR_BLEND_STATE
*pBlendState
,
648 const PFN_BLEND_JIT_FUNC(&pfnBlendFunc
)[SWR_NUM_RENDERTARGETS
], simdscalar
&coverageMask
, simdscalar depthPassMask
, const uint32_t NumRT
, bool useAlternateOffset
)
650 assert(sample
== 0); // will need up upate Raster Tile Color Offsets to support more than single sample here..
652 // type safety guaranteed from template instantiation in BEChooser<>::GetFunc
653 uint32_t rasterTileColorOffset
= RasterTileColorOffset(sample
);
655 if (useAlternateOffset
)
657 rasterTileColorOffset
+= sizeof(simdscalar
);
663 for (uint32_t rt
= 0; rt
< NumRT
; ++rt
)
665 simdscalar
*pColorSample
= reinterpret_cast<simdscalar
*>(pColorBase
[rt
] + rasterTileColorOffset
);
667 const SWR_RENDER_TARGET_BLEND_STATE
*pRTBlend
= &pBlendState
->renderTarget
[rt
];
668 // pfnBlendFunc may not update all channels. Initialize with PS output.
669 /// TODO: move this into the blend JIT.
670 blendOut
= psContext
.shaded
[rt
];
672 blendSrc
[0] = pColorSample
[0];
673 blendSrc
[1] = pColorSample
[2];
674 blendSrc
[2] = pColorSample
[4];
675 blendSrc
[3] = pColorSample
[6];
677 // Blend outputs and update coverage mask for alpha test
678 if (pfnBlendFunc
[rt
] != nullptr)
682 psContext
.shaded
[rt
],
685 reinterpret_cast<uint8_t *>(&blendSrc
),
688 reinterpret_cast<simdscalari
*>(&coverageMask
));
692 simdscalari outputMask
= _simd_castps_si(_simd_and_ps(coverageMask
, depthPassMask
));
694 ///@todo can only use maskstore fast path if bpc is 32. Assuming hot tile is RGBA32_FLOAT.
695 static_assert(KNOB_COLOR_HOT_TILE_FORMAT
== R32G32B32A32_FLOAT
, "Unsupported hot tile format");
697 // store with color mask
698 if (!pRTBlend
->writeDisableRed
)
700 _simd_maskstore_ps(reinterpret_cast<float *>(&pColorSample
[0]), outputMask
, blendOut
.x
);
702 if (!pRTBlend
->writeDisableGreen
)
704 _simd_maskstore_ps(reinterpret_cast<float *>(&pColorSample
[2]), outputMask
, blendOut
.y
);
706 if (!pRTBlend
->writeDisableBlue
)
708 _simd_maskstore_ps(reinterpret_cast<float *>(&pColorSample
[4]), outputMask
, blendOut
.z
);
710 if (!pRTBlend
->writeDisableAlpha
)
712 _simd_maskstore_ps(reinterpret_cast<float *>(&pColorSample
[6]), outputMask
, blendOut
.w
);
718 template<uint32_t sampleCountT
= SWR_MULTISAMPLE_1X
, uint32_t samplePattern
= SWR_MSAA_STANDARD_PATTERN
,
719 uint32_t coverage
= 0, uint32_t centroid
= 0, uint32_t forced
= 0, uint32_t canEarlyZ
= 0>
720 struct SwrBackendTraits
722 static const bool bIsStandardPattern
= (samplePattern
== SWR_MSAA_STANDARD_PATTERN
);
723 static const uint32_t InputCoverage
= coverage
;
724 static const bool bCentroidPos
= (centroid
== 1);
725 static const bool bForcedSampleCount
= (forced
== 1);
726 static const bool bCanEarlyZ
= (canEarlyZ
== 1);
727 typedef MultisampleTraits
<(SWR_MULTISAMPLE_COUNT
)sampleCountT
, (bIsStandardPattern
) ? SWR_MSAA_STANDARD_PATTERN
: SWR_MSAA_CENTER_PATTERN
> MultisampleT
;