1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * @brief Backend handles rasterization, pixel shading and output merger
28 ******************************************************************************/
31 #include "common/os.h"
32 #include "core/context.h"
33 #include "core/multisample.h"
34 #include "depthstencil.h"
35 #include "rdtsc_core.h"
37 void ProcessComputeBE(DRAW_CONTEXT
* pDC
, uint32_t workerId
, uint32_t threadGroupId
, void*& pSpillFillBuffer
, void*& pScratchSpace
);
38 void ProcessSyncBE(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t macroTile
, void *pUserData
);
39 void ProcessClearBE(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t macroTile
, void *pUserData
);
40 void ProcessStoreTilesBE(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t macroTile
, void *pData
);
41 void ProcessDiscardInvalidateTilesBE(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t macroTile
, void *pData
);
42 void ProcessShutdownBE(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t macroTile
, void *pUserData
);
43 void BackendNullPS(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t x
, uint32_t y
, SWR_TRIANGLE_DESC
&work
, RenderOutputBuffers
&renderBuffers
);
44 simdmask
ComputeUserClipMask(uint8_t clipMask
, float* pUserClipBuffer
, simdscalar vI
, simdscalar vJ
);
45 void CalcSampleBarycentrics(const BarycentricCoeffs
& coeffs
, SWR_PS_CONTEXT
&psContext
);
47 extern PFN_BACKEND_FUNC gBackendPixelRateTable
[SWR_MULTISAMPLE_TYPE_COUNT
]
48 [2] // isCenterPattern
49 [SWR_INPUT_COVERAGE_COUNT
]
51 [2] // forcedSampleCount
55 enum SWR_BACKEND_FUNCS
57 SWR_BACKEND_SINGLE_SAMPLE
,
58 SWR_BACKEND_MSAA_PIXEL_RATE
,
59 SWR_BACKEND_MSAA_SAMPLE_RATE
,
60 SWR_BACKEND_FUNCS_MAX
,
63 #if KNOB_SIMD_WIDTH == 8
64 extern const simdscalar vCenterOffsetsX
;
65 extern const simdscalar vCenterOffsetsY
;
66 extern const simdscalar vULOffsetsX
;
67 extern const simdscalar vULOffsetsY
;
71 INLINE
static uint32_t RasterTileColorOffset(uint32_t sampleNum
)
73 static const uint32_t RasterTileColorOffsets
[16]
75 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
/ 8),
76 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
/ 8) * 2,
77 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
/ 8) * 3,
78 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
/ 8) * 4,
79 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
/ 8) * 5,
80 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
/ 8) * 6,
81 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
/ 8) * 7,
82 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
/ 8) * 8,
83 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
/ 8) * 9,
84 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
/ 8) * 10,
85 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
/ 8) * 11,
86 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
/ 8) * 12,
87 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
/ 8) * 13,
88 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
/ 8) * 14,
89 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
/ 8) * 15,
91 assert(sampleNum
< 16);
92 return RasterTileColorOffsets
[sampleNum
];
95 INLINE
static uint32_t RasterTileDepthOffset(uint32_t sampleNum
)
97 static const uint32_t RasterTileDepthOffsets
[16]
99 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
/ 8),
100 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
/ 8) * 2,
101 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
/ 8) * 3,
102 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
/ 8) * 4,
103 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
/ 8) * 5,
104 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
/ 8) * 6,
105 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
/ 8) * 7,
106 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
/ 8) * 8,
107 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
/ 8) * 9,
108 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
/ 8) * 10,
109 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
/ 8) * 11,
110 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
/ 8) * 12,
111 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
/ 8) * 13,
112 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
/ 8) * 14,
113 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
/ 8) * 15,
115 assert(sampleNum
< 16);
116 return RasterTileDepthOffsets
[sampleNum
];
119 INLINE
static uint32_t RasterTileStencilOffset(uint32_t sampleNum
)
121 static const uint32_t RasterTileStencilOffsets
[16]
123 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
/ 8),
124 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
/ 8) * 2,
125 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
/ 8) * 3,
126 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
/ 8) * 4,
127 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
/ 8) * 5,
128 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
/ 8) * 6,
129 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
/ 8) * 7,
130 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
/ 8) * 8,
131 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
/ 8) * 9,
132 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
/ 8) * 10,
133 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
/ 8) * 11,
134 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
/ 8) * 12,
135 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
/ 8) * 13,
136 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
/ 8) * 14,
137 (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
/ 8) * 15,
139 assert(sampleNum
< 16);
140 return RasterTileStencilOffsets
[sampleNum
];
143 template<typename T
, uint32_t InputCoverage
>
144 struct generateInputCoverage
146 INLINE
generateInputCoverage(const uint64_t *const coverageMask
, uint32_t (&inputMask
)[KNOB_SIMD_WIDTH
], const uint32_t sampleMask
)
148 // will need to update for avx512
149 assert(KNOB_SIMD_WIDTH
== 8);
152 simdscalari sampleCoverage
[2];
154 if(T::bIsCenterPattern
)
156 // center coverage is the same for all samples; just broadcast to the sample slots
157 uint32_t centerCoverage
= ((uint32_t)(*coverageMask
) & MASK
);
158 if(T::MultisampleT::numSamples
== 1)
160 sampleCoverage
[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, centerCoverage
);
162 else if(T::MultisampleT::numSamples
== 2)
164 sampleCoverage
[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, centerCoverage
, centerCoverage
);
166 else if(T::MultisampleT::numSamples
== 4)
168 sampleCoverage
[0] = _mm256_set_epi32(0, 0, 0, 0, centerCoverage
, centerCoverage
, centerCoverage
, centerCoverage
);
170 else if(T::MultisampleT::numSamples
== 8)
172 sampleCoverage
[0] = _mm256_set1_epi32(centerCoverage
);
174 else if(T::MultisampleT::numSamples
== 16)
176 sampleCoverage
[0] = _mm256_set1_epi32(centerCoverage
);
177 sampleCoverage
[1] = _mm256_set1_epi32(centerCoverage
);
182 __m256i src
= _mm256_set1_epi32(0);
183 __m256i index0
= _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0), index1
;
185 if(T::MultisampleT::numSamples
== 1)
187 mask
[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, -1);
189 else if(T::MultisampleT::numSamples
== 2)
191 mask
[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1);
193 else if(T::MultisampleT::numSamples
== 4)
195 mask
[0] = _mm256_set_epi32(0, 0, 0, 0, -1, -1, -1, -1);
197 else if(T::MultisampleT::numSamples
== 8)
199 mask
[0] = _mm256_set1_epi32(-1);
201 else if(T::MultisampleT::numSamples
== 16)
203 mask
[0] = _mm256_set1_epi32(-1);
204 mask
[1] = _mm256_set1_epi32(-1);
205 index1
= _mm256_set_epi32(15, 14, 13, 12, 11, 10, 9, 8);
208 // gather coverage for samples 0-7
209 sampleCoverage
[0] = _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src
), (const float*)coverageMask
, index0
, _mm256_castsi256_ps(mask
[0]), 8));
210 if(T::MultisampleT::numSamples
> 8)
212 // gather coverage for samples 8-15
213 sampleCoverage
[1] = _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src
), (const float*)coverageMask
, index1
, _mm256_castsi256_ps(mask
[1]), 8));
217 mask
[0] = _mm256_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xC, 0x8, 0x4, 0x0,
218 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xC, 0x8, 0x4, 0x0);
219 // pull out the 8bit 4x2 coverage for samples 0-7 into the lower 32 bits of each 128bit lane
220 simdscalari packedCoverage0
= _simd_shuffle_epi8(sampleCoverage
[0], mask
[0]);
222 simdscalari packedCoverage1
;
223 if(T::MultisampleT::numSamples
> 8)
225 // pull out the 8bit 4x2 coverage for samples 8-15 into the lower 32 bits of each 128bit lane
226 packedCoverage1
= _simd_shuffle_epi8(sampleCoverage
[1], mask
[0]);
229 #if (KNOB_ARCH == KNOB_ARCH_AVX)
230 // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane
231 simdscalari hiToLow
= _mm256_permute2f128_si256(packedCoverage0
, packedCoverage0
, 0x83);
232 simdscalar shufRes
= _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow
), _mm256_castsi256_ps(hiToLow
), _MM_SHUFFLE(1, 1, 0, 1));
233 packedCoverage0
= _mm256_castps_si256(_mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0
), shufRes
, 0xFE));
235 simdscalari packedSampleCoverage
;
236 if(T::MultisampleT::numSamples
> 8)
238 // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane
239 hiToLow
= _mm256_permute2f128_si256(packedCoverage1
, packedCoverage1
, 0x83);
240 shufRes
= _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow
), _mm256_castsi256_ps(hiToLow
), _MM_SHUFFLE(1, 1, 0, 1));
241 shufRes
= _mm256_blend_ps(_mm256_castsi256_ps(packedCoverage1
), shufRes
, 0xFE);
242 packedCoverage1
= _mm256_castps_si256(_mm256_castpd_ps(_mm256_shuffle_pd(_mm256_castps_pd(shufRes
), _mm256_castps_pd(shufRes
), 0x01)));
243 packedSampleCoverage
= _mm256_castps_si256(_mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0
), _mm256_castsi256_ps(packedCoverage1
), 0xFC));
247 packedSampleCoverage
= packedCoverage0
;
250 simdscalari permMask
= _mm256_set_epi32(0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x4, 0x0);
251 // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane
252 packedCoverage0
= _mm256_permutevar8x32_epi32(packedCoverage0
, permMask
);
254 simdscalari packedSampleCoverage
;
255 if(T::MultisampleT::numSamples
> 8)
257 permMask
= _mm256_set_epi32(0x7, 0x7, 0x7, 0x7, 0x4, 0x0, 0x7, 0x7);
258 // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane
259 packedCoverage1
= _mm256_permutevar8x32_epi32(packedCoverage1
, permMask
);
261 // blend coverage masks for samples 0-7 and samples 8-15 into single 128 bit lane
262 packedSampleCoverage
= _mm256_blend_epi32(packedCoverage0
, packedCoverage1
, 0x0C);
266 packedSampleCoverage
= packedCoverage0
;
270 for(int32_t i
= KNOB_SIMD_WIDTH
- 1; i
>= 0; i
--)
272 // convert packed sample coverage masks into single coverage masks for all samples for each pixel in the 4x2
273 inputMask
[i
] = _simd_movemask_epi8(packedSampleCoverage
);
275 if(!T::bForcedSampleCount
)
277 // input coverage has to be anded with sample mask if MSAA isn't forced on
278 inputMask
[i
] &= sampleMask
;
281 // shift to the next pixel in the 4x2
282 packedSampleCoverage
= _simd_slli_epi32(packedSampleCoverage
, 1);
286 INLINE
generateInputCoverage(const uint64_t *const coverageMask
, simdscalar
&inputCoverage
, const uint32_t sampleMask
)
288 uint32_t inputMask
[KNOB_SIMD_WIDTH
];
289 generateInputCoverage
<T
, T::InputCoverage
>(coverageMask
, inputMask
, sampleMask
);
290 inputCoverage
= _simd_castsi_ps(_mm256_set_epi32(inputMask
[7], inputMask
[6], inputMask
[5], inputMask
[4], inputMask
[3], inputMask
[2], inputMask
[1], inputMask
[0]));
296 struct generateInputCoverage
<T
, SWR_INPUT_COVERAGE_INNER_CONSERVATIVE
>
298 INLINE
generateInputCoverage(const uint64_t *const coverageMask
, simdscalar
&inputCoverage
, const uint32_t sampleMask
)
300 // will need to update for avx512
301 assert(KNOB_SIMD_WIDTH
== 8);
302 simdscalari vec
= _mm256_set1_epi32(coverageMask
[0]);
303 const simdscalari bit
= _mm256_set_epi32(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
304 vec
= _simd_and_si(vec
, bit
);
305 vec
= _simd_cmplt_epi32(_mm256_setzero_si256(), vec
);
306 vec
= _simd_blendv_epi32(_simd_setzero_si(), _simd_set1_epi32(1), vec
);
307 inputCoverage
= _simd_castsi_ps(vec
);
310 INLINE
generateInputCoverage(const uint64_t *const coverageMask
, uint32_t (&inputMask
)[KNOB_SIMD_WIDTH
], const uint32_t sampleMask
)
312 uint32_t simdCoverage
= (coverageMask
[0] & MASK
);
313 static const uint32_t FullCoverageMask
= (1 << T::MultisampleT::numSamples
) - 1;
314 for(int i
= 0; i
< KNOB_SIMD_WIDTH
; i
++)
316 // set all samples to covered if conservative coverage mask is set for that pixel
317 inputMask
[i
] = (((1 << i
) & simdCoverage
) > 0) ? FullCoverageMask
: 0;
322 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
323 // Centroid behaves exactly as follows :
324 // (1) If all samples in the primitive are covered, the attribute is evaluated at the pixel center (even if the sample pattern does not happen to
325 // have a sample location there).
326 // (2) Else the attribute is evaluated at the first covered sample, in increasing order of sample index, where sample coverage is after ANDing the
327 // coverage with the SampleMask Rasterizer State.
328 // (3) If no samples are covered, such as on helper pixels executed off the bounds of a primitive to fill out 2x2 pixel stamps, the attribute is
329 // evaluated as follows : If the SampleMask Rasterizer state is a subset of the samples in the pixel, then the first sample covered by the
330 // SampleMask Rasterizer State is the evaluation point.Otherwise (full SampleMask), the pixel center is the evaluation point.
331 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
333 INLINE
void CalcCentroidPos(SWR_PS_CONTEXT
&psContext
, const SWR_MULTISAMPLE_POS
& samplePos
,
334 const uint64_t *const coverageMask
, const uint32_t sampleMask
,
335 const simdscalar vXSamplePosUL
, const simdscalar vYSamplePosUL
)
337 uint32_t inputMask
[KNOB_SIMD_WIDTH
];
338 generateInputCoverage
<T
, T::InputCoverage
>(coverageMask
, inputMask
, sampleMask
);
340 // Case (2) - partially covered pixel
342 // scan for first covered sample per pixel in the 4x2 span
343 unsigned long sampleNum
[KNOB_SIMD_WIDTH
];
344 (inputMask
[0] > 0) ? (_BitScanForward(&sampleNum
[0], inputMask
[0])) : (sampleNum
[0] = 0);
345 (inputMask
[1] > 0) ? (_BitScanForward(&sampleNum
[1], inputMask
[1])) : (sampleNum
[1] = 0);
346 (inputMask
[2] > 0) ? (_BitScanForward(&sampleNum
[2], inputMask
[2])) : (sampleNum
[2] = 0);
347 (inputMask
[3] > 0) ? (_BitScanForward(&sampleNum
[3], inputMask
[3])) : (sampleNum
[3] = 0);
348 (inputMask
[4] > 0) ? (_BitScanForward(&sampleNum
[4], inputMask
[4])) : (sampleNum
[4] = 0);
349 (inputMask
[5] > 0) ? (_BitScanForward(&sampleNum
[5], inputMask
[5])) : (sampleNum
[5] = 0);
350 (inputMask
[6] > 0) ? (_BitScanForward(&sampleNum
[6], inputMask
[6])) : (sampleNum
[6] = 0);
351 (inputMask
[7] > 0) ? (_BitScanForward(&sampleNum
[7], inputMask
[7])) : (sampleNum
[7] = 0);
353 // look up and set the sample offsets from UL pixel corner for first covered sample
354 __m256 vXSample
= _mm256_set_ps(samplePos
.X(sampleNum
[7]),
355 samplePos
.X(sampleNum
[6]),
356 samplePos
.X(sampleNum
[5]),
357 samplePos
.X(sampleNum
[4]),
358 samplePos
.X(sampleNum
[3]),
359 samplePos
.X(sampleNum
[2]),
360 samplePos
.X(sampleNum
[1]),
361 samplePos
.X(sampleNum
[0]));
363 __m256 vYSample
= _mm256_set_ps(samplePos
.Y(sampleNum
[7]),
364 samplePos
.Y(sampleNum
[6]),
365 samplePos
.Y(sampleNum
[5]),
366 samplePos
.Y(sampleNum
[4]),
367 samplePos
.Y(sampleNum
[3]),
368 samplePos
.Y(sampleNum
[2]),
369 samplePos
.Y(sampleNum
[1]),
370 samplePos
.Y(sampleNum
[0]));
371 // add sample offset to UL pixel corner
372 vXSample
= _simd_add_ps(vXSamplePosUL
, vXSample
);
373 vYSample
= _simd_add_ps(vYSamplePosUL
, vYSample
);
375 // Case (1) and case (3b) - All samples covered or not covered with full SampleMask
376 static const simdscalari vFullyCoveredMask
= T::MultisampleT::FullSampleMask();
377 simdscalari vInputCoveragei
= _mm256_set_epi32(inputMask
[7], inputMask
[6], inputMask
[5], inputMask
[4], inputMask
[3], inputMask
[2], inputMask
[1], inputMask
[0]);
378 simdscalari vAllSamplesCovered
= _simd_cmpeq_epi32(vInputCoveragei
, vFullyCoveredMask
);
380 static const simdscalari vZero
= _simd_setzero_si();
381 const simdscalari vSampleMask
= _simd_and_si(_simd_set1_epi32(sampleMask
), vFullyCoveredMask
);
382 simdscalari vNoSamplesCovered
= _simd_cmpeq_epi32(vInputCoveragei
, vZero
);
383 simdscalari vIsFullSampleMask
= _simd_cmpeq_epi32(vSampleMask
, vFullyCoveredMask
);
384 simdscalari vCase3b
= _simd_and_si(vNoSamplesCovered
, vIsFullSampleMask
);
386 simdscalari vEvalAtCenter
= _simd_or_si(vAllSamplesCovered
, vCase3b
);
388 // set the centroid position based on results from above
389 psContext
.vX
.centroid
= _simd_blendv_ps(vXSample
, psContext
.vX
.center
, _simd_castsi_ps(vEvalAtCenter
));
390 psContext
.vY
.centroid
= _simd_blendv_ps(vYSample
, psContext
.vY
.center
, _simd_castsi_ps(vEvalAtCenter
));
392 // Case (3a) No samples covered and partial sample mask
393 simdscalari vSomeSampleMaskSamples
= _simd_cmplt_epi32(vSampleMask
, vFullyCoveredMask
);
394 // sample mask should never be all 0's for this case, but handle it anyways
395 unsigned long firstCoveredSampleMaskSample
= 0;
396 (sampleMask
> 0) ? (_BitScanForward(&firstCoveredSampleMaskSample
, sampleMask
)) : (firstCoveredSampleMaskSample
= 0);
398 simdscalari vCase3a
= _simd_and_si(vNoSamplesCovered
, vSomeSampleMaskSamples
);
400 vXSample
= _simd_set1_ps(samplePos
.X(firstCoveredSampleMaskSample
));
401 vYSample
= _simd_set1_ps(samplePos
.Y(firstCoveredSampleMaskSample
));
403 // blend in case 3a pixel locations
404 psContext
.vX
.centroid
= _simd_blendv_ps(psContext
.vX
.centroid
, vXSample
, _simd_castsi_ps(vCase3a
));
405 psContext
.vY
.centroid
= _simd_blendv_ps(psContext
.vY
.centroid
, vYSample
, _simd_castsi_ps(vCase3a
));
408 INLINE
void CalcCentroidBarycentrics(const BarycentricCoeffs
& coeffs
, SWR_PS_CONTEXT
&psContext
,
409 const simdscalar vXSamplePosUL
, const simdscalar vYSamplePosUL
)
412 psContext
.vI
.centroid
= vplaneps(coeffs
.vIa
, coeffs
.vIb
, coeffs
.vIc
, psContext
.vX
.centroid
, psContext
.vY
.centroid
);
413 psContext
.vJ
.centroid
= vplaneps(coeffs
.vJa
, coeffs
.vJb
, coeffs
.vJc
, psContext
.vX
.centroid
, psContext
.vY
.centroid
);
414 psContext
.vI
.centroid
= _simd_mul_ps(psContext
.vI
.centroid
, coeffs
.vRecipDet
);
415 psContext
.vJ
.centroid
= _simd_mul_ps(psContext
.vJ
.centroid
, coeffs
.vRecipDet
);
418 psContext
.vOneOverW
.centroid
= vplaneps(coeffs
.vAOneOverW
, coeffs
.vBOneOverW
, coeffs
.vCOneOverW
, psContext
.vI
.centroid
, psContext
.vJ
.centroid
);
421 INLINE simdmask
CalcDepthBoundsAcceptMask(simdscalar z
, float minz
, float maxz
)
423 const simdscalar minzMask
= _simd_cmpge_ps(z
, _simd_set1_ps(minz
));
424 const simdscalar maxzMask
= _simd_cmple_ps(z
, _simd_set1_ps(maxz
));
426 return _simd_movemask_ps(_simd_and_ps(minzMask
, maxzMask
));
430 INLINE
uint32_t GetNumOMSamples(SWR_MULTISAMPLE_COUNT blendSampleCount
)
432 // RT has to be single sample if we're in forcedMSAA mode
433 if(T::bForcedSampleCount
&& (T::MultisampleT::sampleCount
> SWR_MULTISAMPLE_1X
))
437 // unless we're forced to single sample, in which case we run the OM at the sample count of the RT
438 else if(T::bForcedSampleCount
&& (T::MultisampleT::sampleCount
== SWR_MULTISAMPLE_1X
))
440 return GetNumSamples(blendSampleCount
);
442 // else we're in normal MSAA mode and rasterizer and OM are running at the same sample count
445 return T::MultisampleT::numSamples
;
449 inline void SetupBarycentricCoeffs(BarycentricCoeffs
*coeffs
, const SWR_TRIANGLE_DESC
&work
)
453 coeffs
->vIa
= _simd_broadcast_ss(&work
.I
[0]);
454 coeffs
->vIb
= _simd_broadcast_ss(&work
.I
[1]);
455 coeffs
->vIc
= _simd_broadcast_ss(&work
.I
[2]);
457 coeffs
->vJa
= _simd_broadcast_ss(&work
.J
[0]);
458 coeffs
->vJb
= _simd_broadcast_ss(&work
.J
[1]);
459 coeffs
->vJc
= _simd_broadcast_ss(&work
.J
[2]);
461 coeffs
->vZa
= _simd_broadcast_ss(&work
.Z
[0]);
462 coeffs
->vZb
= _simd_broadcast_ss(&work
.Z
[1]);
463 coeffs
->vZc
= _simd_broadcast_ss(&work
.Z
[2]);
465 coeffs
->vRecipDet
= _simd_broadcast_ss(&work
.recipDet
);
467 coeffs
->vAOneOverW
= _simd_broadcast_ss(&work
.OneOverW
[0]);
468 coeffs
->vBOneOverW
= _simd_broadcast_ss(&work
.OneOverW
[1]);
469 coeffs
->vCOneOverW
= _simd_broadcast_ss(&work
.OneOverW
[2]);
472 inline void SetupRenderBuffers(uint8_t *pColorBuffer
[SWR_NUM_RENDERTARGETS
], uint8_t **pDepthBuffer
, uint8_t **pStencilBuffer
, uint32_t colorBufferCount
, RenderOutputBuffers
&renderBuffers
)
474 assert(colorBufferCount
<= SWR_NUM_RENDERTARGETS
);
478 for (uint32_t index
= 0; index
< colorBufferCount
; index
+= 1)
480 pColorBuffer
[index
] = renderBuffers
.pColor
[index
];
486 *pDepthBuffer
= renderBuffers
.pDepth
;
491 *pStencilBuffer
= renderBuffers
.pStencil
;;
496 void SetupPixelShaderContext(SWR_PS_CONTEXT
*psContext
, const SWR_MULTISAMPLE_POS
& samplePos
, SWR_TRIANGLE_DESC
&work
)
498 psContext
->pAttribs
= work
.pAttribs
;
499 psContext
->pPerspAttribs
= work
.pPerspAttribs
;
500 psContext
->frontFace
= work
.triFlags
.frontFacing
;
501 psContext
->renderTargetArrayIndex
= work
.triFlags
.renderTargetArrayIndex
;
503 // save Ia/Ib/Ic and Ja/Jb/Jc if we need to reevaluate i/j/k in the shader because of pull attribs
504 psContext
->I
= work
.I
;
505 psContext
->J
= work
.J
;
507 psContext
->recipDet
= work
.recipDet
;
508 psContext
->pRecipW
= work
.pRecipW
;
509 psContext
->pSamplePosX
= samplePos
.X();//reinterpret_cast<const float *>(&T::MultisampleT::samplePosX);
510 psContext
->pSamplePosY
= samplePos
.Y();//reinterpret_cast<const float *>(&T::MultisampleT::samplePosY);
511 psContext
->rasterizerSampleCount
= T::MultisampleT::numSamples
;
512 psContext
->sampleIndex
= 0;
515 template<typename T
, bool IsSingleSample
>
516 void CalcCentroid(SWR_PS_CONTEXT
*psContext
, const SWR_MULTISAMPLE_POS
& samplePos
,
517 const BarycentricCoeffs
&coeffs
, const uint64_t * const coverageMask
, uint32_t sampleMask
)
519 if (IsSingleSample
) // if (T::MultisampleT::numSamples == 1) // doesn't cut it, the centroid positions are still different
521 // for 1x case, centroid is pixel center
522 psContext
->vX
.centroid
= psContext
->vX
.center
;
523 psContext
->vY
.centroid
= psContext
->vY
.center
;
524 psContext
->vI
.centroid
= psContext
->vI
.center
;
525 psContext
->vJ
.centroid
= psContext
->vJ
.center
;
526 psContext
->vOneOverW
.centroid
= psContext
->vOneOverW
.center
;
532 ///@ todo: don't need to genererate input coverage 2x if input coverage and centroid
533 if (T::bIsCenterPattern
)
535 psContext
->vX
.centroid
= _simd_add_ps(psContext
->vX
.UL
, _simd_set1_ps(0.5f
));
536 psContext
->vY
.centroid
= _simd_add_ps(psContext
->vY
.UL
, _simd_set1_ps(0.5f
));
540 // add param: const uint32_t inputMask[KNOB_SIMD_WIDTH] to eliminate 'generate coverage 2X'..
541 CalcCentroidPos
<T
>(*psContext
, samplePos
, coverageMask
, sampleMask
, psContext
->vX
.UL
, psContext
->vY
.UL
);
544 CalcCentroidBarycentrics(coeffs
, *psContext
, psContext
->vX
.UL
, psContext
->vY
.UL
);
548 psContext
->vX
.centroid
= psContext
->vX
.sample
;
549 psContext
->vY
.centroid
= psContext
->vY
.sample
;
555 struct PixelRateZTestLoop
557 PixelRateZTestLoop(DRAW_CONTEXT
*DC
, uint32_t _workerId
, const SWR_TRIANGLE_DESC
&Work
, const BarycentricCoeffs
& Coeffs
, const API_STATE
& apiState
,
558 uint8_t*& depthBuffer
, uint8_t*& stencilBuffer
, const uint8_t ClipDistanceMask
) :
559 pDC(DC
), workerId(_workerId
), work(Work
), coeffs(Coeffs
), state(apiState
), psState(apiState
.psState
),
560 samplePos(state
.rastState
.samplePositions
),
561 clipDistanceMask(ClipDistanceMask
), pDepthBuffer(depthBuffer
), pStencilBuffer(stencilBuffer
){};
564 uint32_t operator()(simdscalar
& activeLanes
, SWR_PS_CONTEXT
& psContext
,
565 const CORE_BUCKETS BEDepthBucket
, uint32_t currentSimdIn8x8
= 0)
567 SWR_CONTEXT
*pContext
= pDC
->pContext
;
569 uint32_t statCount
= 0;
570 simdscalar anyDepthSamplePassed
= _simd_setzero_ps();
571 for(uint32_t sample
= 0; sample
< T::MultisampleT::numCoverageSamples
; sample
++)
573 const uint8_t *pCoverageMask
= (uint8_t*)&work
.coverageMask
[sample
];
574 vCoverageMask
[sample
] = _simd_and_ps(activeLanes
, vMask(pCoverageMask
[currentSimdIn8x8
] & MASK
));
576 if(!_simd_movemask_ps(vCoverageMask
[sample
]))
578 vCoverageMask
[sample
] = depthPassMask
[sample
] = stencilPassMask
[sample
] = _simd_setzero_ps();
582 // offset depth/stencil buffers current sample
583 uint8_t *pDepthSample
= pDepthBuffer
+ RasterTileDepthOffset(sample
);
584 uint8_t * pStencilSample
= pStencilBuffer
+ RasterTileStencilOffset(sample
);
586 if (state
.depthHottileEnable
&& state
.depthBoundsState
.depthBoundsTestEnable
)
588 static_assert(KNOB_DEPTH_HOT_TILE_FORMAT
== R32_FLOAT
, "Unsupported depth hot tile format");
590 const simdscalar z
= _simd_load_ps(reinterpret_cast<const float *>(pDepthSample
));
592 const float minz
= state
.depthBoundsState
.depthBoundsTestMinValue
;
593 const float maxz
= state
.depthBoundsState
.depthBoundsTestMaxValue
;
595 vCoverageMask
[sample
] = _simd_and_ps(vCoverageMask
[sample
], vMask(CalcDepthBoundsAcceptMask(z
, minz
, maxz
)));
598 AR_BEGIN(BEBarycentric
, pDC
->drawId
);
600 // calculate per sample positions
601 psContext
.vX
.sample
= _simd_add_ps(psContext
.vX
.UL
, samplePos
.vX(sample
));
602 psContext
.vY
.sample
= _simd_add_ps(psContext
.vY
.UL
, samplePos
.vY(sample
));
604 // calc I & J per sample
605 CalcSampleBarycentrics(coeffs
, psContext
);
607 if(psState
.writesODepth
)
610 // broadcast and test oDepth(psContext.vZ) written from the PS for each sample
611 vZ
[sample
] = psContext
.vZ
;
616 vZ
[sample
] = vplaneps(coeffs
.vZa
, coeffs
.vZb
, coeffs
.vZc
, psContext
.vI
.sample
, psContext
.vJ
.sample
);
617 vZ
[sample
] = state
.pfnQuantizeDepth(vZ
[sample
]);
620 AR_END(BEBarycentric
, 0);
622 ///@todo: perspective correct vs non-perspective correct clipping?
623 // if clip distances are enabled, we need to interpolate for each sample
626 uint8_t clipMask
= ComputeUserClipMask(clipDistanceMask
, work
.pUserClipBuffer
, psContext
.vI
.sample
, psContext
.vJ
.sample
);
628 vCoverageMask
[sample
] = _simd_and_ps(vCoverageMask
[sample
], vMask(~clipMask
));
631 // ZTest for this sample
632 ///@todo Need to uncomment out this bucket.
633 //AR_BEGIN(BEDepthBucket, pDC->drawId);
634 depthPassMask
[sample
] = vCoverageMask
[sample
];
635 stencilPassMask
[sample
] = vCoverageMask
[sample
];
636 depthPassMask
[sample
] = DepthStencilTest(&state
, work
.triFlags
.frontFacing
, work
.triFlags
.viewportIndex
,
637 vZ
[sample
], pDepthSample
, vCoverageMask
[sample
],
638 pStencilSample
, &stencilPassMask
[sample
]);
639 //AR_END(BEDepthBucket, 0);
641 // early-exit if no pixels passed depth or earlyZ is forced on
642 if(psState
.forceEarlyZ
|| !_simd_movemask_ps(depthPassMask
[sample
]))
644 DepthStencilWrite(&state
.vp
[work
.triFlags
.viewportIndex
], &state
.depthStencilState
, work
.triFlags
.frontFacing
, vZ
[sample
],
645 pDepthSample
, depthPassMask
[sample
], vCoverageMask
[sample
], pStencilSample
, stencilPassMask
[sample
]);
647 if(!_simd_movemask_ps(depthPassMask
[sample
]))
652 anyDepthSamplePassed
= _simd_or_ps(anyDepthSamplePassed
, depthPassMask
[sample
]);
653 uint32_t statMask
= _simd_movemask_ps(depthPassMask
[sample
]);
654 statCount
+= _mm_popcnt_u32(statMask
);
657 activeLanes
= _simd_and_ps(anyDepthSamplePassed
, activeLanes
);
658 // return number of samples that passed depth and coverage
662 // saved depth/stencil/coverage masks and interpolated Z used in OM and DepthWrite
663 simdscalar vZ
[T::MultisampleT::numCoverageSamples
];
664 simdscalar vCoverageMask
[T::MultisampleT::numCoverageSamples
];
665 simdscalar depthPassMask
[T::MultisampleT::numCoverageSamples
];
666 simdscalar stencilPassMask
[T::MultisampleT::numCoverageSamples
];
673 const SWR_TRIANGLE_DESC
& work
;
674 const BarycentricCoeffs
& coeffs
;
675 const API_STATE
& state
;
676 const SWR_PS_STATE
& psState
;
677 const SWR_MULTISAMPLE_POS
& samplePos
;
678 const uint8_t clipDistanceMask
;
679 uint8_t*& pDepthBuffer
;
680 uint8_t*& pStencilBuffer
;
683 INLINE
void CalcPixelBarycentrics(const BarycentricCoeffs
& coeffs
, SWR_PS_CONTEXT
&psContext
)
686 psContext
.vI
.center
= vplaneps(coeffs
.vIa
, coeffs
.vIb
, coeffs
.vIc
, psContext
.vX
.center
, psContext
.vY
.center
);
687 psContext
.vJ
.center
= vplaneps(coeffs
.vJa
, coeffs
.vJb
, coeffs
.vJc
, psContext
.vX
.center
, psContext
.vY
.center
);
688 psContext
.vI
.center
= _simd_mul_ps(psContext
.vI
.center
, coeffs
.vRecipDet
);
689 psContext
.vJ
.center
= _simd_mul_ps(psContext
.vJ
.center
, coeffs
.vRecipDet
);
692 psContext
.vOneOverW
.center
= vplaneps(coeffs
.vAOneOverW
, coeffs
.vBOneOverW
, coeffs
.vCOneOverW
, psContext
.vI
.center
, psContext
.vJ
.center
);
695 INLINE
void CalcSampleBarycentrics(const BarycentricCoeffs
& coeffs
, SWR_PS_CONTEXT
&psContext
)
698 psContext
.vI
.sample
= vplaneps(coeffs
.vIa
, coeffs
.vIb
, coeffs
.vIc
, psContext
.vX
.sample
, psContext
.vY
.sample
);
699 psContext
.vJ
.sample
= vplaneps(coeffs
.vJa
, coeffs
.vJb
, coeffs
.vJc
, psContext
.vX
.sample
, psContext
.vY
.sample
);
700 psContext
.vI
.sample
= _simd_mul_ps(psContext
.vI
.sample
, coeffs
.vRecipDet
);
701 psContext
.vJ
.sample
= _simd_mul_ps(psContext
.vJ
.sample
, coeffs
.vRecipDet
);
704 psContext
.vOneOverW
.sample
= vplaneps(coeffs
.vAOneOverW
, coeffs
.vBOneOverW
, coeffs
.vCOneOverW
, psContext
.vI
.sample
, psContext
.vJ
.sample
);
707 // Merge Output to 4x2 SIMD Tile Format
708 INLINE
void OutputMerger4x2(SWR_PS_CONTEXT
&psContext
, uint8_t* (&pColorBase
)[SWR_NUM_RENDERTARGETS
], uint32_t sample
, const SWR_BLEND_STATE
*pBlendState
,
709 const PFN_BLEND_JIT_FUNC (&pfnBlendFunc
)[SWR_NUM_RENDERTARGETS
], simdscalar
&coverageMask
, simdscalar depthPassMask
, const uint32_t NumRT
)
711 // type safety guaranteed from template instantiation in BEChooser<>::GetFunc
712 const uint32_t rasterTileColorOffset
= RasterTileColorOffset(sample
);
715 for(uint32_t rt
= 0; rt
< NumRT
; ++rt
)
717 uint8_t *pColorSample
= pColorBase
[rt
] + rasterTileColorOffset
;
719 const SWR_RENDER_TARGET_BLEND_STATE
*pRTBlend
= &pBlendState
->renderTarget
[rt
];
722 // pfnBlendFunc may not update all channels. Initialize with PS output.
723 /// TODO: move this into the blend JIT.
724 blendOut
= psContext
.shaded
[rt
];
726 // Blend outputs and update coverage mask for alpha test
727 if(pfnBlendFunc
[rt
] != nullptr)
731 psContext
.shaded
[rt
],
733 psContext
.shaded
[0].w
,
738 (simdscalari
*)&coverageMask
);
743 simdscalari outputMask
= _simd_castps_si(_simd_and_ps(coverageMask
, depthPassMask
));
745 ///@todo can only use maskstore fast path if bpc is 32. Assuming hot tile is RGBA32_FLOAT.
746 static_assert(KNOB_COLOR_HOT_TILE_FORMAT
== R32G32B32A32_FLOAT
, "Unsupported hot tile format");
748 const uint32_t simd
= KNOB_SIMD_WIDTH
* sizeof(float);
750 // store with color mask
751 if(!pRTBlend
->writeDisableRed
)
753 _simd_maskstore_ps((float*)pColorSample
, outputMask
, blendOut
.x
);
755 if(!pRTBlend
->writeDisableGreen
)
757 _simd_maskstore_ps((float*)(pColorSample
+ simd
), outputMask
, blendOut
.y
);
759 if(!pRTBlend
->writeDisableBlue
)
761 _simd_maskstore_ps((float*)(pColorSample
+ simd
* 2), outputMask
, blendOut
.z
);
763 if(!pRTBlend
->writeDisableAlpha
)
765 _simd_maskstore_ps((float*)(pColorSample
+ simd
* 3), outputMask
, blendOut
.w
);
770 #if USE_8x2_TILE_BACKEND
771 // Merge Output to 8x2 SIMD16 Tile Format
772 INLINE
void OutputMerger8x2(SWR_PS_CONTEXT
&psContext
, uint8_t* (&pColorBase
)[SWR_NUM_RENDERTARGETS
], uint32_t sample
, const SWR_BLEND_STATE
*pBlendState
,
773 const PFN_BLEND_JIT_FUNC(&pfnBlendFunc
)[SWR_NUM_RENDERTARGETS
], simdscalar
&coverageMask
, simdscalar depthPassMask
, const uint32_t NumRT
, const uint32_t colorBufferEnableMask
, bool useAlternateOffset
)
775 // type safety guaranteed from template instantiation in BEChooser<>::GetFunc
776 uint32_t rasterTileColorOffset
= RasterTileColorOffset(sample
);
778 if (useAlternateOffset
)
780 rasterTileColorOffset
+= sizeof(simdscalar
);
786 uint32_t colorBufferBit
= 1;
787 for (uint32_t rt
= 0; rt
< NumRT
; rt
+= 1, colorBufferBit
<<= 1)
789 simdscalar
*pColorSample
= reinterpret_cast<simdscalar
*>(pColorBase
[rt
] + rasterTileColorOffset
);
791 const SWR_RENDER_TARGET_BLEND_STATE
*pRTBlend
= &pBlendState
->renderTarget
[rt
];
793 if (colorBufferBit
& colorBufferEnableMask
)
795 blendSrc
[0] = pColorSample
[0];
796 blendSrc
[1] = pColorSample
[2];
797 blendSrc
[2] = pColorSample
[4];
798 blendSrc
[3] = pColorSample
[6];
802 // pfnBlendFunc may not update all channels. Initialize with PS output.
803 /// TODO: move this into the blend JIT.
804 blendOut
= psContext
.shaded
[rt
];
806 // Blend outputs and update coverage mask for alpha test
807 if(pfnBlendFunc
[rt
] != nullptr)
811 psContext
.shaded
[rt
],
813 psContext
.shaded
[0].w
,
815 reinterpret_cast<uint8_t *>(&blendSrc
),
818 reinterpret_cast<simdscalari
*>(&coverageMask
));
823 simdscalari outputMask
= _simd_castps_si(_simd_and_ps(coverageMask
, depthPassMask
));
825 ///@todo can only use maskstore fast path if bpc is 32. Assuming hot tile is RGBA32_FLOAT.
826 static_assert(KNOB_COLOR_HOT_TILE_FORMAT
== R32G32B32A32_FLOAT
, "Unsupported hot tile format");
828 // store with color mask
829 if (!pRTBlend
->writeDisableRed
)
831 _simd_maskstore_ps(reinterpret_cast<float *>(&pColorSample
[0]), outputMask
, blendOut
.x
);
833 if (!pRTBlend
->writeDisableGreen
)
835 _simd_maskstore_ps(reinterpret_cast<float *>(&pColorSample
[2]), outputMask
, blendOut
.y
);
837 if (!pRTBlend
->writeDisableBlue
)
839 _simd_maskstore_ps(reinterpret_cast<float *>(&pColorSample
[4]), outputMask
, blendOut
.z
);
841 if (!pRTBlend
->writeDisableAlpha
)
843 _simd_maskstore_ps(reinterpret_cast<float *>(&pColorSample
[6]), outputMask
, blendOut
.w
);
851 void BackendPixelRate(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t x
, uint32_t y
, SWR_TRIANGLE_DESC
&work
, RenderOutputBuffers
&renderBuffers
)
853 ///@todo: Need to move locals off stack to prevent __chkstk's from being generated for the backend
856 SWR_CONTEXT
*pContext
= pDC
->pContext
;
858 AR_BEGIN(BEPixelRateBackend
, pDC
->drawId
);
859 AR_BEGIN(BESetup
, pDC
->drawId
);
861 const API_STATE
&state
= GetApiState(pDC
);
863 BarycentricCoeffs coeffs
;
864 SetupBarycentricCoeffs(&coeffs
, work
);
866 SWR_PS_CONTEXT psContext
;
867 const SWR_MULTISAMPLE_POS
& samplePos
= state
.rastState
.samplePositions
;
868 SetupPixelShaderContext
<T
>(&psContext
, samplePos
, work
);
870 uint8_t *pDepthBuffer
, *pStencilBuffer
;
871 SetupRenderBuffers(psContext
.pColorBuffer
, &pDepthBuffer
, &pStencilBuffer
, state
.psState
.numRenderTargets
, renderBuffers
);
875 PixelRateZTestLoop
<T
> PixelRateZTest(pDC
, workerId
, work
, coeffs
, state
, pDepthBuffer
, pStencilBuffer
, state
.rastState
.clipDistanceMask
);
877 psContext
.vY
.UL
= _simd_add_ps(vULOffsetsY
, _simd_set1_ps(static_cast<float>(y
)));
878 psContext
.vY
.center
= _simd_add_ps(vCenterOffsetsY
, _simd_set1_ps(static_cast<float>(y
)));
880 const simdscalar dy
= _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM
));
882 for(uint32_t yy
= y
; yy
< y
+ KNOB_TILE_Y_DIM
; yy
+= SIMD_TILE_Y_DIM
)
884 psContext
.vX
.UL
= _simd_add_ps(vULOffsetsX
, _simd_set1_ps(static_cast<float>(x
)));
885 psContext
.vX
.center
= _simd_add_ps(vCenterOffsetsX
, _simd_set1_ps(static_cast<float>(x
)));
887 const simdscalar dx
= _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM
));
889 for(uint32_t xx
= x
; xx
< x
+ KNOB_TILE_X_DIM
; xx
+= SIMD_TILE_X_DIM
)
891 #if USE_8x2_TILE_BACKEND
892 const bool useAlternateOffset
= ((xx
& SIMD_TILE_X_DIM
) != 0);
894 simdscalar activeLanes
;
895 if(!(work
.anyCoveredSamples
& MASK
)) {goto Endtile
;};
896 activeLanes
= vMask(work
.anyCoveredSamples
& MASK
);
898 if (T::InputCoverage
!= SWR_INPUT_COVERAGE_NONE
)
900 const uint64_t* pCoverageMask
= (T::InputCoverage
== SWR_INPUT_COVERAGE_INNER_CONSERVATIVE
) ? &work
.innerCoverageMask
: &work
.coverageMask
[0];
902 generateInputCoverage
<T
, T::InputCoverage
>(pCoverageMask
, psContext
.inputMask
, state
.blendState
.sampleMask
);
905 AR_BEGIN(BEBarycentric
, pDC
->drawId
);
907 CalcPixelBarycentrics(coeffs
, psContext
);
909 CalcCentroid
<T
, false>(&psContext
, samplePos
, coeffs
, work
.coverageMask
, state
.blendState
.sampleMask
);
911 AR_END(BEBarycentric
, 0);
913 if(T::bForcedSampleCount
)
915 // candidate pixels (that passed coverage) will cause shader invocation if any bits in the samplemask are set
916 const simdscalar vSampleMask
= _simd_castsi_ps(_simd_cmpgt_epi32(_simd_set1_epi32(state
.blendState
.sampleMask
), _simd_setzero_si()));
917 activeLanes
= _simd_and_ps(activeLanes
, vSampleMask
);
921 if(T::bCanEarlyZ
&& !T::bForcedSampleCount
)
923 uint32_t depthPassCount
= PixelRateZTest(activeLanes
, psContext
, BEEarlyDepthTest
);
924 UPDATE_STAT_BE(DepthPassCount
, depthPassCount
);
925 AR_EVENT(EarlyDepthInfoPixelRate(depthPassCount
, _simd_movemask_ps(activeLanes
)));
928 // if we have no covered samples that passed depth at this point, go to next tile
929 if(!_simd_movemask_ps(activeLanes
)) { goto Endtile
; };
931 if(state
.psState
.usesSourceDepth
)
933 AR_BEGIN(BEBarycentric
, pDC
->drawId
);
934 // interpolate and quantize z
935 psContext
.vZ
= vplaneps(coeffs
.vZa
, coeffs
.vZb
, coeffs
.vZc
, psContext
.vI
.center
, psContext
.vJ
.center
);
936 psContext
.vZ
= state
.pfnQuantizeDepth(psContext
.vZ
);
937 AR_END(BEBarycentric
, 0);
940 // pixels that are currently active
941 psContext
.activeMask
= _simd_castps_si(activeLanes
);
942 psContext
.oMask
= T::MultisampleT::FullSampleMask();
944 // execute pixel shader
945 AR_BEGIN(BEPixelShader
, pDC
->drawId
);
946 state
.psState
.pfnPixelShader(GetPrivateState(pDC
), &psContext
);
947 UPDATE_STAT_BE(PsInvocations
, _mm_popcnt_u32(_simd_movemask_ps(activeLanes
)));
948 AR_END(BEPixelShader
, 0);
950 // update active lanes to remove any discarded or oMask'd pixels
951 activeLanes
= _simd_castsi_ps(_simd_and_si(psContext
.activeMask
, _simd_cmpgt_epi32(psContext
.oMask
, _simd_setzero_si())));
952 if(!_simd_movemask_ps(activeLanes
)) { goto Endtile
; };
955 if(!T::bCanEarlyZ
&& !T::bForcedSampleCount
)
957 uint32_t depthPassCount
= PixelRateZTest(activeLanes
, psContext
, BELateDepthTest
);
958 UPDATE_STAT_BE(DepthPassCount
, depthPassCount
);
959 AR_EVENT(LateDepthInfoPixelRate(depthPassCount
, _simd_movemask_ps(activeLanes
)));
962 // if we have no covered samples that passed depth at this point, skip OM and go to next tile
963 if(!_simd_movemask_ps(activeLanes
)) { goto Endtile
; };
966 // loop over all samples, broadcasting the results of the PS to all passing pixels
967 for(uint32_t sample
= 0; sample
< GetNumOMSamples
<T
>(state
.blendState
.sampleCount
); sample
++)
969 AR_BEGIN(BEOutputMerger
, pDC
->drawId
);
970 // center pattern does a single coverage/depth/stencil test, standard pattern tests all samples
971 uint32_t coverageSampleNum
= (T::bIsCenterPattern
) ? 0 : sample
;
972 simdscalar coverageMask
, depthMask
;
973 if(T::bForcedSampleCount
)
975 coverageMask
= depthMask
= activeLanes
;
979 coverageMask
= PixelRateZTest
.vCoverageMask
[coverageSampleNum
];
980 depthMask
= PixelRateZTest
.depthPassMask
[coverageSampleNum
];
981 if(!_simd_movemask_ps(depthMask
))
983 // stencil should already have been written in early/lateZ tests
984 AR_END(BEOutputMerger
, 0);
989 // broadcast the results of the PS to all passing pixels
990 #if USE_8x2_TILE_BACKEND
991 OutputMerger8x2(psContext
, psContext
.pColorBuffer
, sample
, &state
.blendState
,state
.pfnBlendFunc
, coverageMask
, depthMask
, state
.psState
.numRenderTargets
, state
.colorHottileEnable
, useAlternateOffset
);
992 #else // USE_8x2_TILE_BACKEND
993 OutputMerger4x2(psContext
, psContext
.pColorBuffer
, sample
, &state
.blendState
, state
.pfnBlendFunc
, coverageMask
, depthMask
, state
.psState
.numRenderTargets
);
994 #endif // USE_8x2_TILE_BACKEND
996 if(!state
.psState
.forceEarlyZ
&& !T::bForcedSampleCount
)
998 uint8_t *pDepthSample
= pDepthBuffer
+ RasterTileDepthOffset(sample
);
999 uint8_t * pStencilSample
= pStencilBuffer
+ RasterTileStencilOffset(sample
);
1001 DepthStencilWrite(&state
.vp
[work
.triFlags
.viewportIndex
], &state
.depthStencilState
, work
.triFlags
.frontFacing
, PixelRateZTest
.vZ
[coverageSampleNum
],
1002 pDepthSample
, depthMask
, coverageMask
, pStencilSample
, PixelRateZTest
.stencilPassMask
[coverageSampleNum
]);
1004 AR_END(BEOutputMerger
, 0);
1007 AR_BEGIN(BEEndTile
, pDC
->drawId
);
1009 for(uint32_t sample
= 0; sample
< T::MultisampleT::numCoverageSamples
; sample
++)
1011 work
.coverageMask
[sample
] >>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
1014 if(T::InputCoverage
== SWR_INPUT_COVERAGE_INNER_CONSERVATIVE
)
1016 work
.innerCoverageMask
>>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
1018 work
.anyCoveredSamples
>>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
1020 #if USE_8x2_TILE_BACKEND
1021 if (useAlternateOffset
)
1023 for (uint32_t rt
= 0; rt
< state
.psState
.numRenderTargets
; ++rt
)
1025 psContext
.pColorBuffer
[rt
] += (2 * KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
) / 8;
1029 for(uint32_t rt
= 0; rt
< state
.psState
.numRenderTargets
; ++rt
)
1031 psContext
.pColorBuffer
[rt
] += (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
) / 8;
1034 pDepthBuffer
+= (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
) / 8;
1035 pStencilBuffer
+= (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
) / 8;
1037 AR_END(BEEndTile
, 0);
1039 psContext
.vX
.UL
= _simd_add_ps(psContext
.vX
.UL
, dx
);
1040 psContext
.vX
.center
= _simd_add_ps(psContext
.vX
.center
, dx
);
1043 psContext
.vY
.UL
= _simd_add_ps(psContext
.vY
.UL
, dy
);
1044 psContext
.vY
.center
= _simd_add_ps(psContext
.vY
.center
, dy
);
1047 AR_END(BEPixelRateBackend
, 0);
1050 template<uint32_t sampleCountT
= SWR_MULTISAMPLE_1X
, uint32_t isCenter
= 0,
1051 uint32_t coverage
= 0, uint32_t centroid
= 0, uint32_t forced
= 0, uint32_t canEarlyZ
= 0
1053 struct SwrBackendTraits
1055 static const bool bIsCenterPattern
= (isCenter
== 1);
1056 static const uint32_t InputCoverage
= coverage
;
1057 static const bool bCentroidPos
= (centroid
== 1);
1058 static const bool bForcedSampleCount
= (forced
== 1);
1059 static const bool bCanEarlyZ
= (canEarlyZ
== 1);
1060 typedef MultisampleTraits
<(SWR_MULTISAMPLE_COUNT
)sampleCountT
, bIsCenterPattern
> MultisampleT
;