1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * @brief Backend handles rasterization, pixel shading and output merger
28 ******************************************************************************/
31 #include "common/os.h"
32 #include "core/context.h"
33 #include "core/multisample.h"
35 void ProcessComputeBE(DRAW_CONTEXT
* pDC
, uint32_t workerId
, uint32_t threadGroupId
, void*& pSpillFillBuffer
);
36 void ProcessSyncBE(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t macroTile
, void *pUserData
);
37 void ProcessQueryStatsBE(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t macroTile
, void *pUserData
);
38 void ProcessClearBE(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t macroTile
, void *pUserData
);
39 void ProcessStoreTileBE(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t macroTile
, void *pData
);
40 void ProcessDiscardInvalidateTilesBE(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t macroTile
, void *pData
);
41 void BackendNullPS(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t x
, uint32_t y
, SWR_TRIANGLE_DESC
&work
, RenderOutputBuffers
&renderBuffers
);
42 void InitClearTilesTable();
43 simdmask
ComputeUserClipMask(uint8_t clipMask
, float* pUserClipBuffer
, simdscalar vI
, simdscalar vJ
);
44 void InitBackendFuncTables();
45 void InitCPSFuncTables();
47 enum SWR_BACKEND_FUNCS
49 SWR_BACKEND_SINGLE_SAMPLE
,
50 SWR_BACKEND_MSAA_PIXEL_RATE
,
51 SWR_BACKEND_MSAA_SAMPLE_RATE
,
52 SWR_BACKEND_FUNCS_MAX
,
55 #if KNOB_SIMD_WIDTH == 8
56 extern const __m256 vCenterOffsetsX
;
57 extern const __m256 vCenterOffsetsY
;
58 extern const __m256 vULOffsetsX
;
59 extern const __m256 vULOffsetsY
;
64 INLINE
void generateInputCoverage(const uint64_t *const coverageMask
, uint32_t (&inputMask
)[KNOB_SIMD_WIDTH
], const uint32_t sampleMask
)
67 // will need to update for avx512
68 assert(KNOB_SIMD_WIDTH
== 8);
71 __m256i sampleCoverage
[2];
72 if(T::bIsStandardPattern
)
74 __m256i src
= _mm256_set1_epi32(0);
75 __m256i index0
= _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0), index1
;
77 if(T::MultisampleT::numSamples
== 1)
79 mask
[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, -1);
81 else if(T::MultisampleT::numSamples
== 2)
83 mask
[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1);
85 else if(T::MultisampleT::numSamples
== 4)
87 mask
[0] = _mm256_set_epi32(0, 0, 0, 0, -1, -1, -1, -1);
89 else if(T::MultisampleT::numSamples
== 8)
91 mask
[0] = _mm256_set1_epi32(-1);
93 else if(T::MultisampleT::numSamples
== 16)
95 mask
[0] = _mm256_set1_epi32(-1);
96 mask
[1] = _mm256_set1_epi32(-1);
97 index1
= _mm256_set_epi32(15, 14, 13, 12, 11, 10, 9, 8);
100 // gather coverage for samples 0-7
101 sampleCoverage
[0] = _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src
), (const float*)coverageMask
, index0
, _mm256_castsi256_ps(mask
[0]), 8));
102 if(T::MultisampleT::numSamples
> 8)
104 // gather coverage for samples 8-15
105 sampleCoverage
[1] = _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src
), (const float*)coverageMask
, index1
, _mm256_castsi256_ps(mask
[1]), 8));
110 // center coverage is the same for all samples; just broadcast to the sample slots
111 uint32_t centerCoverage
= ((uint32_t)(*coverageMask
) & MASK
);
112 if(T::MultisampleT::numSamples
== 1)
114 sampleCoverage
[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, centerCoverage
);
116 else if(T::MultisampleT::numSamples
== 2)
118 sampleCoverage
[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, centerCoverage
, centerCoverage
);
120 else if(T::MultisampleT::numSamples
== 4)
122 sampleCoverage
[0] = _mm256_set_epi32(0, 0, 0, 0, centerCoverage
, centerCoverage
, centerCoverage
, centerCoverage
);
124 else if(T::MultisampleT::numSamples
== 8)
126 sampleCoverage
[0] = _mm256_set1_epi32(centerCoverage
);
128 else if(T::MultisampleT::numSamples
== 16)
130 sampleCoverage
[0] = _mm256_set1_epi32(centerCoverage
);
131 sampleCoverage
[1] = _mm256_set1_epi32(centerCoverage
);
135 mask
[0] = _mm256_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xC, 0x8, 0x4, 0x0,
136 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xC, 0x8, 0x4, 0x0);
137 // pull out the the 8bit 4x2 coverage for samples 0-7 into the lower 32 bits of each 128bit lane
138 __m256i packedCoverage0
= _simd_shuffle_epi8(sampleCoverage
[0], mask
[0]);
140 __m256i packedCoverage1
;
141 if(T::MultisampleT::numSamples
> 8)
143 // pull out the the 8bit 4x2 coverage for samples 8-15 into the lower 32 bits of each 128bit lane
144 packedCoverage1
= _simd_shuffle_epi8(sampleCoverage
[1], mask
[0]);
147 #if (KNOB_ARCH == KNOB_ARCH_AVX)
148 // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane
149 __m256i hiToLow
= _mm256_permute2f128_si256(packedCoverage0
, packedCoverage0
, 0x83);
150 __m256 shufRes
= _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow
), _mm256_castsi256_ps(hiToLow
), _MM_SHUFFLE(1, 1, 0, 1));
151 packedCoverage0
= _mm256_castps_si256(_mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0
), shufRes
, 0xFE));
153 __m256i packedSampleCoverage
;
154 if(T::MultisampleT::numSamples
> 8)
156 // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane
157 hiToLow
= _mm256_permute2f128_si256(packedCoverage1
, packedCoverage1
, 0x83);
158 shufRes
= _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow
), _mm256_castsi256_ps(hiToLow
), _MM_SHUFFLE(1, 1, 0, 1));
159 shufRes
= _mm256_blend_ps(_mm256_castsi256_ps(packedCoverage1
), shufRes
, 0xFE);
160 packedCoverage1
= _mm256_castps_si256(_mm256_castpd_ps(_mm256_shuffle_pd(_mm256_castps_pd(shufRes
), _mm256_castps_pd(shufRes
), 0x01)));
161 packedSampleCoverage
= _mm256_castps_si256(_mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0
), _mm256_castsi256_ps(packedCoverage1
), 0xFC));
165 packedSampleCoverage
= packedCoverage0
;
168 __m256i permMask
= _mm256_set_epi32(0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x4, 0x0);
169 // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane
170 packedCoverage0
= _mm256_permutevar8x32_epi32(packedCoverage0
, permMask
);
172 __m256i packedSampleCoverage
;
173 if(T::MultisampleT::numSamples
> 8)
175 permMask
= _mm256_set_epi32(0x7, 0x7, 0x7, 0x7, 0x4, 0x0, 0x7, 0x7);
176 // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane
177 packedCoverage1
= _mm256_permutevar8x32_epi32(packedCoverage1
, permMask
);
179 // blend coverage masks for samples 0-7 and samples 8-15 into single 128 bit lane
180 packedSampleCoverage
= _mm256_blend_epi32(packedCoverage0
, packedCoverage1
, 0x0C);
184 packedSampleCoverage
= packedCoverage0
;
188 for(int32_t i
= KNOB_SIMD_WIDTH
- 1; i
>= 0; i
--)
190 // convert packed sample coverage masks into single coverage masks for all samples for each pixel in the 4x2
191 inputMask
[i
] = _simd_movemask_epi8(packedSampleCoverage
);
193 if(!T::bForcedSampleCount
)
195 // input coverage has to be anded with sample mask if MSAA isn't forced on
196 inputMask
[i
] &= sampleMask
;
199 // shift to the next pixel in the 4x2
200 packedSampleCoverage
= _simd_slli_epi32(packedSampleCoverage
, 1);
205 INLINE
void generateInputCoverage(const uint64_t *const coverageMask
, __m256
&inputCoverage
, const uint32_t sampleMask
)
207 uint32_t inputMask
[KNOB_SIMD_WIDTH
];
208 generateInputCoverage
<T
>(coverageMask
, inputMask
, sampleMask
);
209 inputCoverage
= _simd_castsi_ps(_mm256_set_epi32(inputMask
[7], inputMask
[6], inputMask
[5], inputMask
[4], inputMask
[3], inputMask
[2], inputMask
[1], inputMask
[0]));
212 template<uint32_t sampleCountT
= SWR_MULTISAMPLE_1X
, uint32_t samplePattern
= SWR_MSAA_STANDARD_PATTERN
,
213 uint32_t coverage
= 0, uint32_t centroid
= 0, uint32_t forced
= 0, uint32_t odepth
= 0>
214 struct SwrBackendTraits
216 static const bool bIsStandardPattern
= (samplePattern
== SWR_MSAA_STANDARD_PATTERN
);
217 static const bool bInputCoverage
= (coverage
== 1);
218 static const bool bCentroidPos
= (centroid
== 1);
219 static const bool bForcedSampleCount
= (forced
== 1);
220 static const bool bWritesODepth
= (odepth
== 1);
221 typedef MultisampleTraits
<(SWR_MULTISAMPLE_COUNT
)sampleCountT
, (bIsStandardPattern
) ? SWR_MSAA_STANDARD_PATTERN
: SWR_MSAA_CENTER_PATTERN
> MultisampleT
;