swr: [rasterizer core] backend refactor
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / backend.h
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file backend.h
24 *
25 * @brief Backend handles rasterization, pixel shading and output merger
26 * operations.
27 *
28 ******************************************************************************/
29 #pragma once
30
31 #include "common/os.h"
32 #include "core/context.h"
33 #include "core/multisample.h"
34
35 void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId, void*& pSpillFillBuffer);
36 void ProcessSyncBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData);
37 void ProcessQueryStatsBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData);
38 void ProcessClearBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData);
39 void ProcessStoreTileBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData);
40 void ProcessDiscardInvalidateTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData);
41 void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers);
42 void InitClearTilesTable();
43 simdmask ComputeUserClipMask(uint8_t clipMask, float* pUserClipBuffer, simdscalar vI, simdscalar vJ);
44 void InitBackendFuncTables();
45 void InitCPSFuncTables();
46
47 enum SWR_BACKEND_FUNCS
48 {
49 SWR_BACKEND_SINGLE_SAMPLE,
50 SWR_BACKEND_MSAA_PIXEL_RATE,
51 SWR_BACKEND_MSAA_SAMPLE_RATE,
52 SWR_BACKEND_FUNCS_MAX,
53 };
54
55 #if KNOB_SIMD_WIDTH == 8
56 extern const __m256 vCenterOffsetsX;
57 extern const __m256 vCenterOffsetsY;
58 extern const __m256 vULOffsetsX;
59 extern const __m256 vULOffsetsY;
60 #define MASK 0xff
61 #endif
62
63 template<typename T>
64 INLINE void generateInputCoverage(const uint64_t *const coverageMask, uint32_t (&inputMask)[KNOB_SIMD_WIDTH], const uint32_t sampleMask)
65 {
66
67 // will need to update for avx512
68 assert(KNOB_SIMD_WIDTH == 8);
69
70 __m256i mask[2];
71 __m256i sampleCoverage[2];
72 if(T::bIsStandardPattern)
73 {
74 __m256i src = _mm256_set1_epi32(0);
75 __m256i index0 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0), index1;
76
77 if(T::MultisampleT::numSamples == 1)
78 {
79 mask[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, -1);
80 }
81 else if(T::MultisampleT::numSamples == 2)
82 {
83 mask[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1);
84 }
85 else if(T::MultisampleT::numSamples == 4)
86 {
87 mask[0] = _mm256_set_epi32(0, 0, 0, 0, -1, -1, -1, -1);
88 }
89 else if(T::MultisampleT::numSamples == 8)
90 {
91 mask[0] = _mm256_set1_epi32(-1);
92 }
93 else if(T::MultisampleT::numSamples == 16)
94 {
95 mask[0] = _mm256_set1_epi32(-1);
96 mask[1] = _mm256_set1_epi32(-1);
97 index1 = _mm256_set_epi32(15, 14, 13, 12, 11, 10, 9, 8);
98 }
99
100 // gather coverage for samples 0-7
101 sampleCoverage[0] = _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src), (const float*)coverageMask, index0, _mm256_castsi256_ps(mask[0]), 8));
102 if(T::MultisampleT::numSamples > 8)
103 {
104 // gather coverage for samples 8-15
105 sampleCoverage[1] = _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src), (const float*)coverageMask, index1, _mm256_castsi256_ps(mask[1]), 8));
106 }
107 }
108 else
109 {
110 // center coverage is the same for all samples; just broadcast to the sample slots
111 uint32_t centerCoverage = ((uint32_t)(*coverageMask) & MASK);
112 if(T::MultisampleT::numSamples == 1)
113 {
114 sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, centerCoverage);
115 }
116 else if(T::MultisampleT::numSamples == 2)
117 {
118 sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, centerCoverage, centerCoverage);
119 }
120 else if(T::MultisampleT::numSamples == 4)
121 {
122 sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, centerCoverage, centerCoverage, centerCoverage, centerCoverage);
123 }
124 else if(T::MultisampleT::numSamples == 8)
125 {
126 sampleCoverage[0] = _mm256_set1_epi32(centerCoverage);
127 }
128 else if(T::MultisampleT::numSamples == 16)
129 {
130 sampleCoverage[0] = _mm256_set1_epi32(centerCoverage);
131 sampleCoverage[1] = _mm256_set1_epi32(centerCoverage);
132 }
133 }
134
135 mask[0] = _mm256_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xC, 0x8, 0x4, 0x0,
136 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xC, 0x8, 0x4, 0x0);
137 // pull out the the 8bit 4x2 coverage for samples 0-7 into the lower 32 bits of each 128bit lane
138 __m256i packedCoverage0 = _simd_shuffle_epi8(sampleCoverage[0], mask[0]);
139
140 __m256i packedCoverage1;
141 if(T::MultisampleT::numSamples > 8)
142 {
143 // pull out the the 8bit 4x2 coverage for samples 8-15 into the lower 32 bits of each 128bit lane
144 packedCoverage1 = _simd_shuffle_epi8(sampleCoverage[1], mask[0]);
145 }
146
147 #if (KNOB_ARCH == KNOB_ARCH_AVX)
148 // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane
149 __m256i hiToLow = _mm256_permute2f128_si256(packedCoverage0, packedCoverage0, 0x83);
150 __m256 shufRes = _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow), _mm256_castsi256_ps(hiToLow), _MM_SHUFFLE(1, 1, 0, 1));
151 packedCoverage0 = _mm256_castps_si256(_mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0), shufRes, 0xFE));
152
153 __m256i packedSampleCoverage;
154 if(T::MultisampleT::numSamples > 8)
155 {
156 // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane
157 hiToLow = _mm256_permute2f128_si256(packedCoverage1, packedCoverage1, 0x83);
158 shufRes = _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow), _mm256_castsi256_ps(hiToLow), _MM_SHUFFLE(1, 1, 0, 1));
159 shufRes = _mm256_blend_ps(_mm256_castsi256_ps(packedCoverage1), shufRes, 0xFE);
160 packedCoverage1 = _mm256_castps_si256(_mm256_castpd_ps(_mm256_shuffle_pd(_mm256_castps_pd(shufRes), _mm256_castps_pd(shufRes), 0x01)));
161 packedSampleCoverage = _mm256_castps_si256(_mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0), _mm256_castsi256_ps(packedCoverage1), 0xFC));
162 }
163 else
164 {
165 packedSampleCoverage = packedCoverage0;
166 }
167 #else
168 __m256i permMask = _mm256_set_epi32(0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x4, 0x0);
169 // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane
170 packedCoverage0 = _mm256_permutevar8x32_epi32(packedCoverage0, permMask);
171
172 __m256i packedSampleCoverage;
173 if(T::MultisampleT::numSamples > 8)
174 {
175 permMask = _mm256_set_epi32(0x7, 0x7, 0x7, 0x7, 0x4, 0x0, 0x7, 0x7);
176 // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane
177 packedCoverage1 = _mm256_permutevar8x32_epi32(packedCoverage1, permMask);
178
179 // blend coverage masks for samples 0-7 and samples 8-15 into single 128 bit lane
180 packedSampleCoverage = _mm256_blend_epi32(packedCoverage0, packedCoverage1, 0x0C);
181 }
182 else
183 {
184 packedSampleCoverage = packedCoverage0;
185 }
186 #endif
187
188 for(int32_t i = KNOB_SIMD_WIDTH - 1; i >= 0; i--)
189 {
190 // convert packed sample coverage masks into single coverage masks for all samples for each pixel in the 4x2
191 inputMask[i] = _simd_movemask_epi8(packedSampleCoverage);
192
193 if(!T::bForcedSampleCount)
194 {
195 // input coverage has to be anded with sample mask if MSAA isn't forced on
196 inputMask[i] &= sampleMask;
197 }
198
199 // shift to the next pixel in the 4x2
200 packedSampleCoverage = _simd_slli_epi32(packedSampleCoverage, 1);
201 }
202 }
203
204 template<typename T>
205 INLINE void generateInputCoverage(const uint64_t *const coverageMask, __m256 &inputCoverage, const uint32_t sampleMask)
206 {
207 uint32_t inputMask[KNOB_SIMD_WIDTH];
208 generateInputCoverage<T>(coverageMask, inputMask, sampleMask);
209 inputCoverage = _simd_castsi_ps(_mm256_set_epi32(inputMask[7], inputMask[6], inputMask[5], inputMask[4], inputMask[3], inputMask[2], inputMask[1], inputMask[0]));
210 }
211
212 template<uint32_t sampleCountT = SWR_MULTISAMPLE_1X, uint32_t samplePattern = SWR_MSAA_STANDARD_PATTERN,
213 uint32_t coverage = 0, uint32_t centroid = 0, uint32_t forced = 0, uint32_t odepth = 0>
214 struct SwrBackendTraits
215 {
216 static const bool bIsStandardPattern = (samplePattern == SWR_MSAA_STANDARD_PATTERN);
217 static const bool bInputCoverage = (coverage == 1);
218 static const bool bCentroidPos = (centroid == 1);
219 static const bool bForcedSampleCount = (forced == 1);
220 static const bool bWritesODepth = (odepth == 1);
221 typedef MultisampleTraits<(SWR_MULTISAMPLE_COUNT)sampleCountT, (bIsStandardPattern) ? SWR_MSAA_STANDARD_PATTERN : SWR_MSAA_CENTER_PATTERN> MultisampleT;
222 };