swr: [rasterizer core] conservative rast degenerate handling
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / utils.h
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file utils.h
24 *
25 * @brief Utilities used by SWR core.
26 *
27 ******************************************************************************/
28 #pragma once
29
30 #include <string.h>
31 #include <type_traits>
32 #include "common/os.h"
33 #include "common/simdintrin.h"
34 #include "common/swr_assert.h"
35
36 #if defined(_WIN64) || defined(__x86_64__)
37 #define _MM_INSERT_EPI64 _mm_insert_epi64
38 #define _MM_EXTRACT_EPI64 _mm_extract_epi64
39 #else
40 INLINE INT64 _MM_EXTRACT_EPI64(__m128i a, const int32_t ndx)
41 {
42 OSALIGNLINE(uint32_t) elems[4];
43 _mm_store_si128((__m128i*)elems, a);
44 if (ndx == 0)
45 {
46 uint64_t foo = elems[0];
47 foo |= (uint64_t)elems[1] << 32;
48 return foo;
49 }
50 else
51 {
52 uint64_t foo = elems[2];
53 foo |= (uint64_t)elems[3] << 32;
54 return foo;
55 }
56 }
57
58 INLINE __m128i _MM_INSERT_EPI64(__m128i a, INT64 b, const int32_t ndx)
59 {
60 OSALIGNLINE(int64_t) elems[2];
61 _mm_store_si128((__m128i*)elems, a);
62 if (ndx == 0)
63 {
64 elems[0] = b;
65 }
66 else
67 {
68 elems[1] = b;
69 }
70 __m128i out;
71 out = _mm_load_si128((const __m128i*)elems);
72 return out;
73 }
74 #endif
75
76 OSALIGNLINE(struct) BBOX
77 {
78 int top{ 0 };
79 int bottom{ 0 };
80 int left{ 0 };
81 int right{ 0 };
82
83 BBOX() {}
84 BBOX(int t, int b, int l, int r) : top(t), bottom(b), left(l), right(r) {}
85
86 bool operator==(const BBOX& rhs)
87 {
88 return (this->top == rhs.top &&
89 this->bottom == rhs.bottom &&
90 this->left == rhs.left &&
91 this->right == rhs.right);
92 }
93
94 bool operator!=(const BBOX& rhs)
95 {
96 return !(*this == rhs);
97 }
98 };
99
100 struct simdBBox
101 {
102 simdscalari top;
103 simdscalari bottom;
104 simdscalari left;
105 simdscalari right;
106 };
107
108 INLINE
109 void vTranspose(__m128 &row0, __m128 &row1, __m128 &row2, __m128 &row3)
110 {
111 __m128i row0i = _mm_castps_si128(row0);
112 __m128i row1i = _mm_castps_si128(row1);
113 __m128i row2i = _mm_castps_si128(row2);
114 __m128i row3i = _mm_castps_si128(row3);
115
116 __m128i vTemp = row2i;
117 row2i = _mm_unpacklo_epi32(row2i, row3i);
118 vTemp = _mm_unpackhi_epi32(vTemp, row3i);
119
120 row3i = row0i;
121 row0i = _mm_unpacklo_epi32(row0i, row1i);
122 row3i = _mm_unpackhi_epi32(row3i, row1i);
123
124 row1i = row0i;
125 row0i = _mm_unpacklo_epi64(row0i, row2i);
126 row1i = _mm_unpackhi_epi64(row1i, row2i);
127
128 row2i = row3i;
129 row2i = _mm_unpacklo_epi64(row2i, vTemp);
130 row3i = _mm_unpackhi_epi64(row3i, vTemp);
131
132 row0 = _mm_castsi128_ps(row0i);
133 row1 = _mm_castsi128_ps(row1i);
134 row2 = _mm_castsi128_ps(row2i);
135 row3 = _mm_castsi128_ps(row3i);
136 }
137
138 INLINE
139 void vTranspose(__m128i &row0, __m128i &row1, __m128i &row2, __m128i &row3)
140 {
141 __m128i vTemp = row2;
142 row2 = _mm_unpacklo_epi32(row2, row3);
143 vTemp = _mm_unpackhi_epi32(vTemp, row3);
144
145 row3 = row0;
146 row0 = _mm_unpacklo_epi32(row0, row1);
147 row3 = _mm_unpackhi_epi32(row3, row1);
148
149 row1 = row0;
150 row0 = _mm_unpacklo_epi64(row0, row2);
151 row1 = _mm_unpackhi_epi64(row1, row2);
152
153 row2 = row3;
154 row2 = _mm_unpacklo_epi64(row2, vTemp);
155 row3 = _mm_unpackhi_epi64(row3, vTemp);
156 }
157
158 #define GCC_VERSION (__GNUC__ * 10000 \
159 + __GNUC_MINOR__ * 100 \
160 + __GNUC_PATCHLEVEL__)
161
162 #if defined(__clang__) || (defined(__GNUC__) && (GCC_VERSION < 40900))
163 #define _mm_undefined_ps _mm_setzero_ps
164 #define _mm_undefined_si128 _mm_setzero_si128
165 #if KNOB_SIMD_WIDTH == 8
166 #define _mm256_undefined_ps _mm256_setzero_ps
167 #endif
168 #endif
169
170 #if KNOB_SIMD_WIDTH == 8 || KNOB_SIMD_WIDTH == 16
171 INLINE
172 void vTranspose3x8(__m128 (&vDst)[8], __m256 &vSrc0, __m256 &vSrc1, __m256 &vSrc2)
173 {
174 __m256 r0r2 = _mm256_unpacklo_ps(vSrc0, vSrc2); //x0z0x1z1 x4z4x5z5
175 __m256 r1rx = _mm256_unpacklo_ps(vSrc1, _mm256_undefined_ps()); //y0w0y1w1 y4w4y5w5
176 __m256 r02r1xlolo = _mm256_unpacklo_ps(r0r2, r1rx); //x0y0z0w0 x4y4z4w4
177 __m256 r02r1xlohi = _mm256_unpackhi_ps(r0r2, r1rx); //x1y1z1w1 x5y5z5w5
178
179 r0r2 = _mm256_unpackhi_ps(vSrc0, vSrc2); //x2z2x3z3 x6z6x7z7
180 r1rx = _mm256_unpackhi_ps(vSrc1, _mm256_undefined_ps()); //y2w2y3w3 y6w6yw77
181 __m256 r02r1xhilo = _mm256_unpacklo_ps(r0r2, r1rx); //x2y2z2w2 x6y6z6w6
182 __m256 r02r1xhihi = _mm256_unpackhi_ps(r0r2, r1rx); //x3y3z3w3 x7y7z7w7
183
184 vDst[0] = _mm256_castps256_ps128(r02r1xlolo);
185 vDst[1] = _mm256_castps256_ps128(r02r1xlohi);
186 vDst[2] = _mm256_castps256_ps128(r02r1xhilo);
187 vDst[3] = _mm256_castps256_ps128(r02r1xhihi);
188
189 vDst[4] = _mm256_extractf128_ps(r02r1xlolo, 1);
190 vDst[5] = _mm256_extractf128_ps(r02r1xlohi, 1);
191 vDst[6] = _mm256_extractf128_ps(r02r1xhilo, 1);
192 vDst[7] = _mm256_extractf128_ps(r02r1xhihi, 1);
193 }
194
195 INLINE
196 void vTranspose4x8(__m128 (&vDst)[8], __m256 &vSrc0, __m256 &vSrc1, __m256 &vSrc2, __m256 &vSrc3)
197 {
198 __m256 r0r2 = _mm256_unpacklo_ps(vSrc0, vSrc2); //x0z0x1z1 x4z4x5z5
199 __m256 r1rx = _mm256_unpacklo_ps(vSrc1, vSrc3); //y0w0y1w1 y4w4y5w5
200 __m256 r02r1xlolo = _mm256_unpacklo_ps(r0r2, r1rx); //x0y0z0w0 x4y4z4w4
201 __m256 r02r1xlohi = _mm256_unpackhi_ps(r0r2, r1rx); //x1y1z1w1 x5y5z5w5
202
203 r0r2 = _mm256_unpackhi_ps(vSrc0, vSrc2); //x2z2x3z3 x6z6x7z7
204 r1rx = _mm256_unpackhi_ps(vSrc1, vSrc3) ; //y2w2y3w3 y6w6yw77
205 __m256 r02r1xhilo = _mm256_unpacklo_ps(r0r2, r1rx); //x2y2z2w2 x6y6z6w6
206 __m256 r02r1xhihi = _mm256_unpackhi_ps(r0r2, r1rx); //x3y3z3w3 x7y7z7w7
207
208 vDst[0] = _mm256_castps256_ps128(r02r1xlolo);
209 vDst[1] = _mm256_castps256_ps128(r02r1xlohi);
210 vDst[2] = _mm256_castps256_ps128(r02r1xhilo);
211 vDst[3] = _mm256_castps256_ps128(r02r1xhihi);
212
213 vDst[4] = _mm256_extractf128_ps(r02r1xlolo, 1);
214 vDst[5] = _mm256_extractf128_ps(r02r1xlohi, 1);
215 vDst[6] = _mm256_extractf128_ps(r02r1xhilo, 1);
216 vDst[7] = _mm256_extractf128_ps(r02r1xhihi, 1);
217 }
218
219 INLINE
220 void vTranspose8x8(__m256 (&vDst)[8], const __m256 &vMask0, const __m256 &vMask1, const __m256 &vMask2, const __m256 &vMask3, const __m256 &vMask4, const __m256 &vMask5, const __m256 &vMask6, const __m256 &vMask7)
221 {
222 __m256 __t0 = _mm256_unpacklo_ps(vMask0, vMask1);
223 __m256 __t1 = _mm256_unpackhi_ps(vMask0, vMask1);
224 __m256 __t2 = _mm256_unpacklo_ps(vMask2, vMask3);
225 __m256 __t3 = _mm256_unpackhi_ps(vMask2, vMask3);
226 __m256 __t4 = _mm256_unpacklo_ps(vMask4, vMask5);
227 __m256 __t5 = _mm256_unpackhi_ps(vMask4, vMask5);
228 __m256 __t6 = _mm256_unpacklo_ps(vMask6, vMask7);
229 __m256 __t7 = _mm256_unpackhi_ps(vMask6, vMask7);
230 __m256 __tt0 = _mm256_shuffle_ps(__t0,__t2,_MM_SHUFFLE(1,0,1,0));
231 __m256 __tt1 = _mm256_shuffle_ps(__t0,__t2,_MM_SHUFFLE(3,2,3,2));
232 __m256 __tt2 = _mm256_shuffle_ps(__t1,__t3,_MM_SHUFFLE(1,0,1,0));
233 __m256 __tt3 = _mm256_shuffle_ps(__t1,__t3,_MM_SHUFFLE(3,2,3,2));
234 __m256 __tt4 = _mm256_shuffle_ps(__t4,__t6,_MM_SHUFFLE(1,0,1,0));
235 __m256 __tt5 = _mm256_shuffle_ps(__t4,__t6,_MM_SHUFFLE(3,2,3,2));
236 __m256 __tt6 = _mm256_shuffle_ps(__t5,__t7,_MM_SHUFFLE(1,0,1,0));
237 __m256 __tt7 = _mm256_shuffle_ps(__t5,__t7,_MM_SHUFFLE(3,2,3,2));
238 vDst[0] = _mm256_permute2f128_ps(__tt0, __tt4, 0x20);
239 vDst[1] = _mm256_permute2f128_ps(__tt1, __tt5, 0x20);
240 vDst[2] = _mm256_permute2f128_ps(__tt2, __tt6, 0x20);
241 vDst[3] = _mm256_permute2f128_ps(__tt3, __tt7, 0x20);
242 vDst[4] = _mm256_permute2f128_ps(__tt0, __tt4, 0x31);
243 vDst[5] = _mm256_permute2f128_ps(__tt1, __tt5, 0x31);
244 vDst[6] = _mm256_permute2f128_ps(__tt2, __tt6, 0x31);
245 vDst[7] = _mm256_permute2f128_ps(__tt3, __tt7, 0x31);
246 }
247
248 INLINE
249 void vTranspose8x8(__m256 (&vDst)[8], const __m256i &vMask0, const __m256i &vMask1, const __m256i &vMask2, const __m256i &vMask3, const __m256i &vMask4, const __m256i &vMask5, const __m256i &vMask6, const __m256i &vMask7)
250 {
251 vTranspose8x8(vDst, _mm256_castsi256_ps(vMask0), _mm256_castsi256_ps(vMask1), _mm256_castsi256_ps(vMask2), _mm256_castsi256_ps(vMask3),
252 _mm256_castsi256_ps(vMask4), _mm256_castsi256_ps(vMask5), _mm256_castsi256_ps(vMask6), _mm256_castsi256_ps(vMask7));
253 }
254 #endif
255
256 //////////////////////////////////////////////////////////////////////////
257 /// TranposeSingleComponent
258 //////////////////////////////////////////////////////////////////////////
259 template<uint32_t bpp>
260 struct TransposeSingleComponent
261 {
262 //////////////////////////////////////////////////////////////////////////
263 /// @brief Pass-thru for single component.
264 /// @param pSrc - source data in SOA form
265 /// @param pDst - output data in AOS form
266 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
267 {
268 memcpy(pDst, pSrc, (bpp * KNOB_SIMD_WIDTH) / 8);
269 }
270 };
271
272 //////////////////////////////////////////////////////////////////////////
273 /// Transpose8_8_8_8
274 //////////////////////////////////////////////////////////////////////////
275 struct Transpose8_8_8_8
276 {
277 //////////////////////////////////////////////////////////////////////////
278 /// @brief Performs an SOA to AOS conversion for packed 8_8_8_8 data.
279 /// @param pSrc - source data in SOA form
280 /// @param pDst - output data in AOS form
281 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
282 {
283 simdscalari src = _simd_load_si((const simdscalari*)pSrc);
284
285 #if KNOB_SIMD_WIDTH == 8
286 #if KNOB_ARCH == KNOB_ARCH_AVX
287 __m128i c0c1 = _mm256_castsi256_si128(src); // rrrrrrrrgggggggg
288 __m128i c2c3 = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(src), 1)); // bbbbbbbbaaaaaaaa
289 __m128i c0c2 = _mm_unpacklo_epi64(c0c1, c2c3); // rrrrrrrrbbbbbbbb
290 __m128i c1c3 = _mm_unpackhi_epi64(c0c1, c2c3); // ggggggggaaaaaaaa
291 __m128i c01 = _mm_unpacklo_epi8(c0c2, c1c3); // rgrgrgrgrgrgrgrg
292 __m128i c23 = _mm_unpackhi_epi8(c0c2, c1c3); // babababababababa
293 __m128i c0123lo = _mm_unpacklo_epi16(c01, c23); // rgbargbargbargba
294 __m128i c0123hi = _mm_unpackhi_epi16(c01, c23); // rgbargbargbargba
295 _mm_store_si128((__m128i*)pDst, c0123lo);
296 _mm_store_si128((__m128i*)(pDst + 16), c0123hi);
297 #elif KNOB_ARCH == KNOB_ARCH_AVX2
298 simdscalari dst01 = _mm256_shuffle_epi8(src,
299 _mm256_set_epi32(0x0f078080, 0x0e068080, 0x0d058080, 0x0c048080, 0x80800b03, 0x80800a02, 0x80800901, 0x80800800));
300 simdscalari dst23 = _mm256_permute2x128_si256(src, src, 0x01);
301 dst23 = _mm256_shuffle_epi8(dst23,
302 _mm256_set_epi32(0x80800f07, 0x80800e06, 0x80800d05, 0x80800c04, 0x0b038080, 0x0a028080, 0x09018080, 0x08008080));
303 simdscalari dst = _mm256_or_si256(dst01, dst23);
304 _simd_store_si((simdscalari*)pDst, dst);
305 #endif
306 #elif KNOB_SIMD_WIDTH == 16
307 simdscalari mask0 = _simd_set_epi32(0x0f078080, 0x0e068080, 0x0d058080, 0x0c048080, 0x80800b03, 0x80800a02, 0x80800901, 0x80800800);
308
309 simdscalari dst01 = _simd_shuffle_epi8(src, mask0);
310
311 simdscalari perm1 = _simd_permute_128(src, src, 1);
312
313 simdscalari mask1 = _simd_set_epi32(0x80800f07, 0x80800e06, 0x80800d05, 0x80800c04, 0x0b038080, 0x0a028080, 0x09018080, 0x08008080);
314
315 simdscalari dst23 = _simd_shuffle_epi8(perm1, mask1);
316
317 simdscalari dst = _simd_or_si(dst01, dst23);
318
319 _simd_store_si(reinterpret_cast<simdscalari *>(pDst), dst);
320 #else
321 #error Unsupported vector width
322 #endif
323 }
324 };
325
326 //////////////////////////////////////////////////////////////////////////
327 /// Transpose8_8_8
328 //////////////////////////////////////////////////////////////////////////
329 struct Transpose8_8_8
330 {
331 //////////////////////////////////////////////////////////////////////////
332 /// @brief Performs an SOA to AOS conversion for packed 8_8_8 data.
333 /// @param pSrc - source data in SOA form
334 /// @param pDst - output data in AOS form
335 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
336 };
337
338 //////////////////////////////////////////////////////////////////////////
339 /// Transpose8_8
340 //////////////////////////////////////////////////////////////////////////
341 struct Transpose8_8
342 {
343 //////////////////////////////////////////////////////////////////////////
344 /// @brief Performs an SOA to AOS conversion for packed 8_8 data.
345 /// @param pSrc - source data in SOA form
346 /// @param pDst - output data in AOS form
347 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
348 {
349 #if KNOB_SIMD_WIDTH == 8
350 simdscalari src = _simd_load_si((const simdscalari*)pSrc);
351
352 __m128i rg = _mm256_castsi256_si128(src); // rrrrrrrr gggggggg
353 __m128i g = _mm_unpackhi_epi64(rg, rg); // gggggggg gggggggg
354 rg = _mm_unpacklo_epi8(rg, g);
355 _mm_store_si128((__m128i*)pDst, rg);
356 #elif KNOB_SIMD_WIDTH == 16
357 __m256i src = _mm256_load_si256(reinterpret_cast<const __m256i *>(pSrc)); // rrrrrrrrrrrrrrrrgggggggggggggggg
358
359 __m256i r = _mm256_permute4x64_epi64(src, 0x50); // 0x50 = 01010000b // rrrrrrrrxxxxxxxxrrrrrrrrxxxxxxxx
360
361 __m256i g = _mm256_permute4x64_epi64(src, 0xFA); // 0xFA = 11111010b // ggggggggxxxxxxxxggggggggxxxxxxxx
362
363 __m256i dst = _mm256_unpacklo_epi8(r, g); // rgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrg
364
365 _mm256_store_si256(reinterpret_cast<__m256i *>(pDst), dst);
366 #else
367 #error Unsupported vector width
368 #endif
369 }
370 };
371
372 //////////////////////////////////////////////////////////////////////////
373 /// Transpose32_32_32_32
374 //////////////////////////////////////////////////////////////////////////
375 struct Transpose32_32_32_32
376 {
377 //////////////////////////////////////////////////////////////////////////
378 /// @brief Performs an SOA to AOS conversion for packed 32_32_32_32 data.
379 /// @param pSrc - source data in SOA form
380 /// @param pDst - output data in AOS form
381 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
382 {
383 #if KNOB_SIMD_WIDTH == 8
384 simdscalar src0 = _simd_load_ps((const float*)pSrc);
385 simdscalar src1 = _simd_load_ps((const float*)pSrc + 8);
386 simdscalar src2 = _simd_load_ps((const float*)pSrc + 16);
387 simdscalar src3 = _simd_load_ps((const float*)pSrc + 24);
388
389 __m128 vDst[8];
390 vTranspose4x8(vDst, src0, src1, src2, src3);
391 _mm_store_ps((float*)pDst, vDst[0]);
392 _mm_store_ps((float*)pDst+4, vDst[1]);
393 _mm_store_ps((float*)pDst+8, vDst[2]);
394 _mm_store_ps((float*)pDst+12, vDst[3]);
395 _mm_store_ps((float*)pDst+16, vDst[4]);
396 _mm_store_ps((float*)pDst+20, vDst[5]);
397 _mm_store_ps((float*)pDst+24, vDst[6]);
398 _mm_store_ps((float*)pDst+28, vDst[7]);
399 #elif KNOB_SIMD_WIDTH == 16
400 #if ENABLE_AVX512_EMULATION
401 simdscalar src0 = _simd_load_ps(reinterpret_cast<const float*>(pSrc));
402 simdscalar src1 = _simd_load_ps(reinterpret_cast<const float*>(pSrc) + 16);
403 simdscalar src2 = _simd_load_ps(reinterpret_cast<const float*>(pSrc) + 32);
404 simdscalar src3 = _simd_load_ps(reinterpret_cast<const float*>(pSrc) + 48);
405
406 __m128 vDst[8];
407
408 vTranspose4x8(vDst, src0.lo, src1.lo, src2.lo, src3.lo);
409
410 _mm_store_ps(reinterpret_cast<float*>(pDst), vDst[0]);
411 _mm_store_ps(reinterpret_cast<float*>(pDst) + 4, vDst[1]);
412 _mm_store_ps(reinterpret_cast<float*>(pDst) + 8, vDst[2]);
413 _mm_store_ps(reinterpret_cast<float*>(pDst) + 12, vDst[3]);
414 _mm_store_ps(reinterpret_cast<float*>(pDst) + 16, vDst[4]);
415 _mm_store_ps(reinterpret_cast<float*>(pDst) + 20, vDst[5]);
416 _mm_store_ps(reinterpret_cast<float*>(pDst) + 24, vDst[6]);
417 _mm_store_ps(reinterpret_cast<float*>(pDst) + 28, vDst[7]);
418
419 vTranspose4x8(vDst, src0.hi, src1.hi, src2.hi, src3.hi);
420
421 _mm_store_ps(reinterpret_cast<float*>(pDst) + 32, vDst[0]);
422 _mm_store_ps(reinterpret_cast<float*>(pDst) + 36, vDst[1]);
423 _mm_store_ps(reinterpret_cast<float*>(pDst) + 40, vDst[2]);
424 _mm_store_ps(reinterpret_cast<float*>(pDst) + 44, vDst[3]);
425 _mm_store_ps(reinterpret_cast<float*>(pDst) + 48, vDst[4]);
426 _mm_store_ps(reinterpret_cast<float*>(pDst) + 52, vDst[5]);
427 _mm_store_ps(reinterpret_cast<float*>(pDst) + 56, vDst[6]);
428 _mm_store_ps(reinterpret_cast<float*>(pDst) + 60, vDst[7]);
429 #endif
430 #else
431 #error Unsupported vector width
432 #endif
433 }
434 };
435
436 //////////////////////////////////////////////////////////////////////////
437 /// Transpose32_32_32
438 //////////////////////////////////////////////////////////////////////////
439 struct Transpose32_32_32
440 {
441 //////////////////////////////////////////////////////////////////////////
442 /// @brief Performs an SOA to AOS conversion for packed 32_32_32 data.
443 /// @param pSrc - source data in SOA form
444 /// @param pDst - output data in AOS form
445 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
446 {
447 #if KNOB_SIMD_WIDTH == 8
448 simdscalar src0 = _simd_load_ps((const float*)pSrc);
449 simdscalar src1 = _simd_load_ps((const float*)pSrc + 8);
450 simdscalar src2 = _simd_load_ps((const float*)pSrc + 16);
451
452 __m128 vDst[8];
453 vTranspose3x8(vDst, src0, src1, src2);
454 _mm_store_ps((float*)pDst, vDst[0]);
455 _mm_store_ps((float*)pDst + 4, vDst[1]);
456 _mm_store_ps((float*)pDst + 8, vDst[2]);
457 _mm_store_ps((float*)pDst + 12, vDst[3]);
458 _mm_store_ps((float*)pDst + 16, vDst[4]);
459 _mm_store_ps((float*)pDst + 20, vDst[5]);
460 _mm_store_ps((float*)pDst + 24, vDst[6]);
461 _mm_store_ps((float*)pDst + 28, vDst[7]);
462 #elif KNOB_SIMD_WIDTH == 16
463 #if ENABLE_AVX512_EMULATION
464 simdscalar src0 = _simd_load_ps(reinterpret_cast<const float*>(pSrc));
465 simdscalar src1 = _simd_load_ps(reinterpret_cast<const float*>(pSrc) + 16);
466 simdscalar src2 = _simd_load_ps(reinterpret_cast<const float*>(pSrc) + 32);
467
468 __m128 vDst[8];
469
470 vTranspose3x8(vDst, src0.lo, src1.lo, src2.lo);
471
472 _mm_store_ps(reinterpret_cast<float*>(pDst), vDst[0]);
473 _mm_store_ps(reinterpret_cast<float*>(pDst) + 4, vDst[1]);
474 _mm_store_ps(reinterpret_cast<float*>(pDst) + 8, vDst[2]);
475 _mm_store_ps(reinterpret_cast<float*>(pDst) + 12, vDst[3]);
476 _mm_store_ps(reinterpret_cast<float*>(pDst) + 16, vDst[4]);
477 _mm_store_ps(reinterpret_cast<float*>(pDst) + 20, vDst[5]);
478 _mm_store_ps(reinterpret_cast<float*>(pDst) + 24, vDst[6]);
479 _mm_store_ps(reinterpret_cast<float*>(pDst) + 28, vDst[7]);
480
481 vTranspose3x8(vDst, src0.hi, src1.hi, src2.hi);
482
483 _mm_store_ps(reinterpret_cast<float*>(pDst) + 32, vDst[0]);
484 _mm_store_ps(reinterpret_cast<float*>(pDst) + 36, vDst[1]);
485 _mm_store_ps(reinterpret_cast<float*>(pDst) + 40, vDst[2]);
486 _mm_store_ps(reinterpret_cast<float*>(pDst) + 44, vDst[3]);
487 _mm_store_ps(reinterpret_cast<float*>(pDst) + 48, vDst[4]);
488 _mm_store_ps(reinterpret_cast<float*>(pDst) + 52, vDst[5]);
489 _mm_store_ps(reinterpret_cast<float*>(pDst) + 56, vDst[6]);
490 _mm_store_ps(reinterpret_cast<float*>(pDst) + 60, vDst[7]);
491 #endif
492 #else
493 #error Unsupported vector width
494 #endif
495 }
496 };
497
498 //////////////////////////////////////////////////////////////////////////
499 /// Transpose32_32
500 //////////////////////////////////////////////////////////////////////////
501 struct Transpose32_32
502 {
503 //////////////////////////////////////////////////////////////////////////
504 /// @brief Performs an SOA to AOS conversion for packed 32_32 data.
505 /// @param pSrc - source data in SOA form
506 /// @param pDst - output data in AOS form
507 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
508 {
509 #if KNOB_SIMD_WIDTH == 8
510 const float* pfSrc = (const float*)pSrc;
511 __m128 src_r0 = _mm_load_ps(pfSrc + 0);
512 __m128 src_r1 = _mm_load_ps(pfSrc + 4);
513 __m128 src_g0 = _mm_load_ps(pfSrc + 8);
514 __m128 src_g1 = _mm_load_ps(pfSrc + 12);
515
516 __m128 dst0 = _mm_unpacklo_ps(src_r0, src_g0);
517 __m128 dst1 = _mm_unpackhi_ps(src_r0, src_g0);
518 __m128 dst2 = _mm_unpacklo_ps(src_r1, src_g1);
519 __m128 dst3 = _mm_unpackhi_ps(src_r1, src_g1);
520
521 float* pfDst = (float*)pDst;
522 _mm_store_ps(pfDst + 0, dst0);
523 _mm_store_ps(pfDst + 4, dst1);
524 _mm_store_ps(pfDst + 8, dst2);
525 _mm_store_ps(pfDst + 12, dst3);
526 #elif KNOB_SIMD_WIDTH == 16
527 const float* pfSrc = (const float*)pSrc;
528 __m256 src_r0 = _mm256_load_ps(pfSrc + 0);
529 __m256 src_r1 = _mm256_load_ps(pfSrc + 8);
530 __m256 src_g0 = _mm256_load_ps(pfSrc + 16);
531 __m256 src_g1 = _mm256_load_ps(pfSrc + 24);
532
533 __m256 dst0 = _mm256_unpacklo_ps(src_r0, src_g0);
534 __m256 dst1 = _mm256_unpackhi_ps(src_r0, src_g0);
535 __m256 dst2 = _mm256_unpacklo_ps(src_r1, src_g1);
536 __m256 dst3 = _mm256_unpackhi_ps(src_r1, src_g1);
537
538 float* pfDst = (float*)pDst;
539 _mm256_store_ps(pfDst + 0, dst0);
540 _mm256_store_ps(pfDst + 8, dst1);
541 _mm256_store_ps(pfDst + 16, dst2);
542 _mm256_store_ps(pfDst + 24, dst3);
543 #else
544 #error Unsupported vector width
545 #endif
546 }
547 };
548
549 //////////////////////////////////////////////////////////////////////////
550 /// Transpose16_16_16_16
551 //////////////////////////////////////////////////////////////////////////
552 struct Transpose16_16_16_16
553 {
554 //////////////////////////////////////////////////////////////////////////
555 /// @brief Performs an SOA to AOS conversion for packed 16_16_16_16 data.
556 /// @param pSrc - source data in SOA form
557 /// @param pDst - output data in AOS form
558 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
559 {
560 #if KNOB_SIMD_WIDTH == 8
561 simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc);
562 simdscalari src_ba = _simd_load_si((const simdscalari*)(pSrc + sizeof(simdscalari)));
563
564 __m128i src_r = _mm256_extractf128_si256(src_rg, 0);
565 __m128i src_g = _mm256_extractf128_si256(src_rg, 1);
566 __m128i src_b = _mm256_extractf128_si256(src_ba, 0);
567 __m128i src_a = _mm256_extractf128_si256(src_ba, 1);
568
569 __m128i rg0 = _mm_unpacklo_epi16(src_r, src_g);
570 __m128i rg1 = _mm_unpackhi_epi16(src_r, src_g);
571 __m128i ba0 = _mm_unpacklo_epi16(src_b, src_a);
572 __m128i ba1 = _mm_unpackhi_epi16(src_b, src_a);
573
574 __m128i dst0 = _mm_unpacklo_epi32(rg0, ba0);
575 __m128i dst1 = _mm_unpackhi_epi32(rg0, ba0);
576 __m128i dst2 = _mm_unpacklo_epi32(rg1, ba1);
577 __m128i dst3 = _mm_unpackhi_epi32(rg1, ba1);
578
579 _mm_store_si128(((__m128i*)pDst) + 0, dst0);
580 _mm_store_si128(((__m128i*)pDst) + 1, dst1);
581 _mm_store_si128(((__m128i*)pDst) + 2, dst2);
582 _mm_store_si128(((__m128i*)pDst) + 3, dst3);
583 #elif KNOB_SIMD_WIDTH == 16
584 #if ENABLE_AVX512_EMULATION
585 simdscalari src_rg = _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc));
586 simdscalari src_ba = _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc + sizeof(simdscalari)));
587
588 __m256i src_r = src_rg.lo;
589 __m256i src_g = src_rg.hi;
590 __m256i src_b = src_ba.lo;
591 __m256i src_a = src_ba.hi;
592
593 __m256i rg0 = _mm256_unpacklo_epi16(src_r, src_g);
594 __m256i rg1 = _mm256_unpackhi_epi16(src_r, src_g);
595 __m256i ba0 = _mm256_unpacklo_epi16(src_b, src_a);
596 __m256i ba1 = _mm256_unpackhi_epi16(src_b, src_a);
597
598 __m256i dst0 = _mm256_unpacklo_epi32(rg0, ba0);
599 __m256i dst1 = _mm256_unpackhi_epi32(rg0, ba0);
600 __m256i dst2 = _mm256_unpacklo_epi32(rg1, ba1);
601 __m256i dst3 = _mm256_unpackhi_epi32(rg1, ba1);
602
603 _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 0, dst0);
604 _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 1, dst1);
605 _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 2, dst2);
606 _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 3, dst3);
607 #endif
608 #else
609 #error Unsupported vector width
610 #endif
611 }
612 };
613
614 //////////////////////////////////////////////////////////////////////////
615 /// Transpose16_16_16
616 //////////////////////////////////////////////////////////////////////////
617 struct Transpose16_16_16
618 {
619 //////////////////////////////////////////////////////////////////////////
620 /// @brief Performs an SOA to AOS conversion for packed 16_16_16 data.
621 /// @param pSrc - source data in SOA form
622 /// @param pDst - output data in AOS form
623 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
624 {
625 #if KNOB_SIMD_WIDTH == 8
626 simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc);
627
628 __m128i src_r = _mm256_extractf128_si256(src_rg, 0);
629 __m128i src_g = _mm256_extractf128_si256(src_rg, 1);
630 __m128i src_b = _mm_load_si128((const __m128i*)(pSrc + sizeof(simdscalari)));
631 __m128i src_a = _mm_undefined_si128();
632
633 __m128i rg0 = _mm_unpacklo_epi16(src_r, src_g);
634 __m128i rg1 = _mm_unpackhi_epi16(src_r, src_g);
635 __m128i ba0 = _mm_unpacklo_epi16(src_b, src_a);
636 __m128i ba1 = _mm_unpackhi_epi16(src_b, src_a);
637
638 __m128i dst0 = _mm_unpacklo_epi32(rg0, ba0);
639 __m128i dst1 = _mm_unpackhi_epi32(rg0, ba0);
640 __m128i dst2 = _mm_unpacklo_epi32(rg1, ba1);
641 __m128i dst3 = _mm_unpackhi_epi32(rg1, ba1);
642
643 _mm_store_si128(((__m128i*)pDst) + 0, dst0);
644 _mm_store_si128(((__m128i*)pDst) + 1, dst1);
645 _mm_store_si128(((__m128i*)pDst) + 2, dst2);
646 _mm_store_si128(((__m128i*)pDst) + 3, dst3);
647 #elif KNOB_SIMD_WIDTH == 16
648 #if ENABLE_AVX512_EMULATION
649 simdscalari src_rg = _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc));
650
651 __m256i src_r = src_rg.lo;
652 __m256i src_g = src_rg.hi;
653 __m256i src_b = _mm256_load_si256(reinterpret_cast<const __m256i*>(pSrc + sizeof(simdscalari)));
654 __m256i src_a = _mm256_undefined_si256();
655
656 __m256i rg0 = _mm256_unpacklo_epi16(src_r, src_g);
657 __m256i rg1 = _mm256_unpackhi_epi16(src_r, src_g);
658 __m256i ba0 = _mm256_unpacklo_epi16(src_b, src_a);
659 __m256i ba1 = _mm256_unpackhi_epi16(src_b, src_a);
660
661 __m256i dst0 = _mm256_unpacklo_epi32(rg0, ba0);
662 __m256i dst1 = _mm256_unpackhi_epi32(rg0, ba0);
663 __m256i dst2 = _mm256_unpacklo_epi32(rg1, ba1);
664 __m256i dst3 = _mm256_unpackhi_epi32(rg1, ba1);
665
666 _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 0, dst0);
667 _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 1, dst1);
668 _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 2, dst2);
669 _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 3, dst3);
670 #endif
671 #else
672 #error Unsupported vector width
673 #endif
674 }
675 };
676
677 //////////////////////////////////////////////////////////////////////////
678 /// Transpose16_16
679 //////////////////////////////////////////////////////////////////////////
680 struct Transpose16_16
681 {
682 //////////////////////////////////////////////////////////////////////////
683 /// @brief Performs an SOA to AOS conversion for packed 16_16 data.
684 /// @param pSrc - source data in SOA form
685 /// @param pDst - output data in AOS form
686 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
687 {
688 #if KNOB_SIMD_WIDTH == 8
689 simdscalar src = _simd_load_ps((const float*)pSrc);
690
691 __m128 comp0 = _mm256_castps256_ps128(src);
692 __m128 comp1 = _mm256_extractf128_ps(src, 1);
693
694 __m128i comp0i = _mm_castps_si128(comp0);
695 __m128i comp1i = _mm_castps_si128(comp1);
696
697 __m128i resLo = _mm_unpacklo_epi16(comp0i, comp1i);
698 __m128i resHi = _mm_unpackhi_epi16(comp0i, comp1i);
699
700 _mm_store_si128((__m128i*)pDst, resLo);
701 _mm_store_si128((__m128i*)pDst + 1, resHi);
702 #elif KNOB_SIMD_WIDTH == 16
703 #if ENABLE_AVX512_EMULATION
704 simdscalari src = _simd_castps_si(_simd_load_ps(reinterpret_cast<const float*>(pSrc)));
705
706 simdscalari result;
707
708 result.lo = _mm256_unpacklo_epi16(src.lo, src.hi);
709 result.hi = _mm256_unpackhi_epi16(src.lo, src.hi);
710
711 _simd_store_si(reinterpret_cast<simdscalari *>(pDst), result);
712 #endif
713 #else
714 #error Unsupported vector width
715 #endif
716 }
717 };
718
719 //////////////////////////////////////////////////////////////////////////
720 /// Transpose24_8
721 //////////////////////////////////////////////////////////////////////////
722 struct Transpose24_8
723 {
724 //////////////////////////////////////////////////////////////////////////
725 /// @brief Performs an SOA to AOS conversion for packed 24_8 data.
726 /// @param pSrc - source data in SOA form
727 /// @param pDst - output data in AOS form
728 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
729 };
730
731 //////////////////////////////////////////////////////////////////////////
732 /// Transpose32_8_24
733 //////////////////////////////////////////////////////////////////////////
734 struct Transpose32_8_24
735 {
736 //////////////////////////////////////////////////////////////////////////
737 /// @brief Performs an SOA to AOS conversion for packed 32_8_24 data.
738 /// @param pSrc - source data in SOA form
739 /// @param pDst - output data in AOS form
740 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
741 };
742
743
744
745 //////////////////////////////////////////////////////////////////////////
746 /// Transpose4_4_4_4
747 //////////////////////////////////////////////////////////////////////////
748 struct Transpose4_4_4_4
749 {
750 //////////////////////////////////////////////////////////////////////////
751 /// @brief Performs an SOA to AOS conversion for packed 4_4_4_4 data.
752 /// @param pSrc - source data in SOA form
753 /// @param pDst - output data in AOS form
754 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
755 };
756
757 //////////////////////////////////////////////////////////////////////////
758 /// Transpose5_6_5
759 //////////////////////////////////////////////////////////////////////////
760 struct Transpose5_6_5
761 {
762 //////////////////////////////////////////////////////////////////////////
763 /// @brief Performs an SOA to AOS conversion for packed 5_6_5 data.
764 /// @param pSrc - source data in SOA form
765 /// @param pDst - output data in AOS form
766 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
767 };
768
769 //////////////////////////////////////////////////////////////////////////
770 /// Transpose9_9_9_5
771 //////////////////////////////////////////////////////////////////////////
772 struct Transpose9_9_9_5
773 {
774 //////////////////////////////////////////////////////////////////////////
775 /// @brief Performs an SOA to AOS conversion for packed 9_9_9_5 data.
776 /// @param pSrc - source data in SOA form
777 /// @param pDst - output data in AOS form
778 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
779 };
780
781 //////////////////////////////////////////////////////////////////////////
782 /// Transpose5_5_5_1
783 //////////////////////////////////////////////////////////////////////////
784 struct Transpose5_5_5_1
785 {
786 //////////////////////////////////////////////////////////////////////////
787 /// @brief Performs an SOA to AOS conversion for packed 5_5_5_1 data.
788 /// @param pSrc - source data in SOA form
789 /// @param pDst - output data in AOS form
790 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
791 };
792
793 //////////////////////////////////////////////////////////////////////////
794 /// Transpose10_10_10_2
795 //////////////////////////////////////////////////////////////////////////
796 struct Transpose10_10_10_2
797 {
798 //////////////////////////////////////////////////////////////////////////
799 /// @brief Performs an SOA to AOS conversion for packed 10_10_10_2 data.
800 /// @param pSrc - source data in SOA form
801 /// @param pDst - output data in AOS form
802 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
803 };
804
805 //////////////////////////////////////////////////////////////////////////
806 /// Transpose11_11_10
807 //////////////////////////////////////////////////////////////////////////
808 struct Transpose11_11_10
809 {
810 //////////////////////////////////////////////////////////////////////////
811 /// @brief Performs an SOA to AOS conversion for packed 11_11_10 data.
812 /// @param pSrc - source data in SOA form
813 /// @param pDst - output data in AOS form
814 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
815 };
816
817 // helper function to unroll loops
818 template<int Begin, int End, int Step = 1>
819 struct UnrollerL {
820 template<typename Lambda>
821 INLINE static void step(Lambda& func) {
822 func(Begin);
823 UnrollerL<Begin + Step, End, Step>::step(func);
824 }
825 };
826
827 template<int End, int Step>
828 struct UnrollerL<End, End, Step> {
829 template<typename Lambda>
830 static void step(Lambda& func) {
831 }
832 };
833
834 // helper function to unroll loops, with mask to skip specific iterations
835 template<int Begin, int End, int Step = 1, int Mask = 0x7f>
836 struct UnrollerLMask {
837 template<typename Lambda>
838 INLINE static void step(Lambda& func) {
839 if(Mask & (1 << Begin))
840 {
841 func(Begin);
842 }
843 UnrollerL<Begin + Step, End, Step>::step(func);
844 }
845 };
846
847 template<int End, int Step, int Mask>
848 struct UnrollerLMask<End, End, Step, Mask> {
849 template<typename Lambda>
850 static void step(Lambda& func) {
851 }
852 };
853
854 // general CRC compute
855 INLINE
856 uint32_t ComputeCRC(uint32_t crc, const void *pData, uint32_t size)
857 {
858 #if defined(_WIN64) || defined(__x86_64__)
859 uint32_t sizeInQwords = size / sizeof(uint64_t);
860 uint32_t sizeRemainderBytes = size % sizeof(uint64_t);
861 uint64_t* pDataWords = (uint64_t*)pData;
862 for (uint32_t i = 0; i < sizeInQwords; ++i)
863 {
864 crc = (uint32_t)_mm_crc32_u64(crc, *pDataWords++);
865 }
866 #else
867 uint32_t sizeInDwords = size / sizeof(uint32_t);
868 uint32_t sizeRemainderBytes = size % sizeof(uint32_t);
869 uint32_t* pDataWords = (uint32_t*)pData;
870 for (uint32_t i = 0; i < sizeInDwords; ++i)
871 {
872 crc = _mm_crc32_u32(crc, *pDataWords++);
873 }
874 #endif
875
876 uint8_t* pRemainderBytes = (uint8_t*)pDataWords;
877 for (uint32_t i = 0; i < sizeRemainderBytes; ++i)
878 {
879 crc = _mm_crc32_u8(crc, *pRemainderBytes++);
880 }
881
882 return crc;
883 }
884
885 //////////////////////////////////////////////////////////////////////////
886 /// Add byte offset to any-type pointer
887 //////////////////////////////////////////////////////////////////////////
888 template <typename T>
889 INLINE
890 static T* PtrAdd(T* p, intptr_t offset)
891 {
892 intptr_t intp = reinterpret_cast<intptr_t>(p);
893 return reinterpret_cast<T*>(intp + offset);
894 }
895
896 //////////////////////////////////////////////////////////////////////////
897 /// Is a power-of-2?
898 //////////////////////////////////////////////////////////////////////////
899 template <typename T>
900 INLINE
901 static bool IsPow2(T value)
902 {
903 return value == (value & (0 - value));
904 }
905
906 //////////////////////////////////////////////////////////////////////////
907 /// Align down to specified alignment
908 /// Note: IsPow2(alignment) MUST be true
909 //////////////////////////////////////////////////////////////////////////
910 template <typename T1, typename T2>
911 INLINE
912 static T1 AlignDownPow2(T1 value, T2 alignment)
913 {
914 SWR_ASSERT(IsPow2(alignment));
915 return value & ~T1(alignment - 1);
916 }
917
918 //////////////////////////////////////////////////////////////////////////
919 /// Align up to specified alignment
920 /// Note: IsPow2(alignment) MUST be true
921 //////////////////////////////////////////////////////////////////////////
922 template <typename T1, typename T2>
923 INLINE
924 static T1 AlignUpPow2(T1 value, T2 alignment)
925 {
926 return AlignDownPow2(value + T1(alignment - 1), alignment);
927 }
928
929 //////////////////////////////////////////////////////////////////////////
930 /// Align up ptr to specified alignment
931 /// Note: IsPow2(alignment) MUST be true
932 //////////////////////////////////////////////////////////////////////////
933 template <typename T1, typename T2>
934 INLINE
935 static T1* AlignUpPow2(T1* value, T2 alignment)
936 {
937 return reinterpret_cast<T1*>(
938 AlignDownPow2(reinterpret_cast<uintptr_t>(value) + uintptr_t(alignment - 1), alignment));
939 }
940
941 //////////////////////////////////////////////////////////////////////////
942 /// Align down to specified alignment
943 //////////////////////////////////////////////////////////////////////////
944 template <typename T1, typename T2>
945 INLINE
946 static T1 AlignDown(T1 value, T2 alignment)
947 {
948 if (IsPow2(alignment)) { return AlignDownPow2(value, alignment); }
949 return value - T1(value % alignment);
950 }
951
952 //////////////////////////////////////////////////////////////////////////
953 /// Align down to specified alignment
954 //////////////////////////////////////////////////////////////////////////
955 template <typename T1, typename T2>
956 INLINE
957 static T1* AlignDown(T1* value, T2 alignment)
958 {
959 return (T1*)AlignDown(uintptr_t(value), alignment);
960 }
961
962 //////////////////////////////////////////////////////////////////////////
963 /// Align up to specified alignment
964 /// Note: IsPow2(alignment) MUST be true
965 //////////////////////////////////////////////////////////////////////////
966 template <typename T1, typename T2>
967 INLINE
968 static T1 AlignUp(T1 value, T2 alignment)
969 {
970 return AlignDown(value + T1(alignment - 1), alignment);
971 }
972
973 //////////////////////////////////////////////////////////////////////////
974 /// Align up to specified alignment
975 /// Note: IsPow2(alignment) MUST be true
976 //////////////////////////////////////////////////////////////////////////
977 template <typename T1, typename T2>
978 INLINE
979 static T1* AlignUp(T1* value, T2 alignment)
980 {
981 return AlignDown(PtrAdd(value, alignment - 1), alignment);
982 }
983
984 //////////////////////////////////////////////////////////////////////////
985 /// Helper structure used to access an array of elements that don't
986 /// correspond to a typical word size.
987 //////////////////////////////////////////////////////////////////////////
988 template<typename T, size_t BitsPerElementT, size_t ArrayLenT>
989 class BitsArray
990 {
991 private:
992 static const size_t BITS_PER_WORD = sizeof(size_t) * 8;
993 static const size_t ELEMENTS_PER_WORD = BITS_PER_WORD / BitsPerElementT;
994 static const size_t NUM_WORDS = (ArrayLenT + ELEMENTS_PER_WORD - 1) / ELEMENTS_PER_WORD;
995 static const size_t ELEMENT_MASK = (size_t(1) << BitsPerElementT) - 1;
996
997 static_assert(ELEMENTS_PER_WORD * BitsPerElementT == BITS_PER_WORD,
998 "Element size must an integral fraction of pointer size");
999
1000 size_t m_words[NUM_WORDS] = {};
1001
1002 public:
1003
1004 T operator[] (size_t elementIndex) const
1005 {
1006 size_t word = m_words[elementIndex / ELEMENTS_PER_WORD];
1007 word >>= ((elementIndex % ELEMENTS_PER_WORD) * BitsPerElementT);
1008 return T(word & ELEMENT_MASK);
1009 }
1010 };
1011
1012 // Ranged integer argument for TemplateArgUnroller
1013 template <uint32_t TMin, uint32_t TMax>
1014 struct IntArg
1015 {
1016 uint32_t val;
1017 };
1018
1019 // Recursive template used to auto-nest conditionals. Converts dynamic boolean function
1020 // arguments to static template arguments.
1021 template <typename TermT, typename... ArgsB>
1022 struct TemplateArgUnroller
1023 {
1024 //-----------------------------------------
1025 // Boolean value
1026 //-----------------------------------------
1027
1028 // Last Arg Terminator
1029 static typename TermT::FuncType GetFunc(bool bArg)
1030 {
1031 if (bArg)
1032 {
1033 return TermT::template GetFunc<ArgsB..., std::true_type>();
1034 }
1035
1036 return TermT::template GetFunc<ArgsB..., std::false_type>();
1037 }
1038
1039 // Recursively parse args
1040 template <typename... TArgsT>
1041 static typename TermT::FuncType GetFunc(bool bArg, TArgsT... remainingArgs)
1042 {
1043 if (bArg)
1044 {
1045 return TemplateArgUnroller<TermT, ArgsB..., std::true_type>::GetFunc(remainingArgs...);
1046 }
1047
1048 return TemplateArgUnroller<TermT, ArgsB..., std::false_type>::GetFunc(remainingArgs...);
1049 }
1050
1051 //-----------------------------------------
1052 // Integer value (within specified range)
1053 //-----------------------------------------
1054
1055 // Last Arg Terminator
1056 template <uint32_t TMin, uint32_t TMax>
1057 static typename TermT::FuncType GetFunc(IntArg<TMin, TMax> iArg)
1058 {
1059 if (iArg.val == TMax)
1060 {
1061 return TermT::template GetFunc<ArgsB..., std::integral_constant<uint32_t, TMax>>();
1062 }
1063 if (TMax > TMin)
1064 {
1065 return TemplateArgUnroller<TermT, ArgsB...>::GetFunc(IntArg<TMin, TMax-1>{iArg.val});
1066 }
1067 SWR_ASSUME(false); return nullptr;
1068 }
1069 template <uint32_t TVal>
1070 static typename TermT::FuncType GetFunc(IntArg<TVal, TVal> iArg)
1071 {
1072 SWR_ASSERT(iArg.val == TVal);
1073 return TermT::template GetFunc<ArgsB..., std::integral_constant<uint32_t, TVal>>();
1074 }
1075
1076 // Recursively parse args
1077 template <uint32_t TMin, uint32_t TMax, typename... TArgsT>
1078 static typename TermT::FuncType GetFunc(IntArg<TMin, TMax> iArg, TArgsT... remainingArgs)
1079 {
1080 if (iArg.val == TMax)
1081 {
1082 return TemplateArgUnroller<TermT, ArgsB..., std::integral_constant<uint32_t, TMax>>::GetFunc(remainingArgs...);
1083 }
1084 if (TMax > TMin)
1085 {
1086 return TemplateArgUnroller<TermT, ArgsB...>::GetFunc(IntArg<TMin, TMax - 1>{iArg.val}, remainingArgs...);
1087 }
1088 SWR_ASSUME(false); return nullptr;
1089 }
1090 template <uint32_t TVal, typename... TArgsT>
1091 static typename TermT::FuncType GetFunc(IntArg<TVal, TVal> iArg, TArgsT... remainingArgs)
1092 {
1093 SWR_ASSERT(iArg.val == TVal);
1094 return TemplateArgUnroller<TermT, ArgsB..., std::integral_constant<uint32_t, TVal>>::GetFunc(remainingArgs...);
1095 }
1096 };
1097
1098