swr: avoid using exceptions for expected condition handling
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / utils.h
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file utils.h
24 *
25 * @brief Utilities used by SWR core.
26 *
27 ******************************************************************************/
28 #pragma once
29
30 #include <string.h>
31 #include <type_traits>
32 #include <algorithm>
33 #include "common/os.h"
34 #include "common/simdintrin.h"
35 #include "common/swr_assert.h"
36 #include "core/api.h"
37
38 #if defined(_WIN64) || defined(__x86_64__)
39 #define _MM_INSERT_EPI64 _mm_insert_epi64
40 #define _MM_EXTRACT_EPI64 _mm_extract_epi64
41 #else
42 INLINE int64_t _MM_EXTRACT_EPI64(__m128i a, const int32_t ndx)
43 {
44 OSALIGNLINE(uint32_t) elems[4];
45 _mm_store_si128((__m128i*)elems, a);
46 if (ndx == 0)
47 {
48 uint64_t foo = elems[0];
49 foo |= (uint64_t)elems[1] << 32;
50 return foo;
51 }
52 else
53 {
54 uint64_t foo = elems[2];
55 foo |= (uint64_t)elems[3] << 32;
56 return foo;
57 }
58 }
59
60 INLINE __m128i _MM_INSERT_EPI64(__m128i a, int64_t b, const int32_t ndx)
61 {
62 OSALIGNLINE(int64_t) elems[2];
63 _mm_store_si128((__m128i*)elems, a);
64 if (ndx == 0)
65 {
66 elems[0] = b;
67 }
68 else
69 {
70 elems[1] = b;
71 }
72 __m128i out;
73 out = _mm_load_si128((const __m128i*)elems);
74 return out;
75 }
76 #endif
77
78 struct simdBBox
79 {
80 simdscalari ymin;
81 simdscalari ymax;
82 simdscalari xmin;
83 simdscalari xmax;
84 };
85
86 INLINE
87 void vTranspose(__m128 &row0, __m128 &row1, __m128 &row2, __m128 &row3)
88 {
89 __m128i row0i = _mm_castps_si128(row0);
90 __m128i row1i = _mm_castps_si128(row1);
91 __m128i row2i = _mm_castps_si128(row2);
92 __m128i row3i = _mm_castps_si128(row3);
93
94 __m128i vTemp = row2i;
95 row2i = _mm_unpacklo_epi32(row2i, row3i);
96 vTemp = _mm_unpackhi_epi32(vTemp, row3i);
97
98 row3i = row0i;
99 row0i = _mm_unpacklo_epi32(row0i, row1i);
100 row3i = _mm_unpackhi_epi32(row3i, row1i);
101
102 row1i = row0i;
103 row0i = _mm_unpacklo_epi64(row0i, row2i);
104 row1i = _mm_unpackhi_epi64(row1i, row2i);
105
106 row2i = row3i;
107 row2i = _mm_unpacklo_epi64(row2i, vTemp);
108 row3i = _mm_unpackhi_epi64(row3i, vTemp);
109
110 row0 = _mm_castsi128_ps(row0i);
111 row1 = _mm_castsi128_ps(row1i);
112 row2 = _mm_castsi128_ps(row2i);
113 row3 = _mm_castsi128_ps(row3i);
114 }
115
116 INLINE
117 void vTranspose(__m128i &row0, __m128i &row1, __m128i &row2, __m128i &row3)
118 {
119 __m128i vTemp = row2;
120 row2 = _mm_unpacklo_epi32(row2, row3);
121 vTemp = _mm_unpackhi_epi32(vTemp, row3);
122
123 row3 = row0;
124 row0 = _mm_unpacklo_epi32(row0, row1);
125 row3 = _mm_unpackhi_epi32(row3, row1);
126
127 row1 = row0;
128 row0 = _mm_unpacklo_epi64(row0, row2);
129 row1 = _mm_unpackhi_epi64(row1, row2);
130
131 row2 = row3;
132 row2 = _mm_unpacklo_epi64(row2, vTemp);
133 row3 = _mm_unpackhi_epi64(row3, vTemp);
134 }
135
136 #define GCC_VERSION (__GNUC__ * 10000 \
137 + __GNUC_MINOR__ * 100 \
138 + __GNUC_PATCHLEVEL__)
139
140 #if defined(__clang__) || (defined(__GNUC__) && (GCC_VERSION < 40900))
141 #define _mm_undefined_ps _mm_setzero_ps
142 #define _mm_undefined_si128 _mm_setzero_si128
143 #if KNOB_SIMD_WIDTH == 8
144 #define _mm256_undefined_ps _mm256_setzero_ps
145 #endif
146 #endif
147
148 #if KNOB_SIMD_WIDTH == 8
149 INLINE
150 void vTranspose3x8(__m128 (&vDst)[8], const __m256 &vSrc0, const __m256 &vSrc1, const __m256 &vSrc2)
151 {
152 __m256 r0r2 = _mm256_unpacklo_ps(vSrc0, vSrc2); //x0z0x1z1 x4z4x5z5
153 __m256 r1rx = _mm256_unpacklo_ps(vSrc1, _mm256_undefined_ps()); //y0w0y1w1 y4w4y5w5
154 __m256 r02r1xlolo = _mm256_unpacklo_ps(r0r2, r1rx); //x0y0z0w0 x4y4z4w4
155 __m256 r02r1xlohi = _mm256_unpackhi_ps(r0r2, r1rx); //x1y1z1w1 x5y5z5w5
156
157 r0r2 = _mm256_unpackhi_ps(vSrc0, vSrc2); //x2z2x3z3 x6z6x7z7
158 r1rx = _mm256_unpackhi_ps(vSrc1, _mm256_undefined_ps()); //y2w2y3w3 y6w6yw77
159 __m256 r02r1xhilo = _mm256_unpacklo_ps(r0r2, r1rx); //x2y2z2w2 x6y6z6w6
160 __m256 r02r1xhihi = _mm256_unpackhi_ps(r0r2, r1rx); //x3y3z3w3 x7y7z7w7
161
162 vDst[0] = _mm256_castps256_ps128(r02r1xlolo);
163 vDst[1] = _mm256_castps256_ps128(r02r1xlohi);
164 vDst[2] = _mm256_castps256_ps128(r02r1xhilo);
165 vDst[3] = _mm256_castps256_ps128(r02r1xhihi);
166
167 vDst[4] = _mm256_extractf128_ps(r02r1xlolo, 1);
168 vDst[5] = _mm256_extractf128_ps(r02r1xlohi, 1);
169 vDst[6] = _mm256_extractf128_ps(r02r1xhilo, 1);
170 vDst[7] = _mm256_extractf128_ps(r02r1xhihi, 1);
171 }
172
173 INLINE
174 void vTranspose4x8(__m128 (&vDst)[8], const __m256 &vSrc0, const __m256 &vSrc1, const __m256 &vSrc2, const __m256 &vSrc3)
175 {
176 __m256 r0r2 = _mm256_unpacklo_ps(vSrc0, vSrc2); //x0z0x1z1 x4z4x5z5
177 __m256 r1rx = _mm256_unpacklo_ps(vSrc1, vSrc3); //y0w0y1w1 y4w4y5w5
178 __m256 r02r1xlolo = _mm256_unpacklo_ps(r0r2, r1rx); //x0y0z0w0 x4y4z4w4
179 __m256 r02r1xlohi = _mm256_unpackhi_ps(r0r2, r1rx); //x1y1z1w1 x5y5z5w5
180
181 r0r2 = _mm256_unpackhi_ps(vSrc0, vSrc2); //x2z2x3z3 x6z6x7z7
182 r1rx = _mm256_unpackhi_ps(vSrc1, vSrc3) ; //y2w2y3w3 y6w6yw77
183 __m256 r02r1xhilo = _mm256_unpacklo_ps(r0r2, r1rx); //x2y2z2w2 x6y6z6w6
184 __m256 r02r1xhihi = _mm256_unpackhi_ps(r0r2, r1rx); //x3y3z3w3 x7y7z7w7
185
186 vDst[0] = _mm256_castps256_ps128(r02r1xlolo);
187 vDst[1] = _mm256_castps256_ps128(r02r1xlohi);
188 vDst[2] = _mm256_castps256_ps128(r02r1xhilo);
189 vDst[3] = _mm256_castps256_ps128(r02r1xhihi);
190
191 vDst[4] = _mm256_extractf128_ps(r02r1xlolo, 1);
192 vDst[5] = _mm256_extractf128_ps(r02r1xlohi, 1);
193 vDst[6] = _mm256_extractf128_ps(r02r1xhilo, 1);
194 vDst[7] = _mm256_extractf128_ps(r02r1xhihi, 1);
195 }
196
197 INLINE
198 void vTranspose8x8(__m256 (&vDst)[8], const __m256 &vMask0, const __m256 &vMask1, const __m256 &vMask2, const __m256 &vMask3, const __m256 &vMask4, const __m256 &vMask5, const __m256 &vMask6, const __m256 &vMask7)
199 {
200 __m256 __t0 = _mm256_unpacklo_ps(vMask0, vMask1);
201 __m256 __t1 = _mm256_unpackhi_ps(vMask0, vMask1);
202 __m256 __t2 = _mm256_unpacklo_ps(vMask2, vMask3);
203 __m256 __t3 = _mm256_unpackhi_ps(vMask2, vMask3);
204 __m256 __t4 = _mm256_unpacklo_ps(vMask4, vMask5);
205 __m256 __t5 = _mm256_unpackhi_ps(vMask4, vMask5);
206 __m256 __t6 = _mm256_unpacklo_ps(vMask6, vMask7);
207 __m256 __t7 = _mm256_unpackhi_ps(vMask6, vMask7);
208 __m256 __tt0 = _mm256_shuffle_ps(__t0,__t2,_MM_SHUFFLE(1,0,1,0));
209 __m256 __tt1 = _mm256_shuffle_ps(__t0,__t2,_MM_SHUFFLE(3,2,3,2));
210 __m256 __tt2 = _mm256_shuffle_ps(__t1,__t3,_MM_SHUFFLE(1,0,1,0));
211 __m256 __tt3 = _mm256_shuffle_ps(__t1,__t3,_MM_SHUFFLE(3,2,3,2));
212 __m256 __tt4 = _mm256_shuffle_ps(__t4,__t6,_MM_SHUFFLE(1,0,1,0));
213 __m256 __tt5 = _mm256_shuffle_ps(__t4,__t6,_MM_SHUFFLE(3,2,3,2));
214 __m256 __tt6 = _mm256_shuffle_ps(__t5,__t7,_MM_SHUFFLE(1,0,1,0));
215 __m256 __tt7 = _mm256_shuffle_ps(__t5,__t7,_MM_SHUFFLE(3,2,3,2));
216 vDst[0] = _mm256_permute2f128_ps(__tt0, __tt4, 0x20);
217 vDst[1] = _mm256_permute2f128_ps(__tt1, __tt5, 0x20);
218 vDst[2] = _mm256_permute2f128_ps(__tt2, __tt6, 0x20);
219 vDst[3] = _mm256_permute2f128_ps(__tt3, __tt7, 0x20);
220 vDst[4] = _mm256_permute2f128_ps(__tt0, __tt4, 0x31);
221 vDst[5] = _mm256_permute2f128_ps(__tt1, __tt5, 0x31);
222 vDst[6] = _mm256_permute2f128_ps(__tt2, __tt6, 0x31);
223 vDst[7] = _mm256_permute2f128_ps(__tt3, __tt7, 0x31);
224 }
225
226 INLINE
227 void vTranspose8x8(__m256 (&vDst)[8], const __m256i &vMask0, const __m256i &vMask1, const __m256i &vMask2, const __m256i &vMask3, const __m256i &vMask4, const __m256i &vMask5, const __m256i &vMask6, const __m256i &vMask7)
228 {
229 vTranspose8x8(vDst, _mm256_castsi256_ps(vMask0), _mm256_castsi256_ps(vMask1), _mm256_castsi256_ps(vMask2), _mm256_castsi256_ps(vMask3),
230 _mm256_castsi256_ps(vMask4), _mm256_castsi256_ps(vMask5), _mm256_castsi256_ps(vMask6), _mm256_castsi256_ps(vMask7));
231 }
232 #endif
233
234 //////////////////////////////////////////////////////////////////////////
235 /// TranposeSingleComponent
236 //////////////////////////////////////////////////////////////////////////
237 template<uint32_t bpp>
238 struct TransposeSingleComponent
239 {
240 //////////////////////////////////////////////////////////////////////////
241 /// @brief Pass-thru for single component.
242 /// @param pSrc - source data in SOA form
243 /// @param pDst - output data in AOS form
244 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
245 {
246 memcpy(pDst, pSrc, (bpp * KNOB_SIMD_WIDTH) / 8);
247 }
248 #if ENABLE_AVX512_SIMD16
249
250 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
251 {
252 memcpy(pDst, pSrc, (bpp * KNOB_SIMD16_WIDTH) / 8);
253 }
254 #endif
255 };
256
257 //////////////////////////////////////////////////////////////////////////
258 /// Transpose8_8_8_8
259 //////////////////////////////////////////////////////////////////////////
260 struct Transpose8_8_8_8
261 {
262 //////////////////////////////////////////////////////////////////////////
263 /// @brief Performs an SOA to AOS conversion for packed 8_8_8_8 data.
264 /// @param pSrc - source data in SOA form
265 /// @param pDst - output data in AOS form
266 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
267 {
268 simdscalari src = _simd_load_si((const simdscalari*)pSrc);
269
270 #if KNOB_SIMD_WIDTH == 8
271 #if KNOB_ARCH == KNOB_ARCH_AVX
272 __m128i c0c1 = _mm256_castsi256_si128(src); // rrrrrrrrgggggggg
273 __m128i c2c3 = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(src), 1)); // bbbbbbbbaaaaaaaa
274 __m128i c0c2 = _mm_unpacklo_epi64(c0c1, c2c3); // rrrrrrrrbbbbbbbb
275 __m128i c1c3 = _mm_unpackhi_epi64(c0c1, c2c3); // ggggggggaaaaaaaa
276 __m128i c01 = _mm_unpacklo_epi8(c0c2, c1c3); // rgrgrgrgrgrgrgrg
277 __m128i c23 = _mm_unpackhi_epi8(c0c2, c1c3); // babababababababa
278 __m128i c0123lo = _mm_unpacklo_epi16(c01, c23); // rgbargbargbargba
279 __m128i c0123hi = _mm_unpackhi_epi16(c01, c23); // rgbargbargbargba
280 _mm_store_si128((__m128i*)pDst, c0123lo);
281 _mm_store_si128((__m128i*)(pDst + 16), c0123hi);
282 #elif KNOB_ARCH == KNOB_ARCH_AVX2
283 simdscalari dst01 = _mm256_shuffle_epi8(src,
284 _mm256_set_epi32(0x0f078080, 0x0e068080, 0x0d058080, 0x0c048080, 0x80800b03, 0x80800a02, 0x80800901, 0x80800800));
285 simdscalari dst23 = _mm256_permute2x128_si256(src, src, 0x01);
286 dst23 = _mm256_shuffle_epi8(dst23,
287 _mm256_set_epi32(0x80800f07, 0x80800e06, 0x80800d05, 0x80800c04, 0x0b038080, 0x0a028080, 0x09018080, 0x08008080));
288 simdscalari dst = _mm256_or_si256(dst01, dst23);
289 _simd_store_si((simdscalari*)pDst, dst);
290 #endif
291 #else
292 #error Unsupported vector width
293 #endif
294 }
295 #if ENABLE_AVX512_SIMD16
296
297 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
298 {
299 simd16scalari src = _simd16_load_si(reinterpret_cast<const simd16scalari *>(pSrc));
300
301 simd16scalari mask0 = _simd16_set_epi32(0x0f078080, 0x0e068080, 0x0d058080, 0x0c048080, 0x80800b03, 0x80800a02, 0x80800901, 0x80800800);
302
303 simd16scalari dst01 = _simd16_shuffle_epi8(src, mask0);
304
305 simd16scalari perm1 = _simd16_permute2f128_si(src, src, 1);
306
307 simd16scalari mask1 = _simd16_set_epi32(0x80800f07, 0x80800e06, 0x80800d05, 0x80800c04, 0x0b038080, 0x0a028080, 0x09018080, 0x08008080);
308
309 simd16scalari dst23 = _simd16_shuffle_epi8(perm1, mask1);
310
311 simd16scalari dst = _simd16_or_si(dst01, dst23);
312
313 _simd16_store_si(reinterpret_cast<simd16scalari *>(pDst), dst);
314 }
315 #endif
316 };
317
318 //////////////////////////////////////////////////////////////////////////
319 /// Transpose8_8_8
320 //////////////////////////////////////////////////////////////////////////
321 struct Transpose8_8_8
322 {
323 //////////////////////////////////////////////////////////////////////////
324 /// @brief Performs an SOA to AOS conversion for packed 8_8_8 data.
325 /// @param pSrc - source data in SOA form
326 /// @param pDst - output data in AOS form
327 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
328 #if ENABLE_AVX512_SIMD16
329
330 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
331 #endif
332 };
333
334 //////////////////////////////////////////////////////////////////////////
335 /// Transpose8_8
336 //////////////////////////////////////////////////////////////////////////
337 struct Transpose8_8
338 {
339 //////////////////////////////////////////////////////////////////////////
340 /// @brief Performs an SOA to AOS conversion for packed 8_8 data.
341 /// @param pSrc - source data in SOA form
342 /// @param pDst - output data in AOS form
343 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
344 {
345 #if KNOB_SIMD_WIDTH == 8
346 simdscalari src = _simd_load_si((const simdscalari*)pSrc);
347
348 __m128i rg = _mm256_castsi256_si128(src); // rrrrrrrr gggggggg
349 __m128i g = _mm_unpackhi_epi64(rg, rg); // gggggggg gggggggg
350 rg = _mm_unpacklo_epi8(rg, g);
351 _mm_store_si128((__m128i*)pDst, rg);
352 #else
353 #error Unsupported vector width
354 #endif
355 }
356 #if ENABLE_AVX512_SIMD16
357
358 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
359 {
360 simdscalari r = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc)); // rrrrrrrrrrrrrrrrgggggggggggggggg
361
362 simdscalari g = _simd_permute2f128_si(r, r, 1); // ggggggggggggggggxxxxxxxxxxxxxxxx
363
364 r = _simd_insertf128_si(r, _mm_srli_si128(_simd_extractf128_si(r, 0), 8), 1); // rrrrrrrrxxxxxxxxrrrrrrrrxxxxxxxx
365
366 g = _simd_insertf128_si(g, _mm_srli_si128(_simd_extractf128_si(g, 0), 8), 1); // ggggggggxxxxxxxxggggggggxxxxxxxx
367
368 simdscalari dst = _simd_unpacklo_epi8(r, g); // rgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrg
369
370 _simd_store_si(reinterpret_cast<simdscalari *>(pDst), dst);
371 }
372 #endif
373 };
374
375 //////////////////////////////////////////////////////////////////////////
376 /// Transpose32_32_32_32
377 //////////////////////////////////////////////////////////////////////////
378 struct Transpose32_32_32_32
379 {
380 //////////////////////////////////////////////////////////////////////////
381 /// @brief Performs an SOA to AOS conversion for packed 32_32_32_32 data.
382 /// @param pSrc - source data in SOA form
383 /// @param pDst - output data in AOS form
384 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
385 {
386 #if KNOB_SIMD_WIDTH == 8
387 simdscalar src0 = _simd_load_ps((const float*)pSrc);
388 simdscalar src1 = _simd_load_ps((const float*)pSrc + 8);
389 simdscalar src2 = _simd_load_ps((const float*)pSrc + 16);
390 simdscalar src3 = _simd_load_ps((const float*)pSrc + 24);
391
392 __m128 vDst[8];
393 vTranspose4x8(vDst, src0, src1, src2, src3);
394 _mm_store_ps((float*)pDst, vDst[0]);
395 _mm_store_ps((float*)pDst+4, vDst[1]);
396 _mm_store_ps((float*)pDst+8, vDst[2]);
397 _mm_store_ps((float*)pDst+12, vDst[3]);
398 _mm_store_ps((float*)pDst+16, vDst[4]);
399 _mm_store_ps((float*)pDst+20, vDst[5]);
400 _mm_store_ps((float*)pDst+24, vDst[6]);
401 _mm_store_ps((float*)pDst+28, vDst[7]);
402 #else
403 #error Unsupported vector width
404 #endif
405 }
406 #if ENABLE_AVX512_SIMD16
407
408 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
409 {
410 simd16scalar src0 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc));
411 simd16scalar src1 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 16);
412 simd16scalar src2 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 32);
413 simd16scalar src3 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 48);
414
415 __m128 vDst[8];
416
417 vTranspose4x8(vDst, _simd16_extract_ps(src0, 0), _simd16_extract_ps(src1, 0), _simd16_extract_ps(src2, 0), _simd16_extract_ps(src3, 0));
418
419 _simd16_store_ps(reinterpret_cast<float *>(pDst) + 0, reinterpret_cast<simd16scalar *>(vDst)[0]);
420 _simd16_store_ps(reinterpret_cast<float *>(pDst) + 16, reinterpret_cast<simd16scalar *>(vDst)[1]);
421
422 vTranspose4x8(vDst, _simd16_extract_ps(src0, 1), _simd16_extract_ps(src1, 1), _simd16_extract_ps(src2, 1), _simd16_extract_ps(src3, 1));
423
424 _simd16_store_ps(reinterpret_cast<float *>(pDst) + 32, reinterpret_cast<simd16scalar *>(vDst)[2]);
425 _simd16_store_ps(reinterpret_cast<float *>(pDst) + 48, reinterpret_cast<simd16scalar *>(vDst)[3]);
426 }
427 #endif
428 };
429
430 //////////////////////////////////////////////////////////////////////////
431 /// Transpose32_32_32
432 //////////////////////////////////////////////////////////////////////////
433 struct Transpose32_32_32
434 {
435 //////////////////////////////////////////////////////////////////////////
436 /// @brief Performs an SOA to AOS conversion for packed 32_32_32 data.
437 /// @param pSrc - source data in SOA form
438 /// @param pDst - output data in AOS form
439 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
440 {
441 #if KNOB_SIMD_WIDTH == 8
442 simdscalar src0 = _simd_load_ps((const float*)pSrc);
443 simdscalar src1 = _simd_load_ps((const float*)pSrc + 8);
444 simdscalar src2 = _simd_load_ps((const float*)pSrc + 16);
445
446 __m128 vDst[8];
447 vTranspose3x8(vDst, src0, src1, src2);
448 _mm_store_ps((float*)pDst, vDst[0]);
449 _mm_store_ps((float*)pDst + 4, vDst[1]);
450 _mm_store_ps((float*)pDst + 8, vDst[2]);
451 _mm_store_ps((float*)pDst + 12, vDst[3]);
452 _mm_store_ps((float*)pDst + 16, vDst[4]);
453 _mm_store_ps((float*)pDst + 20, vDst[5]);
454 _mm_store_ps((float*)pDst + 24, vDst[6]);
455 _mm_store_ps((float*)pDst + 28, vDst[7]);
456 #else
457 #error Unsupported vector width
458 #endif
459 }
460 #if ENABLE_AVX512_SIMD16
461
462 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
463 {
464 simd16scalar src0 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc));
465 simd16scalar src1 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 16);
466 simd16scalar src2 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 32);
467
468 __m128 vDst[8];
469
470 vTranspose3x8(vDst, _simd16_extract_ps(src0, 0), _simd16_extract_ps(src1, 0), _simd16_extract_ps(src2, 0));
471
472 _simd16_store_ps(reinterpret_cast<float *>(pDst) + 0, reinterpret_cast<simd16scalar *>(vDst)[0]);
473 _simd16_store_ps(reinterpret_cast<float *>(pDst) + 16, reinterpret_cast<simd16scalar *>(vDst)[1]);
474
475 vTranspose3x8(vDst, _simd16_extract_ps(src0, 1), _simd16_extract_ps(src1, 1), _simd16_extract_ps(src2, 1));
476
477 _simd16_store_ps(reinterpret_cast<float *>(pDst) + 32, reinterpret_cast<simd16scalar *>(vDst)[2]);
478 _simd16_store_ps(reinterpret_cast<float *>(pDst) + 48, reinterpret_cast<simd16scalar *>(vDst)[3]);
479 }
480 #endif
481 };
482
483 //////////////////////////////////////////////////////////////////////////
484 /// Transpose32_32
485 //////////////////////////////////////////////////////////////////////////
486 struct Transpose32_32
487 {
488 //////////////////////////////////////////////////////////////////////////
489 /// @brief Performs an SOA to AOS conversion for packed 32_32 data.
490 /// @param pSrc - source data in SOA form
491 /// @param pDst - output data in AOS form
492 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
493 {
494 #if KNOB_SIMD_WIDTH == 8
495 const float* pfSrc = (const float*)pSrc;
496 __m128 src_r0 = _mm_load_ps(pfSrc + 0);
497 __m128 src_r1 = _mm_load_ps(pfSrc + 4);
498 __m128 src_g0 = _mm_load_ps(pfSrc + 8);
499 __m128 src_g1 = _mm_load_ps(pfSrc + 12);
500
501 __m128 dst0 = _mm_unpacklo_ps(src_r0, src_g0);
502 __m128 dst1 = _mm_unpackhi_ps(src_r0, src_g0);
503 __m128 dst2 = _mm_unpacklo_ps(src_r1, src_g1);
504 __m128 dst3 = _mm_unpackhi_ps(src_r1, src_g1);
505
506 float* pfDst = (float*)pDst;
507 _mm_store_ps(pfDst + 0, dst0);
508 _mm_store_ps(pfDst + 4, dst1);
509 _mm_store_ps(pfDst + 8, dst2);
510 _mm_store_ps(pfDst + 12, dst3);
511 #else
512 #error Unsupported vector width
513 #endif
514 }
515 #if ENABLE_AVX512_SIMD16
516
517 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
518 {
519 simdscalar src_r0 = _simd_load_ps(reinterpret_cast<const float *>(pSrc));
520 simdscalar src_r1 = _simd_load_ps(reinterpret_cast<const float *>(pSrc) + 8);
521 simdscalar src_g0 = _simd_load_ps(reinterpret_cast<const float *>(pSrc) + 16);
522 simdscalar src_g1 = _simd_load_ps(reinterpret_cast<const float *>(pSrc) + 24);
523
524 simdscalar dst0 = _simd_unpacklo_ps(src_r0, src_g0);
525 simdscalar dst1 = _simd_unpacklo_ps(src_r0, src_g0);
526 simdscalar dst2 = _simd_unpacklo_ps(src_r1, src_g1);
527 simdscalar dst3 = _simd_unpacklo_ps(src_r1, src_g1);
528
529 _simd_store_ps(reinterpret_cast<float *>(pDst) + 0, dst0);
530 _simd_store_ps(reinterpret_cast<float *>(pDst) + 8, dst1);
531 _simd_store_ps(reinterpret_cast<float *>(pDst) + 16, dst2);
532 _simd_store_ps(reinterpret_cast<float *>(pDst) + 24, dst3);
533 }
534 #endif
535 };
536
537 //////////////////////////////////////////////////////////////////////////
538 /// Transpose16_16_16_16
539 //////////////////////////////////////////////////////////////////////////
540 struct Transpose16_16_16_16
541 {
542 //////////////////////////////////////////////////////////////////////////
543 /// @brief Performs an SOA to AOS conversion for packed 16_16_16_16 data.
544 /// @param pSrc - source data in SOA form
545 /// @param pDst - output data in AOS form
546 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
547 {
548 #if KNOB_SIMD_WIDTH == 8
549 simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc);
550 simdscalari src_ba = _simd_load_si((const simdscalari*)(pSrc + sizeof(simdscalari)));
551
552 __m128i src_r = _mm256_extractf128_si256(src_rg, 0);
553 __m128i src_g = _mm256_extractf128_si256(src_rg, 1);
554 __m128i src_b = _mm256_extractf128_si256(src_ba, 0);
555 __m128i src_a = _mm256_extractf128_si256(src_ba, 1);
556
557 __m128i rg0 = _mm_unpacklo_epi16(src_r, src_g);
558 __m128i rg1 = _mm_unpackhi_epi16(src_r, src_g);
559 __m128i ba0 = _mm_unpacklo_epi16(src_b, src_a);
560 __m128i ba1 = _mm_unpackhi_epi16(src_b, src_a);
561
562 __m128i dst0 = _mm_unpacklo_epi32(rg0, ba0);
563 __m128i dst1 = _mm_unpackhi_epi32(rg0, ba0);
564 __m128i dst2 = _mm_unpacklo_epi32(rg1, ba1);
565 __m128i dst3 = _mm_unpackhi_epi32(rg1, ba1);
566
567 _mm_store_si128(((__m128i*)pDst) + 0, dst0);
568 _mm_store_si128(((__m128i*)pDst) + 1, dst1);
569 _mm_store_si128(((__m128i*)pDst) + 2, dst2);
570 _mm_store_si128(((__m128i*)pDst) + 3, dst3);
571 #else
572 #error Unsupported vector width
573 #endif
574 }
575 #if ENABLE_AVX512_SIMD16
576
577 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
578 {
579 simd16scalari src_rg = _simd16_load_si(reinterpret_cast<const simd16scalari *>(pSrc));
580 simd16scalari src_ba = _simd16_load_si(reinterpret_cast<const simd16scalari *>(pSrc + sizeof(simd16scalari)));
581
582 simdscalari src_r = _simd16_extract_si(src_rg, 0);
583 simdscalari src_g = _simd16_extract_si(src_rg, 1);
584 simdscalari src_b = _simd16_extract_si(src_ba, 0);
585 simdscalari src_a = _simd16_extract_si(src_ba, 1);
586
587 simdscalari rg0 = _simd_unpacklo_epi16(src_r, src_g);
588 simdscalari rg1 = _simd_unpackhi_epi16(src_r, src_g);
589 simdscalari ba0 = _simd_unpacklo_epi16(src_b, src_a);
590 simdscalari ba1 = _simd_unpackhi_epi16(src_b, src_a);
591
592 simdscalari dst0 = _simd_unpacklo_epi32(rg0, ba0);
593 simdscalari dst1 = _simd_unpackhi_epi32(rg0, ba0);
594 simdscalari dst2 = _simd_unpacklo_epi32(rg1, ba1);
595 simdscalari dst3 = _simd_unpackhi_epi32(rg1, ba1);
596
597 _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 0, dst0);
598 _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 1, dst1);
599 _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 2, dst2);
600 _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 3, dst3);
601 }
602 #endif
603 };
604
605 //////////////////////////////////////////////////////////////////////////
606 /// Transpose16_16_16
607 //////////////////////////////////////////////////////////////////////////
608 struct Transpose16_16_16
609 {
610 //////////////////////////////////////////////////////////////////////////
611 /// @brief Performs an SOA to AOS conversion for packed 16_16_16 data.
612 /// @param pSrc - source data in SOA form
613 /// @param pDst - output data in AOS form
614 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
615 {
616 #if KNOB_SIMD_WIDTH == 8
617 simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc);
618
619 __m128i src_r = _mm256_extractf128_si256(src_rg, 0);
620 __m128i src_g = _mm256_extractf128_si256(src_rg, 1);
621 __m128i src_b = _mm_load_si128((const __m128i*)(pSrc + sizeof(simdscalari)));
622 __m128i src_a = _mm_undefined_si128();
623
624 __m128i rg0 = _mm_unpacklo_epi16(src_r, src_g);
625 __m128i rg1 = _mm_unpackhi_epi16(src_r, src_g);
626 __m128i ba0 = _mm_unpacklo_epi16(src_b, src_a);
627 __m128i ba1 = _mm_unpackhi_epi16(src_b, src_a);
628
629 __m128i dst0 = _mm_unpacklo_epi32(rg0, ba0);
630 __m128i dst1 = _mm_unpackhi_epi32(rg0, ba0);
631 __m128i dst2 = _mm_unpacklo_epi32(rg1, ba1);
632 __m128i dst3 = _mm_unpackhi_epi32(rg1, ba1);
633
634 _mm_store_si128(((__m128i*)pDst) + 0, dst0);
635 _mm_store_si128(((__m128i*)pDst) + 1, dst1);
636 _mm_store_si128(((__m128i*)pDst) + 2, dst2);
637 _mm_store_si128(((__m128i*)pDst) + 3, dst3);
638 #else
639 #error Unsupported vector width
640 #endif
641 }
642 #if ENABLE_AVX512_SIMD16
643
644 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
645 {
646 simd16scalari src_rg = _simd16_load_si(reinterpret_cast<const simd16scalari *>(pSrc));
647
648 simdscalari src_r = _simd16_extract_si(src_rg, 0);
649 simdscalari src_g = _simd16_extract_si(src_rg, 1);
650 simdscalari src_b = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc + sizeof(simd16scalari)));
651 simdscalari src_a = _mm256_undefined_si256();
652
653 simdscalari rg0 = _simd_unpacklo_epi16(src_r, src_g);
654 simdscalari rg1 = _simd_unpackhi_epi16(src_r, src_g);
655 simdscalari ba0 = _simd_unpacklo_epi16(src_b, src_a);
656 simdscalari ba1 = _simd_unpackhi_epi16(src_b, src_a);
657
658 simdscalari dst0 = _simd_unpacklo_epi32(rg0, ba0);
659 simdscalari dst1 = _simd_unpackhi_epi32(rg0, ba0);
660 simdscalari dst2 = _simd_unpacklo_epi32(rg1, ba1);
661 simdscalari dst3 = _simd_unpackhi_epi32(rg1, ba1);
662
663 _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 0, dst0);
664 _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 1, dst1);
665 _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 2, dst2);
666 _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 3, dst3);
667 }
668 #endif
669 };
670
671 //////////////////////////////////////////////////////////////////////////
672 /// Transpose16_16
673 //////////////////////////////////////////////////////////////////////////
674 struct Transpose16_16
675 {
676 //////////////////////////////////////////////////////////////////////////
677 /// @brief Performs an SOA to AOS conversion for packed 16_16 data.
678 /// @param pSrc - source data in SOA form
679 /// @param pDst - output data in AOS form
680 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
681 {
682 #if KNOB_SIMD_WIDTH == 8
683 simdscalar src = _simd_load_ps((const float*)pSrc);
684
685 __m128 comp0 = _mm256_castps256_ps128(src);
686 __m128 comp1 = _mm256_extractf128_ps(src, 1);
687
688 __m128i comp0i = _mm_castps_si128(comp0);
689 __m128i comp1i = _mm_castps_si128(comp1);
690
691 __m128i resLo = _mm_unpacklo_epi16(comp0i, comp1i);
692 __m128i resHi = _mm_unpackhi_epi16(comp0i, comp1i);
693
694 _mm_store_si128((__m128i*)pDst, resLo);
695 _mm_store_si128((__m128i*)pDst + 1, resHi);
696 #else
697 #error Unsupported vector width
698 #endif
699 }
700 #if ENABLE_AVX512_SIMD16
701
702 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
703 {
704 simd16scalari result = _simd16_setzero_si();
705
706 simd16scalari src = _simd16_load_si(reinterpret_cast<const simd16scalari *>(pSrc));
707
708 simdscalari srclo = _simd16_extract_si(src, 0);
709 simdscalari srchi = _simd16_extract_si(src, 1);
710
711 result = _simd16_insert_si(result, _simd_unpacklo_epi16(srclo, srchi), 0);
712 result = _simd16_insert_si(result, _simd_unpackhi_epi16(srclo, srchi), 1);
713
714 _simd16_store_si(reinterpret_cast<simd16scalari *>(pDst), result);
715 }
716 #endif
717 };
718
719 //////////////////////////////////////////////////////////////////////////
720 /// Transpose24_8
721 //////////////////////////////////////////////////////////////////////////
722 struct Transpose24_8
723 {
724 //////////////////////////////////////////////////////////////////////////
725 /// @brief Performs an SOA to AOS conversion for packed 24_8 data.
726 /// @param pSrc - source data in SOA form
727 /// @param pDst - output data in AOS form
728 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
729 #if ENABLE_AVX512_SIMD16
730
731 static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
732 #endif
733 };
734
735 //////////////////////////////////////////////////////////////////////////
736 /// Transpose32_8_24
737 //////////////////////////////////////////////////////////////////////////
738 struct Transpose32_8_24
739 {
740 //////////////////////////////////////////////////////////////////////////
741 /// @brief Performs an SOA to AOS conversion for packed 32_8_24 data.
742 /// @param pSrc - source data in SOA form
743 /// @param pDst - output data in AOS form
744 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
745 #if ENABLE_AVX512_SIMD16
746
747 static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
748 #endif
749 };
750
751 //////////////////////////////////////////////////////////////////////////
752 /// Transpose4_4_4_4
753 //////////////////////////////////////////////////////////////////////////
754 struct Transpose4_4_4_4
755 {
756 //////////////////////////////////////////////////////////////////////////
757 /// @brief Performs an SOA to AOS conversion for packed 4_4_4_4 data.
758 /// @param pSrc - source data in SOA form
759 /// @param pDst - output data in AOS form
760 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
761 #if ENABLE_AVX512_SIMD16
762
763 static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
764 #endif
765 };
766
767 //////////////////////////////////////////////////////////////////////////
768 /// Transpose5_6_5
769 //////////////////////////////////////////////////////////////////////////
770 struct Transpose5_6_5
771 {
772 //////////////////////////////////////////////////////////////////////////
773 /// @brief Performs an SOA to AOS conversion for packed 5_6_5 data.
774 /// @param pSrc - source data in SOA form
775 /// @param pDst - output data in AOS form
776 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
777 #if ENABLE_AVX512_SIMD16
778
779 static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
780 #endif
781 };
782
783 //////////////////////////////////////////////////////////////////////////
784 /// Transpose9_9_9_5
785 //////////////////////////////////////////////////////////////////////////
786 struct Transpose9_9_9_5
787 {
788 //////////////////////////////////////////////////////////////////////////
789 /// @brief Performs an SOA to AOS conversion for packed 9_9_9_5 data.
790 /// @param pSrc - source data in SOA form
791 /// @param pDst - output data in AOS form
792 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
793 #if ENABLE_AVX512_SIMD16
794
795 static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
796 #endif
797 };
798
799 //////////////////////////////////////////////////////////////////////////
800 /// Transpose5_5_5_1
801 //////////////////////////////////////////////////////////////////////////
802 struct Transpose5_5_5_1
803 {
804 //////////////////////////////////////////////////////////////////////////
805 /// @brief Performs an SOA to AOS conversion for packed 5_5_5_1 data.
806 /// @param pSrc - source data in SOA form
807 /// @param pDst - output data in AOS form
808 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
809 #if ENABLE_AVX512_SIMD16
810
811 static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
812 #endif
813 };
814
815 //////////////////////////////////////////////////////////////////////////
816 /// Transpose1_5_5_5
817 //////////////////////////////////////////////////////////////////////////
818 struct Transpose1_5_5_5
819 {
820 //////////////////////////////////////////////////////////////////////////
821 /// @brief Performs an SOA to AOS conversion for packed 5_5_5_1 data.
822 /// @param pSrc - source data in SOA form
823 /// @param pDst - output data in AOS form
824 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
825 };
826
827 //////////////////////////////////////////////////////////////////////////
828 /// Transpose10_10_10_2
829 //////////////////////////////////////////////////////////////////////////
830 struct Transpose10_10_10_2
831 {
832 //////////////////////////////////////////////////////////////////////////
833 /// @brief Performs an SOA to AOS conversion for packed 10_10_10_2 data.
834 /// @param pSrc - source data in SOA form
835 /// @param pDst - output data in AOS form
836 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
837 #if ENABLE_AVX512_SIMD16
838
839 static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
840 #endif
841 };
842
843 //////////////////////////////////////////////////////////////////////////
844 /// Transpose11_11_10
845 //////////////////////////////////////////////////////////////////////////
846 struct Transpose11_11_10
847 {
848 //////////////////////////////////////////////////////////////////////////
849 /// @brief Performs an SOA to AOS conversion for packed 11_11_10 data.
850 /// @param pSrc - source data in SOA form
851 /// @param pDst - output data in AOS form
852 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
853 #if ENABLE_AVX512_SIMD16
854
855 static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
856 #endif
857 };
858
859 // helper function to unroll loops
860 template<int Begin, int End, int Step = 1>
861 struct UnrollerL {
862 template<typename Lambda>
863 INLINE static void step(Lambda& func) {
864 func(Begin);
865 UnrollerL<Begin + Step, End, Step>::step(func);
866 }
867 };
868
869 template<int End, int Step>
870 struct UnrollerL<End, End, Step> {
871 template<typename Lambda>
872 static void step(Lambda& func) {
873 }
874 };
875
876 // helper function to unroll loops, with mask to skip specific iterations
877 template<int Begin, int End, int Step = 1, int Mask = 0x7f>
878 struct UnrollerLMask {
879 template<typename Lambda>
880 INLINE static void step(Lambda& func) {
881 if(Mask & (1 << Begin))
882 {
883 func(Begin);
884 }
885 UnrollerL<Begin + Step, End, Step>::step(func);
886 }
887 };
888
889 template<int End, int Step, int Mask>
890 struct UnrollerLMask<End, End, Step, Mask> {
891 template<typename Lambda>
892 static void step(Lambda& func) {
893 }
894 };
895
896 // general CRC compute
897 INLINE
898 uint32_t ComputeCRC(uint32_t crc, const void *pData, uint32_t size)
899 {
900 #if defined(_WIN64) || defined(__x86_64__)
901 uint32_t sizeInQwords = size / sizeof(uint64_t);
902 uint32_t sizeRemainderBytes = size % sizeof(uint64_t);
903 uint64_t* pDataWords = (uint64_t*)pData;
904 for (uint32_t i = 0; i < sizeInQwords; ++i)
905 {
906 crc = (uint32_t)_mm_crc32_u64(crc, *pDataWords++);
907 }
908 #else
909 uint32_t sizeInDwords = size / sizeof(uint32_t);
910 uint32_t sizeRemainderBytes = size % sizeof(uint32_t);
911 uint32_t* pDataWords = (uint32_t*)pData;
912 for (uint32_t i = 0; i < sizeInDwords; ++i)
913 {
914 crc = _mm_crc32_u32(crc, *pDataWords++);
915 }
916 #endif
917
918 uint8_t* pRemainderBytes = (uint8_t*)pDataWords;
919 for (uint32_t i = 0; i < sizeRemainderBytes; ++i)
920 {
921 crc = _mm_crc32_u8(crc, *pRemainderBytes++);
922 }
923
924 return crc;
925 }
926
927 //////////////////////////////////////////////////////////////////////////
928 /// Add byte offset to any-type pointer
929 //////////////////////////////////////////////////////////////////////////
930 template <typename T>
931 INLINE
932 static T* PtrAdd(T* p, intptr_t offset)
933 {
934 intptr_t intp = reinterpret_cast<intptr_t>(p);
935 return reinterpret_cast<T*>(intp + offset);
936 }
937
938 //////////////////////////////////////////////////////////////////////////
939 /// Is a power-of-2?
940 //////////////////////////////////////////////////////////////////////////
941 template <typename T>
942 INLINE
943 static bool IsPow2(T value)
944 {
945 return value == (value & (0 - value));
946 }
947
948 //////////////////////////////////////////////////////////////////////////
949 /// Align down to specified alignment
950 /// Note: IsPow2(alignment) MUST be true
951 //////////////////////////////////////////////////////////////////////////
952 template <typename T1, typename T2>
953 INLINE
954 static T1 AlignDownPow2(T1 value, T2 alignment)
955 {
956 SWR_ASSERT(IsPow2(alignment));
957 return value & ~T1(alignment - 1);
958 }
959
960 //////////////////////////////////////////////////////////////////////////
961 /// Align up to specified alignment
962 /// Note: IsPow2(alignment) MUST be true
963 //////////////////////////////////////////////////////////////////////////
964 template <typename T1, typename T2>
965 INLINE
966 static T1 AlignUpPow2(T1 value, T2 alignment)
967 {
968 return AlignDownPow2(value + T1(alignment - 1), alignment);
969 }
970
971 //////////////////////////////////////////////////////////////////////////
972 /// Align up ptr to specified alignment
973 /// Note: IsPow2(alignment) MUST be true
974 //////////////////////////////////////////////////////////////////////////
975 template <typename T1, typename T2>
976 INLINE
977 static T1* AlignUpPow2(T1* value, T2 alignment)
978 {
979 return reinterpret_cast<T1*>(
980 AlignDownPow2(reinterpret_cast<uintptr_t>(value) + uintptr_t(alignment - 1), alignment));
981 }
982
983 //////////////////////////////////////////////////////////////////////////
984 /// Align down to specified alignment
985 //////////////////////////////////////////////////////////////////////////
986 template <typename T1, typename T2>
987 INLINE
988 static T1 AlignDown(T1 value, T2 alignment)
989 {
990 if (IsPow2(alignment)) { return AlignDownPow2(value, alignment); }
991 return value - T1(value % alignment);
992 }
993
994 //////////////////////////////////////////////////////////////////////////
995 /// Align down to specified alignment
996 //////////////////////////////////////////////////////////////////////////
997 template <typename T1, typename T2>
998 INLINE
999 static T1* AlignDown(T1* value, T2 alignment)
1000 {
1001 return (T1*)AlignDown(uintptr_t(value), alignment);
1002 }
1003
1004 //////////////////////////////////////////////////////////////////////////
1005 /// Align up to specified alignment
1006 /// Note: IsPow2(alignment) MUST be true
1007 //////////////////////////////////////////////////////////////////////////
1008 template <typename T1, typename T2>
1009 INLINE
1010 static T1 AlignUp(T1 value, T2 alignment)
1011 {
1012 return AlignDown(value + T1(alignment - 1), alignment);
1013 }
1014
1015 //////////////////////////////////////////////////////////////////////////
1016 /// Align up to specified alignment
1017 /// Note: IsPow2(alignment) MUST be true
1018 //////////////////////////////////////////////////////////////////////////
1019 template <typename T1, typename T2>
1020 INLINE
1021 static T1* AlignUp(T1* value, T2 alignment)
1022 {
1023 return AlignDown(PtrAdd(value, alignment - 1), alignment);
1024 }
1025
1026 //////////////////////////////////////////////////////////////////////////
1027 /// Helper structure used to access an array of elements that don't
1028 /// correspond to a typical word size.
1029 //////////////////////////////////////////////////////////////////////////
1030 template<typename T, size_t BitsPerElementT, size_t ArrayLenT>
1031 class BitsArray
1032 {
1033 private:
1034 static const size_t BITS_PER_WORD = sizeof(size_t) * 8;
1035 static const size_t ELEMENTS_PER_WORD = BITS_PER_WORD / BitsPerElementT;
1036 static const size_t NUM_WORDS = (ArrayLenT + ELEMENTS_PER_WORD - 1) / ELEMENTS_PER_WORD;
1037 static const size_t ELEMENT_MASK = (size_t(1) << BitsPerElementT) - 1;
1038
1039 static_assert(ELEMENTS_PER_WORD * BitsPerElementT == BITS_PER_WORD,
1040 "Element size must an integral fraction of pointer size");
1041
1042 size_t m_words[NUM_WORDS] = {};
1043
1044 public:
1045
1046 T operator[] (size_t elementIndex) const
1047 {
1048 size_t word = m_words[elementIndex / ELEMENTS_PER_WORD];
1049 word >>= ((elementIndex % ELEMENTS_PER_WORD) * BitsPerElementT);
1050 return T(word & ELEMENT_MASK);
1051 }
1052 };
1053
1054 // Ranged integer argument for TemplateArgUnroller
1055 template <uint32_t TMin, uint32_t TMax>
1056 struct IntArg
1057 {
1058 uint32_t val;
1059 };
1060
1061 // Recursive template used to auto-nest conditionals. Converts dynamic boolean function
1062 // arguments to static template arguments.
1063 template <typename TermT, typename... ArgsB>
1064 struct TemplateArgUnroller
1065 {
1066 //-----------------------------------------
1067 // Boolean value
1068 //-----------------------------------------
1069
1070 // Last Arg Terminator
1071 static typename TermT::FuncType GetFunc(bool bArg)
1072 {
1073 if (bArg)
1074 {
1075 return TermT::template GetFunc<ArgsB..., std::true_type>();
1076 }
1077
1078 return TermT::template GetFunc<ArgsB..., std::false_type>();
1079 }
1080
1081 // Recursively parse args
1082 template <typename... TArgsT>
1083 static typename TermT::FuncType GetFunc(bool bArg, TArgsT... remainingArgs)
1084 {
1085 if (bArg)
1086 {
1087 return TemplateArgUnroller<TermT, ArgsB..., std::true_type>::GetFunc(remainingArgs...);
1088 }
1089
1090 return TemplateArgUnroller<TermT, ArgsB..., std::false_type>::GetFunc(remainingArgs...);
1091 }
1092
1093 //-----------------------------------------
1094 // Integer value (within specified range)
1095 //-----------------------------------------
1096
1097 // Last Arg Terminator
1098 template <uint32_t TMin, uint32_t TMax>
1099 static typename TermT::FuncType GetFunc(IntArg<TMin, TMax> iArg)
1100 {
1101 if (iArg.val == TMax)
1102 {
1103 return TermT::template GetFunc<ArgsB..., std::integral_constant<uint32_t, TMax>>();
1104 }
1105 if (TMax > TMin)
1106 {
1107 return TemplateArgUnroller<TermT, ArgsB...>::GetFunc(IntArg<TMin, TMax-1>{iArg.val});
1108 }
1109 SWR_ASSUME(false); return nullptr;
1110 }
1111 template <uint32_t TVal>
1112 static typename TermT::FuncType GetFunc(IntArg<TVal, TVal> iArg)
1113 {
1114 SWR_ASSERT(iArg.val == TVal);
1115 return TermT::template GetFunc<ArgsB..., std::integral_constant<uint32_t, TVal>>();
1116 }
1117
1118 // Recursively parse args
1119 template <uint32_t TMin, uint32_t TMax, typename... TArgsT>
1120 static typename TermT::FuncType GetFunc(IntArg<TMin, TMax> iArg, TArgsT... remainingArgs)
1121 {
1122 if (iArg.val == TMax)
1123 {
1124 return TemplateArgUnroller<TermT, ArgsB..., std::integral_constant<uint32_t, TMax>>::GetFunc(remainingArgs...);
1125 }
1126 if (TMax > TMin)
1127 {
1128 return TemplateArgUnroller<TermT, ArgsB...>::GetFunc(IntArg<TMin, TMax - 1>{iArg.val}, remainingArgs...);
1129 }
1130 SWR_ASSUME(false); return nullptr;
1131 }
1132 template <uint32_t TVal, typename... TArgsT>
1133 static typename TermT::FuncType GetFunc(IntArg<TVal, TVal> iArg, TArgsT... remainingArgs)
1134 {
1135 SWR_ASSERT(iArg.val == TVal);
1136 return TemplateArgUnroller<TermT, ArgsB..., std::integral_constant<uint32_t, TVal>>::GetFunc(remainingArgs...);
1137 }
1138 };
1139
1140