swr: [rasterizer core] portability - remove use of INT64
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / utils.h
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file utils.h
24 *
25 * @brief Utilities used by SWR core.
26 *
27 ******************************************************************************/
28 #pragma once
29
30 #include <string.h>
31 #include <type_traits>
32 #include <algorithm>
33 #include "common/os.h"
34 #include "common/simdintrin.h"
35 #include "common/swr_assert.h"
36
37 #if defined(_WIN64) || defined(__x86_64__)
38 #define _MM_INSERT_EPI64 _mm_insert_epi64
39 #define _MM_EXTRACT_EPI64 _mm_extract_epi64
40 #else
41 INLINE int64_t _MM_EXTRACT_EPI64(__m128i a, const int32_t ndx)
42 {
43 OSALIGNLINE(uint32_t) elems[4];
44 _mm_store_si128((__m128i*)elems, a);
45 if (ndx == 0)
46 {
47 uint64_t foo = elems[0];
48 foo |= (uint64_t)elems[1] << 32;
49 return foo;
50 }
51 else
52 {
53 uint64_t foo = elems[2];
54 foo |= (uint64_t)elems[3] << 32;
55 return foo;
56 }
57 }
58
59 INLINE __m128i _MM_INSERT_EPI64(__m128i a, int64_t b, const int32_t ndx)
60 {
61 OSALIGNLINE(int64_t) elems[2];
62 _mm_store_si128((__m128i*)elems, a);
63 if (ndx == 0)
64 {
65 elems[0] = b;
66 }
67 else
68 {
69 elems[1] = b;
70 }
71 __m128i out;
72 out = _mm_load_si128((const __m128i*)elems);
73 return out;
74 }
75 #endif
76
77 OSALIGNLINE(struct) BBOX
78 {
79 int top{ 0 };
80 int bottom{ 0 };
81 int left{ 0 };
82 int right{ 0 };
83
84 BBOX() {}
85 BBOX(int t, int b, int l, int r) : top(t), bottom(b), left(l), right(r) {}
86
87 bool operator==(const BBOX& rhs)
88 {
89 return (this->top == rhs.top &&
90 this->bottom == rhs.bottom &&
91 this->left == rhs.left &&
92 this->right == rhs.right);
93 }
94
95 bool operator!=(const BBOX& rhs)
96 {
97 return !(*this == rhs);
98 }
99
100 BBOX& Intersect(const BBOX& other)
101 {
102 this->top = std::max(this->top, other.top);
103 this->bottom = std::min(this->bottom, other.bottom);
104 this->left = std::max(this->left, other.left);
105 this->right = std::min(this->right, other.right);
106
107 if (right - left < 0 ||
108 bottom - top < 0)
109 {
110 // Zero area
111 top = bottom = left = right = 0;
112 }
113
114 return *this;
115 }
116 };
117
118 struct simdBBox
119 {
120 simdscalari top;
121 simdscalari bottom;
122 simdscalari left;
123 simdscalari right;
124 };
125
126 INLINE
127 void vTranspose(__m128 &row0, __m128 &row1, __m128 &row2, __m128 &row3)
128 {
129 __m128i row0i = _mm_castps_si128(row0);
130 __m128i row1i = _mm_castps_si128(row1);
131 __m128i row2i = _mm_castps_si128(row2);
132 __m128i row3i = _mm_castps_si128(row3);
133
134 __m128i vTemp = row2i;
135 row2i = _mm_unpacklo_epi32(row2i, row3i);
136 vTemp = _mm_unpackhi_epi32(vTemp, row3i);
137
138 row3i = row0i;
139 row0i = _mm_unpacklo_epi32(row0i, row1i);
140 row3i = _mm_unpackhi_epi32(row3i, row1i);
141
142 row1i = row0i;
143 row0i = _mm_unpacklo_epi64(row0i, row2i);
144 row1i = _mm_unpackhi_epi64(row1i, row2i);
145
146 row2i = row3i;
147 row2i = _mm_unpacklo_epi64(row2i, vTemp);
148 row3i = _mm_unpackhi_epi64(row3i, vTemp);
149
150 row0 = _mm_castsi128_ps(row0i);
151 row1 = _mm_castsi128_ps(row1i);
152 row2 = _mm_castsi128_ps(row2i);
153 row3 = _mm_castsi128_ps(row3i);
154 }
155
156 INLINE
157 void vTranspose(__m128i &row0, __m128i &row1, __m128i &row2, __m128i &row3)
158 {
159 __m128i vTemp = row2;
160 row2 = _mm_unpacklo_epi32(row2, row3);
161 vTemp = _mm_unpackhi_epi32(vTemp, row3);
162
163 row3 = row0;
164 row0 = _mm_unpacklo_epi32(row0, row1);
165 row3 = _mm_unpackhi_epi32(row3, row1);
166
167 row1 = row0;
168 row0 = _mm_unpacklo_epi64(row0, row2);
169 row1 = _mm_unpackhi_epi64(row1, row2);
170
171 row2 = row3;
172 row2 = _mm_unpacklo_epi64(row2, vTemp);
173 row3 = _mm_unpackhi_epi64(row3, vTemp);
174 }
175
176 #define GCC_VERSION (__GNUC__ * 10000 \
177 + __GNUC_MINOR__ * 100 \
178 + __GNUC_PATCHLEVEL__)
179
180 #if defined(__clang__) || (defined(__GNUC__) && (GCC_VERSION < 40900))
181 #define _mm_undefined_ps _mm_setzero_ps
182 #define _mm_undefined_si128 _mm_setzero_si128
183 #if KNOB_SIMD_WIDTH == 8
184 #define _mm256_undefined_ps _mm256_setzero_ps
185 #endif
186 #endif
187
188 #if KNOB_SIMD_WIDTH == 8 || KNOB_SIMD_WIDTH == 16
189 INLINE
190 void vTranspose3x8(__m128 (&vDst)[8], __m256 &vSrc0, __m256 &vSrc1, __m256 &vSrc2)
191 {
192 __m256 r0r2 = _mm256_unpacklo_ps(vSrc0, vSrc2); //x0z0x1z1 x4z4x5z5
193 __m256 r1rx = _mm256_unpacklo_ps(vSrc1, _mm256_undefined_ps()); //y0w0y1w1 y4w4y5w5
194 __m256 r02r1xlolo = _mm256_unpacklo_ps(r0r2, r1rx); //x0y0z0w0 x4y4z4w4
195 __m256 r02r1xlohi = _mm256_unpackhi_ps(r0r2, r1rx); //x1y1z1w1 x5y5z5w5
196
197 r0r2 = _mm256_unpackhi_ps(vSrc0, vSrc2); //x2z2x3z3 x6z6x7z7
198 r1rx = _mm256_unpackhi_ps(vSrc1, _mm256_undefined_ps()); //y2w2y3w3 y6w6yw77
199 __m256 r02r1xhilo = _mm256_unpacklo_ps(r0r2, r1rx); //x2y2z2w2 x6y6z6w6
200 __m256 r02r1xhihi = _mm256_unpackhi_ps(r0r2, r1rx); //x3y3z3w3 x7y7z7w7
201
202 vDst[0] = _mm256_castps256_ps128(r02r1xlolo);
203 vDst[1] = _mm256_castps256_ps128(r02r1xlohi);
204 vDst[2] = _mm256_castps256_ps128(r02r1xhilo);
205 vDst[3] = _mm256_castps256_ps128(r02r1xhihi);
206
207 vDst[4] = _mm256_extractf128_ps(r02r1xlolo, 1);
208 vDst[5] = _mm256_extractf128_ps(r02r1xlohi, 1);
209 vDst[6] = _mm256_extractf128_ps(r02r1xhilo, 1);
210 vDst[7] = _mm256_extractf128_ps(r02r1xhihi, 1);
211 }
212
213 INLINE
214 void vTranspose4x8(__m128 (&vDst)[8], __m256 &vSrc0, __m256 &vSrc1, __m256 &vSrc2, __m256 &vSrc3)
215 {
216 __m256 r0r2 = _mm256_unpacklo_ps(vSrc0, vSrc2); //x0z0x1z1 x4z4x5z5
217 __m256 r1rx = _mm256_unpacklo_ps(vSrc1, vSrc3); //y0w0y1w1 y4w4y5w5
218 __m256 r02r1xlolo = _mm256_unpacklo_ps(r0r2, r1rx); //x0y0z0w0 x4y4z4w4
219 __m256 r02r1xlohi = _mm256_unpackhi_ps(r0r2, r1rx); //x1y1z1w1 x5y5z5w5
220
221 r0r2 = _mm256_unpackhi_ps(vSrc0, vSrc2); //x2z2x3z3 x6z6x7z7
222 r1rx = _mm256_unpackhi_ps(vSrc1, vSrc3) ; //y2w2y3w3 y6w6yw77
223 __m256 r02r1xhilo = _mm256_unpacklo_ps(r0r2, r1rx); //x2y2z2w2 x6y6z6w6
224 __m256 r02r1xhihi = _mm256_unpackhi_ps(r0r2, r1rx); //x3y3z3w3 x7y7z7w7
225
226 vDst[0] = _mm256_castps256_ps128(r02r1xlolo);
227 vDst[1] = _mm256_castps256_ps128(r02r1xlohi);
228 vDst[2] = _mm256_castps256_ps128(r02r1xhilo);
229 vDst[3] = _mm256_castps256_ps128(r02r1xhihi);
230
231 vDst[4] = _mm256_extractf128_ps(r02r1xlolo, 1);
232 vDst[5] = _mm256_extractf128_ps(r02r1xlohi, 1);
233 vDst[6] = _mm256_extractf128_ps(r02r1xhilo, 1);
234 vDst[7] = _mm256_extractf128_ps(r02r1xhihi, 1);
235 }
236
237 INLINE
238 void vTranspose8x8(__m256 (&vDst)[8], const __m256 &vMask0, const __m256 &vMask1, const __m256 &vMask2, const __m256 &vMask3, const __m256 &vMask4, const __m256 &vMask5, const __m256 &vMask6, const __m256 &vMask7)
239 {
240 __m256 __t0 = _mm256_unpacklo_ps(vMask0, vMask1);
241 __m256 __t1 = _mm256_unpackhi_ps(vMask0, vMask1);
242 __m256 __t2 = _mm256_unpacklo_ps(vMask2, vMask3);
243 __m256 __t3 = _mm256_unpackhi_ps(vMask2, vMask3);
244 __m256 __t4 = _mm256_unpacklo_ps(vMask4, vMask5);
245 __m256 __t5 = _mm256_unpackhi_ps(vMask4, vMask5);
246 __m256 __t6 = _mm256_unpacklo_ps(vMask6, vMask7);
247 __m256 __t7 = _mm256_unpackhi_ps(vMask6, vMask7);
248 __m256 __tt0 = _mm256_shuffle_ps(__t0,__t2,_MM_SHUFFLE(1,0,1,0));
249 __m256 __tt1 = _mm256_shuffle_ps(__t0,__t2,_MM_SHUFFLE(3,2,3,2));
250 __m256 __tt2 = _mm256_shuffle_ps(__t1,__t3,_MM_SHUFFLE(1,0,1,0));
251 __m256 __tt3 = _mm256_shuffle_ps(__t1,__t3,_MM_SHUFFLE(3,2,3,2));
252 __m256 __tt4 = _mm256_shuffle_ps(__t4,__t6,_MM_SHUFFLE(1,0,1,0));
253 __m256 __tt5 = _mm256_shuffle_ps(__t4,__t6,_MM_SHUFFLE(3,2,3,2));
254 __m256 __tt6 = _mm256_shuffle_ps(__t5,__t7,_MM_SHUFFLE(1,0,1,0));
255 __m256 __tt7 = _mm256_shuffle_ps(__t5,__t7,_MM_SHUFFLE(3,2,3,2));
256 vDst[0] = _mm256_permute2f128_ps(__tt0, __tt4, 0x20);
257 vDst[1] = _mm256_permute2f128_ps(__tt1, __tt5, 0x20);
258 vDst[2] = _mm256_permute2f128_ps(__tt2, __tt6, 0x20);
259 vDst[3] = _mm256_permute2f128_ps(__tt3, __tt7, 0x20);
260 vDst[4] = _mm256_permute2f128_ps(__tt0, __tt4, 0x31);
261 vDst[5] = _mm256_permute2f128_ps(__tt1, __tt5, 0x31);
262 vDst[6] = _mm256_permute2f128_ps(__tt2, __tt6, 0x31);
263 vDst[7] = _mm256_permute2f128_ps(__tt3, __tt7, 0x31);
264 }
265
266 INLINE
267 void vTranspose8x8(__m256 (&vDst)[8], const __m256i &vMask0, const __m256i &vMask1, const __m256i &vMask2, const __m256i &vMask3, const __m256i &vMask4, const __m256i &vMask5, const __m256i &vMask6, const __m256i &vMask7)
268 {
269 vTranspose8x8(vDst, _mm256_castsi256_ps(vMask0), _mm256_castsi256_ps(vMask1), _mm256_castsi256_ps(vMask2), _mm256_castsi256_ps(vMask3),
270 _mm256_castsi256_ps(vMask4), _mm256_castsi256_ps(vMask5), _mm256_castsi256_ps(vMask6), _mm256_castsi256_ps(vMask7));
271 }
272 #endif
273
274 //////////////////////////////////////////////////////////////////////////
275 /// TranposeSingleComponent
276 //////////////////////////////////////////////////////////////////////////
277 template<uint32_t bpp>
278 struct TransposeSingleComponent
279 {
280 //////////////////////////////////////////////////////////////////////////
281 /// @brief Pass-thru for single component.
282 /// @param pSrc - source data in SOA form
283 /// @param pDst - output data in AOS form
284 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
285 {
286 memcpy(pDst, pSrc, (bpp * KNOB_SIMD_WIDTH) / 8);
287 }
288 };
289
290 //////////////////////////////////////////////////////////////////////////
291 /// Transpose8_8_8_8
292 //////////////////////////////////////////////////////////////////////////
293 struct Transpose8_8_8_8
294 {
295 //////////////////////////////////////////////////////////////////////////
296 /// @brief Performs an SOA to AOS conversion for packed 8_8_8_8 data.
297 /// @param pSrc - source data in SOA form
298 /// @param pDst - output data in AOS form
299 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
300 {
301 simdscalari src = _simd_load_si((const simdscalari*)pSrc);
302
303 #if KNOB_SIMD_WIDTH == 8
304 #if KNOB_ARCH == KNOB_ARCH_AVX
305 __m128i c0c1 = _mm256_castsi256_si128(src); // rrrrrrrrgggggggg
306 __m128i c2c3 = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(src), 1)); // bbbbbbbbaaaaaaaa
307 __m128i c0c2 = _mm_unpacklo_epi64(c0c1, c2c3); // rrrrrrrrbbbbbbbb
308 __m128i c1c3 = _mm_unpackhi_epi64(c0c1, c2c3); // ggggggggaaaaaaaa
309 __m128i c01 = _mm_unpacklo_epi8(c0c2, c1c3); // rgrgrgrgrgrgrgrg
310 __m128i c23 = _mm_unpackhi_epi8(c0c2, c1c3); // babababababababa
311 __m128i c0123lo = _mm_unpacklo_epi16(c01, c23); // rgbargbargbargba
312 __m128i c0123hi = _mm_unpackhi_epi16(c01, c23); // rgbargbargbargba
313 _mm_store_si128((__m128i*)pDst, c0123lo);
314 _mm_store_si128((__m128i*)(pDst + 16), c0123hi);
315 #elif KNOB_ARCH == KNOB_ARCH_AVX2
316 simdscalari dst01 = _mm256_shuffle_epi8(src,
317 _mm256_set_epi32(0x0f078080, 0x0e068080, 0x0d058080, 0x0c048080, 0x80800b03, 0x80800a02, 0x80800901, 0x80800800));
318 simdscalari dst23 = _mm256_permute2x128_si256(src, src, 0x01);
319 dst23 = _mm256_shuffle_epi8(dst23,
320 _mm256_set_epi32(0x80800f07, 0x80800e06, 0x80800d05, 0x80800c04, 0x0b038080, 0x0a028080, 0x09018080, 0x08008080));
321 simdscalari dst = _mm256_or_si256(dst01, dst23);
322 _simd_store_si((simdscalari*)pDst, dst);
323 #endif
324 #elif KNOB_SIMD_WIDTH == 16
325 simdscalari mask0 = _simd_set_epi32(0x0f078080, 0x0e068080, 0x0d058080, 0x0c048080, 0x80800b03, 0x80800a02, 0x80800901, 0x80800800);
326
327 simdscalari dst01 = _simd_shuffle_epi8(src, mask0);
328
329 simdscalari perm1 = _simd_permute_128(src, src, 1);
330
331 simdscalari mask1 = _simd_set_epi32(0x80800f07, 0x80800e06, 0x80800d05, 0x80800c04, 0x0b038080, 0x0a028080, 0x09018080, 0x08008080);
332
333 simdscalari dst23 = _simd_shuffle_epi8(perm1, mask1);
334
335 simdscalari dst = _simd_or_si(dst01, dst23);
336
337 _simd_store_si(reinterpret_cast<simdscalari *>(pDst), dst);
338 #else
339 #error Unsupported vector width
340 #endif
341 }
342 };
343
344 //////////////////////////////////////////////////////////////////////////
345 /// Transpose8_8_8
346 //////////////////////////////////////////////////////////////////////////
347 struct Transpose8_8_8
348 {
349 //////////////////////////////////////////////////////////////////////////
350 /// @brief Performs an SOA to AOS conversion for packed 8_8_8 data.
351 /// @param pSrc - source data in SOA form
352 /// @param pDst - output data in AOS form
353 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
354 };
355
356 //////////////////////////////////////////////////////////////////////////
357 /// Transpose8_8
358 //////////////////////////////////////////////////////////////////////////
359 struct Transpose8_8
360 {
361 //////////////////////////////////////////////////////////////////////////
362 /// @brief Performs an SOA to AOS conversion for packed 8_8 data.
363 /// @param pSrc - source data in SOA form
364 /// @param pDst - output data in AOS form
365 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
366 {
367 #if KNOB_SIMD_WIDTH == 8
368 simdscalari src = _simd_load_si((const simdscalari*)pSrc);
369
370 __m128i rg = _mm256_castsi256_si128(src); // rrrrrrrr gggggggg
371 __m128i g = _mm_unpackhi_epi64(rg, rg); // gggggggg gggggggg
372 rg = _mm_unpacklo_epi8(rg, g);
373 _mm_store_si128((__m128i*)pDst, rg);
374 #elif KNOB_SIMD_WIDTH == 16
375 __m256i src = _mm256_load_si256(reinterpret_cast<const __m256i *>(pSrc)); // rrrrrrrrrrrrrrrrgggggggggggggggg
376
377 __m256i r = _mm256_permute4x64_epi64(src, 0x50); // 0x50 = 01010000b // rrrrrrrrxxxxxxxxrrrrrrrrxxxxxxxx
378
379 __m256i g = _mm256_permute4x64_epi64(src, 0xFA); // 0xFA = 11111010b // ggggggggxxxxxxxxggggggggxxxxxxxx
380
381 __m256i dst = _mm256_unpacklo_epi8(r, g); // rgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrg
382
383 _mm256_store_si256(reinterpret_cast<__m256i *>(pDst), dst);
384 #else
385 #error Unsupported vector width
386 #endif
387 }
388 };
389
390 //////////////////////////////////////////////////////////////////////////
391 /// Transpose32_32_32_32
392 //////////////////////////////////////////////////////////////////////////
393 struct Transpose32_32_32_32
394 {
395 //////////////////////////////////////////////////////////////////////////
396 /// @brief Performs an SOA to AOS conversion for packed 32_32_32_32 data.
397 /// @param pSrc - source data in SOA form
398 /// @param pDst - output data in AOS form
399 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
400 {
401 #if KNOB_SIMD_WIDTH == 8
402 simdscalar src0 = _simd_load_ps((const float*)pSrc);
403 simdscalar src1 = _simd_load_ps((const float*)pSrc + 8);
404 simdscalar src2 = _simd_load_ps((const float*)pSrc + 16);
405 simdscalar src3 = _simd_load_ps((const float*)pSrc + 24);
406
407 __m128 vDst[8];
408 vTranspose4x8(vDst, src0, src1, src2, src3);
409 _mm_store_ps((float*)pDst, vDst[0]);
410 _mm_store_ps((float*)pDst+4, vDst[1]);
411 _mm_store_ps((float*)pDst+8, vDst[2]);
412 _mm_store_ps((float*)pDst+12, vDst[3]);
413 _mm_store_ps((float*)pDst+16, vDst[4]);
414 _mm_store_ps((float*)pDst+20, vDst[5]);
415 _mm_store_ps((float*)pDst+24, vDst[6]);
416 _mm_store_ps((float*)pDst+28, vDst[7]);
417 #elif KNOB_SIMD_WIDTH == 16
418 #if ENABLE_AVX512_EMULATION
419 simdscalar src0 = _simd_load_ps(reinterpret_cast<const float*>(pSrc));
420 simdscalar src1 = _simd_load_ps(reinterpret_cast<const float*>(pSrc) + 16);
421 simdscalar src2 = _simd_load_ps(reinterpret_cast<const float*>(pSrc) + 32);
422 simdscalar src3 = _simd_load_ps(reinterpret_cast<const float*>(pSrc) + 48);
423
424 __m128 vDst[8];
425
426 vTranspose4x8(vDst, src0.lo, src1.lo, src2.lo, src3.lo);
427
428 _mm_store_ps(reinterpret_cast<float*>(pDst), vDst[0]);
429 _mm_store_ps(reinterpret_cast<float*>(pDst) + 4, vDst[1]);
430 _mm_store_ps(reinterpret_cast<float*>(pDst) + 8, vDst[2]);
431 _mm_store_ps(reinterpret_cast<float*>(pDst) + 12, vDst[3]);
432 _mm_store_ps(reinterpret_cast<float*>(pDst) + 16, vDst[4]);
433 _mm_store_ps(reinterpret_cast<float*>(pDst) + 20, vDst[5]);
434 _mm_store_ps(reinterpret_cast<float*>(pDst) + 24, vDst[6]);
435 _mm_store_ps(reinterpret_cast<float*>(pDst) + 28, vDst[7]);
436
437 vTranspose4x8(vDst, src0.hi, src1.hi, src2.hi, src3.hi);
438
439 _mm_store_ps(reinterpret_cast<float*>(pDst) + 32, vDst[0]);
440 _mm_store_ps(reinterpret_cast<float*>(pDst) + 36, vDst[1]);
441 _mm_store_ps(reinterpret_cast<float*>(pDst) + 40, vDst[2]);
442 _mm_store_ps(reinterpret_cast<float*>(pDst) + 44, vDst[3]);
443 _mm_store_ps(reinterpret_cast<float*>(pDst) + 48, vDst[4]);
444 _mm_store_ps(reinterpret_cast<float*>(pDst) + 52, vDst[5]);
445 _mm_store_ps(reinterpret_cast<float*>(pDst) + 56, vDst[6]);
446 _mm_store_ps(reinterpret_cast<float*>(pDst) + 60, vDst[7]);
447 #endif
448 #else
449 #error Unsupported vector width
450 #endif
451 }
452 };
453
454 //////////////////////////////////////////////////////////////////////////
455 /// Transpose32_32_32
456 //////////////////////////////////////////////////////////////////////////
457 struct Transpose32_32_32
458 {
459 //////////////////////////////////////////////////////////////////////////
460 /// @brief Performs an SOA to AOS conversion for packed 32_32_32 data.
461 /// @param pSrc - source data in SOA form
462 /// @param pDst - output data in AOS form
463 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
464 {
465 #if KNOB_SIMD_WIDTH == 8
466 simdscalar src0 = _simd_load_ps((const float*)pSrc);
467 simdscalar src1 = _simd_load_ps((const float*)pSrc + 8);
468 simdscalar src2 = _simd_load_ps((const float*)pSrc + 16);
469
470 __m128 vDst[8];
471 vTranspose3x8(vDst, src0, src1, src2);
472 _mm_store_ps((float*)pDst, vDst[0]);
473 _mm_store_ps((float*)pDst + 4, vDst[1]);
474 _mm_store_ps((float*)pDst + 8, vDst[2]);
475 _mm_store_ps((float*)pDst + 12, vDst[3]);
476 _mm_store_ps((float*)pDst + 16, vDst[4]);
477 _mm_store_ps((float*)pDst + 20, vDst[5]);
478 _mm_store_ps((float*)pDst + 24, vDst[6]);
479 _mm_store_ps((float*)pDst + 28, vDst[7]);
480 #elif KNOB_SIMD_WIDTH == 16
481 #if ENABLE_AVX512_EMULATION
482 simdscalar src0 = _simd_load_ps(reinterpret_cast<const float*>(pSrc));
483 simdscalar src1 = _simd_load_ps(reinterpret_cast<const float*>(pSrc) + 16);
484 simdscalar src2 = _simd_load_ps(reinterpret_cast<const float*>(pSrc) + 32);
485
486 __m128 vDst[8];
487
488 vTranspose3x8(vDst, src0.lo, src1.lo, src2.lo);
489
490 _mm_store_ps(reinterpret_cast<float*>(pDst), vDst[0]);
491 _mm_store_ps(reinterpret_cast<float*>(pDst) + 4, vDst[1]);
492 _mm_store_ps(reinterpret_cast<float*>(pDst) + 8, vDst[2]);
493 _mm_store_ps(reinterpret_cast<float*>(pDst) + 12, vDst[3]);
494 _mm_store_ps(reinterpret_cast<float*>(pDst) + 16, vDst[4]);
495 _mm_store_ps(reinterpret_cast<float*>(pDst) + 20, vDst[5]);
496 _mm_store_ps(reinterpret_cast<float*>(pDst) + 24, vDst[6]);
497 _mm_store_ps(reinterpret_cast<float*>(pDst) + 28, vDst[7]);
498
499 vTranspose3x8(vDst, src0.hi, src1.hi, src2.hi);
500
501 _mm_store_ps(reinterpret_cast<float*>(pDst) + 32, vDst[0]);
502 _mm_store_ps(reinterpret_cast<float*>(pDst) + 36, vDst[1]);
503 _mm_store_ps(reinterpret_cast<float*>(pDst) + 40, vDst[2]);
504 _mm_store_ps(reinterpret_cast<float*>(pDst) + 44, vDst[3]);
505 _mm_store_ps(reinterpret_cast<float*>(pDst) + 48, vDst[4]);
506 _mm_store_ps(reinterpret_cast<float*>(pDst) + 52, vDst[5]);
507 _mm_store_ps(reinterpret_cast<float*>(pDst) + 56, vDst[6]);
508 _mm_store_ps(reinterpret_cast<float*>(pDst) + 60, vDst[7]);
509 #endif
510 #else
511 #error Unsupported vector width
512 #endif
513 }
514 };
515
516 //////////////////////////////////////////////////////////////////////////
517 /// Transpose32_32
518 //////////////////////////////////////////////////////////////////////////
519 struct Transpose32_32
520 {
521 //////////////////////////////////////////////////////////////////////////
522 /// @brief Performs an SOA to AOS conversion for packed 32_32 data.
523 /// @param pSrc - source data in SOA form
524 /// @param pDst - output data in AOS form
525 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
526 {
527 #if KNOB_SIMD_WIDTH == 8
528 const float* pfSrc = (const float*)pSrc;
529 __m128 src_r0 = _mm_load_ps(pfSrc + 0);
530 __m128 src_r1 = _mm_load_ps(pfSrc + 4);
531 __m128 src_g0 = _mm_load_ps(pfSrc + 8);
532 __m128 src_g1 = _mm_load_ps(pfSrc + 12);
533
534 __m128 dst0 = _mm_unpacklo_ps(src_r0, src_g0);
535 __m128 dst1 = _mm_unpackhi_ps(src_r0, src_g0);
536 __m128 dst2 = _mm_unpacklo_ps(src_r1, src_g1);
537 __m128 dst3 = _mm_unpackhi_ps(src_r1, src_g1);
538
539 float* pfDst = (float*)pDst;
540 _mm_store_ps(pfDst + 0, dst0);
541 _mm_store_ps(pfDst + 4, dst1);
542 _mm_store_ps(pfDst + 8, dst2);
543 _mm_store_ps(pfDst + 12, dst3);
544 #elif KNOB_SIMD_WIDTH == 16
545 const float* pfSrc = (const float*)pSrc;
546 __m256 src_r0 = _mm256_load_ps(pfSrc + 0);
547 __m256 src_r1 = _mm256_load_ps(pfSrc + 8);
548 __m256 src_g0 = _mm256_load_ps(pfSrc + 16);
549 __m256 src_g1 = _mm256_load_ps(pfSrc + 24);
550
551 __m256 dst0 = _mm256_unpacklo_ps(src_r0, src_g0);
552 __m256 dst1 = _mm256_unpackhi_ps(src_r0, src_g0);
553 __m256 dst2 = _mm256_unpacklo_ps(src_r1, src_g1);
554 __m256 dst3 = _mm256_unpackhi_ps(src_r1, src_g1);
555
556 float* pfDst = (float*)pDst;
557 _mm256_store_ps(pfDst + 0, dst0);
558 _mm256_store_ps(pfDst + 8, dst1);
559 _mm256_store_ps(pfDst + 16, dst2);
560 _mm256_store_ps(pfDst + 24, dst3);
561 #else
562 #error Unsupported vector width
563 #endif
564 }
565 };
566
567 //////////////////////////////////////////////////////////////////////////
568 /// Transpose16_16_16_16
569 //////////////////////////////////////////////////////////////////////////
570 struct Transpose16_16_16_16
571 {
572 //////////////////////////////////////////////////////////////////////////
573 /// @brief Performs an SOA to AOS conversion for packed 16_16_16_16 data.
574 /// @param pSrc - source data in SOA form
575 /// @param pDst - output data in AOS form
576 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
577 {
578 #if KNOB_SIMD_WIDTH == 8
579 simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc);
580 simdscalari src_ba = _simd_load_si((const simdscalari*)(pSrc + sizeof(simdscalari)));
581
582 __m128i src_r = _mm256_extractf128_si256(src_rg, 0);
583 __m128i src_g = _mm256_extractf128_si256(src_rg, 1);
584 __m128i src_b = _mm256_extractf128_si256(src_ba, 0);
585 __m128i src_a = _mm256_extractf128_si256(src_ba, 1);
586
587 __m128i rg0 = _mm_unpacklo_epi16(src_r, src_g);
588 __m128i rg1 = _mm_unpackhi_epi16(src_r, src_g);
589 __m128i ba0 = _mm_unpacklo_epi16(src_b, src_a);
590 __m128i ba1 = _mm_unpackhi_epi16(src_b, src_a);
591
592 __m128i dst0 = _mm_unpacklo_epi32(rg0, ba0);
593 __m128i dst1 = _mm_unpackhi_epi32(rg0, ba0);
594 __m128i dst2 = _mm_unpacklo_epi32(rg1, ba1);
595 __m128i dst3 = _mm_unpackhi_epi32(rg1, ba1);
596
597 _mm_store_si128(((__m128i*)pDst) + 0, dst0);
598 _mm_store_si128(((__m128i*)pDst) + 1, dst1);
599 _mm_store_si128(((__m128i*)pDst) + 2, dst2);
600 _mm_store_si128(((__m128i*)pDst) + 3, dst3);
601 #elif KNOB_SIMD_WIDTH == 16
602 #if ENABLE_AVX512_EMULATION
603 simdscalari src_rg = _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc));
604 simdscalari src_ba = _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc + sizeof(simdscalari)));
605
606 __m256i src_r = src_rg.lo;
607 __m256i src_g = src_rg.hi;
608 __m256i src_b = src_ba.lo;
609 __m256i src_a = src_ba.hi;
610
611 __m256i rg0 = _mm256_unpacklo_epi16(src_r, src_g);
612 __m256i rg1 = _mm256_unpackhi_epi16(src_r, src_g);
613 __m256i ba0 = _mm256_unpacklo_epi16(src_b, src_a);
614 __m256i ba1 = _mm256_unpackhi_epi16(src_b, src_a);
615
616 __m256i dst0 = _mm256_unpacklo_epi32(rg0, ba0);
617 __m256i dst1 = _mm256_unpackhi_epi32(rg0, ba0);
618 __m256i dst2 = _mm256_unpacklo_epi32(rg1, ba1);
619 __m256i dst3 = _mm256_unpackhi_epi32(rg1, ba1);
620
621 _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 0, dst0);
622 _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 1, dst1);
623 _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 2, dst2);
624 _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 3, dst3);
625 #endif
626 #else
627 #error Unsupported vector width
628 #endif
629 }
630 };
631
632 //////////////////////////////////////////////////////////////////////////
633 /// Transpose16_16_16
634 //////////////////////////////////////////////////////////////////////////
635 struct Transpose16_16_16
636 {
637 //////////////////////////////////////////////////////////////////////////
638 /// @brief Performs an SOA to AOS conversion for packed 16_16_16 data.
639 /// @param pSrc - source data in SOA form
640 /// @param pDst - output data in AOS form
641 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
642 {
643 #if KNOB_SIMD_WIDTH == 8
644 simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc);
645
646 __m128i src_r = _mm256_extractf128_si256(src_rg, 0);
647 __m128i src_g = _mm256_extractf128_si256(src_rg, 1);
648 __m128i src_b = _mm_load_si128((const __m128i*)(pSrc + sizeof(simdscalari)));
649 __m128i src_a = _mm_undefined_si128();
650
651 __m128i rg0 = _mm_unpacklo_epi16(src_r, src_g);
652 __m128i rg1 = _mm_unpackhi_epi16(src_r, src_g);
653 __m128i ba0 = _mm_unpacklo_epi16(src_b, src_a);
654 __m128i ba1 = _mm_unpackhi_epi16(src_b, src_a);
655
656 __m128i dst0 = _mm_unpacklo_epi32(rg0, ba0);
657 __m128i dst1 = _mm_unpackhi_epi32(rg0, ba0);
658 __m128i dst2 = _mm_unpacklo_epi32(rg1, ba1);
659 __m128i dst3 = _mm_unpackhi_epi32(rg1, ba1);
660
661 _mm_store_si128(((__m128i*)pDst) + 0, dst0);
662 _mm_store_si128(((__m128i*)pDst) + 1, dst1);
663 _mm_store_si128(((__m128i*)pDst) + 2, dst2);
664 _mm_store_si128(((__m128i*)pDst) + 3, dst3);
665 #elif KNOB_SIMD_WIDTH == 16
666 #if ENABLE_AVX512_EMULATION
667 simdscalari src_rg = _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc));
668
669 __m256i src_r = src_rg.lo;
670 __m256i src_g = src_rg.hi;
671 __m256i src_b = _mm256_load_si256(reinterpret_cast<const __m256i*>(pSrc + sizeof(simdscalari)));
672 __m256i src_a = _mm256_undefined_si256();
673
674 __m256i rg0 = _mm256_unpacklo_epi16(src_r, src_g);
675 __m256i rg1 = _mm256_unpackhi_epi16(src_r, src_g);
676 __m256i ba0 = _mm256_unpacklo_epi16(src_b, src_a);
677 __m256i ba1 = _mm256_unpackhi_epi16(src_b, src_a);
678
679 __m256i dst0 = _mm256_unpacklo_epi32(rg0, ba0);
680 __m256i dst1 = _mm256_unpackhi_epi32(rg0, ba0);
681 __m256i dst2 = _mm256_unpacklo_epi32(rg1, ba1);
682 __m256i dst3 = _mm256_unpackhi_epi32(rg1, ba1);
683
684 _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 0, dst0);
685 _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 1, dst1);
686 _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 2, dst2);
687 _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 3, dst3);
688 #endif
689 #else
690 #error Unsupported vector width
691 #endif
692 }
693 };
694
695 //////////////////////////////////////////////////////////////////////////
696 /// Transpose16_16
697 //////////////////////////////////////////////////////////////////////////
698 struct Transpose16_16
699 {
700 //////////////////////////////////////////////////////////////////////////
701 /// @brief Performs an SOA to AOS conversion for packed 16_16 data.
702 /// @param pSrc - source data in SOA form
703 /// @param pDst - output data in AOS form
704 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
705 {
706 #if KNOB_SIMD_WIDTH == 8
707 simdscalar src = _simd_load_ps((const float*)pSrc);
708
709 __m128 comp0 = _mm256_castps256_ps128(src);
710 __m128 comp1 = _mm256_extractf128_ps(src, 1);
711
712 __m128i comp0i = _mm_castps_si128(comp0);
713 __m128i comp1i = _mm_castps_si128(comp1);
714
715 __m128i resLo = _mm_unpacklo_epi16(comp0i, comp1i);
716 __m128i resHi = _mm_unpackhi_epi16(comp0i, comp1i);
717
718 _mm_store_si128((__m128i*)pDst, resLo);
719 _mm_store_si128((__m128i*)pDst + 1, resHi);
720 #elif KNOB_SIMD_WIDTH == 16
721 #if ENABLE_AVX512_EMULATION
722 simdscalari src = _simd_castps_si(_simd_load_ps(reinterpret_cast<const float*>(pSrc)));
723
724 simdscalari result;
725
726 result.lo = _mm256_unpacklo_epi16(src.lo, src.hi);
727 result.hi = _mm256_unpackhi_epi16(src.lo, src.hi);
728
729 _simd_store_si(reinterpret_cast<simdscalari *>(pDst), result);
730 #endif
731 #else
732 #error Unsupported vector width
733 #endif
734 }
735 };
736
737 //////////////////////////////////////////////////////////////////////////
738 /// Transpose24_8
739 //////////////////////////////////////////////////////////////////////////
740 struct Transpose24_8
741 {
742 //////////////////////////////////////////////////////////////////////////
743 /// @brief Performs an SOA to AOS conversion for packed 24_8 data.
744 /// @param pSrc - source data in SOA form
745 /// @param pDst - output data in AOS form
746 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
747 };
748
749 //////////////////////////////////////////////////////////////////////////
750 /// Transpose32_8_24
751 //////////////////////////////////////////////////////////////////////////
752 struct Transpose32_8_24
753 {
754 //////////////////////////////////////////////////////////////////////////
755 /// @brief Performs an SOA to AOS conversion for packed 32_8_24 data.
756 /// @param pSrc - source data in SOA form
757 /// @param pDst - output data in AOS form
758 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
759 };
760
761
762
763 //////////////////////////////////////////////////////////////////////////
764 /// Transpose4_4_4_4
765 //////////////////////////////////////////////////////////////////////////
766 struct Transpose4_4_4_4
767 {
768 //////////////////////////////////////////////////////////////////////////
769 /// @brief Performs an SOA to AOS conversion for packed 4_4_4_4 data.
770 /// @param pSrc - source data in SOA form
771 /// @param pDst - output data in AOS form
772 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
773 };
774
775 //////////////////////////////////////////////////////////////////////////
776 /// Transpose5_6_5
777 //////////////////////////////////////////////////////////////////////////
778 struct Transpose5_6_5
779 {
780 //////////////////////////////////////////////////////////////////////////
781 /// @brief Performs an SOA to AOS conversion for packed 5_6_5 data.
782 /// @param pSrc - source data in SOA form
783 /// @param pDst - output data in AOS form
784 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
785 };
786
787 //////////////////////////////////////////////////////////////////////////
788 /// Transpose9_9_9_5
789 //////////////////////////////////////////////////////////////////////////
790 struct Transpose9_9_9_5
791 {
792 //////////////////////////////////////////////////////////////////////////
793 /// @brief Performs an SOA to AOS conversion for packed 9_9_9_5 data.
794 /// @param pSrc - source data in SOA form
795 /// @param pDst - output data in AOS form
796 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
797 };
798
799 //////////////////////////////////////////////////////////////////////////
800 /// Transpose5_5_5_1
801 //////////////////////////////////////////////////////////////////////////
802 struct Transpose5_5_5_1
803 {
804 //////////////////////////////////////////////////////////////////////////
805 /// @brief Performs an SOA to AOS conversion for packed 5_5_5_1 data.
806 /// @param pSrc - source data in SOA form
807 /// @param pDst - output data in AOS form
808 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
809 };
810
811 //////////////////////////////////////////////////////////////////////////
812 /// Transpose10_10_10_2
813 //////////////////////////////////////////////////////////////////////////
814 struct Transpose10_10_10_2
815 {
816 //////////////////////////////////////////////////////////////////////////
817 /// @brief Performs an SOA to AOS conversion for packed 10_10_10_2 data.
818 /// @param pSrc - source data in SOA form
819 /// @param pDst - output data in AOS form
820 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
821 };
822
823 //////////////////////////////////////////////////////////////////////////
824 /// Transpose11_11_10
825 //////////////////////////////////////////////////////////////////////////
826 struct Transpose11_11_10
827 {
828 //////////////////////////////////////////////////////////////////////////
829 /// @brief Performs an SOA to AOS conversion for packed 11_11_10 data.
830 /// @param pSrc - source data in SOA form
831 /// @param pDst - output data in AOS form
832 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
833 };
834
835 // helper function to unroll loops
836 template<int Begin, int End, int Step = 1>
837 struct UnrollerL {
838 template<typename Lambda>
839 INLINE static void step(Lambda& func) {
840 func(Begin);
841 UnrollerL<Begin + Step, End, Step>::step(func);
842 }
843 };
844
845 template<int End, int Step>
846 struct UnrollerL<End, End, Step> {
847 template<typename Lambda>
848 static void step(Lambda& func) {
849 }
850 };
851
852 // helper function to unroll loops, with mask to skip specific iterations
853 template<int Begin, int End, int Step = 1, int Mask = 0x7f>
854 struct UnrollerLMask {
855 template<typename Lambda>
856 INLINE static void step(Lambda& func) {
857 if(Mask & (1 << Begin))
858 {
859 func(Begin);
860 }
861 UnrollerL<Begin + Step, End, Step>::step(func);
862 }
863 };
864
865 template<int End, int Step, int Mask>
866 struct UnrollerLMask<End, End, Step, Mask> {
867 template<typename Lambda>
868 static void step(Lambda& func) {
869 }
870 };
871
872 // general CRC compute
873 INLINE
874 uint32_t ComputeCRC(uint32_t crc, const void *pData, uint32_t size)
875 {
876 #if defined(_WIN64) || defined(__x86_64__)
877 uint32_t sizeInQwords = size / sizeof(uint64_t);
878 uint32_t sizeRemainderBytes = size % sizeof(uint64_t);
879 uint64_t* pDataWords = (uint64_t*)pData;
880 for (uint32_t i = 0; i < sizeInQwords; ++i)
881 {
882 crc = (uint32_t)_mm_crc32_u64(crc, *pDataWords++);
883 }
884 #else
885 uint32_t sizeInDwords = size / sizeof(uint32_t);
886 uint32_t sizeRemainderBytes = size % sizeof(uint32_t);
887 uint32_t* pDataWords = (uint32_t*)pData;
888 for (uint32_t i = 0; i < sizeInDwords; ++i)
889 {
890 crc = _mm_crc32_u32(crc, *pDataWords++);
891 }
892 #endif
893
894 uint8_t* pRemainderBytes = (uint8_t*)pDataWords;
895 for (uint32_t i = 0; i < sizeRemainderBytes; ++i)
896 {
897 crc = _mm_crc32_u8(crc, *pRemainderBytes++);
898 }
899
900 return crc;
901 }
902
903 //////////////////////////////////////////////////////////////////////////
904 /// Add byte offset to any-type pointer
905 //////////////////////////////////////////////////////////////////////////
906 template <typename T>
907 INLINE
908 static T* PtrAdd(T* p, intptr_t offset)
909 {
910 intptr_t intp = reinterpret_cast<intptr_t>(p);
911 return reinterpret_cast<T*>(intp + offset);
912 }
913
914 //////////////////////////////////////////////////////////////////////////
915 /// Is a power-of-2?
916 //////////////////////////////////////////////////////////////////////////
917 template <typename T>
918 INLINE
919 static bool IsPow2(T value)
920 {
921 return value == (value & (0 - value));
922 }
923
924 //////////////////////////////////////////////////////////////////////////
925 /// Align down to specified alignment
926 /// Note: IsPow2(alignment) MUST be true
927 //////////////////////////////////////////////////////////////////////////
928 template <typename T1, typename T2>
929 INLINE
930 static T1 AlignDownPow2(T1 value, T2 alignment)
931 {
932 SWR_ASSERT(IsPow2(alignment));
933 return value & ~T1(alignment - 1);
934 }
935
936 //////////////////////////////////////////////////////////////////////////
937 /// Align up to specified alignment
938 /// Note: IsPow2(alignment) MUST be true
939 //////////////////////////////////////////////////////////////////////////
940 template <typename T1, typename T2>
941 INLINE
942 static T1 AlignUpPow2(T1 value, T2 alignment)
943 {
944 return AlignDownPow2(value + T1(alignment - 1), alignment);
945 }
946
947 //////////////////////////////////////////////////////////////////////////
948 /// Align up ptr to specified alignment
949 /// Note: IsPow2(alignment) MUST be true
950 //////////////////////////////////////////////////////////////////////////
951 template <typename T1, typename T2>
952 INLINE
953 static T1* AlignUpPow2(T1* value, T2 alignment)
954 {
955 return reinterpret_cast<T1*>(
956 AlignDownPow2(reinterpret_cast<uintptr_t>(value) + uintptr_t(alignment - 1), alignment));
957 }
958
959 //////////////////////////////////////////////////////////////////////////
960 /// Align down to specified alignment
961 //////////////////////////////////////////////////////////////////////////
962 template <typename T1, typename T2>
963 INLINE
964 static T1 AlignDown(T1 value, T2 alignment)
965 {
966 if (IsPow2(alignment)) { return AlignDownPow2(value, alignment); }
967 return value - T1(value % alignment);
968 }
969
970 //////////////////////////////////////////////////////////////////////////
971 /// Align down to specified alignment
972 //////////////////////////////////////////////////////////////////////////
973 template <typename T1, typename T2>
974 INLINE
975 static T1* AlignDown(T1* value, T2 alignment)
976 {
977 return (T1*)AlignDown(uintptr_t(value), alignment);
978 }
979
980 //////////////////////////////////////////////////////////////////////////
981 /// Align up to specified alignment
982 /// Note: IsPow2(alignment) MUST be true
983 //////////////////////////////////////////////////////////////////////////
984 template <typename T1, typename T2>
985 INLINE
986 static T1 AlignUp(T1 value, T2 alignment)
987 {
988 return AlignDown(value + T1(alignment - 1), alignment);
989 }
990
991 //////////////////////////////////////////////////////////////////////////
992 /// Align up to specified alignment
993 /// Note: IsPow2(alignment) MUST be true
994 //////////////////////////////////////////////////////////////////////////
995 template <typename T1, typename T2>
996 INLINE
997 static T1* AlignUp(T1* value, T2 alignment)
998 {
999 return AlignDown(PtrAdd(value, alignment - 1), alignment);
1000 }
1001
1002 //////////////////////////////////////////////////////////////////////////
1003 /// Helper structure used to access an array of elements that don't
1004 /// correspond to a typical word size.
1005 //////////////////////////////////////////////////////////////////////////
1006 template<typename T, size_t BitsPerElementT, size_t ArrayLenT>
1007 class BitsArray
1008 {
1009 private:
1010 static const size_t BITS_PER_WORD = sizeof(size_t) * 8;
1011 static const size_t ELEMENTS_PER_WORD = BITS_PER_WORD / BitsPerElementT;
1012 static const size_t NUM_WORDS = (ArrayLenT + ELEMENTS_PER_WORD - 1) / ELEMENTS_PER_WORD;
1013 static const size_t ELEMENT_MASK = (size_t(1) << BitsPerElementT) - 1;
1014
1015 static_assert(ELEMENTS_PER_WORD * BitsPerElementT == BITS_PER_WORD,
1016 "Element size must an integral fraction of pointer size");
1017
1018 size_t m_words[NUM_WORDS] = {};
1019
1020 public:
1021
1022 T operator[] (size_t elementIndex) const
1023 {
1024 size_t word = m_words[elementIndex / ELEMENTS_PER_WORD];
1025 word >>= ((elementIndex % ELEMENTS_PER_WORD) * BitsPerElementT);
1026 return T(word & ELEMENT_MASK);
1027 }
1028 };
1029
1030 // Ranged integer argument for TemplateArgUnroller
1031 template <uint32_t TMin, uint32_t TMax>
1032 struct IntArg
1033 {
1034 uint32_t val;
1035 };
1036
1037 // Recursive template used to auto-nest conditionals. Converts dynamic boolean function
1038 // arguments to static template arguments.
1039 template <typename TermT, typename... ArgsB>
1040 struct TemplateArgUnroller
1041 {
1042 //-----------------------------------------
1043 // Boolean value
1044 //-----------------------------------------
1045
1046 // Last Arg Terminator
1047 static typename TermT::FuncType GetFunc(bool bArg)
1048 {
1049 if (bArg)
1050 {
1051 return TermT::template GetFunc<ArgsB..., std::true_type>();
1052 }
1053
1054 return TermT::template GetFunc<ArgsB..., std::false_type>();
1055 }
1056
1057 // Recursively parse args
1058 template <typename... TArgsT>
1059 static typename TermT::FuncType GetFunc(bool bArg, TArgsT... remainingArgs)
1060 {
1061 if (bArg)
1062 {
1063 return TemplateArgUnroller<TermT, ArgsB..., std::true_type>::GetFunc(remainingArgs...);
1064 }
1065
1066 return TemplateArgUnroller<TermT, ArgsB..., std::false_type>::GetFunc(remainingArgs...);
1067 }
1068
1069 //-----------------------------------------
1070 // Integer value (within specified range)
1071 //-----------------------------------------
1072
1073 // Last Arg Terminator
1074 template <uint32_t TMin, uint32_t TMax>
1075 static typename TermT::FuncType GetFunc(IntArg<TMin, TMax> iArg)
1076 {
1077 if (iArg.val == TMax)
1078 {
1079 return TermT::template GetFunc<ArgsB..., std::integral_constant<uint32_t, TMax>>();
1080 }
1081 if (TMax > TMin)
1082 {
1083 return TemplateArgUnroller<TermT, ArgsB...>::GetFunc(IntArg<TMin, TMax-1>{iArg.val});
1084 }
1085 SWR_ASSUME(false); return nullptr;
1086 }
1087 template <uint32_t TVal>
1088 static typename TermT::FuncType GetFunc(IntArg<TVal, TVal> iArg)
1089 {
1090 SWR_ASSERT(iArg.val == TVal);
1091 return TermT::template GetFunc<ArgsB..., std::integral_constant<uint32_t, TVal>>();
1092 }
1093
1094 // Recursively parse args
1095 template <uint32_t TMin, uint32_t TMax, typename... TArgsT>
1096 static typename TermT::FuncType GetFunc(IntArg<TMin, TMax> iArg, TArgsT... remainingArgs)
1097 {
1098 if (iArg.val == TMax)
1099 {
1100 return TemplateArgUnroller<TermT, ArgsB..., std::integral_constant<uint32_t, TMax>>::GetFunc(remainingArgs...);
1101 }
1102 if (TMax > TMin)
1103 {
1104 return TemplateArgUnroller<TermT, ArgsB...>::GetFunc(IntArg<TMin, TMax - 1>{iArg.val}, remainingArgs...);
1105 }
1106 SWR_ASSUME(false); return nullptr;
1107 }
1108 template <uint32_t TVal, typename... TArgsT>
1109 static typename TermT::FuncType GetFunc(IntArg<TVal, TVal> iArg, TArgsT... remainingArgs)
1110 {
1111 SWR_ASSERT(iArg.val == TVal);
1112 return TemplateArgUnroller<TermT, ArgsB..., std::integral_constant<uint32_t, TVal>>::GetFunc(remainingArgs...);
1113 }
1114 };
1115
1116