1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * @brief Utilities used by SWR core.
27 ******************************************************************************/
31 #include <type_traits>
33 #include "common/os.h"
34 #include "common/simdintrin.h"
35 #include "common/swr_assert.h"
37 #if defined(_WIN64) || defined(__x86_64__)
38 #define _MM_INSERT_EPI64 _mm_insert_epi64
39 #define _MM_EXTRACT_EPI64 _mm_extract_epi64
41 INLINE
int64_t _MM_EXTRACT_EPI64(__m128i a
, const int32_t ndx
)
43 OSALIGNLINE(uint32_t) elems
[4];
44 _mm_store_si128((__m128i
*)elems
, a
);
47 uint64_t foo
= elems
[0];
48 foo
|= (uint64_t)elems
[1] << 32;
53 uint64_t foo
= elems
[2];
54 foo
|= (uint64_t)elems
[3] << 32;
59 INLINE __m128i
_MM_INSERT_EPI64(__m128i a
, int64_t b
, const int32_t ndx
)
61 OSALIGNLINE(int64_t) elems
[2];
62 _mm_store_si128((__m128i
*)elems
, a
);
72 out
= _mm_load_si128((const __m128i
*)elems
);
77 OSALIGNLINE(struct) BBOX
85 BBOX(int t
, int b
, int l
, int r
) : top(t
), bottom(b
), left(l
), right(r
) {}
87 bool operator==(const BBOX
& rhs
)
89 return (this->top
== rhs
.top
&&
90 this->bottom
== rhs
.bottom
&&
91 this->left
== rhs
.left
&&
92 this->right
== rhs
.right
);
95 bool operator!=(const BBOX
& rhs
)
97 return !(*this == rhs
);
100 BBOX
& Intersect(const BBOX
& other
)
102 this->top
= std::max(this->top
, other
.top
);
103 this->bottom
= std::min(this->bottom
, other
.bottom
);
104 this->left
= std::max(this->left
, other
.left
);
105 this->right
= std::min(this->right
, other
.right
);
107 if (right
- left
< 0 ||
111 top
= bottom
= left
= right
= 0;
127 void vTranspose(__m128
&row0
, __m128
&row1
, __m128
&row2
, __m128
&row3
)
129 __m128i row0i
= _mm_castps_si128(row0
);
130 __m128i row1i
= _mm_castps_si128(row1
);
131 __m128i row2i
= _mm_castps_si128(row2
);
132 __m128i row3i
= _mm_castps_si128(row3
);
134 __m128i vTemp
= row2i
;
135 row2i
= _mm_unpacklo_epi32(row2i
, row3i
);
136 vTemp
= _mm_unpackhi_epi32(vTemp
, row3i
);
139 row0i
= _mm_unpacklo_epi32(row0i
, row1i
);
140 row3i
= _mm_unpackhi_epi32(row3i
, row1i
);
143 row0i
= _mm_unpacklo_epi64(row0i
, row2i
);
144 row1i
= _mm_unpackhi_epi64(row1i
, row2i
);
147 row2i
= _mm_unpacklo_epi64(row2i
, vTemp
);
148 row3i
= _mm_unpackhi_epi64(row3i
, vTemp
);
150 row0
= _mm_castsi128_ps(row0i
);
151 row1
= _mm_castsi128_ps(row1i
);
152 row2
= _mm_castsi128_ps(row2i
);
153 row3
= _mm_castsi128_ps(row3i
);
157 void vTranspose(__m128i
&row0
, __m128i
&row1
, __m128i
&row2
, __m128i
&row3
)
159 __m128i vTemp
= row2
;
160 row2
= _mm_unpacklo_epi32(row2
, row3
);
161 vTemp
= _mm_unpackhi_epi32(vTemp
, row3
);
164 row0
= _mm_unpacklo_epi32(row0
, row1
);
165 row3
= _mm_unpackhi_epi32(row3
, row1
);
168 row0
= _mm_unpacklo_epi64(row0
, row2
);
169 row1
= _mm_unpackhi_epi64(row1
, row2
);
172 row2
= _mm_unpacklo_epi64(row2
, vTemp
);
173 row3
= _mm_unpackhi_epi64(row3
, vTemp
);
176 #define GCC_VERSION (__GNUC__ * 10000 \
177 + __GNUC_MINOR__ * 100 \
178 + __GNUC_PATCHLEVEL__)
180 #if defined(__clang__) || (defined(__GNUC__) && (GCC_VERSION < 40900))
181 #define _mm_undefined_ps _mm_setzero_ps
182 #define _mm_undefined_si128 _mm_setzero_si128
183 #if KNOB_SIMD_WIDTH == 8
184 #define _mm256_undefined_ps _mm256_setzero_ps
188 #if KNOB_SIMD_WIDTH == 8 || KNOB_SIMD_WIDTH == 16
190 void vTranspose3x8(__m128 (&vDst
)[8], __m256
&vSrc0
, __m256
&vSrc1
, __m256
&vSrc2
)
192 __m256 r0r2
= _mm256_unpacklo_ps(vSrc0
, vSrc2
); //x0z0x1z1 x4z4x5z5
193 __m256 r1rx
= _mm256_unpacklo_ps(vSrc1
, _mm256_undefined_ps()); //y0w0y1w1 y4w4y5w5
194 __m256 r02r1xlolo
= _mm256_unpacklo_ps(r0r2
, r1rx
); //x0y0z0w0 x4y4z4w4
195 __m256 r02r1xlohi
= _mm256_unpackhi_ps(r0r2
, r1rx
); //x1y1z1w1 x5y5z5w5
197 r0r2
= _mm256_unpackhi_ps(vSrc0
, vSrc2
); //x2z2x3z3 x6z6x7z7
198 r1rx
= _mm256_unpackhi_ps(vSrc1
, _mm256_undefined_ps()); //y2w2y3w3 y6w6yw77
199 __m256 r02r1xhilo
= _mm256_unpacklo_ps(r0r2
, r1rx
); //x2y2z2w2 x6y6z6w6
200 __m256 r02r1xhihi
= _mm256_unpackhi_ps(r0r2
, r1rx
); //x3y3z3w3 x7y7z7w7
202 vDst
[0] = _mm256_castps256_ps128(r02r1xlolo
);
203 vDst
[1] = _mm256_castps256_ps128(r02r1xlohi
);
204 vDst
[2] = _mm256_castps256_ps128(r02r1xhilo
);
205 vDst
[3] = _mm256_castps256_ps128(r02r1xhihi
);
207 vDst
[4] = _mm256_extractf128_ps(r02r1xlolo
, 1);
208 vDst
[5] = _mm256_extractf128_ps(r02r1xlohi
, 1);
209 vDst
[6] = _mm256_extractf128_ps(r02r1xhilo
, 1);
210 vDst
[7] = _mm256_extractf128_ps(r02r1xhihi
, 1);
214 void vTranspose4x8(__m128 (&vDst
)[8], __m256
&vSrc0
, __m256
&vSrc1
, __m256
&vSrc2
, __m256
&vSrc3
)
216 __m256 r0r2
= _mm256_unpacklo_ps(vSrc0
, vSrc2
); //x0z0x1z1 x4z4x5z5
217 __m256 r1rx
= _mm256_unpacklo_ps(vSrc1
, vSrc3
); //y0w0y1w1 y4w4y5w5
218 __m256 r02r1xlolo
= _mm256_unpacklo_ps(r0r2
, r1rx
); //x0y0z0w0 x4y4z4w4
219 __m256 r02r1xlohi
= _mm256_unpackhi_ps(r0r2
, r1rx
); //x1y1z1w1 x5y5z5w5
221 r0r2
= _mm256_unpackhi_ps(vSrc0
, vSrc2
); //x2z2x3z3 x6z6x7z7
222 r1rx
= _mm256_unpackhi_ps(vSrc1
, vSrc3
) ; //y2w2y3w3 y6w6yw77
223 __m256 r02r1xhilo
= _mm256_unpacklo_ps(r0r2
, r1rx
); //x2y2z2w2 x6y6z6w6
224 __m256 r02r1xhihi
= _mm256_unpackhi_ps(r0r2
, r1rx
); //x3y3z3w3 x7y7z7w7
226 vDst
[0] = _mm256_castps256_ps128(r02r1xlolo
);
227 vDst
[1] = _mm256_castps256_ps128(r02r1xlohi
);
228 vDst
[2] = _mm256_castps256_ps128(r02r1xhilo
);
229 vDst
[3] = _mm256_castps256_ps128(r02r1xhihi
);
231 vDst
[4] = _mm256_extractf128_ps(r02r1xlolo
, 1);
232 vDst
[5] = _mm256_extractf128_ps(r02r1xlohi
, 1);
233 vDst
[6] = _mm256_extractf128_ps(r02r1xhilo
, 1);
234 vDst
[7] = _mm256_extractf128_ps(r02r1xhihi
, 1);
238 void vTranspose8x8(__m256 (&vDst
)[8], const __m256
&vMask0
, const __m256
&vMask1
, const __m256
&vMask2
, const __m256
&vMask3
, const __m256
&vMask4
, const __m256
&vMask5
, const __m256
&vMask6
, const __m256
&vMask7
)
240 __m256 __t0
= _mm256_unpacklo_ps(vMask0
, vMask1
);
241 __m256 __t1
= _mm256_unpackhi_ps(vMask0
, vMask1
);
242 __m256 __t2
= _mm256_unpacklo_ps(vMask2
, vMask3
);
243 __m256 __t3
= _mm256_unpackhi_ps(vMask2
, vMask3
);
244 __m256 __t4
= _mm256_unpacklo_ps(vMask4
, vMask5
);
245 __m256 __t5
= _mm256_unpackhi_ps(vMask4
, vMask5
);
246 __m256 __t6
= _mm256_unpacklo_ps(vMask6
, vMask7
);
247 __m256 __t7
= _mm256_unpackhi_ps(vMask6
, vMask7
);
248 __m256 __tt0
= _mm256_shuffle_ps(__t0
,__t2
,_MM_SHUFFLE(1,0,1,0));
249 __m256 __tt1
= _mm256_shuffle_ps(__t0
,__t2
,_MM_SHUFFLE(3,2,3,2));
250 __m256 __tt2
= _mm256_shuffle_ps(__t1
,__t3
,_MM_SHUFFLE(1,0,1,0));
251 __m256 __tt3
= _mm256_shuffle_ps(__t1
,__t3
,_MM_SHUFFLE(3,2,3,2));
252 __m256 __tt4
= _mm256_shuffle_ps(__t4
,__t6
,_MM_SHUFFLE(1,0,1,0));
253 __m256 __tt5
= _mm256_shuffle_ps(__t4
,__t6
,_MM_SHUFFLE(3,2,3,2));
254 __m256 __tt6
= _mm256_shuffle_ps(__t5
,__t7
,_MM_SHUFFLE(1,0,1,0));
255 __m256 __tt7
= _mm256_shuffle_ps(__t5
,__t7
,_MM_SHUFFLE(3,2,3,2));
256 vDst
[0] = _mm256_permute2f128_ps(__tt0
, __tt4
, 0x20);
257 vDst
[1] = _mm256_permute2f128_ps(__tt1
, __tt5
, 0x20);
258 vDst
[2] = _mm256_permute2f128_ps(__tt2
, __tt6
, 0x20);
259 vDst
[3] = _mm256_permute2f128_ps(__tt3
, __tt7
, 0x20);
260 vDst
[4] = _mm256_permute2f128_ps(__tt0
, __tt4
, 0x31);
261 vDst
[5] = _mm256_permute2f128_ps(__tt1
, __tt5
, 0x31);
262 vDst
[6] = _mm256_permute2f128_ps(__tt2
, __tt6
, 0x31);
263 vDst
[7] = _mm256_permute2f128_ps(__tt3
, __tt7
, 0x31);
267 void vTranspose8x8(__m256 (&vDst
)[8], const __m256i
&vMask0
, const __m256i
&vMask1
, const __m256i
&vMask2
, const __m256i
&vMask3
, const __m256i
&vMask4
, const __m256i
&vMask5
, const __m256i
&vMask6
, const __m256i
&vMask7
)
269 vTranspose8x8(vDst
, _mm256_castsi256_ps(vMask0
), _mm256_castsi256_ps(vMask1
), _mm256_castsi256_ps(vMask2
), _mm256_castsi256_ps(vMask3
),
270 _mm256_castsi256_ps(vMask4
), _mm256_castsi256_ps(vMask5
), _mm256_castsi256_ps(vMask6
), _mm256_castsi256_ps(vMask7
));
274 //////////////////////////////////////////////////////////////////////////
275 /// TranposeSingleComponent
276 //////////////////////////////////////////////////////////////////////////
277 template<uint32_t bpp
>
278 struct TransposeSingleComponent
280 //////////////////////////////////////////////////////////////////////////
281 /// @brief Pass-thru for single component.
282 /// @param pSrc - source data in SOA form
283 /// @param pDst - output data in AOS form
284 INLINE
static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
)
286 memcpy(pDst
, pSrc
, (bpp
* KNOB_SIMD_WIDTH
) / 8);
290 //////////////////////////////////////////////////////////////////////////
292 //////////////////////////////////////////////////////////////////////////
293 struct Transpose8_8_8_8
295 //////////////////////////////////////////////////////////////////////////
296 /// @brief Performs an SOA to AOS conversion for packed 8_8_8_8 data.
297 /// @param pSrc - source data in SOA form
298 /// @param pDst - output data in AOS form
299 INLINE
static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
)
301 simdscalari src
= _simd_load_si((const simdscalari
*)pSrc
);
303 #if KNOB_SIMD_WIDTH == 8
304 #if KNOB_ARCH == KNOB_ARCH_AVX
305 __m128i c0c1
= _mm256_castsi256_si128(src
); // rrrrrrrrgggggggg
306 __m128i c2c3
= _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(src
), 1)); // bbbbbbbbaaaaaaaa
307 __m128i c0c2
= _mm_unpacklo_epi64(c0c1
, c2c3
); // rrrrrrrrbbbbbbbb
308 __m128i c1c3
= _mm_unpackhi_epi64(c0c1
, c2c3
); // ggggggggaaaaaaaa
309 __m128i c01
= _mm_unpacklo_epi8(c0c2
, c1c3
); // rgrgrgrgrgrgrgrg
310 __m128i c23
= _mm_unpackhi_epi8(c0c2
, c1c3
); // babababababababa
311 __m128i c0123lo
= _mm_unpacklo_epi16(c01
, c23
); // rgbargbargbargba
312 __m128i c0123hi
= _mm_unpackhi_epi16(c01
, c23
); // rgbargbargbargba
313 _mm_store_si128((__m128i
*)pDst
, c0123lo
);
314 _mm_store_si128((__m128i
*)(pDst
+ 16), c0123hi
);
315 #elif KNOB_ARCH == KNOB_ARCH_AVX2
316 simdscalari dst01
= _mm256_shuffle_epi8(src
,
317 _mm256_set_epi32(0x0f078080, 0x0e068080, 0x0d058080, 0x0c048080, 0x80800b03, 0x80800a02, 0x80800901, 0x80800800));
318 simdscalari dst23
= _mm256_permute2x128_si256(src
, src
, 0x01);
319 dst23
= _mm256_shuffle_epi8(dst23
,
320 _mm256_set_epi32(0x80800f07, 0x80800e06, 0x80800d05, 0x80800c04, 0x0b038080, 0x0a028080, 0x09018080, 0x08008080));
321 simdscalari dst
= _mm256_or_si256(dst01
, dst23
);
322 _simd_store_si((simdscalari
*)pDst
, dst
);
324 #elif KNOB_SIMD_WIDTH == 16
325 simdscalari mask0
= _simd_set_epi32(0x0f078080, 0x0e068080, 0x0d058080, 0x0c048080, 0x80800b03, 0x80800a02, 0x80800901, 0x80800800);
327 simdscalari dst01
= _simd_shuffle_epi8(src
, mask0
);
329 simdscalari perm1
= _simd_permute_128(src
, src
, 1);
331 simdscalari mask1
= _simd_set_epi32(0x80800f07, 0x80800e06, 0x80800d05, 0x80800c04, 0x0b038080, 0x0a028080, 0x09018080, 0x08008080);
333 simdscalari dst23
= _simd_shuffle_epi8(perm1
, mask1
);
335 simdscalari dst
= _simd_or_si(dst01
, dst23
);
337 _simd_store_si(reinterpret_cast<simdscalari
*>(pDst
), dst
);
339 #error Unsupported vector width
344 //////////////////////////////////////////////////////////////////////////
346 //////////////////////////////////////////////////////////////////////////
347 struct Transpose8_8_8
349 //////////////////////////////////////////////////////////////////////////
350 /// @brief Performs an SOA to AOS conversion for packed 8_8_8 data.
351 /// @param pSrc - source data in SOA form
352 /// @param pDst - output data in AOS form
353 INLINE
static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
356 //////////////////////////////////////////////////////////////////////////
358 //////////////////////////////////////////////////////////////////////////
361 //////////////////////////////////////////////////////////////////////////
362 /// @brief Performs an SOA to AOS conversion for packed 8_8 data.
363 /// @param pSrc - source data in SOA form
364 /// @param pDst - output data in AOS form
365 INLINE
static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
)
367 #if KNOB_SIMD_WIDTH == 8
368 simdscalari src
= _simd_load_si((const simdscalari
*)pSrc
);
370 __m128i rg
= _mm256_castsi256_si128(src
); // rrrrrrrr gggggggg
371 __m128i g
= _mm_unpackhi_epi64(rg
, rg
); // gggggggg gggggggg
372 rg
= _mm_unpacklo_epi8(rg
, g
);
373 _mm_store_si128((__m128i
*)pDst
, rg
);
374 #elif KNOB_SIMD_WIDTH == 16
375 __m256i src
= _mm256_load_si256(reinterpret_cast<const __m256i
*>(pSrc
)); // rrrrrrrrrrrrrrrrgggggggggggggggg
377 __m256i r
= _mm256_permute4x64_epi64(src
, 0x50); // 0x50 = 01010000b // rrrrrrrrxxxxxxxxrrrrrrrrxxxxxxxx
379 __m256i g
= _mm256_permute4x64_epi64(src
, 0xFA); // 0xFA = 11111010b // ggggggggxxxxxxxxggggggggxxxxxxxx
381 __m256i dst
= _mm256_unpacklo_epi8(r
, g
); // rgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrg
383 _mm256_store_si256(reinterpret_cast<__m256i
*>(pDst
), dst
);
385 #error Unsupported vector width
390 //////////////////////////////////////////////////////////////////////////
391 /// Transpose32_32_32_32
392 //////////////////////////////////////////////////////////////////////////
393 struct Transpose32_32_32_32
395 //////////////////////////////////////////////////////////////////////////
396 /// @brief Performs an SOA to AOS conversion for packed 32_32_32_32 data.
397 /// @param pSrc - source data in SOA form
398 /// @param pDst - output data in AOS form
399 INLINE
static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
)
401 #if KNOB_SIMD_WIDTH == 8
402 simdscalar src0
= _simd_load_ps((const float*)pSrc
);
403 simdscalar src1
= _simd_load_ps((const float*)pSrc
+ 8);
404 simdscalar src2
= _simd_load_ps((const float*)pSrc
+ 16);
405 simdscalar src3
= _simd_load_ps((const float*)pSrc
+ 24);
408 vTranspose4x8(vDst
, src0
, src1
, src2
, src3
);
409 _mm_store_ps((float*)pDst
, vDst
[0]);
410 _mm_store_ps((float*)pDst
+4, vDst
[1]);
411 _mm_store_ps((float*)pDst
+8, vDst
[2]);
412 _mm_store_ps((float*)pDst
+12, vDst
[3]);
413 _mm_store_ps((float*)pDst
+16, vDst
[4]);
414 _mm_store_ps((float*)pDst
+20, vDst
[5]);
415 _mm_store_ps((float*)pDst
+24, vDst
[6]);
416 _mm_store_ps((float*)pDst
+28, vDst
[7]);
417 #elif KNOB_SIMD_WIDTH == 16
418 #if ENABLE_AVX512_EMULATION
419 simdscalar src0
= _simd_load_ps(reinterpret_cast<const float*>(pSrc
));
420 simdscalar src1
= _simd_load_ps(reinterpret_cast<const float*>(pSrc
) + 16);
421 simdscalar src2
= _simd_load_ps(reinterpret_cast<const float*>(pSrc
) + 32);
422 simdscalar src3
= _simd_load_ps(reinterpret_cast<const float*>(pSrc
) + 48);
426 vTranspose4x8(vDst
, src0
.lo
, src1
.lo
, src2
.lo
, src3
.lo
);
428 _mm_store_ps(reinterpret_cast<float*>(pDst
), vDst
[0]);
429 _mm_store_ps(reinterpret_cast<float*>(pDst
) + 4, vDst
[1]);
430 _mm_store_ps(reinterpret_cast<float*>(pDst
) + 8, vDst
[2]);
431 _mm_store_ps(reinterpret_cast<float*>(pDst
) + 12, vDst
[3]);
432 _mm_store_ps(reinterpret_cast<float*>(pDst
) + 16, vDst
[4]);
433 _mm_store_ps(reinterpret_cast<float*>(pDst
) + 20, vDst
[5]);
434 _mm_store_ps(reinterpret_cast<float*>(pDst
) + 24, vDst
[6]);
435 _mm_store_ps(reinterpret_cast<float*>(pDst
) + 28, vDst
[7]);
437 vTranspose4x8(vDst
, src0
.hi
, src1
.hi
, src2
.hi
, src3
.hi
);
439 _mm_store_ps(reinterpret_cast<float*>(pDst
) + 32, vDst
[0]);
440 _mm_store_ps(reinterpret_cast<float*>(pDst
) + 36, vDst
[1]);
441 _mm_store_ps(reinterpret_cast<float*>(pDst
) + 40, vDst
[2]);
442 _mm_store_ps(reinterpret_cast<float*>(pDst
) + 44, vDst
[3]);
443 _mm_store_ps(reinterpret_cast<float*>(pDst
) + 48, vDst
[4]);
444 _mm_store_ps(reinterpret_cast<float*>(pDst
) + 52, vDst
[5]);
445 _mm_store_ps(reinterpret_cast<float*>(pDst
) + 56, vDst
[6]);
446 _mm_store_ps(reinterpret_cast<float*>(pDst
) + 60, vDst
[7]);
449 #error Unsupported vector width
454 //////////////////////////////////////////////////////////////////////////
455 /// Transpose32_32_32
456 //////////////////////////////////////////////////////////////////////////
457 struct Transpose32_32_32
459 //////////////////////////////////////////////////////////////////////////
460 /// @brief Performs an SOA to AOS conversion for packed 32_32_32 data.
461 /// @param pSrc - source data in SOA form
462 /// @param pDst - output data in AOS form
463 INLINE
static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
)
465 #if KNOB_SIMD_WIDTH == 8
466 simdscalar src0
= _simd_load_ps((const float*)pSrc
);
467 simdscalar src1
= _simd_load_ps((const float*)pSrc
+ 8);
468 simdscalar src2
= _simd_load_ps((const float*)pSrc
+ 16);
471 vTranspose3x8(vDst
, src0
, src1
, src2
);
472 _mm_store_ps((float*)pDst
, vDst
[0]);
473 _mm_store_ps((float*)pDst
+ 4, vDst
[1]);
474 _mm_store_ps((float*)pDst
+ 8, vDst
[2]);
475 _mm_store_ps((float*)pDst
+ 12, vDst
[3]);
476 _mm_store_ps((float*)pDst
+ 16, vDst
[4]);
477 _mm_store_ps((float*)pDst
+ 20, vDst
[5]);
478 _mm_store_ps((float*)pDst
+ 24, vDst
[6]);
479 _mm_store_ps((float*)pDst
+ 28, vDst
[7]);
480 #elif KNOB_SIMD_WIDTH == 16
481 #if ENABLE_AVX512_EMULATION
482 simdscalar src0
= _simd_load_ps(reinterpret_cast<const float*>(pSrc
));
483 simdscalar src1
= _simd_load_ps(reinterpret_cast<const float*>(pSrc
) + 16);
484 simdscalar src2
= _simd_load_ps(reinterpret_cast<const float*>(pSrc
) + 32);
488 vTranspose3x8(vDst
, src0
.lo
, src1
.lo
, src2
.lo
);
490 _mm_store_ps(reinterpret_cast<float*>(pDst
), vDst
[0]);
491 _mm_store_ps(reinterpret_cast<float*>(pDst
) + 4, vDst
[1]);
492 _mm_store_ps(reinterpret_cast<float*>(pDst
) + 8, vDst
[2]);
493 _mm_store_ps(reinterpret_cast<float*>(pDst
) + 12, vDst
[3]);
494 _mm_store_ps(reinterpret_cast<float*>(pDst
) + 16, vDst
[4]);
495 _mm_store_ps(reinterpret_cast<float*>(pDst
) + 20, vDst
[5]);
496 _mm_store_ps(reinterpret_cast<float*>(pDst
) + 24, vDst
[6]);
497 _mm_store_ps(reinterpret_cast<float*>(pDst
) + 28, vDst
[7]);
499 vTranspose3x8(vDst
, src0
.hi
, src1
.hi
, src2
.hi
);
501 _mm_store_ps(reinterpret_cast<float*>(pDst
) + 32, vDst
[0]);
502 _mm_store_ps(reinterpret_cast<float*>(pDst
) + 36, vDst
[1]);
503 _mm_store_ps(reinterpret_cast<float*>(pDst
) + 40, vDst
[2]);
504 _mm_store_ps(reinterpret_cast<float*>(pDst
) + 44, vDst
[3]);
505 _mm_store_ps(reinterpret_cast<float*>(pDst
) + 48, vDst
[4]);
506 _mm_store_ps(reinterpret_cast<float*>(pDst
) + 52, vDst
[5]);
507 _mm_store_ps(reinterpret_cast<float*>(pDst
) + 56, vDst
[6]);
508 _mm_store_ps(reinterpret_cast<float*>(pDst
) + 60, vDst
[7]);
511 #error Unsupported vector width
516 //////////////////////////////////////////////////////////////////////////
518 //////////////////////////////////////////////////////////////////////////
519 struct Transpose32_32
521 //////////////////////////////////////////////////////////////////////////
522 /// @brief Performs an SOA to AOS conversion for packed 32_32 data.
523 /// @param pSrc - source data in SOA form
524 /// @param pDst - output data in AOS form
525 INLINE
static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
)
527 #if KNOB_SIMD_WIDTH == 8
528 const float* pfSrc
= (const float*)pSrc
;
529 __m128 src_r0
= _mm_load_ps(pfSrc
+ 0);
530 __m128 src_r1
= _mm_load_ps(pfSrc
+ 4);
531 __m128 src_g0
= _mm_load_ps(pfSrc
+ 8);
532 __m128 src_g1
= _mm_load_ps(pfSrc
+ 12);
534 __m128 dst0
= _mm_unpacklo_ps(src_r0
, src_g0
);
535 __m128 dst1
= _mm_unpackhi_ps(src_r0
, src_g0
);
536 __m128 dst2
= _mm_unpacklo_ps(src_r1
, src_g1
);
537 __m128 dst3
= _mm_unpackhi_ps(src_r1
, src_g1
);
539 float* pfDst
= (float*)pDst
;
540 _mm_store_ps(pfDst
+ 0, dst0
);
541 _mm_store_ps(pfDst
+ 4, dst1
);
542 _mm_store_ps(pfDst
+ 8, dst2
);
543 _mm_store_ps(pfDst
+ 12, dst3
);
544 #elif KNOB_SIMD_WIDTH == 16
545 const float* pfSrc
= (const float*)pSrc
;
546 __m256 src_r0
= _mm256_load_ps(pfSrc
+ 0);
547 __m256 src_r1
= _mm256_load_ps(pfSrc
+ 8);
548 __m256 src_g0
= _mm256_load_ps(pfSrc
+ 16);
549 __m256 src_g1
= _mm256_load_ps(pfSrc
+ 24);
551 __m256 dst0
= _mm256_unpacklo_ps(src_r0
, src_g0
);
552 __m256 dst1
= _mm256_unpackhi_ps(src_r0
, src_g0
);
553 __m256 dst2
= _mm256_unpacklo_ps(src_r1
, src_g1
);
554 __m256 dst3
= _mm256_unpackhi_ps(src_r1
, src_g1
);
556 float* pfDst
= (float*)pDst
;
557 _mm256_store_ps(pfDst
+ 0, dst0
);
558 _mm256_store_ps(pfDst
+ 8, dst1
);
559 _mm256_store_ps(pfDst
+ 16, dst2
);
560 _mm256_store_ps(pfDst
+ 24, dst3
);
562 #error Unsupported vector width
567 //////////////////////////////////////////////////////////////////////////
568 /// Transpose16_16_16_16
569 //////////////////////////////////////////////////////////////////////////
570 struct Transpose16_16_16_16
572 //////////////////////////////////////////////////////////////////////////
573 /// @brief Performs an SOA to AOS conversion for packed 16_16_16_16 data.
574 /// @param pSrc - source data in SOA form
575 /// @param pDst - output data in AOS form
576 INLINE
static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
)
578 #if KNOB_SIMD_WIDTH == 8
579 simdscalari src_rg
= _simd_load_si((const simdscalari
*)pSrc
);
580 simdscalari src_ba
= _simd_load_si((const simdscalari
*)(pSrc
+ sizeof(simdscalari
)));
582 __m128i src_r
= _mm256_extractf128_si256(src_rg
, 0);
583 __m128i src_g
= _mm256_extractf128_si256(src_rg
, 1);
584 __m128i src_b
= _mm256_extractf128_si256(src_ba
, 0);
585 __m128i src_a
= _mm256_extractf128_si256(src_ba
, 1);
587 __m128i rg0
= _mm_unpacklo_epi16(src_r
, src_g
);
588 __m128i rg1
= _mm_unpackhi_epi16(src_r
, src_g
);
589 __m128i ba0
= _mm_unpacklo_epi16(src_b
, src_a
);
590 __m128i ba1
= _mm_unpackhi_epi16(src_b
, src_a
);
592 __m128i dst0
= _mm_unpacklo_epi32(rg0
, ba0
);
593 __m128i dst1
= _mm_unpackhi_epi32(rg0
, ba0
);
594 __m128i dst2
= _mm_unpacklo_epi32(rg1
, ba1
);
595 __m128i dst3
= _mm_unpackhi_epi32(rg1
, ba1
);
597 _mm_store_si128(((__m128i
*)pDst
) + 0, dst0
);
598 _mm_store_si128(((__m128i
*)pDst
) + 1, dst1
);
599 _mm_store_si128(((__m128i
*)pDst
) + 2, dst2
);
600 _mm_store_si128(((__m128i
*)pDst
) + 3, dst3
);
601 #elif KNOB_SIMD_WIDTH == 16
602 #if ENABLE_AVX512_EMULATION
603 simdscalari src_rg
= _simd_load_si(reinterpret_cast<const simdscalari
*>(pSrc
));
604 simdscalari src_ba
= _simd_load_si(reinterpret_cast<const simdscalari
*>(pSrc
+ sizeof(simdscalari
)));
606 __m256i src_r
= src_rg
.lo
;
607 __m256i src_g
= src_rg
.hi
;
608 __m256i src_b
= src_ba
.lo
;
609 __m256i src_a
= src_ba
.hi
;
611 __m256i rg0
= _mm256_unpacklo_epi16(src_r
, src_g
);
612 __m256i rg1
= _mm256_unpackhi_epi16(src_r
, src_g
);
613 __m256i ba0
= _mm256_unpacklo_epi16(src_b
, src_a
);
614 __m256i ba1
= _mm256_unpackhi_epi16(src_b
, src_a
);
616 __m256i dst0
= _mm256_unpacklo_epi32(rg0
, ba0
);
617 __m256i dst1
= _mm256_unpackhi_epi32(rg0
, ba0
);
618 __m256i dst2
= _mm256_unpacklo_epi32(rg1
, ba1
);
619 __m256i dst3
= _mm256_unpackhi_epi32(rg1
, ba1
);
621 _mm256_store_si256(reinterpret_cast<__m256i
*>(pDst
) + 0, dst0
);
622 _mm256_store_si256(reinterpret_cast<__m256i
*>(pDst
) + 1, dst1
);
623 _mm256_store_si256(reinterpret_cast<__m256i
*>(pDst
) + 2, dst2
);
624 _mm256_store_si256(reinterpret_cast<__m256i
*>(pDst
) + 3, dst3
);
627 #error Unsupported vector width
632 //////////////////////////////////////////////////////////////////////////
633 /// Transpose16_16_16
634 //////////////////////////////////////////////////////////////////////////
635 struct Transpose16_16_16
637 //////////////////////////////////////////////////////////////////////////
638 /// @brief Performs an SOA to AOS conversion for packed 16_16_16 data.
639 /// @param pSrc - source data in SOA form
640 /// @param pDst - output data in AOS form
641 INLINE
static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
)
643 #if KNOB_SIMD_WIDTH == 8
644 simdscalari src_rg
= _simd_load_si((const simdscalari
*)pSrc
);
646 __m128i src_r
= _mm256_extractf128_si256(src_rg
, 0);
647 __m128i src_g
= _mm256_extractf128_si256(src_rg
, 1);
648 __m128i src_b
= _mm_load_si128((const __m128i
*)(pSrc
+ sizeof(simdscalari
)));
649 __m128i src_a
= _mm_undefined_si128();
651 __m128i rg0
= _mm_unpacklo_epi16(src_r
, src_g
);
652 __m128i rg1
= _mm_unpackhi_epi16(src_r
, src_g
);
653 __m128i ba0
= _mm_unpacklo_epi16(src_b
, src_a
);
654 __m128i ba1
= _mm_unpackhi_epi16(src_b
, src_a
);
656 __m128i dst0
= _mm_unpacklo_epi32(rg0
, ba0
);
657 __m128i dst1
= _mm_unpackhi_epi32(rg0
, ba0
);
658 __m128i dst2
= _mm_unpacklo_epi32(rg1
, ba1
);
659 __m128i dst3
= _mm_unpackhi_epi32(rg1
, ba1
);
661 _mm_store_si128(((__m128i
*)pDst
) + 0, dst0
);
662 _mm_store_si128(((__m128i
*)pDst
) + 1, dst1
);
663 _mm_store_si128(((__m128i
*)pDst
) + 2, dst2
);
664 _mm_store_si128(((__m128i
*)pDst
) + 3, dst3
);
665 #elif KNOB_SIMD_WIDTH == 16
666 #if ENABLE_AVX512_EMULATION
667 simdscalari src_rg
= _simd_load_si(reinterpret_cast<const simdscalari
*>(pSrc
));
669 __m256i src_r
= src_rg
.lo
;
670 __m256i src_g
= src_rg
.hi
;
671 __m256i src_b
= _mm256_load_si256(reinterpret_cast<const __m256i
*>(pSrc
+ sizeof(simdscalari
)));
672 __m256i src_a
= _mm256_undefined_si256();
674 __m256i rg0
= _mm256_unpacklo_epi16(src_r
, src_g
);
675 __m256i rg1
= _mm256_unpackhi_epi16(src_r
, src_g
);
676 __m256i ba0
= _mm256_unpacklo_epi16(src_b
, src_a
);
677 __m256i ba1
= _mm256_unpackhi_epi16(src_b
, src_a
);
679 __m256i dst0
= _mm256_unpacklo_epi32(rg0
, ba0
);
680 __m256i dst1
= _mm256_unpackhi_epi32(rg0
, ba0
);
681 __m256i dst2
= _mm256_unpacklo_epi32(rg1
, ba1
);
682 __m256i dst3
= _mm256_unpackhi_epi32(rg1
, ba1
);
684 _mm256_store_si256(reinterpret_cast<__m256i
*>(pDst
) + 0, dst0
);
685 _mm256_store_si256(reinterpret_cast<__m256i
*>(pDst
) + 1, dst1
);
686 _mm256_store_si256(reinterpret_cast<__m256i
*>(pDst
) + 2, dst2
);
687 _mm256_store_si256(reinterpret_cast<__m256i
*>(pDst
) + 3, dst3
);
690 #error Unsupported vector width
695 //////////////////////////////////////////////////////////////////////////
697 //////////////////////////////////////////////////////////////////////////
698 struct Transpose16_16
700 //////////////////////////////////////////////////////////////////////////
701 /// @brief Performs an SOA to AOS conversion for packed 16_16 data.
702 /// @param pSrc - source data in SOA form
703 /// @param pDst - output data in AOS form
704 INLINE
static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
)
706 #if KNOB_SIMD_WIDTH == 8
707 simdscalar src
= _simd_load_ps((const float*)pSrc
);
709 __m128 comp0
= _mm256_castps256_ps128(src
);
710 __m128 comp1
= _mm256_extractf128_ps(src
, 1);
712 __m128i comp0i
= _mm_castps_si128(comp0
);
713 __m128i comp1i
= _mm_castps_si128(comp1
);
715 __m128i resLo
= _mm_unpacklo_epi16(comp0i
, comp1i
);
716 __m128i resHi
= _mm_unpackhi_epi16(comp0i
, comp1i
);
718 _mm_store_si128((__m128i
*)pDst
, resLo
);
719 _mm_store_si128((__m128i
*)pDst
+ 1, resHi
);
720 #elif KNOB_SIMD_WIDTH == 16
721 #if ENABLE_AVX512_EMULATION
722 simdscalari src
= _simd_castps_si(_simd_load_ps(reinterpret_cast<const float*>(pSrc
)));
726 result
.lo
= _mm256_unpacklo_epi16(src
.lo
, src
.hi
);
727 result
.hi
= _mm256_unpackhi_epi16(src
.lo
, src
.hi
);
729 _simd_store_si(reinterpret_cast<simdscalari
*>(pDst
), result
);
732 #error Unsupported vector width
737 //////////////////////////////////////////////////////////////////////////
739 //////////////////////////////////////////////////////////////////////////
742 //////////////////////////////////////////////////////////////////////////
743 /// @brief Performs an SOA to AOS conversion for packed 24_8 data.
744 /// @param pSrc - source data in SOA form
745 /// @param pDst - output data in AOS form
746 static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
749 //////////////////////////////////////////////////////////////////////////
751 //////////////////////////////////////////////////////////////////////////
752 struct Transpose32_8_24
754 //////////////////////////////////////////////////////////////////////////
755 /// @brief Performs an SOA to AOS conversion for packed 32_8_24 data.
756 /// @param pSrc - source data in SOA form
757 /// @param pDst - output data in AOS form
758 static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
763 //////////////////////////////////////////////////////////////////////////
765 //////////////////////////////////////////////////////////////////////////
766 struct Transpose4_4_4_4
768 //////////////////////////////////////////////////////////////////////////
769 /// @brief Performs an SOA to AOS conversion for packed 4_4_4_4 data.
770 /// @param pSrc - source data in SOA form
771 /// @param pDst - output data in AOS form
772 static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
775 //////////////////////////////////////////////////////////////////////////
777 //////////////////////////////////////////////////////////////////////////
778 struct Transpose5_6_5
780 //////////////////////////////////////////////////////////////////////////
781 /// @brief Performs an SOA to AOS conversion for packed 5_6_5 data.
782 /// @param pSrc - source data in SOA form
783 /// @param pDst - output data in AOS form
784 static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
787 //////////////////////////////////////////////////////////////////////////
789 //////////////////////////////////////////////////////////////////////////
790 struct Transpose9_9_9_5
792 //////////////////////////////////////////////////////////////////////////
793 /// @brief Performs an SOA to AOS conversion for packed 9_9_9_5 data.
794 /// @param pSrc - source data in SOA form
795 /// @param pDst - output data in AOS form
796 static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
799 //////////////////////////////////////////////////////////////////////////
801 //////////////////////////////////////////////////////////////////////////
802 struct Transpose5_5_5_1
804 //////////////////////////////////////////////////////////////////////////
805 /// @brief Performs an SOA to AOS conversion for packed 5_5_5_1 data.
806 /// @param pSrc - source data in SOA form
807 /// @param pDst - output data in AOS form
808 static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
811 //////////////////////////////////////////////////////////////////////////
812 /// Transpose10_10_10_2
813 //////////////////////////////////////////////////////////////////////////
814 struct Transpose10_10_10_2
816 //////////////////////////////////////////////////////////////////////////
817 /// @brief Performs an SOA to AOS conversion for packed 10_10_10_2 data.
818 /// @param pSrc - source data in SOA form
819 /// @param pDst - output data in AOS form
820 static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
823 //////////////////////////////////////////////////////////////////////////
824 /// Transpose11_11_10
825 //////////////////////////////////////////////////////////////////////////
826 struct Transpose11_11_10
828 //////////////////////////////////////////////////////////////////////////
829 /// @brief Performs an SOA to AOS conversion for packed 11_11_10 data.
830 /// @param pSrc - source data in SOA form
831 /// @param pDst - output data in AOS form
832 static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
835 // helper function to unroll loops
836 template<int Begin
, int End
, int Step
= 1>
838 template<typename Lambda
>
839 INLINE
static void step(Lambda
& func
) {
841 UnrollerL
<Begin
+ Step
, End
, Step
>::step(func
);
845 template<int End
, int Step
>
846 struct UnrollerL
<End
, End
, Step
> {
847 template<typename Lambda
>
848 static void step(Lambda
& func
) {
852 // helper function to unroll loops, with mask to skip specific iterations
853 template<int Begin
, int End
, int Step
= 1, int Mask
= 0x7f>
854 struct UnrollerLMask
{
855 template<typename Lambda
>
856 INLINE
static void step(Lambda
& func
) {
857 if(Mask
& (1 << Begin
))
861 UnrollerL
<Begin
+ Step
, End
, Step
>::step(func
);
865 template<int End
, int Step
, int Mask
>
866 struct UnrollerLMask
<End
, End
, Step
, Mask
> {
867 template<typename Lambda
>
868 static void step(Lambda
& func
) {
872 // general CRC compute
874 uint32_t ComputeCRC(uint32_t crc
, const void *pData
, uint32_t size
)
876 #if defined(_WIN64) || defined(__x86_64__)
877 uint32_t sizeInQwords
= size
/ sizeof(uint64_t);
878 uint32_t sizeRemainderBytes
= size
% sizeof(uint64_t);
879 uint64_t* pDataWords
= (uint64_t*)pData
;
880 for (uint32_t i
= 0; i
< sizeInQwords
; ++i
)
882 crc
= (uint32_t)_mm_crc32_u64(crc
, *pDataWords
++);
885 uint32_t sizeInDwords
= size
/ sizeof(uint32_t);
886 uint32_t sizeRemainderBytes
= size
% sizeof(uint32_t);
887 uint32_t* pDataWords
= (uint32_t*)pData
;
888 for (uint32_t i
= 0; i
< sizeInDwords
; ++i
)
890 crc
= _mm_crc32_u32(crc
, *pDataWords
++);
894 uint8_t* pRemainderBytes
= (uint8_t*)pDataWords
;
895 for (uint32_t i
= 0; i
< sizeRemainderBytes
; ++i
)
897 crc
= _mm_crc32_u8(crc
, *pRemainderBytes
++);
903 //////////////////////////////////////////////////////////////////////////
904 /// Add byte offset to any-type pointer
905 //////////////////////////////////////////////////////////////////////////
906 template <typename T
>
908 static T
* PtrAdd(T
* p
, intptr_t offset
)
910 intptr_t intp
= reinterpret_cast<intptr_t>(p
);
911 return reinterpret_cast<T
*>(intp
+ offset
);
914 //////////////////////////////////////////////////////////////////////////
916 //////////////////////////////////////////////////////////////////////////
917 template <typename T
>
919 static bool IsPow2(T value
)
921 return value
== (value
& (0 - value
));
924 //////////////////////////////////////////////////////////////////////////
925 /// Align down to specified alignment
926 /// Note: IsPow2(alignment) MUST be true
927 //////////////////////////////////////////////////////////////////////////
928 template <typename T1
, typename T2
>
930 static T1
AlignDownPow2(T1 value
, T2 alignment
)
932 SWR_ASSERT(IsPow2(alignment
));
933 return value
& ~T1(alignment
- 1);
936 //////////////////////////////////////////////////////////////////////////
937 /// Align up to specified alignment
938 /// Note: IsPow2(alignment) MUST be true
939 //////////////////////////////////////////////////////////////////////////
940 template <typename T1
, typename T2
>
942 static T1
AlignUpPow2(T1 value
, T2 alignment
)
944 return AlignDownPow2(value
+ T1(alignment
- 1), alignment
);
947 //////////////////////////////////////////////////////////////////////////
948 /// Align up ptr to specified alignment
949 /// Note: IsPow2(alignment) MUST be true
950 //////////////////////////////////////////////////////////////////////////
951 template <typename T1
, typename T2
>
953 static T1
* AlignUpPow2(T1
* value
, T2 alignment
)
955 return reinterpret_cast<T1
*>(
956 AlignDownPow2(reinterpret_cast<uintptr_t>(value
) + uintptr_t(alignment
- 1), alignment
));
959 //////////////////////////////////////////////////////////////////////////
960 /// Align down to specified alignment
961 //////////////////////////////////////////////////////////////////////////
962 template <typename T1
, typename T2
>
964 static T1
AlignDown(T1 value
, T2 alignment
)
966 if (IsPow2(alignment
)) { return AlignDownPow2(value
, alignment
); }
967 return value
- T1(value
% alignment
);
970 //////////////////////////////////////////////////////////////////////////
971 /// Align down to specified alignment
972 //////////////////////////////////////////////////////////////////////////
973 template <typename T1
, typename T2
>
975 static T1
* AlignDown(T1
* value
, T2 alignment
)
977 return (T1
*)AlignDown(uintptr_t(value
), alignment
);
980 //////////////////////////////////////////////////////////////////////////
981 /// Align up to specified alignment
982 /// Note: IsPow2(alignment) MUST be true
983 //////////////////////////////////////////////////////////////////////////
984 template <typename T1
, typename T2
>
986 static T1
AlignUp(T1 value
, T2 alignment
)
988 return AlignDown(value
+ T1(alignment
- 1), alignment
);
991 //////////////////////////////////////////////////////////////////////////
992 /// Align up to specified alignment
993 /// Note: IsPow2(alignment) MUST be true
994 //////////////////////////////////////////////////////////////////////////
995 template <typename T1
, typename T2
>
997 static T1
* AlignUp(T1
* value
, T2 alignment
)
999 return AlignDown(PtrAdd(value
, alignment
- 1), alignment
);
1002 //////////////////////////////////////////////////////////////////////////
1003 /// Helper structure used to access an array of elements that don't
1004 /// correspond to a typical word size.
1005 //////////////////////////////////////////////////////////////////////////
1006 template<typename T
, size_t BitsPerElementT
, size_t ArrayLenT
>
1010 static const size_t BITS_PER_WORD
= sizeof(size_t) * 8;
1011 static const size_t ELEMENTS_PER_WORD
= BITS_PER_WORD
/ BitsPerElementT
;
1012 static const size_t NUM_WORDS
= (ArrayLenT
+ ELEMENTS_PER_WORD
- 1) / ELEMENTS_PER_WORD
;
1013 static const size_t ELEMENT_MASK
= (size_t(1) << BitsPerElementT
) - 1;
1015 static_assert(ELEMENTS_PER_WORD
* BitsPerElementT
== BITS_PER_WORD
,
1016 "Element size must an integral fraction of pointer size");
1018 size_t m_words
[NUM_WORDS
] = {};
1022 T
operator[] (size_t elementIndex
) const
1024 size_t word
= m_words
[elementIndex
/ ELEMENTS_PER_WORD
];
1025 word
>>= ((elementIndex
% ELEMENTS_PER_WORD
) * BitsPerElementT
);
1026 return T(word
& ELEMENT_MASK
);
1030 // Ranged integer argument for TemplateArgUnroller
1031 template <uint32_t TMin
, uint32_t TMax
>
1037 // Recursive template used to auto-nest conditionals. Converts dynamic boolean function
1038 // arguments to static template arguments.
1039 template <typename TermT
, typename
... ArgsB
>
1040 struct TemplateArgUnroller
1042 //-----------------------------------------
1044 //-----------------------------------------
1046 // Last Arg Terminator
1047 static typename
TermT::FuncType
GetFunc(bool bArg
)
1051 return TermT::template GetFunc
<ArgsB
..., std::true_type
>();
1054 return TermT::template GetFunc
<ArgsB
..., std::false_type
>();
1057 // Recursively parse args
1058 template <typename
... TArgsT
>
1059 static typename
TermT::FuncType
GetFunc(bool bArg
, TArgsT
... remainingArgs
)
1063 return TemplateArgUnroller
<TermT
, ArgsB
..., std::true_type
>::GetFunc(remainingArgs
...);
1066 return TemplateArgUnroller
<TermT
, ArgsB
..., std::false_type
>::GetFunc(remainingArgs
...);
1069 //-----------------------------------------
1070 // Integer value (within specified range)
1071 //-----------------------------------------
1073 // Last Arg Terminator
1074 template <uint32_t TMin
, uint32_t TMax
>
1075 static typename
TermT::FuncType
GetFunc(IntArg
<TMin
, TMax
> iArg
)
1077 if (iArg
.val
== TMax
)
1079 return TermT::template GetFunc
<ArgsB
..., std::integral_constant
<uint32_t, TMax
>>();
1083 return TemplateArgUnroller
<TermT
, ArgsB
...>::GetFunc(IntArg
<TMin
, TMax
-1>{iArg
.val
});
1085 SWR_ASSUME(false); return nullptr;
1087 template <uint32_t TVal
>
1088 static typename
TermT::FuncType
GetFunc(IntArg
<TVal
, TVal
> iArg
)
1090 SWR_ASSERT(iArg
.val
== TVal
);
1091 return TermT::template GetFunc
<ArgsB
..., std::integral_constant
<uint32_t, TVal
>>();
1094 // Recursively parse args
1095 template <uint32_t TMin
, uint32_t TMax
, typename
... TArgsT
>
1096 static typename
TermT::FuncType
GetFunc(IntArg
<TMin
, TMax
> iArg
, TArgsT
... remainingArgs
)
1098 if (iArg
.val
== TMax
)
1100 return TemplateArgUnroller
<TermT
, ArgsB
..., std::integral_constant
<uint32_t, TMax
>>::GetFunc(remainingArgs
...);
1104 return TemplateArgUnroller
<TermT
, ArgsB
...>::GetFunc(IntArg
<TMin
, TMax
- 1>{iArg
.val
}, remainingArgs
...);
1106 SWR_ASSUME(false); return nullptr;
1108 template <uint32_t TVal
, typename
... TArgsT
>
1109 static typename
TermT::FuncType
GetFunc(IntArg
<TVal
, TVal
> iArg
, TArgsT
... remainingArgs
)
1111 SWR_ASSERT(iArg
.val
== TVal
);
1112 return TemplateArgUnroller
<TermT
, ArgsB
..., std::integral_constant
<uint32_t, TVal
>>::GetFunc(remainingArgs
...);