1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * @brief Utilities used by SWR core.
27 ******************************************************************************/
31 #include <type_traits>
32 #include "common/os.h"
33 #include "common/simdintrin.h"
34 #include "common/swr_assert.h"
36 #if defined(_WIN64) || defined(__x86_64__)
37 #define _MM_INSERT_EPI64 _mm_insert_epi64
38 #define _MM_EXTRACT_EPI64 _mm_extract_epi64
40 INLINE INT64
_MM_EXTRACT_EPI64(__m128i a
, const int32_t ndx
)
42 OSALIGNLINE(uint32_t) elems
[4];
43 _mm_store_si128((__m128i
*)elems
, a
);
46 uint64_t foo
= elems
[0];
47 foo
|= (uint64_t)elems
[1] << 32;
52 uint64_t foo
= elems
[2];
53 foo
|= (uint64_t)elems
[3] << 32;
58 INLINE __m128i
_MM_INSERT_EPI64(__m128i a
, INT64 b
, const int32_t ndx
)
60 OSALIGNLINE(int64_t) elems
[2];
61 _mm_store_si128((__m128i
*)elems
, a
);
71 out
= _mm_load_si128((const __m128i
*)elems
);
76 OSALIGNLINE(struct) BBOX
84 BBOX(int t
, int b
, int l
, int r
) : top(t
), bottom(b
), left(l
), right(r
) {}
86 bool operator==(const BBOX
& rhs
)
88 return (this->top
== rhs
.top
&&
89 this->bottom
== rhs
.bottom
&&
90 this->left
== rhs
.left
&&
91 this->right
== rhs
.right
);
94 bool operator!=(const BBOX
& rhs
)
96 return !(*this == rhs
);
109 void vTranspose(__m128
&row0
, __m128
&row1
, __m128
&row2
, __m128
&row3
)
111 __m128i row0i
= _mm_castps_si128(row0
);
112 __m128i row1i
= _mm_castps_si128(row1
);
113 __m128i row2i
= _mm_castps_si128(row2
);
114 __m128i row3i
= _mm_castps_si128(row3
);
116 __m128i vTemp
= row2i
;
117 row2i
= _mm_unpacklo_epi32(row2i
, row3i
);
118 vTemp
= _mm_unpackhi_epi32(vTemp
, row3i
);
121 row0i
= _mm_unpacklo_epi32(row0i
, row1i
);
122 row3i
= _mm_unpackhi_epi32(row3i
, row1i
);
125 row0i
= _mm_unpacklo_epi64(row0i
, row2i
);
126 row1i
= _mm_unpackhi_epi64(row1i
, row2i
);
129 row2i
= _mm_unpacklo_epi64(row2i
, vTemp
);
130 row3i
= _mm_unpackhi_epi64(row3i
, vTemp
);
132 row0
= _mm_castsi128_ps(row0i
);
133 row1
= _mm_castsi128_ps(row1i
);
134 row2
= _mm_castsi128_ps(row2i
);
135 row3
= _mm_castsi128_ps(row3i
);
139 void vTranspose(__m128i
&row0
, __m128i
&row1
, __m128i
&row2
, __m128i
&row3
)
141 __m128i vTemp
= row2
;
142 row2
= _mm_unpacklo_epi32(row2
, row3
);
143 vTemp
= _mm_unpackhi_epi32(vTemp
, row3
);
146 row0
= _mm_unpacklo_epi32(row0
, row1
);
147 row3
= _mm_unpackhi_epi32(row3
, row1
);
150 row0
= _mm_unpacklo_epi64(row0
, row2
);
151 row1
= _mm_unpackhi_epi64(row1
, row2
);
154 row2
= _mm_unpacklo_epi64(row2
, vTemp
);
155 row3
= _mm_unpackhi_epi64(row3
, vTemp
);
158 #define GCC_VERSION (__GNUC__ * 10000 \
159 + __GNUC_MINOR__ * 100 \
160 + __GNUC_PATCHLEVEL__)
162 #if defined(__clang__) || (defined(__GNUC__) && (GCC_VERSION < 40900))
163 #define _mm_undefined_ps _mm_setzero_ps
164 #define _mm_undefined_si128 _mm_setzero_si128
165 #if KNOB_SIMD_WIDTH == 8
166 #define _mm256_undefined_ps _mm256_setzero_ps
170 #if KNOB_SIMD_WIDTH == 8 || KNOB_SIMD_WIDTH == 16
172 void vTranspose3x8(__m128 (&vDst
)[8], __m256
&vSrc0
, __m256
&vSrc1
, __m256
&vSrc2
)
174 __m256 r0r2
= _mm256_unpacklo_ps(vSrc0
, vSrc2
); //x0z0x1z1 x4z4x5z5
175 __m256 r1rx
= _mm256_unpacklo_ps(vSrc1
, _mm256_undefined_ps()); //y0w0y1w1 y4w4y5w5
176 __m256 r02r1xlolo
= _mm256_unpacklo_ps(r0r2
, r1rx
); //x0y0z0w0 x4y4z4w4
177 __m256 r02r1xlohi
= _mm256_unpackhi_ps(r0r2
, r1rx
); //x1y1z1w1 x5y5z5w5
179 r0r2
= _mm256_unpackhi_ps(vSrc0
, vSrc2
); //x2z2x3z3 x6z6x7z7
180 r1rx
= _mm256_unpackhi_ps(vSrc1
, _mm256_undefined_ps()); //y2w2y3w3 y6w6yw77
181 __m256 r02r1xhilo
= _mm256_unpacklo_ps(r0r2
, r1rx
); //x2y2z2w2 x6y6z6w6
182 __m256 r02r1xhihi
= _mm256_unpackhi_ps(r0r2
, r1rx
); //x3y3z3w3 x7y7z7w7
184 vDst
[0] = _mm256_castps256_ps128(r02r1xlolo
);
185 vDst
[1] = _mm256_castps256_ps128(r02r1xlohi
);
186 vDst
[2] = _mm256_castps256_ps128(r02r1xhilo
);
187 vDst
[3] = _mm256_castps256_ps128(r02r1xhihi
);
189 vDst
[4] = _mm256_extractf128_ps(r02r1xlolo
, 1);
190 vDst
[5] = _mm256_extractf128_ps(r02r1xlohi
, 1);
191 vDst
[6] = _mm256_extractf128_ps(r02r1xhilo
, 1);
192 vDst
[7] = _mm256_extractf128_ps(r02r1xhihi
, 1);
196 void vTranspose4x8(__m128 (&vDst
)[8], __m256
&vSrc0
, __m256
&vSrc1
, __m256
&vSrc2
, __m256
&vSrc3
)
198 __m256 r0r2
= _mm256_unpacklo_ps(vSrc0
, vSrc2
); //x0z0x1z1 x4z4x5z5
199 __m256 r1rx
= _mm256_unpacklo_ps(vSrc1
, vSrc3
); //y0w0y1w1 y4w4y5w5
200 __m256 r02r1xlolo
= _mm256_unpacklo_ps(r0r2
, r1rx
); //x0y0z0w0 x4y4z4w4
201 __m256 r02r1xlohi
= _mm256_unpackhi_ps(r0r2
, r1rx
); //x1y1z1w1 x5y5z5w5
203 r0r2
= _mm256_unpackhi_ps(vSrc0
, vSrc2
); //x2z2x3z3 x6z6x7z7
204 r1rx
= _mm256_unpackhi_ps(vSrc1
, vSrc3
) ; //y2w2y3w3 y6w6yw77
205 __m256 r02r1xhilo
= _mm256_unpacklo_ps(r0r2
, r1rx
); //x2y2z2w2 x6y6z6w6
206 __m256 r02r1xhihi
= _mm256_unpackhi_ps(r0r2
, r1rx
); //x3y3z3w3 x7y7z7w7
208 vDst
[0] = _mm256_castps256_ps128(r02r1xlolo
);
209 vDst
[1] = _mm256_castps256_ps128(r02r1xlohi
);
210 vDst
[2] = _mm256_castps256_ps128(r02r1xhilo
);
211 vDst
[3] = _mm256_castps256_ps128(r02r1xhihi
);
213 vDst
[4] = _mm256_extractf128_ps(r02r1xlolo
, 1);
214 vDst
[5] = _mm256_extractf128_ps(r02r1xlohi
, 1);
215 vDst
[6] = _mm256_extractf128_ps(r02r1xhilo
, 1);
216 vDst
[7] = _mm256_extractf128_ps(r02r1xhihi
, 1);
220 void vTranspose8x8(__m256 (&vDst
)[8], const __m256
&vMask0
, const __m256
&vMask1
, const __m256
&vMask2
, const __m256
&vMask3
, const __m256
&vMask4
, const __m256
&vMask5
, const __m256
&vMask6
, const __m256
&vMask7
)
222 __m256 __t0
= _mm256_unpacklo_ps(vMask0
, vMask1
);
223 __m256 __t1
= _mm256_unpackhi_ps(vMask0
, vMask1
);
224 __m256 __t2
= _mm256_unpacklo_ps(vMask2
, vMask3
);
225 __m256 __t3
= _mm256_unpackhi_ps(vMask2
, vMask3
);
226 __m256 __t4
= _mm256_unpacklo_ps(vMask4
, vMask5
);
227 __m256 __t5
= _mm256_unpackhi_ps(vMask4
, vMask5
);
228 __m256 __t6
= _mm256_unpacklo_ps(vMask6
, vMask7
);
229 __m256 __t7
= _mm256_unpackhi_ps(vMask6
, vMask7
);
230 __m256 __tt0
= _mm256_shuffle_ps(__t0
,__t2
,_MM_SHUFFLE(1,0,1,0));
231 __m256 __tt1
= _mm256_shuffle_ps(__t0
,__t2
,_MM_SHUFFLE(3,2,3,2));
232 __m256 __tt2
= _mm256_shuffle_ps(__t1
,__t3
,_MM_SHUFFLE(1,0,1,0));
233 __m256 __tt3
= _mm256_shuffle_ps(__t1
,__t3
,_MM_SHUFFLE(3,2,3,2));
234 __m256 __tt4
= _mm256_shuffle_ps(__t4
,__t6
,_MM_SHUFFLE(1,0,1,0));
235 __m256 __tt5
= _mm256_shuffle_ps(__t4
,__t6
,_MM_SHUFFLE(3,2,3,2));
236 __m256 __tt6
= _mm256_shuffle_ps(__t5
,__t7
,_MM_SHUFFLE(1,0,1,0));
237 __m256 __tt7
= _mm256_shuffle_ps(__t5
,__t7
,_MM_SHUFFLE(3,2,3,2));
238 vDst
[0] = _mm256_permute2f128_ps(__tt0
, __tt4
, 0x20);
239 vDst
[1] = _mm256_permute2f128_ps(__tt1
, __tt5
, 0x20);
240 vDst
[2] = _mm256_permute2f128_ps(__tt2
, __tt6
, 0x20);
241 vDst
[3] = _mm256_permute2f128_ps(__tt3
, __tt7
, 0x20);
242 vDst
[4] = _mm256_permute2f128_ps(__tt0
, __tt4
, 0x31);
243 vDst
[5] = _mm256_permute2f128_ps(__tt1
, __tt5
, 0x31);
244 vDst
[6] = _mm256_permute2f128_ps(__tt2
, __tt6
, 0x31);
245 vDst
[7] = _mm256_permute2f128_ps(__tt3
, __tt7
, 0x31);
249 void vTranspose8x8(__m256 (&vDst
)[8], const __m256i
&vMask0
, const __m256i
&vMask1
, const __m256i
&vMask2
, const __m256i
&vMask3
, const __m256i
&vMask4
, const __m256i
&vMask5
, const __m256i
&vMask6
, const __m256i
&vMask7
)
251 vTranspose8x8(vDst
, _mm256_castsi256_ps(vMask0
), _mm256_castsi256_ps(vMask1
), _mm256_castsi256_ps(vMask2
), _mm256_castsi256_ps(vMask3
),
252 _mm256_castsi256_ps(vMask4
), _mm256_castsi256_ps(vMask5
), _mm256_castsi256_ps(vMask6
), _mm256_castsi256_ps(vMask7
));
256 //////////////////////////////////////////////////////////////////////////
257 /// TranposeSingleComponent
258 //////////////////////////////////////////////////////////////////////////
259 template<uint32_t bpp
>
260 struct TransposeSingleComponent
262 //////////////////////////////////////////////////////////////////////////
263 /// @brief Pass-thru for single component.
264 /// @param pSrc - source data in SOA form
265 /// @param pDst - output data in AOS form
266 INLINE
static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
)
268 memcpy(pDst
, pSrc
, (bpp
* KNOB_SIMD_WIDTH
) / 8);
272 //////////////////////////////////////////////////////////////////////////
274 //////////////////////////////////////////////////////////////////////////
275 struct Transpose8_8_8_8
277 //////////////////////////////////////////////////////////////////////////
278 /// @brief Performs an SOA to AOS conversion for packed 8_8_8_8 data.
279 /// @param pSrc - source data in SOA form
280 /// @param pDst - output data in AOS form
281 INLINE
static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
)
283 simdscalari src
= _simd_load_si((const simdscalari
*)pSrc
);
285 #if KNOB_SIMD_WIDTH == 8
286 #if KNOB_ARCH == KNOB_ARCH_AVX
287 __m128i c0c1
= _mm256_castsi256_si128(src
); // rrrrrrrrgggggggg
288 __m128i c2c3
= _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(src
), 1)); // bbbbbbbbaaaaaaaa
289 __m128i c0c2
= _mm_unpacklo_epi64(c0c1
, c2c3
); // rrrrrrrrbbbbbbbb
290 __m128i c1c3
= _mm_unpackhi_epi64(c0c1
, c2c3
); // ggggggggaaaaaaaa
291 __m128i c01
= _mm_unpacklo_epi8(c0c2
, c1c3
); // rgrgrgrgrgrgrgrg
292 __m128i c23
= _mm_unpackhi_epi8(c0c2
, c1c3
); // babababababababa
293 __m128i c0123lo
= _mm_unpacklo_epi16(c01
, c23
); // rgbargbargbargba
294 __m128i c0123hi
= _mm_unpackhi_epi16(c01
, c23
); // rgbargbargbargba
295 _mm_store_si128((__m128i
*)pDst
, c0123lo
);
296 _mm_store_si128((__m128i
*)(pDst
+ 16), c0123hi
);
297 #elif KNOB_ARCH == KNOB_ARCH_AVX2
298 simdscalari dst01
= _mm256_shuffle_epi8(src
,
299 _mm256_set_epi32(0x0f078080, 0x0e068080, 0x0d058080, 0x0c048080, 0x80800b03, 0x80800a02, 0x80800901, 0x80800800));
300 simdscalari dst23
= _mm256_permute2x128_si256(src
, src
, 0x01);
301 dst23
= _mm256_shuffle_epi8(dst23
,
302 _mm256_set_epi32(0x80800f07, 0x80800e06, 0x80800d05, 0x80800c04, 0x0b038080, 0x0a028080, 0x09018080, 0x08008080));
303 simdscalari dst
= _mm256_or_si256(dst01
, dst23
);
304 _simd_store_si((simdscalari
*)pDst
, dst
);
306 #elif KNOB_SIMD_WIDTH == 16
307 simdscalari mask0
= _simd_set_epi32(0x0f078080, 0x0e068080, 0x0d058080, 0x0c048080, 0x80800b03, 0x80800a02, 0x80800901, 0x80800800);
309 simdscalari dst01
= _simd_shuffle_epi8(src
, mask0
);
311 simdscalari perm1
= _simd_permute_128(src
, src
, 1);
313 simdscalari mask1
= _simd_set_epi32(0x80800f07, 0x80800e06, 0x80800d05, 0x80800c04, 0x0b038080, 0x0a028080, 0x09018080, 0x08008080);
315 simdscalari dst23
= _simd_shuffle_epi8(perm1
, mask1
);
317 simdscalari dst
= _simd_or_si(dst01
, dst23
);
319 _simd_store_si(reinterpret_cast<simdscalari
*>(pDst
), dst
);
321 #error Unsupported vector width
326 //////////////////////////////////////////////////////////////////////////
328 //////////////////////////////////////////////////////////////////////////
329 struct Transpose8_8_8
331 //////////////////////////////////////////////////////////////////////////
332 /// @brief Performs an SOA to AOS conversion for packed 8_8_8 data.
333 /// @param pSrc - source data in SOA form
334 /// @param pDst - output data in AOS form
335 INLINE
static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
338 //////////////////////////////////////////////////////////////////////////
340 //////////////////////////////////////////////////////////////////////////
343 //////////////////////////////////////////////////////////////////////////
344 /// @brief Performs an SOA to AOS conversion for packed 8_8 data.
345 /// @param pSrc - source data in SOA form
346 /// @param pDst - output data in AOS form
347 INLINE
static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
)
349 #if KNOB_SIMD_WIDTH == 8
350 simdscalari src
= _simd_load_si((const simdscalari
*)pSrc
);
352 __m128i rg
= _mm256_castsi256_si128(src
); // rrrrrrrr gggggggg
353 __m128i g
= _mm_unpackhi_epi64(rg
, rg
); // gggggggg gggggggg
354 rg
= _mm_unpacklo_epi8(rg
, g
);
355 _mm_store_si128((__m128i
*)pDst
, rg
);
356 #elif KNOB_SIMD_WIDTH == 16
357 __m256i src
= _mm256_load_si256(reinterpret_cast<const __m256i
*>(pSrc
)); // rrrrrrrrrrrrrrrrgggggggggggggggg
359 __m256i r
= _mm256_permute4x64_epi64(src
, 0x50); // 0x50 = 01010000b // rrrrrrrrxxxxxxxxrrrrrrrrxxxxxxxx
361 __m256i g
= _mm256_permute4x64_epi64(src
, 0xFA); // 0xFA = 11111010b // ggggggggxxxxxxxxggggggggxxxxxxxx
363 __m256i dst
= _mm256_unpacklo_epi8(r
, g
); // rgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrg
365 _mm256_store_si256(reinterpret_cast<__m256i
*>(pDst
), dst
);
367 #error Unsupported vector width
372 //////////////////////////////////////////////////////////////////////////
373 /// Transpose32_32_32_32
374 //////////////////////////////////////////////////////////////////////////
375 struct Transpose32_32_32_32
377 //////////////////////////////////////////////////////////////////////////
378 /// @brief Performs an SOA to AOS conversion for packed 32_32_32_32 data.
379 /// @param pSrc - source data in SOA form
380 /// @param pDst - output data in AOS form
381 INLINE
static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
)
383 #if KNOB_SIMD_WIDTH == 8
384 simdscalar src0
= _simd_load_ps((const float*)pSrc
);
385 simdscalar src1
= _simd_load_ps((const float*)pSrc
+ 8);
386 simdscalar src2
= _simd_load_ps((const float*)pSrc
+ 16);
387 simdscalar src3
= _simd_load_ps((const float*)pSrc
+ 24);
390 vTranspose4x8(vDst
, src0
, src1
, src2
, src3
);
391 _mm_store_ps((float*)pDst
, vDst
[0]);
392 _mm_store_ps((float*)pDst
+4, vDst
[1]);
393 _mm_store_ps((float*)pDst
+8, vDst
[2]);
394 _mm_store_ps((float*)pDst
+12, vDst
[3]);
395 _mm_store_ps((float*)pDst
+16, vDst
[4]);
396 _mm_store_ps((float*)pDst
+20, vDst
[5]);
397 _mm_store_ps((float*)pDst
+24, vDst
[6]);
398 _mm_store_ps((float*)pDst
+28, vDst
[7]);
399 #elif KNOB_SIMD_WIDTH == 16
400 #if ENABLE_AVX512_EMULATION
401 simdscalar src0
= _simd_load_ps(reinterpret_cast<const float*>(pSrc
));
402 simdscalar src1
= _simd_load_ps(reinterpret_cast<const float*>(pSrc
) + 16);
403 simdscalar src2
= _simd_load_ps(reinterpret_cast<const float*>(pSrc
) + 32);
404 simdscalar src3
= _simd_load_ps(reinterpret_cast<const float*>(pSrc
) + 48);
408 vTranspose4x8(vDst
, src0
.lo
, src1
.lo
, src2
.lo
, src3
.lo
);
410 _mm_store_ps(reinterpret_cast<float*>(pDst
), vDst
[0]);
411 _mm_store_ps(reinterpret_cast<float*>(pDst
) + 4, vDst
[1]);
412 _mm_store_ps(reinterpret_cast<float*>(pDst
) + 8, vDst
[2]);
413 _mm_store_ps(reinterpret_cast<float*>(pDst
) + 12, vDst
[3]);
414 _mm_store_ps(reinterpret_cast<float*>(pDst
) + 16, vDst
[4]);
415 _mm_store_ps(reinterpret_cast<float*>(pDst
) + 20, vDst
[5]);
416 _mm_store_ps(reinterpret_cast<float*>(pDst
) + 24, vDst
[6]);
417 _mm_store_ps(reinterpret_cast<float*>(pDst
) + 28, vDst
[7]);
419 vTranspose4x8(vDst
, src0
.hi
, src1
.hi
, src2
.hi
, src3
.hi
);
421 _mm_store_ps(reinterpret_cast<float*>(pDst
) + 32, vDst
[0]);
422 _mm_store_ps(reinterpret_cast<float*>(pDst
) + 36, vDst
[1]);
423 _mm_store_ps(reinterpret_cast<float*>(pDst
) + 40, vDst
[2]);
424 _mm_store_ps(reinterpret_cast<float*>(pDst
) + 44, vDst
[3]);
425 _mm_store_ps(reinterpret_cast<float*>(pDst
) + 48, vDst
[4]);
426 _mm_store_ps(reinterpret_cast<float*>(pDst
) + 52, vDst
[5]);
427 _mm_store_ps(reinterpret_cast<float*>(pDst
) + 56, vDst
[6]);
428 _mm_store_ps(reinterpret_cast<float*>(pDst
) + 60, vDst
[7]);
431 #error Unsupported vector width
436 //////////////////////////////////////////////////////////////////////////
437 /// Transpose32_32_32
438 //////////////////////////////////////////////////////////////////////////
439 struct Transpose32_32_32
441 //////////////////////////////////////////////////////////////////////////
442 /// @brief Performs an SOA to AOS conversion for packed 32_32_32 data.
443 /// @param pSrc - source data in SOA form
444 /// @param pDst - output data in AOS form
445 INLINE
static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
)
447 #if KNOB_SIMD_WIDTH == 8
448 simdscalar src0
= _simd_load_ps((const float*)pSrc
);
449 simdscalar src1
= _simd_load_ps((const float*)pSrc
+ 8);
450 simdscalar src2
= _simd_load_ps((const float*)pSrc
+ 16);
453 vTranspose3x8(vDst
, src0
, src1
, src2
);
454 _mm_store_ps((float*)pDst
, vDst
[0]);
455 _mm_store_ps((float*)pDst
+ 4, vDst
[1]);
456 _mm_store_ps((float*)pDst
+ 8, vDst
[2]);
457 _mm_store_ps((float*)pDst
+ 12, vDst
[3]);
458 _mm_store_ps((float*)pDst
+ 16, vDst
[4]);
459 _mm_store_ps((float*)pDst
+ 20, vDst
[5]);
460 _mm_store_ps((float*)pDst
+ 24, vDst
[6]);
461 _mm_store_ps((float*)pDst
+ 28, vDst
[7]);
462 #elif KNOB_SIMD_WIDTH == 16
463 #if ENABLE_AVX512_EMULATION
464 simdscalar src0
= _simd_load_ps(reinterpret_cast<const float*>(pSrc
));
465 simdscalar src1
= _simd_load_ps(reinterpret_cast<const float*>(pSrc
) + 16);
466 simdscalar src2
= _simd_load_ps(reinterpret_cast<const float*>(pSrc
) + 32);
470 vTranspose3x8(vDst
, src0
.lo
, src1
.lo
, src2
.lo
);
472 _mm_store_ps(reinterpret_cast<float*>(pDst
), vDst
[0]);
473 _mm_store_ps(reinterpret_cast<float*>(pDst
) + 4, vDst
[1]);
474 _mm_store_ps(reinterpret_cast<float*>(pDst
) + 8, vDst
[2]);
475 _mm_store_ps(reinterpret_cast<float*>(pDst
) + 12, vDst
[3]);
476 _mm_store_ps(reinterpret_cast<float*>(pDst
) + 16, vDst
[4]);
477 _mm_store_ps(reinterpret_cast<float*>(pDst
) + 20, vDst
[5]);
478 _mm_store_ps(reinterpret_cast<float*>(pDst
) + 24, vDst
[6]);
479 _mm_store_ps(reinterpret_cast<float*>(pDst
) + 28, vDst
[7]);
481 vTranspose3x8(vDst
, src0
.hi
, src1
.hi
, src2
.hi
);
483 _mm_store_ps(reinterpret_cast<float*>(pDst
) + 32, vDst
[0]);
484 _mm_store_ps(reinterpret_cast<float*>(pDst
) + 36, vDst
[1]);
485 _mm_store_ps(reinterpret_cast<float*>(pDst
) + 40, vDst
[2]);
486 _mm_store_ps(reinterpret_cast<float*>(pDst
) + 44, vDst
[3]);
487 _mm_store_ps(reinterpret_cast<float*>(pDst
) + 48, vDst
[4]);
488 _mm_store_ps(reinterpret_cast<float*>(pDst
) + 52, vDst
[5]);
489 _mm_store_ps(reinterpret_cast<float*>(pDst
) + 56, vDst
[6]);
490 _mm_store_ps(reinterpret_cast<float*>(pDst
) + 60, vDst
[7]);
493 #error Unsupported vector width
498 //////////////////////////////////////////////////////////////////////////
500 //////////////////////////////////////////////////////////////////////////
501 struct Transpose32_32
503 //////////////////////////////////////////////////////////////////////////
504 /// @brief Performs an SOA to AOS conversion for packed 32_32 data.
505 /// @param pSrc - source data in SOA form
506 /// @param pDst - output data in AOS form
507 INLINE
static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
)
509 #if KNOB_SIMD_WIDTH == 8
510 const float* pfSrc
= (const float*)pSrc
;
511 __m128 src_r0
= _mm_load_ps(pfSrc
+ 0);
512 __m128 src_r1
= _mm_load_ps(pfSrc
+ 4);
513 __m128 src_g0
= _mm_load_ps(pfSrc
+ 8);
514 __m128 src_g1
= _mm_load_ps(pfSrc
+ 12);
516 __m128 dst0
= _mm_unpacklo_ps(src_r0
, src_g0
);
517 __m128 dst1
= _mm_unpackhi_ps(src_r0
, src_g0
);
518 __m128 dst2
= _mm_unpacklo_ps(src_r1
, src_g1
);
519 __m128 dst3
= _mm_unpackhi_ps(src_r1
, src_g1
);
521 float* pfDst
= (float*)pDst
;
522 _mm_store_ps(pfDst
+ 0, dst0
);
523 _mm_store_ps(pfDst
+ 4, dst1
);
524 _mm_store_ps(pfDst
+ 8, dst2
);
525 _mm_store_ps(pfDst
+ 12, dst3
);
526 #elif KNOB_SIMD_WIDTH == 16
527 const float* pfSrc
= (const float*)pSrc
;
528 __m256 src_r0
= _mm256_load_ps(pfSrc
+ 0);
529 __m256 src_r1
= _mm256_load_ps(pfSrc
+ 8);
530 __m256 src_g0
= _mm256_load_ps(pfSrc
+ 16);
531 __m256 src_g1
= _mm256_load_ps(pfSrc
+ 24);
533 __m256 dst0
= _mm256_unpacklo_ps(src_r0
, src_g0
);
534 __m256 dst1
= _mm256_unpackhi_ps(src_r0
, src_g0
);
535 __m256 dst2
= _mm256_unpacklo_ps(src_r1
, src_g1
);
536 __m256 dst3
= _mm256_unpackhi_ps(src_r1
, src_g1
);
538 float* pfDst
= (float*)pDst
;
539 _mm256_store_ps(pfDst
+ 0, dst0
);
540 _mm256_store_ps(pfDst
+ 8, dst1
);
541 _mm256_store_ps(pfDst
+ 16, dst2
);
542 _mm256_store_ps(pfDst
+ 24, dst3
);
544 #error Unsupported vector width
549 //////////////////////////////////////////////////////////////////////////
550 /// Transpose16_16_16_16
551 //////////////////////////////////////////////////////////////////////////
552 struct Transpose16_16_16_16
554 //////////////////////////////////////////////////////////////////////////
555 /// @brief Performs an SOA to AOS conversion for packed 16_16_16_16 data.
556 /// @param pSrc - source data in SOA form
557 /// @param pDst - output data in AOS form
558 INLINE
static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
)
560 #if KNOB_SIMD_WIDTH == 8
561 simdscalari src_rg
= _simd_load_si((const simdscalari
*)pSrc
);
562 simdscalari src_ba
= _simd_load_si((const simdscalari
*)(pSrc
+ sizeof(simdscalari
)));
564 __m128i src_r
= _mm256_extractf128_si256(src_rg
, 0);
565 __m128i src_g
= _mm256_extractf128_si256(src_rg
, 1);
566 __m128i src_b
= _mm256_extractf128_si256(src_ba
, 0);
567 __m128i src_a
= _mm256_extractf128_si256(src_ba
, 1);
569 __m128i rg0
= _mm_unpacklo_epi16(src_r
, src_g
);
570 __m128i rg1
= _mm_unpackhi_epi16(src_r
, src_g
);
571 __m128i ba0
= _mm_unpacklo_epi16(src_b
, src_a
);
572 __m128i ba1
= _mm_unpackhi_epi16(src_b
, src_a
);
574 __m128i dst0
= _mm_unpacklo_epi32(rg0
, ba0
);
575 __m128i dst1
= _mm_unpackhi_epi32(rg0
, ba0
);
576 __m128i dst2
= _mm_unpacklo_epi32(rg1
, ba1
);
577 __m128i dst3
= _mm_unpackhi_epi32(rg1
, ba1
);
579 _mm_store_si128(((__m128i
*)pDst
) + 0, dst0
);
580 _mm_store_si128(((__m128i
*)pDst
) + 1, dst1
);
581 _mm_store_si128(((__m128i
*)pDst
) + 2, dst2
);
582 _mm_store_si128(((__m128i
*)pDst
) + 3, dst3
);
583 #elif KNOB_SIMD_WIDTH == 16
584 #if ENABLE_AVX512_EMULATION
585 simdscalari src_rg
= _simd_load_si(reinterpret_cast<const simdscalari
*>(pSrc
));
586 simdscalari src_ba
= _simd_load_si(reinterpret_cast<const simdscalari
*>(pSrc
+ sizeof(simdscalari
)));
588 __m256i src_r
= src_rg
.lo
;
589 __m256i src_g
= src_rg
.hi
;
590 __m256i src_b
= src_ba
.lo
;
591 __m256i src_a
= src_ba
.hi
;
593 __m256i rg0
= _mm256_unpacklo_epi16(src_r
, src_g
);
594 __m256i rg1
= _mm256_unpackhi_epi16(src_r
, src_g
);
595 __m256i ba0
= _mm256_unpacklo_epi16(src_b
, src_a
);
596 __m256i ba1
= _mm256_unpackhi_epi16(src_b
, src_a
);
598 __m256i dst0
= _mm256_unpacklo_epi32(rg0
, ba0
);
599 __m256i dst1
= _mm256_unpackhi_epi32(rg0
, ba0
);
600 __m256i dst2
= _mm256_unpacklo_epi32(rg1
, ba1
);
601 __m256i dst3
= _mm256_unpackhi_epi32(rg1
, ba1
);
603 _mm256_store_si256(reinterpret_cast<__m256i
*>(pDst
) + 0, dst0
);
604 _mm256_store_si256(reinterpret_cast<__m256i
*>(pDst
) + 1, dst1
);
605 _mm256_store_si256(reinterpret_cast<__m256i
*>(pDst
) + 2, dst2
);
606 _mm256_store_si256(reinterpret_cast<__m256i
*>(pDst
) + 3, dst3
);
609 #error Unsupported vector width
614 //////////////////////////////////////////////////////////////////////////
615 /// Transpose16_16_16
616 //////////////////////////////////////////////////////////////////////////
617 struct Transpose16_16_16
619 //////////////////////////////////////////////////////////////////////////
620 /// @brief Performs an SOA to AOS conversion for packed 16_16_16 data.
621 /// @param pSrc - source data in SOA form
622 /// @param pDst - output data in AOS form
623 INLINE
static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
)
625 #if KNOB_SIMD_WIDTH == 8
626 simdscalari src_rg
= _simd_load_si((const simdscalari
*)pSrc
);
628 __m128i src_r
= _mm256_extractf128_si256(src_rg
, 0);
629 __m128i src_g
= _mm256_extractf128_si256(src_rg
, 1);
630 __m128i src_b
= _mm_load_si128((const __m128i
*)(pSrc
+ sizeof(simdscalari
)));
631 __m128i src_a
= _mm_undefined_si128();
633 __m128i rg0
= _mm_unpacklo_epi16(src_r
, src_g
);
634 __m128i rg1
= _mm_unpackhi_epi16(src_r
, src_g
);
635 __m128i ba0
= _mm_unpacklo_epi16(src_b
, src_a
);
636 __m128i ba1
= _mm_unpackhi_epi16(src_b
, src_a
);
638 __m128i dst0
= _mm_unpacklo_epi32(rg0
, ba0
);
639 __m128i dst1
= _mm_unpackhi_epi32(rg0
, ba0
);
640 __m128i dst2
= _mm_unpacklo_epi32(rg1
, ba1
);
641 __m128i dst3
= _mm_unpackhi_epi32(rg1
, ba1
);
643 _mm_store_si128(((__m128i
*)pDst
) + 0, dst0
);
644 _mm_store_si128(((__m128i
*)pDst
) + 1, dst1
);
645 _mm_store_si128(((__m128i
*)pDst
) + 2, dst2
);
646 _mm_store_si128(((__m128i
*)pDst
) + 3, dst3
);
647 #elif KNOB_SIMD_WIDTH == 16
648 #if ENABLE_AVX512_EMULATION
649 simdscalari src_rg
= _simd_load_si(reinterpret_cast<const simdscalari
*>(pSrc
));
651 __m256i src_r
= src_rg
.lo
;
652 __m256i src_g
= src_rg
.hi
;
653 __m256i src_b
= _mm256_load_si256(reinterpret_cast<const __m256i
*>(pSrc
+ sizeof(simdscalari
)));
654 __m256i src_a
= _mm256_undefined_si256();
656 __m256i rg0
= _mm256_unpacklo_epi16(src_r
, src_g
);
657 __m256i rg1
= _mm256_unpackhi_epi16(src_r
, src_g
);
658 __m256i ba0
= _mm256_unpacklo_epi16(src_b
, src_a
);
659 __m256i ba1
= _mm256_unpackhi_epi16(src_b
, src_a
);
661 __m256i dst0
= _mm256_unpacklo_epi32(rg0
, ba0
);
662 __m256i dst1
= _mm256_unpackhi_epi32(rg0
, ba0
);
663 __m256i dst2
= _mm256_unpacklo_epi32(rg1
, ba1
);
664 __m256i dst3
= _mm256_unpackhi_epi32(rg1
, ba1
);
666 _mm256_store_si256(reinterpret_cast<__m256i
*>(pDst
) + 0, dst0
);
667 _mm256_store_si256(reinterpret_cast<__m256i
*>(pDst
) + 1, dst1
);
668 _mm256_store_si256(reinterpret_cast<__m256i
*>(pDst
) + 2, dst2
);
669 _mm256_store_si256(reinterpret_cast<__m256i
*>(pDst
) + 3, dst3
);
672 #error Unsupported vector width
677 //////////////////////////////////////////////////////////////////////////
679 //////////////////////////////////////////////////////////////////////////
680 struct Transpose16_16
682 //////////////////////////////////////////////////////////////////////////
683 /// @brief Performs an SOA to AOS conversion for packed 16_16 data.
684 /// @param pSrc - source data in SOA form
685 /// @param pDst - output data in AOS form
686 INLINE
static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
)
688 #if KNOB_SIMD_WIDTH == 8
689 simdscalar src
= _simd_load_ps((const float*)pSrc
);
691 __m128 comp0
= _mm256_castps256_ps128(src
);
692 __m128 comp1
= _mm256_extractf128_ps(src
, 1);
694 __m128i comp0i
= _mm_castps_si128(comp0
);
695 __m128i comp1i
= _mm_castps_si128(comp1
);
697 __m128i resLo
= _mm_unpacklo_epi16(comp0i
, comp1i
);
698 __m128i resHi
= _mm_unpackhi_epi16(comp0i
, comp1i
);
700 _mm_store_si128((__m128i
*)pDst
, resLo
);
701 _mm_store_si128((__m128i
*)pDst
+ 1, resHi
);
702 #elif KNOB_SIMD_WIDTH == 16
703 #if ENABLE_AVX512_EMULATION
704 simdscalari src
= _simd_castps_si(_simd_load_ps(reinterpret_cast<const float*>(pSrc
)));
708 result
.lo
= _mm256_unpacklo_epi16(src
.lo
, src
.hi
);
709 result
.hi
= _mm256_unpackhi_epi16(src
.lo
, src
.hi
);
711 _simd_store_si(reinterpret_cast<simdscalari
*>(pDst
), result
);
714 #error Unsupported vector width
719 //////////////////////////////////////////////////////////////////////////
721 //////////////////////////////////////////////////////////////////////////
724 //////////////////////////////////////////////////////////////////////////
725 /// @brief Performs an SOA to AOS conversion for packed 24_8 data.
726 /// @param pSrc - source data in SOA form
727 /// @param pDst - output data in AOS form
728 static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
731 //////////////////////////////////////////////////////////////////////////
733 //////////////////////////////////////////////////////////////////////////
734 struct Transpose32_8_24
736 //////////////////////////////////////////////////////////////////////////
737 /// @brief Performs an SOA to AOS conversion for packed 32_8_24 data.
738 /// @param pSrc - source data in SOA form
739 /// @param pDst - output data in AOS form
740 static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
745 //////////////////////////////////////////////////////////////////////////
747 //////////////////////////////////////////////////////////////////////////
748 struct Transpose4_4_4_4
750 //////////////////////////////////////////////////////////////////////////
751 /// @brief Performs an SOA to AOS conversion for packed 4_4_4_4 data.
752 /// @param pSrc - source data in SOA form
753 /// @param pDst - output data in AOS form
754 static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
757 //////////////////////////////////////////////////////////////////////////
759 //////////////////////////////////////////////////////////////////////////
760 struct Transpose5_6_5
762 //////////////////////////////////////////////////////////////////////////
763 /// @brief Performs an SOA to AOS conversion for packed 5_6_5 data.
764 /// @param pSrc - source data in SOA form
765 /// @param pDst - output data in AOS form
766 static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
769 //////////////////////////////////////////////////////////////////////////
771 //////////////////////////////////////////////////////////////////////////
772 struct Transpose9_9_9_5
774 //////////////////////////////////////////////////////////////////////////
775 /// @brief Performs an SOA to AOS conversion for packed 9_9_9_5 data.
776 /// @param pSrc - source data in SOA form
777 /// @param pDst - output data in AOS form
778 static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
781 //////////////////////////////////////////////////////////////////////////
783 //////////////////////////////////////////////////////////////////////////
784 struct Transpose5_5_5_1
786 //////////////////////////////////////////////////////////////////////////
787 /// @brief Performs an SOA to AOS conversion for packed 5_5_5_1 data.
788 /// @param pSrc - source data in SOA form
789 /// @param pDst - output data in AOS form
790 static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
793 //////////////////////////////////////////////////////////////////////////
794 /// Transpose10_10_10_2
795 //////////////////////////////////////////////////////////////////////////
796 struct Transpose10_10_10_2
798 //////////////////////////////////////////////////////////////////////////
799 /// @brief Performs an SOA to AOS conversion for packed 10_10_10_2 data.
800 /// @param pSrc - source data in SOA form
801 /// @param pDst - output data in AOS form
802 static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
805 //////////////////////////////////////////////////////////////////////////
806 /// Transpose11_11_10
807 //////////////////////////////////////////////////////////////////////////
808 struct Transpose11_11_10
810 //////////////////////////////////////////////////////////////////////////
811 /// @brief Performs an SOA to AOS conversion for packed 11_11_10 data.
812 /// @param pSrc - source data in SOA form
813 /// @param pDst - output data in AOS form
814 static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
817 // helper function to unroll loops
818 template<int Begin
, int End
, int Step
= 1>
820 template<typename Lambda
>
821 INLINE
static void step(Lambda
& func
) {
823 UnrollerL
<Begin
+ Step
, End
, Step
>::step(func
);
827 template<int End
, int Step
>
828 struct UnrollerL
<End
, End
, Step
> {
829 template<typename Lambda
>
830 static void step(Lambda
& func
) {
834 // general CRC compute
836 uint32_t ComputeCRC(uint32_t crc
, const void *pData
, uint32_t size
)
838 #if defined(_WIN64) || defined(__x86_64__)
839 uint32_t sizeInQwords
= size
/ sizeof(uint64_t);
840 uint32_t sizeRemainderBytes
= size
% sizeof(uint64_t);
841 uint64_t* pDataWords
= (uint64_t*)pData
;
842 for (uint32_t i
= 0; i
< sizeInQwords
; ++i
)
844 crc
= (uint32_t)_mm_crc32_u64(crc
, *pDataWords
++);
847 uint32_t sizeInDwords
= size
/ sizeof(uint32_t);
848 uint32_t sizeRemainderBytes
= size
% sizeof(uint32_t);
849 uint32_t* pDataWords
= (uint32_t*)pData
;
850 for (uint32_t i
= 0; i
< sizeInDwords
; ++i
)
852 crc
= _mm_crc32_u32(crc
, *pDataWords
++);
856 uint8_t* pRemainderBytes
= (uint8_t*)pDataWords
;
857 for (uint32_t i
= 0; i
< sizeRemainderBytes
; ++i
)
859 crc
= _mm_crc32_u8(crc
, *pRemainderBytes
++);
865 //////////////////////////////////////////////////////////////////////////
866 /// Add byte offset to any-type pointer
867 //////////////////////////////////////////////////////////////////////////
868 template <typename T
>
870 static T
* PtrAdd(T
* p
, intptr_t offset
)
872 intptr_t intp
= reinterpret_cast<intptr_t>(p
);
873 return reinterpret_cast<T
*>(intp
+ offset
);
876 //////////////////////////////////////////////////////////////////////////
878 //////////////////////////////////////////////////////////////////////////
879 template <typename T
>
881 static bool IsPow2(T value
)
883 return value
== (value
& (0 - value
));
886 //////////////////////////////////////////////////////////////////////////
887 /// Align down to specified alignment
888 /// Note: IsPow2(alignment) MUST be true
889 //////////////////////////////////////////////////////////////////////////
890 template <typename T1
, typename T2
>
892 static T1
AlignDownPow2(T1 value
, T2 alignment
)
894 SWR_ASSERT(IsPow2(alignment
));
895 return value
& ~T1(alignment
- 1);
898 //////////////////////////////////////////////////////////////////////////
899 /// Align up to specified alignment
900 /// Note: IsPow2(alignment) MUST be true
901 //////////////////////////////////////////////////////////////////////////
902 template <typename T1
, typename T2
>
904 static T1
AlignUpPow2(T1 value
, T2 alignment
)
906 return AlignDownPow2(value
+ T1(alignment
- 1), alignment
);
909 //////////////////////////////////////////////////////////////////////////
910 /// Align up ptr to specified alignment
911 /// Note: IsPow2(alignment) MUST be true
912 //////////////////////////////////////////////////////////////////////////
913 template <typename T1
, typename T2
>
915 static T1
* AlignUpPow2(T1
* value
, T2 alignment
)
917 return reinterpret_cast<T1
*>(
918 AlignDownPow2(reinterpret_cast<uintptr_t>(value
) + uintptr_t(alignment
- 1), alignment
));
921 //////////////////////////////////////////////////////////////////////////
922 /// Align down to specified alignment
923 //////////////////////////////////////////////////////////////////////////
924 template <typename T1
, typename T2
>
926 static T1
AlignDown(T1 value
, T2 alignment
)
928 if (IsPow2(alignment
)) { return AlignDownPow2(value
, alignment
); }
929 return value
- T1(value
% alignment
);
932 //////////////////////////////////////////////////////////////////////////
933 /// Align down to specified alignment
934 //////////////////////////////////////////////////////////////////////////
935 template <typename T1
, typename T2
>
937 static T1
* AlignDown(T1
* value
, T2 alignment
)
939 return (T1
*)AlignDown(uintptr_t(value
), alignment
);
942 //////////////////////////////////////////////////////////////////////////
943 /// Align up to specified alignment
944 /// Note: IsPow2(alignment) MUST be true
945 //////////////////////////////////////////////////////////////////////////
946 template <typename T1
, typename T2
>
948 static T1
AlignUp(T1 value
, T2 alignment
)
950 return AlignDown(value
+ T1(alignment
- 1), alignment
);
953 //////////////////////////////////////////////////////////////////////////
954 /// Align up to specified alignment
955 /// Note: IsPow2(alignment) MUST be true
956 //////////////////////////////////////////////////////////////////////////
957 template <typename T1
, typename T2
>
959 static T1
* AlignUp(T1
* value
, T2 alignment
)
961 return AlignDown(PtrAdd(value
, alignment
- 1), alignment
);
964 //////////////////////////////////////////////////////////////////////////
965 /// Helper structure used to access an array of elements that don't
966 /// correspond to a typical word size.
967 //////////////////////////////////////////////////////////////////////////
968 template<typename T
, size_t BitsPerElementT
, size_t ArrayLenT
>
972 static const size_t BITS_PER_WORD
= sizeof(size_t) * 8;
973 static const size_t ELEMENTS_PER_WORD
= BITS_PER_WORD
/ BitsPerElementT
;
974 static const size_t NUM_WORDS
= (ArrayLenT
+ ELEMENTS_PER_WORD
- 1) / ELEMENTS_PER_WORD
;
975 static const size_t ELEMENT_MASK
= (size_t(1) << BitsPerElementT
) - 1;
977 static_assert(ELEMENTS_PER_WORD
* BitsPerElementT
== BITS_PER_WORD
,
978 "Element size must an integral fraction of pointer size");
980 size_t m_words
[NUM_WORDS
] = {};
984 T
operator[] (size_t elementIndex
) const
986 size_t word
= m_words
[elementIndex
/ ELEMENTS_PER_WORD
];
987 word
>>= ((elementIndex
% ELEMENTS_PER_WORD
) * BitsPerElementT
);
988 return T(word
& ELEMENT_MASK
);
992 // Ranged integer argument for TemplateArgUnroller
993 template <uint32_t TMin
, uint32_t TMax
>
999 // Recursive template used to auto-nest conditionals. Converts dynamic boolean function
1000 // arguments to static template arguments.
1001 template <typename TermT
, typename
... ArgsB
>
1002 struct TemplateArgUnroller
1004 //-----------------------------------------
1006 //-----------------------------------------
1008 // Last Arg Terminator
1009 static typename
TermT::FuncType
GetFunc(bool bArg
)
1013 return TermT::template GetFunc
<ArgsB
..., std::true_type
>();
1016 return TermT::template GetFunc
<ArgsB
..., std::false_type
>();
1019 // Recursively parse args
1020 template <typename
... TArgsT
>
1021 static typename
TermT::FuncType
GetFunc(bool bArg
, TArgsT
... remainingArgs
)
1025 return TemplateArgUnroller
<TermT
, ArgsB
..., std::true_type
>::GetFunc(remainingArgs
...);
1028 return TemplateArgUnroller
<TermT
, ArgsB
..., std::false_type
>::GetFunc(remainingArgs
...);
1031 //-----------------------------------------
1032 // Integer value (within specified range)
1033 //-----------------------------------------
1035 // Last Arg Terminator
1036 template <uint32_t TMin
, uint32_t TMax
>
1037 static typename
TermT::FuncType
GetFunc(IntArg
<TMin
, TMax
> iArg
)
1039 if (iArg
.val
== TMax
)
1041 return TermT::template GetFunc
<ArgsB
..., std::integral_constant
<uint32_t, TMax
>>();
1045 return TemplateArgUnroller
<TermT
, ArgsB
...>::GetFunc(IntArg
<TMin
, TMax
-1>{iArg
.val
});
1047 SWR_ASSUME(false); return nullptr;
1049 template <uint32_t TVal
>
1050 static typename
TermT::FuncType
GetFunc(IntArg
<TVal
, TVal
> iArg
)
1052 SWR_ASSERT(iArg
.val
== TVal
);
1053 return TermT::template GetFunc
<ArgsB
..., std::integral_constant
<uint32_t, TVal
>>();
1056 // Recursively parse args
1057 template <uint32_t TMin
, uint32_t TMax
, typename
... TArgsT
>
1058 static typename
TermT::FuncType
GetFunc(IntArg
<TMin
, TMax
> iArg
, TArgsT
... remainingArgs
)
1060 if (iArg
.val
== TMax
)
1062 return TemplateArgUnroller
<TermT
, ArgsB
..., std::integral_constant
<uint32_t, TMax
>>::GetFunc(remainingArgs
...);
1066 return TemplateArgUnroller
<TermT
, ArgsB
...>::GetFunc(IntArg
<TMin
, TMax
- 1>{iArg
.val
}, remainingArgs
...);
1068 SWR_ASSUME(false); return nullptr;
1070 template <uint32_t TVal
, typename
... TArgsT
>
1071 static typename
TermT::FuncType
GetFunc(IntArg
<TVal
, TVal
> iArg
, TArgsT
... remainingArgs
)
1073 SWR_ASSERT(iArg
.val
== TVal
);
1074 return TemplateArgUnroller
<TermT
, ArgsB
..., std::integral_constant
<uint32_t, TVal
>>::GetFunc(remainingArgs
...);