1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * @brief Utilities used by SWR core.
27 ******************************************************************************/
31 #include <type_traits>
33 #include "common/os.h"
34 #include "common/simdintrin.h"
35 #include "common/swr_assert.h"
38 #if defined(_WIN64) || defined(__x86_64__)
39 #define _MM_INSERT_EPI64 _mm_insert_epi64
40 #define _MM_EXTRACT_EPI64 _mm_extract_epi64
42 INLINE
int64_t _MM_EXTRACT_EPI64(__m128i a
, const int32_t ndx
)
44 OSALIGNLINE(uint32_t) elems
[4];
45 _mm_store_si128((__m128i
*)elems
, a
);
48 uint64_t foo
= elems
[0];
49 foo
|= (uint64_t)elems
[1] << 32;
54 uint64_t foo
= elems
[2];
55 foo
|= (uint64_t)elems
[3] << 32;
60 INLINE __m128i
_MM_INSERT_EPI64(__m128i a
, int64_t b
, const int32_t ndx
)
62 OSALIGNLINE(int64_t) elems
[2];
63 _mm_store_si128((__m128i
*)elems
, a
);
73 out
= _mm_load_si128((const __m128i
*)elems
);
86 #if ENABLE_AVX512_SIMD16
97 void vTranspose(__m128
&row0
, __m128
&row1
, __m128
&row2
, __m128
&row3
)
99 __m128i row0i
= _mm_castps_si128(row0
);
100 __m128i row1i
= _mm_castps_si128(row1
);
101 __m128i row2i
= _mm_castps_si128(row2
);
102 __m128i row3i
= _mm_castps_si128(row3
);
104 __m128i vTemp
= row2i
;
105 row2i
= _mm_unpacklo_epi32(row2i
, row3i
);
106 vTemp
= _mm_unpackhi_epi32(vTemp
, row3i
);
109 row0i
= _mm_unpacklo_epi32(row0i
, row1i
);
110 row3i
= _mm_unpackhi_epi32(row3i
, row1i
);
113 row0i
= _mm_unpacklo_epi64(row0i
, row2i
);
114 row1i
= _mm_unpackhi_epi64(row1i
, row2i
);
117 row2i
= _mm_unpacklo_epi64(row2i
, vTemp
);
118 row3i
= _mm_unpackhi_epi64(row3i
, vTemp
);
120 row0
= _mm_castsi128_ps(row0i
);
121 row1
= _mm_castsi128_ps(row1i
);
122 row2
= _mm_castsi128_ps(row2i
);
123 row3
= _mm_castsi128_ps(row3i
);
127 void vTranspose(__m128i
&row0
, __m128i
&row1
, __m128i
&row2
, __m128i
&row3
)
129 __m128i vTemp
= row2
;
130 row2
= _mm_unpacklo_epi32(row2
, row3
);
131 vTemp
= _mm_unpackhi_epi32(vTemp
, row3
);
134 row0
= _mm_unpacklo_epi32(row0
, row1
);
135 row3
= _mm_unpackhi_epi32(row3
, row1
);
138 row0
= _mm_unpacklo_epi64(row0
, row2
);
139 row1
= _mm_unpackhi_epi64(row1
, row2
);
142 row2
= _mm_unpacklo_epi64(row2
, vTemp
);
143 row3
= _mm_unpackhi_epi64(row3
, vTemp
);
146 #if KNOB_SIMD_WIDTH == 8
148 void vTranspose3x8(__m128 (&vDst
)[8], const simdscalar
&vSrc0
, const simdscalar
&vSrc1
, const simdscalar
&vSrc2
)
150 simdscalar r0r2
= _simd_unpacklo_ps(vSrc0
, vSrc2
); //x0z0x1z1 x4z4x5z5
151 simdscalar r1rx
= _simd_unpacklo_ps(vSrc1
, _simd_setzero_ps()); //y0w0y1w1 y4w4y5w5
152 simdscalar r02r1xlolo
= _simd_unpacklo_ps(r0r2
, r1rx
); //x0y0z0w0 x4y4z4w4
153 simdscalar r02r1xlohi
= _simd_unpackhi_ps(r0r2
, r1rx
); //x1y1z1w1 x5y5z5w5
155 r0r2
= _simd_unpackhi_ps(vSrc0
, vSrc2
); //x2z2x3z3 x6z6x7z7
156 r1rx
= _simd_unpackhi_ps(vSrc1
, _simd_setzero_ps()); //y2w2y3w3 y6w6yw77
157 simdscalar r02r1xhilo
= _simd_unpacklo_ps(r0r2
, r1rx
); //x2y2z2w2 x6y6z6w6
158 simdscalar r02r1xhihi
= _simd_unpackhi_ps(r0r2
, r1rx
); //x3y3z3w3 x7y7z7w7
160 vDst
[0] = _mm256_castps256_ps128(r02r1xlolo
);
161 vDst
[1] = _mm256_castps256_ps128(r02r1xlohi
);
162 vDst
[2] = _mm256_castps256_ps128(r02r1xhilo
);
163 vDst
[3] = _mm256_castps256_ps128(r02r1xhihi
);
165 vDst
[4] = _simd_extractf128_ps(r02r1xlolo
, 1);
166 vDst
[5] = _simd_extractf128_ps(r02r1xlohi
, 1);
167 vDst
[6] = _simd_extractf128_ps(r02r1xhilo
, 1);
168 vDst
[7] = _simd_extractf128_ps(r02r1xhihi
, 1);
172 void vTranspose4x8(__m128 (&vDst
)[8], const simdscalar
&vSrc0
, const simdscalar
&vSrc1
, const simdscalar
&vSrc2
, const simdscalar
&vSrc3
)
174 simdscalar r0r2
= _simd_unpacklo_ps(vSrc0
, vSrc2
); //x0z0x1z1 x4z4x5z5
175 simdscalar r1rx
= _simd_unpacklo_ps(vSrc1
, vSrc3
); //y0w0y1w1 y4w4y5w5
176 simdscalar r02r1xlolo
= _simd_unpacklo_ps(r0r2
, r1rx
); //x0y0z0w0 x4y4z4w4
177 simdscalar r02r1xlohi
= _simd_unpackhi_ps(r0r2
, r1rx
); //x1y1z1w1 x5y5z5w5
179 r0r2
= _simd_unpackhi_ps(vSrc0
, vSrc2
); //x2z2x3z3 x6z6x7z7
180 r1rx
= _simd_unpackhi_ps(vSrc1
, vSrc3
); //y2w2y3w3 y6w6yw77
181 simdscalar r02r1xhilo
= _simd_unpacklo_ps(r0r2
, r1rx
); //x2y2z2w2 x6y6z6w6
182 simdscalar r02r1xhihi
= _simd_unpackhi_ps(r0r2
, r1rx
); //x3y3z3w3 x7y7z7w7
184 vDst
[0] = _mm256_castps256_ps128(r02r1xlolo
);
185 vDst
[1] = _mm256_castps256_ps128(r02r1xlohi
);
186 vDst
[2] = _mm256_castps256_ps128(r02r1xhilo
);
187 vDst
[3] = _mm256_castps256_ps128(r02r1xhihi
);
189 vDst
[4] = _simd_extractf128_ps(r02r1xlolo
, 1);
190 vDst
[5] = _simd_extractf128_ps(r02r1xlohi
, 1);
191 vDst
[6] = _simd_extractf128_ps(r02r1xhilo
, 1);
192 vDst
[7] = _simd_extractf128_ps(r02r1xhihi
, 1);
195 #if ENABLE_AVX512_SIMD16
197 void vTranspose4x16(simd16scalar(&dst
)[4], const simd16scalar
&src0
, const simd16scalar
&src1
, const simd16scalar
&src2
, const simd16scalar
&src3
)
199 const simd16scalari perm
= _simd16_set_epi32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0); // pre-permute input to setup the right order after all the unpacking
201 simd16scalar pre0
= _simd16_permute_ps(src0
, perm
); // r
202 simd16scalar pre1
= _simd16_permute_ps(src1
, perm
); // g
203 simd16scalar pre2
= _simd16_permute_ps(src2
, perm
); // b
204 simd16scalar pre3
= _simd16_permute_ps(src3
, perm
); // a
206 simd16scalar rblo
= _simd16_unpacklo_ps(pre0
, pre2
);
207 simd16scalar galo
= _simd16_unpacklo_ps(pre1
, pre3
);
208 simd16scalar rbhi
= _simd16_unpackhi_ps(pre0
, pre2
);
209 simd16scalar gahi
= _simd16_unpackhi_ps(pre1
, pre3
);
211 dst
[0] = _simd16_unpacklo_ps(rblo
, galo
);
212 dst
[1] = _simd16_unpackhi_ps(rblo
, galo
);
213 dst
[2] = _simd16_unpacklo_ps(rbhi
, gahi
);
214 dst
[3] = _simd16_unpackhi_ps(rbhi
, gahi
);
219 void vTranspose8x8(simdscalar (&vDst
)[8], const simdscalar
&vMask0
, const simdscalar
&vMask1
, const simdscalar
&vMask2
, const simdscalar
&vMask3
, const simdscalar
&vMask4
, const simdscalar
&vMask5
, const simdscalar
&vMask6
, const simdscalar
&vMask7
)
221 simdscalar __t0
= _simd_unpacklo_ps(vMask0
, vMask1
);
222 simdscalar __t1
= _simd_unpackhi_ps(vMask0
, vMask1
);
223 simdscalar __t2
= _simd_unpacklo_ps(vMask2
, vMask3
);
224 simdscalar __t3
= _simd_unpackhi_ps(vMask2
, vMask3
);
225 simdscalar __t4
= _simd_unpacklo_ps(vMask4
, vMask5
);
226 simdscalar __t5
= _simd_unpackhi_ps(vMask4
, vMask5
);
227 simdscalar __t6
= _simd_unpacklo_ps(vMask6
, vMask7
);
228 simdscalar __t7
= _simd_unpackhi_ps(vMask6
, vMask7
);
229 simdscalar __tt0
= _simd_shuffle_ps(__t0
,__t2
,_MM_SHUFFLE(1,0,1,0));
230 simdscalar __tt1
= _simd_shuffle_ps(__t0
,__t2
,_MM_SHUFFLE(3,2,3,2));
231 simdscalar __tt2
= _simd_shuffle_ps(__t1
,__t3
,_MM_SHUFFLE(1,0,1,0));
232 simdscalar __tt3
= _simd_shuffle_ps(__t1
,__t3
,_MM_SHUFFLE(3,2,3,2));
233 simdscalar __tt4
= _simd_shuffle_ps(__t4
,__t6
,_MM_SHUFFLE(1,0,1,0));
234 simdscalar __tt5
= _simd_shuffle_ps(__t4
,__t6
,_MM_SHUFFLE(3,2,3,2));
235 simdscalar __tt6
= _simd_shuffle_ps(__t5
,__t7
,_MM_SHUFFLE(1,0,1,0));
236 simdscalar __tt7
= _simd_shuffle_ps(__t5
,__t7
,_MM_SHUFFLE(3,2,3,2));
237 vDst
[0] = _simd_permute2f128_ps(__tt0
, __tt4
, 0x20);
238 vDst
[1] = _simd_permute2f128_ps(__tt1
, __tt5
, 0x20);
239 vDst
[2] = _simd_permute2f128_ps(__tt2
, __tt6
, 0x20);
240 vDst
[3] = _simd_permute2f128_ps(__tt3
, __tt7
, 0x20);
241 vDst
[4] = _simd_permute2f128_ps(__tt0
, __tt4
, 0x31);
242 vDst
[5] = _simd_permute2f128_ps(__tt1
, __tt5
, 0x31);
243 vDst
[6] = _simd_permute2f128_ps(__tt2
, __tt6
, 0x31);
244 vDst
[7] = _simd_permute2f128_ps(__tt3
, __tt7
, 0x31);
248 void vTranspose8x8(simdscalar (&vDst
)[8], const simdscalari
&vMask0
, const simdscalari
&vMask1
, const simdscalari
&vMask2
, const simdscalari
&vMask3
, const simdscalari
&vMask4
, const simdscalari
&vMask5
, const simdscalari
&vMask6
, const simdscalari
&vMask7
)
250 vTranspose8x8(vDst
, _simd_castsi_ps(vMask0
), _simd_castsi_ps(vMask1
), _simd_castsi_ps(vMask2
), _simd_castsi_ps(vMask3
),
251 _simd_castsi_ps(vMask4
), _simd_castsi_ps(vMask5
), _simd_castsi_ps(vMask6
), _simd_castsi_ps(vMask7
));
255 //////////////////////////////////////////////////////////////////////////
256 /// TranposeSingleComponent
257 //////////////////////////////////////////////////////////////////////////
258 template<uint32_t bpp
>
259 struct TransposeSingleComponent
261 //////////////////////////////////////////////////////////////////////////
262 /// @brief Pass-thru for single component.
263 /// @param pSrc - source data in SOA form
264 /// @param pDst - output data in AOS form
265 INLINE
static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
)
267 memcpy(pDst
, pSrc
, (bpp
* KNOB_SIMD_WIDTH
) / 8);
269 #if ENABLE_AVX512_SIMD16
271 INLINE
static void Transpose_16(const uint8_t* pSrc
, uint8_t* pDst
)
273 memcpy(pDst
, pSrc
, (bpp
* KNOB_SIMD16_WIDTH
) / 8);
278 //////////////////////////////////////////////////////////////////////////
280 //////////////////////////////////////////////////////////////////////////
281 struct Transpose8_8_8_8
283 //////////////////////////////////////////////////////////////////////////
284 /// @brief Performs an SOA to AOS conversion for packed 8_8_8_8 data.
285 /// @param pSrc - source data in SOA form
286 /// @param pDst - output data in AOS form
287 INLINE
static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
)
289 simdscalari src
= _simd_load_si((const simdscalari
*)pSrc
);
291 #if KNOB_SIMD_WIDTH == 8
292 #if KNOB_ARCH <= KNOB_ARCH_AVX
293 __m128i c0c1
= _mm256_castsi256_si128(src
); // rrrrrrrrgggggggg
294 __m128i c2c3
= _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(src
), 1)); // bbbbbbbbaaaaaaaa
295 __m128i c0c2
= _mm_unpacklo_epi64(c0c1
, c2c3
); // rrrrrrrrbbbbbbbb
296 __m128i c1c3
= _mm_unpackhi_epi64(c0c1
, c2c3
); // ggggggggaaaaaaaa
297 __m128i c01
= _mm_unpacklo_epi8(c0c2
, c1c3
); // rgrgrgrgrgrgrgrg
298 __m128i c23
= _mm_unpackhi_epi8(c0c2
, c1c3
); // babababababababa
299 __m128i c0123lo
= _mm_unpacklo_epi16(c01
, c23
); // rgbargbargbargba
300 __m128i c0123hi
= _mm_unpackhi_epi16(c01
, c23
); // rgbargbargbargba
301 _mm_store_si128((__m128i
*)pDst
, c0123lo
);
302 _mm_store_si128((__m128i
*)(pDst
+ 16), c0123hi
);
304 simdscalari dst01
= _simd_shuffle_epi8(src
,
305 _simd_set_epi32(0x0f078080, 0x0e068080, 0x0d058080, 0x0c048080, 0x80800b03, 0x80800a02, 0x80800901, 0x80800800));
306 simdscalari dst23
= _mm256_permute2x128_si256(src
, src
, 0x01);
307 dst23
= _simd_shuffle_epi8(dst23
,
308 _simd_set_epi32(0x80800f07, 0x80800e06, 0x80800d05, 0x80800c04, 0x0b038080, 0x0a028080, 0x09018080, 0x08008080));
309 simdscalari dst
= _simd_or_si(dst01
, dst23
);
310 _simd_store_si((simdscalari
*)pDst
, dst
);
313 #error Unsupported vector width
316 #if ENABLE_AVX512_SIMD16
318 INLINE
static void Transpose_16(const uint8_t* pSrc
, uint8_t* pDst
)
320 __m128i src0
= _mm_load_si128(reinterpret_cast<const __m128i
*>(pSrc
)); // rrrrrrrrrrrrrrrr
321 __m128i src1
= _mm_load_si128(reinterpret_cast<const __m128i
*>(pSrc
) + 1); // gggggggggggggggg
322 __m128i src2
= _mm_load_si128(reinterpret_cast<const __m128i
*>(pSrc
) + 2); // bbbbbbbbbbbbbbbb
323 __m128i src3
= _mm_load_si128(reinterpret_cast<const __m128i
*>(pSrc
) + 3); // aaaaaaaaaaaaaaaa
325 simd16scalari cvt0
= _simd16_cvtepu8_epi32(src0
);
326 simd16scalari cvt1
= _simd16_cvtepu8_epi32(src1
);
327 simd16scalari cvt2
= _simd16_cvtepu8_epi32(src2
);
328 simd16scalari cvt3
= _simd16_cvtepu8_epi32(src3
);
330 simd16scalari shl1
= _simd16_slli_epi32(cvt1
, 8);
331 simd16scalari shl2
= _simd16_slli_epi32(cvt2
, 16);
332 simd16scalari shl3
= _simd16_slli_epi32(cvt3
, 24);
334 simd16scalari dst
= _simd16_or_si(_simd16_or_si(cvt0
, shl1
), _simd16_or_si(shl2
, shl3
));
336 _simd16_store_si(reinterpret_cast<simd16scalari
*>(pDst
), dst
); // rgbargbargbargbargbargbargbargbargbargbargbargbargbargbargbargba
341 //////////////////////////////////////////////////////////////////////////
343 //////////////////////////////////////////////////////////////////////////
344 struct Transpose8_8_8
346 //////////////////////////////////////////////////////////////////////////
347 /// @brief Performs an SOA to AOS conversion for packed 8_8_8 data.
348 /// @param pSrc - source data in SOA form
349 /// @param pDst - output data in AOS form
350 INLINE
static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
351 #if ENABLE_AVX512_SIMD16
353 INLINE
static void Transpose_16(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
357 //////////////////////////////////////////////////////////////////////////
359 //////////////////////////////////////////////////////////////////////////
362 //////////////////////////////////////////////////////////////////////////
363 /// @brief Performs an SOA to AOS conversion for packed 8_8 data.
364 /// @param pSrc - source data in SOA form
365 /// @param pDst - output data in AOS form
366 INLINE
static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
)
368 #if KNOB_SIMD_WIDTH == 8
369 simdscalari src
= _simd_load_si((const simdscalari
*)pSrc
);
371 __m128i rg
= _mm256_castsi256_si128(src
); // rrrrrrrr gggggggg
372 __m128i g
= _mm_unpackhi_epi64(rg
, rg
); // gggggggg gggggggg
373 rg
= _mm_unpacklo_epi8(rg
, g
);
374 _mm_store_si128((__m128i
*)pDst
, rg
);
376 #error Unsupported vector width
379 #if ENABLE_AVX512_SIMD16
381 INLINE
static void Transpose_16(const uint8_t* pSrc
, uint8_t* pDst
)
383 __m128i src0
= _mm_load_si128(reinterpret_cast<const __m128i
*>(pSrc
)); // rrrrrrrrrrrrrrrr
384 __m128i src1
= _mm_load_si128(reinterpret_cast<const __m128i
*>(pSrc
) + 1); // gggggggggggggggg
386 simdscalari cvt0
= _simd_cvtepu8_epi16(src0
);
387 simdscalari cvt1
= _simd_cvtepu8_epi16(src1
);
389 simdscalari shl1
= _simd_slli_epi32(cvt1
, 8);
391 simdscalari dst
= _simd_or_si(cvt0
, shl1
);
393 _simd_store_si(reinterpret_cast<simdscalari
*>(pDst
), dst
); // rgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrg
398 //////////////////////////////////////////////////////////////////////////
399 /// Transpose32_32_32_32
400 //////////////////////////////////////////////////////////////////////////
401 struct Transpose32_32_32_32
403 //////////////////////////////////////////////////////////////////////////
404 /// @brief Performs an SOA to AOS conversion for packed 32_32_32_32 data.
405 /// @param pSrc - source data in SOA form
406 /// @param pDst - output data in AOS form
407 INLINE
static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
)
409 #if KNOB_SIMD_WIDTH == 8
410 simdscalar src0
= _simd_load_ps((const float*)pSrc
);
411 simdscalar src1
= _simd_load_ps((const float*)pSrc
+ 8);
412 simdscalar src2
= _simd_load_ps((const float*)pSrc
+ 16);
413 simdscalar src3
= _simd_load_ps((const float*)pSrc
+ 24);
416 vTranspose4x8(vDst
, src0
, src1
, src2
, src3
);
417 _mm_store_ps((float*)pDst
, vDst
[0]);
418 _mm_store_ps((float*)pDst
+4, vDst
[1]);
419 _mm_store_ps((float*)pDst
+8, vDst
[2]);
420 _mm_store_ps((float*)pDst
+12, vDst
[3]);
421 _mm_store_ps((float*)pDst
+16, vDst
[4]);
422 _mm_store_ps((float*)pDst
+20, vDst
[5]);
423 _mm_store_ps((float*)pDst
+24, vDst
[6]);
424 _mm_store_ps((float*)pDst
+28, vDst
[7]);
426 #error Unsupported vector width
429 #if ENABLE_AVX512_SIMD16
431 INLINE
static void Transpose_16(const uint8_t* pSrc
, uint8_t* pDst
)
433 simd16scalar src0
= _simd16_load_ps(reinterpret_cast<const float *>(pSrc
));
434 simd16scalar src1
= _simd16_load_ps(reinterpret_cast<const float *>(pSrc
) + 16);
435 simd16scalar src2
= _simd16_load_ps(reinterpret_cast<const float *>(pSrc
) + 32);
436 simd16scalar src3
= _simd16_load_ps(reinterpret_cast<const float *>(pSrc
) + 48);
440 vTranspose4x16(dst
, src0
, src1
, src2
, src3
);
442 _simd16_store_ps(reinterpret_cast<float *>(pDst
) + 0, dst
[0]);
443 _simd16_store_ps(reinterpret_cast<float *>(pDst
) + 16, dst
[1]);
444 _simd16_store_ps(reinterpret_cast<float *>(pDst
) + 32, dst
[2]);
445 _simd16_store_ps(reinterpret_cast<float *>(pDst
) + 48, dst
[3]);
450 //////////////////////////////////////////////////////////////////////////
451 /// Transpose32_32_32
452 //////////////////////////////////////////////////////////////////////////
453 struct Transpose32_32_32
455 //////////////////////////////////////////////////////////////////////////
456 /// @brief Performs an SOA to AOS conversion for packed 32_32_32 data.
457 /// @param pSrc - source data in SOA form
458 /// @param pDst - output data in AOS form
459 INLINE
static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
)
461 #if KNOB_SIMD_WIDTH == 8
462 simdscalar src0
= _simd_load_ps((const float*)pSrc
);
463 simdscalar src1
= _simd_load_ps((const float*)pSrc
+ 8);
464 simdscalar src2
= _simd_load_ps((const float*)pSrc
+ 16);
467 vTranspose3x8(vDst
, src0
, src1
, src2
);
468 _mm_store_ps((float*)pDst
, vDst
[0]);
469 _mm_store_ps((float*)pDst
+ 4, vDst
[1]);
470 _mm_store_ps((float*)pDst
+ 8, vDst
[2]);
471 _mm_store_ps((float*)pDst
+ 12, vDst
[3]);
472 _mm_store_ps((float*)pDst
+ 16, vDst
[4]);
473 _mm_store_ps((float*)pDst
+ 20, vDst
[5]);
474 _mm_store_ps((float*)pDst
+ 24, vDst
[6]);
475 _mm_store_ps((float*)pDst
+ 28, vDst
[7]);
477 #error Unsupported vector width
480 #if ENABLE_AVX512_SIMD16
482 INLINE
static void Transpose_16(const uint8_t* pSrc
, uint8_t* pDst
)
484 simd16scalar src0
= _simd16_load_ps(reinterpret_cast<const float *>(pSrc
));
485 simd16scalar src1
= _simd16_load_ps(reinterpret_cast<const float *>(pSrc
) + 16);
486 simd16scalar src2
= _simd16_load_ps(reinterpret_cast<const float *>(pSrc
) + 32);
487 simd16scalar src3
= _simd16_setzero_ps();
491 vTranspose4x16(dst
, src0
, src1
, src2
, src3
);
493 _simd16_store_ps(reinterpret_cast<float *>(pDst
) + 0, dst
[0]);
494 _simd16_store_ps(reinterpret_cast<float *>(pDst
) + 16, dst
[1]);
495 _simd16_store_ps(reinterpret_cast<float *>(pDst
) + 32, dst
[2]);
496 _simd16_store_ps(reinterpret_cast<float *>(pDst
) + 48, dst
[3]);
501 //////////////////////////////////////////////////////////////////////////
503 //////////////////////////////////////////////////////////////////////////
504 struct Transpose32_32
506 //////////////////////////////////////////////////////////////////////////
507 /// @brief Performs an SOA to AOS conversion for packed 32_32 data.
508 /// @param pSrc - source data in SOA form
509 /// @param pDst - output data in AOS form
510 INLINE
static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
)
512 #if KNOB_SIMD_WIDTH == 8
513 const float* pfSrc
= (const float*)pSrc
;
514 __m128 src_r0
= _mm_load_ps(pfSrc
+ 0);
515 __m128 src_r1
= _mm_load_ps(pfSrc
+ 4);
516 __m128 src_g0
= _mm_load_ps(pfSrc
+ 8);
517 __m128 src_g1
= _mm_load_ps(pfSrc
+ 12);
519 __m128 dst0
= _mm_unpacklo_ps(src_r0
, src_g0
);
520 __m128 dst1
= _mm_unpackhi_ps(src_r0
, src_g0
);
521 __m128 dst2
= _mm_unpacklo_ps(src_r1
, src_g1
);
522 __m128 dst3
= _mm_unpackhi_ps(src_r1
, src_g1
);
524 float* pfDst
= (float*)pDst
;
525 _mm_store_ps(pfDst
+ 0, dst0
);
526 _mm_store_ps(pfDst
+ 4, dst1
);
527 _mm_store_ps(pfDst
+ 8, dst2
);
528 _mm_store_ps(pfDst
+ 12, dst3
);
530 #error Unsupported vector width
533 #if ENABLE_AVX512_SIMD16
535 INLINE
static void Transpose_16(const uint8_t* pSrc
, uint8_t* pDst
)
537 simd16scalar src0
= _simd16_load_ps(reinterpret_cast<const float *>(pSrc
)); // rrrrrrrrrrrrrrrr
538 simd16scalar src1
= _simd16_load_ps(reinterpret_cast<const float *>(pSrc
) + 16); // gggggggggggggggg
540 simd16scalar tmp0
= _simd16_unpacklo_ps(src0
, src1
); // r0 g0 r1 g1 r4 g4 r5 g5 r8 g8 r9 g9 rC gC rD gD
541 simd16scalar tmp1
= _simd16_unpackhi_ps(src0
, src1
); // r2 g2 r3 g3 r6 g6 r7 g7 rA gA rB gB rE gE rF gF
543 simd16scalar per0
= _simd16_permute2f128_ps(tmp0
, tmp1
, 0x44); // (1, 0, 1, 0) // r0 g0 r1 g1 r4 g4 r5 g5 r2 g2 r3 g3 r6 g6 r7 g7
544 simd16scalar per1
= _simd16_permute2f128_ps(tmp0
, tmp1
, 0xEE); // (3, 2, 3, 2) // r8 g8 r9 g9 rC gC rD gD rA gA rB gB rE gE rF gF
546 simd16scalar dst0
= _simd16_permute2f128_ps(per0
, per0
, 0xD8); // (3, 1, 2, 0) // r0 g0 r1 g1 r2 g2 r3 g3 r4 g4 r5 g5 r6 g6 r7 g7
547 simd16scalar dst1
= _simd16_permute2f128_ps(per1
, per1
, 0xD8); // (3, 1, 2, 0) // r8 g8 r9 g9 rA gA rB gB rC gC rD gD rE gE rF gF
549 _simd16_store_ps(reinterpret_cast<float *>(pDst
) + 0, dst0
); // rgrgrgrgrgrgrgrg
550 _simd16_store_ps(reinterpret_cast<float *>(pDst
) + 16, dst1
); // rgrgrgrgrgrgrgrg
555 //////////////////////////////////////////////////////////////////////////
556 /// Transpose16_16_16_16
557 //////////////////////////////////////////////////////////////////////////
558 struct Transpose16_16_16_16
560 //////////////////////////////////////////////////////////////////////////
561 /// @brief Performs an SOA to AOS conversion for packed 16_16_16_16 data.
562 /// @param pSrc - source data in SOA form
563 /// @param pDst - output data in AOS form
564 INLINE
static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
)
566 #if KNOB_SIMD_WIDTH == 8
567 simdscalari src_rg
= _simd_load_si((const simdscalari
*)pSrc
);
568 simdscalari src_ba
= _simd_load_si((const simdscalari
*)(pSrc
+ sizeof(simdscalari
)));
570 __m128i src_r
= _mm256_extractf128_si256(src_rg
, 0);
571 __m128i src_g
= _mm256_extractf128_si256(src_rg
, 1);
572 __m128i src_b
= _mm256_extractf128_si256(src_ba
, 0);
573 __m128i src_a
= _mm256_extractf128_si256(src_ba
, 1);
575 __m128i rg0
= _mm_unpacklo_epi16(src_r
, src_g
);
576 __m128i rg1
= _mm_unpackhi_epi16(src_r
, src_g
);
577 __m128i ba0
= _mm_unpacklo_epi16(src_b
, src_a
);
578 __m128i ba1
= _mm_unpackhi_epi16(src_b
, src_a
);
580 __m128i dst0
= _mm_unpacklo_epi32(rg0
, ba0
);
581 __m128i dst1
= _mm_unpackhi_epi32(rg0
, ba0
);
582 __m128i dst2
= _mm_unpacklo_epi32(rg1
, ba1
);
583 __m128i dst3
= _mm_unpackhi_epi32(rg1
, ba1
);
585 _mm_store_si128(((__m128i
*)pDst
) + 0, dst0
);
586 _mm_store_si128(((__m128i
*)pDst
) + 1, dst1
);
587 _mm_store_si128(((__m128i
*)pDst
) + 2, dst2
);
588 _mm_store_si128(((__m128i
*)pDst
) + 3, dst3
);
590 #error Unsupported vector width
593 #if ENABLE_AVX512_SIMD16
595 INLINE
static void Transpose_16(const uint8_t* pSrc
, uint8_t* pDst
)
597 simdscalari src0
= _simd_load_si(reinterpret_cast<const simdscalari
*>(pSrc
)); // rrrrrrrrrrrrrrrr
598 simdscalari src1
= _simd_load_si(reinterpret_cast<const simdscalari
*>(pSrc
) + 1); // gggggggggggggggg
599 simdscalari src2
= _simd_load_si(reinterpret_cast<const simdscalari
*>(pSrc
) + 2); // bbbbbbbbbbbbbbbb
600 simdscalari src3
= _simd_load_si(reinterpret_cast<const simdscalari
*>(pSrc
) + 3); // aaaaaaaaaaaaaaaa
602 simdscalari pre0
= _simd_unpacklo_epi16(src0
, src1
); // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB
603 simdscalari pre1
= _simd_unpackhi_epi16(src0
, src1
); // rg4 rg5 rg6 rg7 rgC rgD rgE rgF
604 simdscalari pre2
= _simd_unpacklo_epi16(src2
, src3
); // ba0 ba1 ba3 ba3 ba8 ba9 baA baB
605 simdscalari pre3
= _simd_unpackhi_epi16(src2
, src3
); // ba4 ba5 ba6 ba7 baC baD baE baF
607 simdscalari tmp0
= _simd_unpacklo_epi32(pre0
, pre2
); // rbga0 rbga1 rbga8 rbga9
608 simdscalari tmp1
= _simd_unpackhi_epi32(pre0
, pre2
); // rbga2 rbga3 rbgaA rbgaB
609 simdscalari tmp2
= _simd_unpacklo_epi32(pre1
, pre3
); // rbga4 rbga5 rgbaC rbgaD
610 simdscalari tmp3
= _simd_unpackhi_epi32(pre1
, pre3
); // rbga6 rbga7 rbgaE rbgaF
612 simdscalari dst0
= _simd_permute2f128_si(tmp0
, tmp1
, 0x20); // (2, 0) // rbga0 rbga1 rbga2 rbga3
613 simdscalari dst1
= _simd_permute2f128_si(tmp2
, tmp3
, 0x20); // (2, 0) // rbga4 rbga5 rbga6 rbga7
614 simdscalari dst2
= _simd_permute2f128_si(tmp0
, tmp1
, 0x31); // (3, 1) // rbga8 rbga9 rbgaA rbgaB
615 simdscalari dst3
= _simd_permute2f128_si(tmp2
, tmp3
, 0x31); // (3, 1) // rbgaC rbgaD rbgaE rbgaF
617 _simd_store_si(reinterpret_cast<simdscalari
*>(pDst
) + 0, dst0
); // rgbargbargbargba
618 _simd_store_si(reinterpret_cast<simdscalari
*>(pDst
) + 1, dst1
); // rgbargbargbargba
619 _simd_store_si(reinterpret_cast<simdscalari
*>(pDst
) + 2, dst2
); // rgbargbargbargba
620 _simd_store_si(reinterpret_cast<simdscalari
*>(pDst
) + 3, dst3
); // rgbargbargbargba
625 //////////////////////////////////////////////////////////////////////////
626 /// Transpose16_16_16
627 //////////////////////////////////////////////////////////////////////////
628 struct Transpose16_16_16
630 //////////////////////////////////////////////////////////////////////////
631 /// @brief Performs an SOA to AOS conversion for packed 16_16_16 data.
632 /// @param pSrc - source data in SOA form
633 /// @param pDst - output data in AOS form
634 INLINE
static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
)
636 #if KNOB_SIMD_WIDTH == 8
637 simdscalari src_rg
= _simd_load_si((const simdscalari
*)pSrc
);
639 __m128i src_r
= _mm256_extractf128_si256(src_rg
, 0);
640 __m128i src_g
= _mm256_extractf128_si256(src_rg
, 1);
641 __m128i src_b
= _mm_load_si128((const __m128i
*)(pSrc
+ sizeof(simdscalari
)));
642 __m128i src_a
= _mm_undefined_si128();
644 __m128i rg0
= _mm_unpacklo_epi16(src_r
, src_g
);
645 __m128i rg1
= _mm_unpackhi_epi16(src_r
, src_g
);
646 __m128i ba0
= _mm_unpacklo_epi16(src_b
, src_a
);
647 __m128i ba1
= _mm_unpackhi_epi16(src_b
, src_a
);
649 __m128i dst0
= _mm_unpacklo_epi32(rg0
, ba0
);
650 __m128i dst1
= _mm_unpackhi_epi32(rg0
, ba0
);
651 __m128i dst2
= _mm_unpacklo_epi32(rg1
, ba1
);
652 __m128i dst3
= _mm_unpackhi_epi32(rg1
, ba1
);
654 _mm_store_si128(((__m128i
*)pDst
) + 0, dst0
);
655 _mm_store_si128(((__m128i
*)pDst
) + 1, dst1
);
656 _mm_store_si128(((__m128i
*)pDst
) + 2, dst2
);
657 _mm_store_si128(((__m128i
*)pDst
) + 3, dst3
);
659 #error Unsupported vector width
662 #if ENABLE_AVX512_SIMD16
664 INLINE
static void Transpose_16(const uint8_t* pSrc
, uint8_t* pDst
)
666 simdscalari src0
= _simd_load_si(reinterpret_cast<const simdscalari
*>(pSrc
)); // rrrrrrrrrrrrrrrr
667 simdscalari src1
= _simd_load_si(reinterpret_cast<const simdscalari
*>(pSrc
) + 1); // gggggggggggggggg
668 simdscalari src2
= _simd_load_si(reinterpret_cast<const simdscalari
*>(pSrc
) + 2); // bbbbbbbbbbbbbbbb
669 simdscalari src3
= _simd_setzero_si(); // aaaaaaaaaaaaaaaa
671 simdscalari pre0
= _simd_unpacklo_epi16(src0
, src1
); // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB
672 simdscalari pre1
= _simd_unpackhi_epi16(src0
, src1
); // rg4 rg5 rg6 rg7 rgC rgD rgE rgF
673 simdscalari pre2
= _simd_unpacklo_epi16(src2
, src3
); // ba0 ba1 ba3 ba3 ba8 ba9 baA baB
674 simdscalari pre3
= _simd_unpackhi_epi16(src2
, src3
); // ba4 ba5 ba6 ba7 baC baD baE baF
676 simdscalari tmp0
= _simd_unpacklo_epi32(pre0
, pre2
); // rbga0 rbga1 rbga8 rbga9
677 simdscalari tmp1
= _simd_unpackhi_epi32(pre0
, pre2
); // rbga2 rbga3 rbgaA rbgaB
678 simdscalari tmp2
= _simd_unpacklo_epi32(pre1
, pre3
); // rbga4 rbga5 rgbaC rbgaD
679 simdscalari tmp3
= _simd_unpackhi_epi32(pre1
, pre3
); // rbga6 rbga7 rbgaE rbgaF
681 simdscalari dst0
= _simd_permute2f128_si(tmp0
, tmp1
, 0x20); // (2, 0) // rbga0 rbga1 rbga2 rbga3
682 simdscalari dst1
= _simd_permute2f128_si(tmp2
, tmp3
, 0x20); // (2, 0) // rbga4 rbga5 rbga6 rbga7
683 simdscalari dst2
= _simd_permute2f128_si(tmp0
, tmp1
, 0x31); // (3, 1) // rbga8 rbga9 rbgaA rbgaB
684 simdscalari dst3
= _simd_permute2f128_si(tmp2
, tmp3
, 0x31); // (3, 1) // rbgaC rbgaD rbgaE rbgaF
686 _simd_store_si(reinterpret_cast<simdscalari
*>(pDst
) + 0, dst0
); // rgbargbargbargba
687 _simd_store_si(reinterpret_cast<simdscalari
*>(pDst
) + 1, dst1
); // rgbargbargbargba
688 _simd_store_si(reinterpret_cast<simdscalari
*>(pDst
) + 2, dst2
); // rgbargbargbargba
689 _simd_store_si(reinterpret_cast<simdscalari
*>(pDst
) + 3, dst3
); // rgbargbargbargba
694 //////////////////////////////////////////////////////////////////////////
696 //////////////////////////////////////////////////////////////////////////
697 struct Transpose16_16
699 //////////////////////////////////////////////////////////////////////////
700 /// @brief Performs an SOA to AOS conversion for packed 16_16 data.
701 /// @param pSrc - source data in SOA form
702 /// @param pDst - output data in AOS form
703 INLINE
static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
)
705 #if KNOB_SIMD_WIDTH == 8
706 simdscalar src
= _simd_load_ps((const float*)pSrc
);
708 __m128 comp0
= _mm256_castps256_ps128(src
);
709 __m128 comp1
= _mm256_extractf128_ps(src
, 1);
711 __m128i comp0i
= _mm_castps_si128(comp0
);
712 __m128i comp1i
= _mm_castps_si128(comp1
);
714 __m128i resLo
= _mm_unpacklo_epi16(comp0i
, comp1i
);
715 __m128i resHi
= _mm_unpackhi_epi16(comp0i
, comp1i
);
717 _mm_store_si128((__m128i
*)pDst
, resLo
);
718 _mm_store_si128((__m128i
*)pDst
+ 1, resHi
);
720 #error Unsupported vector width
723 #if ENABLE_AVX512_SIMD16
725 INLINE
static void Transpose_16(const uint8_t* pSrc
, uint8_t* pDst
)
727 simdscalari src0
= _simd_load_si(reinterpret_cast<const simdscalari
*>(pSrc
)); // rrrrrrrrrrrrrrrr
728 simdscalari src1
= _simd_load_si(reinterpret_cast<const simdscalari
*>(pSrc
) + 1); // gggggggggggggggg
730 simdscalari tmp0
= _simd_unpacklo_epi16(src0
, src1
); // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB
731 simdscalari tmp1
= _simd_unpackhi_epi16(src0
, src1
); // rg4 rg5 rg6 rg7 rgC rgD rgE rgF
733 simdscalari dst0
= _simd_permute2f128_si(tmp0
, tmp1
, 0x20); // (2, 0) // rg0 rg1 rg2 rg3 rg4 rg5 rg6 rg7
734 simdscalari dst1
= _simd_permute2f128_si(tmp0
, tmp1
, 0x31); // (3, 1) // rg8 rg9 rgA rgB rgC rgD rgE rgF
736 _simd_store_si(reinterpret_cast<simdscalari
*>(pDst
) + 0, dst0
); // rgrgrgrgrgrgrgrg
737 _simd_store_si(reinterpret_cast<simdscalari
*>(pDst
) + 1, dst1
); // rgrgrgrgrgrgrgrg
742 //////////////////////////////////////////////////////////////////////////
744 //////////////////////////////////////////////////////////////////////////
747 //////////////////////////////////////////////////////////////////////////
748 /// @brief Performs an SOA to AOS conversion for packed 24_8 data.
749 /// @param pSrc - source data in SOA form
750 /// @param pDst - output data in AOS form
751 static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
752 #if ENABLE_AVX512_SIMD16
754 static void Transpose_16(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
758 //////////////////////////////////////////////////////////////////////////
760 //////////////////////////////////////////////////////////////////////////
761 struct Transpose32_8_24
763 //////////////////////////////////////////////////////////////////////////
764 /// @brief Performs an SOA to AOS conversion for packed 32_8_24 data.
765 /// @param pSrc - source data in SOA form
766 /// @param pDst - output data in AOS form
767 static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
768 #if ENABLE_AVX512_SIMD16
770 static void Transpose_16(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
774 //////////////////////////////////////////////////////////////////////////
776 //////////////////////////////////////////////////////////////////////////
777 struct Transpose4_4_4_4
779 //////////////////////////////////////////////////////////////////////////
780 /// @brief Performs an SOA to AOS conversion for packed 4_4_4_4 data.
781 /// @param pSrc - source data in SOA form
782 /// @param pDst - output data in AOS form
783 static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
784 #if ENABLE_AVX512_SIMD16
786 static void Transpose_16(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
790 //////////////////////////////////////////////////////////////////////////
792 //////////////////////////////////////////////////////////////////////////
793 struct Transpose5_6_5
795 //////////////////////////////////////////////////////////////////////////
796 /// @brief Performs an SOA to AOS conversion for packed 5_6_5 data.
797 /// @param pSrc - source data in SOA form
798 /// @param pDst - output data in AOS form
799 static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
800 #if ENABLE_AVX512_SIMD16
802 static void Transpose_16(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
806 //////////////////////////////////////////////////////////////////////////
808 //////////////////////////////////////////////////////////////////////////
809 struct Transpose9_9_9_5
811 //////////////////////////////////////////////////////////////////////////
812 /// @brief Performs an SOA to AOS conversion for packed 9_9_9_5 data.
813 /// @param pSrc - source data in SOA form
814 /// @param pDst - output data in AOS form
815 static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
816 #if ENABLE_AVX512_SIMD16
818 static void Transpose_16(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
822 //////////////////////////////////////////////////////////////////////////
824 //////////////////////////////////////////////////////////////////////////
825 struct Transpose5_5_5_1
827 //////////////////////////////////////////////////////////////////////////
828 /// @brief Performs an SOA to AOS conversion for packed 5_5_5_1 data.
829 /// @param pSrc - source data in SOA form
830 /// @param pDst - output data in AOS form
831 static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
832 #if ENABLE_AVX512_SIMD16
834 static void Transpose_16(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
838 //////////////////////////////////////////////////////////////////////////
840 //////////////////////////////////////////////////////////////////////////
841 struct Transpose1_5_5_5
843 //////////////////////////////////////////////////////////////////////////
844 /// @brief Performs an SOA to AOS conversion for packed 5_5_5_1 data.
845 /// @param pSrc - source data in SOA form
846 /// @param pDst - output data in AOS form
847 static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
850 //////////////////////////////////////////////////////////////////////////
851 /// Transpose10_10_10_2
852 //////////////////////////////////////////////////////////////////////////
853 struct Transpose10_10_10_2
855 //////////////////////////////////////////////////////////////////////////
856 /// @brief Performs an SOA to AOS conversion for packed 10_10_10_2 data.
857 /// @param pSrc - source data in SOA form
858 /// @param pDst - output data in AOS form
859 static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
860 #if ENABLE_AVX512_SIMD16
862 static void Transpose_16(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
866 //////////////////////////////////////////////////////////////////////////
867 /// Transpose11_11_10
868 //////////////////////////////////////////////////////////////////////////
869 struct Transpose11_11_10
871 //////////////////////////////////////////////////////////////////////////
872 /// @brief Performs an SOA to AOS conversion for packed 11_11_10 data.
873 /// @param pSrc - source data in SOA form
874 /// @param pDst - output data in AOS form
875 static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
876 #if ENABLE_AVX512_SIMD16
878 static void Transpose_16(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
882 //////////////////////////////////////////////////////////////////////////
884 //////////////////////////////////////////////////////////////////////////
887 //////////////////////////////////////////////////////////////////////////
888 /// @brief Performs an SOA to AOS conversion
889 /// @param pSrc - source data in SOA form
890 /// @param pDst - output data in AOS form
891 static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
892 #if ENABLE_AVX512_SIMD16
894 static void Transpose_16(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
898 //////////////////////////////////////////////////////////////////////////
900 //////////////////////////////////////////////////////////////////////////
901 struct Transpose64_64
903 //////////////////////////////////////////////////////////////////////////
904 /// @brief Performs an SOA to AOS conversion
905 /// @param pSrc - source data in SOA form
906 /// @param pDst - output data in AOS form
907 static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
908 #if ENABLE_AVX512_SIMD16
910 static void Transpose_16(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
914 //////////////////////////////////////////////////////////////////////////
915 /// Transpose64_64_64
916 //////////////////////////////////////////////////////////////////////////
917 struct Transpose64_64_64
919 //////////////////////////////////////////////////////////////////////////
920 /// @brief Performs an SOA to AOS conversion
921 /// @param pSrc - source data in SOA form
922 /// @param pDst - output data in AOS form
923 static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
924 #if ENABLE_AVX512_SIMD16
926 static void Transpose_16(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
930 //////////////////////////////////////////////////////////////////////////
931 /// Transpose64_64_64_64
932 //////////////////////////////////////////////////////////////////////////
933 struct Transpose64_64_64_64
935 //////////////////////////////////////////////////////////////////////////
936 /// @brief Performs an SOA to AOS conversion
937 /// @param pSrc - source data in SOA form
938 /// @param pDst - output data in AOS form
939 static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
940 #if ENABLE_AVX512_SIMD16
942 static void Transpose_16(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
946 // helper function to unroll loops
947 template<int Begin
, int End
, int Step
= 1>
949 template<typename Lambda
>
950 INLINE
static void step(Lambda
& func
) {
952 UnrollerL
<Begin
+ Step
, End
, Step
>::step(func
);
956 template<int End
, int Step
>
957 struct UnrollerL
<End
, End
, Step
> {
958 template<typename Lambda
>
959 static void step(Lambda
& func
) {
963 // helper function to unroll loops, with mask to skip specific iterations
964 template<int Begin
, int End
, int Step
= 1, int Mask
= 0x7f>
965 struct UnrollerLMask
{
966 template<typename Lambda
>
967 INLINE
static void step(Lambda
& func
) {
968 if(Mask
& (1 << Begin
))
972 UnrollerL
<Begin
+ Step
, End
, Step
>::step(func
);
976 template<int End
, int Step
, int Mask
>
977 struct UnrollerLMask
<End
, End
, Step
, Mask
> {
978 template<typename Lambda
>
979 static void step(Lambda
& func
) {
983 // general CRC compute
985 uint32_t ComputeCRC(uint32_t crc
, const void *pData
, uint32_t size
)
987 #if defined(_WIN64) || defined(__x86_64__)
988 uint32_t sizeInQwords
= size
/ sizeof(uint64_t);
989 uint32_t sizeRemainderBytes
= size
% sizeof(uint64_t);
990 uint64_t* pDataWords
= (uint64_t*)pData
;
991 for (uint32_t i
= 0; i
< sizeInQwords
; ++i
)
993 crc
= (uint32_t)_mm_crc32_u64(crc
, *pDataWords
++);
996 uint32_t sizeInDwords
= size
/ sizeof(uint32_t);
997 uint32_t sizeRemainderBytes
= size
% sizeof(uint32_t);
998 uint32_t* pDataWords
= (uint32_t*)pData
;
999 for (uint32_t i
= 0; i
< sizeInDwords
; ++i
)
1001 crc
= _mm_crc32_u32(crc
, *pDataWords
++);
1005 uint8_t* pRemainderBytes
= (uint8_t*)pDataWords
;
1006 for (uint32_t i
= 0; i
< sizeRemainderBytes
; ++i
)
1008 crc
= _mm_crc32_u8(crc
, *pRemainderBytes
++);
1014 //////////////////////////////////////////////////////////////////////////
1015 /// Add byte offset to any-type pointer
1016 //////////////////////////////////////////////////////////////////////////
1017 template <typename T
>
1019 static T
* PtrAdd(T
* p
, intptr_t offset
)
1021 intptr_t intp
= reinterpret_cast<intptr_t>(p
);
1022 return reinterpret_cast<T
*>(intp
+ offset
);
1025 //////////////////////////////////////////////////////////////////////////
1026 /// Is a power-of-2?
1027 //////////////////////////////////////////////////////////////////////////
1028 template <typename T
>
1030 static bool IsPow2(T value
)
1032 return value
== (value
& (0 - value
));
1035 //////////////////////////////////////////////////////////////////////////
1036 /// Align down to specified alignment
1037 /// Note: IsPow2(alignment) MUST be true
1038 //////////////////////////////////////////////////////////////////////////
1039 template <typename T1
, typename T2
>
1041 static T1
AlignDownPow2(T1 value
, T2 alignment
)
1043 SWR_ASSERT(IsPow2(alignment
));
1044 return value
& ~T1(alignment
- 1);
1047 //////////////////////////////////////////////////////////////////////////
1048 /// Align up to specified alignment
1049 /// Note: IsPow2(alignment) MUST be true
1050 //////////////////////////////////////////////////////////////////////////
1051 template <typename T1
, typename T2
>
1053 static T1
AlignUpPow2(T1 value
, T2 alignment
)
1055 return AlignDownPow2(value
+ T1(alignment
- 1), alignment
);
1058 //////////////////////////////////////////////////////////////////////////
1059 /// Align up ptr to specified alignment
1060 /// Note: IsPow2(alignment) MUST be true
1061 //////////////////////////////////////////////////////////////////////////
1062 template <typename T1
, typename T2
>
1064 static T1
* AlignUpPow2(T1
* value
, T2 alignment
)
1066 return reinterpret_cast<T1
*>(
1067 AlignDownPow2(reinterpret_cast<uintptr_t>(value
) + uintptr_t(alignment
- 1), alignment
));
1070 //////////////////////////////////////////////////////////////////////////
1071 /// Align down to specified alignment
1072 //////////////////////////////////////////////////////////////////////////
1073 template <typename T1
, typename T2
>
1075 static T1
AlignDown(T1 value
, T2 alignment
)
1077 if (IsPow2(alignment
)) { return AlignDownPow2(value
, alignment
); }
1078 return value
- T1(value
% alignment
);
1081 //////////////////////////////////////////////////////////////////////////
1082 /// Align down to specified alignment
1083 //////////////////////////////////////////////////////////////////////////
1084 template <typename T1
, typename T2
>
1086 static T1
* AlignDown(T1
* value
, T2 alignment
)
1088 return (T1
*)AlignDown(uintptr_t(value
), alignment
);
1091 //////////////////////////////////////////////////////////////////////////
1092 /// Align up to specified alignment
1093 /// Note: IsPow2(alignment) MUST be true
1094 //////////////////////////////////////////////////////////////////////////
1095 template <typename T1
, typename T2
>
1097 static T1
AlignUp(T1 value
, T2 alignment
)
1099 return AlignDown(value
+ T1(alignment
- 1), alignment
);
1102 //////////////////////////////////////////////////////////////////////////
1103 /// Align up to specified alignment
1104 /// Note: IsPow2(alignment) MUST be true
1105 //////////////////////////////////////////////////////////////////////////
1106 template <typename T1
, typename T2
>
1108 static T1
* AlignUp(T1
* value
, T2 alignment
)
1110 return AlignDown(PtrAdd(value
, alignment
- 1), alignment
);
1113 //////////////////////////////////////////////////////////////////////////
1114 /// Helper structure used to access an array of elements that don't
1115 /// correspond to a typical word size.
1116 //////////////////////////////////////////////////////////////////////////
1117 template<typename T
, size_t BitsPerElementT
, size_t ArrayLenT
>
1121 static const size_t BITS_PER_WORD
= sizeof(size_t) * 8;
1122 static const size_t ELEMENTS_PER_WORD
= BITS_PER_WORD
/ BitsPerElementT
;
1123 static const size_t NUM_WORDS
= (ArrayLenT
+ ELEMENTS_PER_WORD
- 1) / ELEMENTS_PER_WORD
;
1124 static const size_t ELEMENT_MASK
= (size_t(1) << BitsPerElementT
) - 1;
1126 static_assert(ELEMENTS_PER_WORD
* BitsPerElementT
== BITS_PER_WORD
,
1127 "Element size must an integral fraction of pointer size");
1129 size_t m_words
[NUM_WORDS
] = {};
1133 T
operator[] (size_t elementIndex
) const
1135 size_t word
= m_words
[elementIndex
/ ELEMENTS_PER_WORD
];
1136 word
>>= ((elementIndex
% ELEMENTS_PER_WORD
) * BitsPerElementT
);
1137 return T(word
& ELEMENT_MASK
);
1141 // Ranged integer argument for TemplateArgUnroller
1142 template <uint32_t TMin
, uint32_t TMax
>
1148 // Recursive template used to auto-nest conditionals. Converts dynamic boolean function
1149 // arguments to static template arguments.
1150 template <typename TermT
, typename
... ArgsB
>
1151 struct TemplateArgUnroller
1153 //-----------------------------------------
1155 //-----------------------------------------
1157 // Last Arg Terminator
1158 static typename
TermT::FuncType
GetFunc(bool bArg
)
1162 return TermT::template GetFunc
<ArgsB
..., std::true_type
>();
1165 return TermT::template GetFunc
<ArgsB
..., std::false_type
>();
1168 // Recursively parse args
1169 template <typename
... TArgsT
>
1170 static typename
TermT::FuncType
GetFunc(bool bArg
, TArgsT
... remainingArgs
)
1174 return TemplateArgUnroller
<TermT
, ArgsB
..., std::true_type
>::GetFunc(remainingArgs
...);
1177 return TemplateArgUnroller
<TermT
, ArgsB
..., std::false_type
>::GetFunc(remainingArgs
...);
1180 //-----------------------------------------
1181 // Integer value (within specified range)
1182 //-----------------------------------------
1184 // Last Arg Terminator
1185 template <uint32_t TMin
, uint32_t TMax
>
1186 static typename
TermT::FuncType
GetFunc(IntArg
<TMin
, TMax
> iArg
)
1188 if (iArg
.val
== TMax
)
1190 return TermT::template GetFunc
<ArgsB
..., std::integral_constant
<uint32_t, TMax
>>();
1194 return TemplateArgUnroller
<TermT
, ArgsB
...>::GetFunc(IntArg
<TMin
, TMax
-1>{iArg
.val
});
1196 SWR_ASSUME(false); return nullptr;
1198 template <uint32_t TVal
>
1199 static typename
TermT::FuncType
GetFunc(IntArg
<TVal
, TVal
> iArg
)
1201 SWR_ASSERT(iArg
.val
== TVal
);
1202 return TermT::template GetFunc
<ArgsB
..., std::integral_constant
<uint32_t, TVal
>>();
1205 // Recursively parse args
1206 template <uint32_t TMin
, uint32_t TMax
, typename
... TArgsT
>
1207 static typename
TermT::FuncType
GetFunc(IntArg
<TMin
, TMax
> iArg
, TArgsT
... remainingArgs
)
1209 if (iArg
.val
== TMax
)
1211 return TemplateArgUnroller
<TermT
, ArgsB
..., std::integral_constant
<uint32_t, TMax
>>::GetFunc(remainingArgs
...);
1215 return TemplateArgUnroller
<TermT
, ArgsB
...>::GetFunc(IntArg
<TMin
, TMax
- 1>{iArg
.val
}, remainingArgs
...);
1217 SWR_ASSUME(false); return nullptr;
1219 template <uint32_t TVal
, typename
... TArgsT
>
1220 static typename
TermT::FuncType
GetFunc(IntArg
<TVal
, TVal
> iArg
, TArgsT
... remainingArgs
)
1222 SWR_ASSERT(iArg
.val
== TVal
);
1223 return TemplateArgUnroller
<TermT
, ArgsB
..., std::integral_constant
<uint32_t, TVal
>>::GetFunc(remainingArgs
...);
1227 //////////////////////////////////////////////////////////////////////////
1228 /// Helpers used to get / set environment variable
1229 //////////////////////////////////////////////////////////////////////////
1230 static INLINE
std::string
GetEnv(const std::string
& variableName
)
1234 DWORD valueSize
= GetEnvironmentVariableA(variableName
.c_str(), nullptr, 0);
1235 if (!valueSize
) return output
;
1236 output
.resize(valueSize
- 1); // valueSize includes null, output.resize() does not
1237 GetEnvironmentVariableA(variableName
.c_str(), &output
[0], valueSize
);
1239 output
= getenv(variableName
.c_str());
1245 static INLINE
void SetEnv(const std::string
& variableName
, const std::string
& value
)
1248 SetEnvironmentVariableA(variableName
.c_str(), value
.c_str());
1250 setenv(variableName
.c_str(), value
.c_str(), true);