1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * @brief Utilities used by SWR core.
27 ******************************************************************************/
31 #include <type_traits>
33 #include "common/os.h"
34 #include "common/simdintrin.h"
35 #include "common/swr_assert.h"
38 #if defined(_WIN64) || defined(__x86_64__)
39 #define _MM_INSERT_EPI64 _mm_insert_epi64
40 #define _MM_EXTRACT_EPI64 _mm_extract_epi64
42 INLINE
int64_t _MM_EXTRACT_EPI64(__m128i a
, const int32_t ndx
)
44 OSALIGNLINE(uint32_t) elems
[4];
45 _mm_store_si128((__m128i
*)elems
, a
);
48 uint64_t foo
= elems
[0];
49 foo
|= (uint64_t)elems
[1] << 32;
54 uint64_t foo
= elems
[2];
55 foo
|= (uint64_t)elems
[3] << 32;
60 INLINE __m128i
_MM_INSERT_EPI64(__m128i a
, int64_t b
, const int32_t ndx
)
62 OSALIGNLINE(int64_t) elems
[2];
63 _mm_store_si128((__m128i
*)elems
, a
);
73 out
= _mm_load_si128((const __m128i
*)elems
);
87 void vTranspose(__m128
&row0
, __m128
&row1
, __m128
&row2
, __m128
&row3
)
89 __m128i row0i
= _mm_castps_si128(row0
);
90 __m128i row1i
= _mm_castps_si128(row1
);
91 __m128i row2i
= _mm_castps_si128(row2
);
92 __m128i row3i
= _mm_castps_si128(row3
);
94 __m128i vTemp
= row2i
;
95 row2i
= _mm_unpacklo_epi32(row2i
, row3i
);
96 vTemp
= _mm_unpackhi_epi32(vTemp
, row3i
);
99 row0i
= _mm_unpacklo_epi32(row0i
, row1i
);
100 row3i
= _mm_unpackhi_epi32(row3i
, row1i
);
103 row0i
= _mm_unpacklo_epi64(row0i
, row2i
);
104 row1i
= _mm_unpackhi_epi64(row1i
, row2i
);
107 row2i
= _mm_unpacklo_epi64(row2i
, vTemp
);
108 row3i
= _mm_unpackhi_epi64(row3i
, vTemp
);
110 row0
= _mm_castsi128_ps(row0i
);
111 row1
= _mm_castsi128_ps(row1i
);
112 row2
= _mm_castsi128_ps(row2i
);
113 row3
= _mm_castsi128_ps(row3i
);
117 void vTranspose(__m128i
&row0
, __m128i
&row1
, __m128i
&row2
, __m128i
&row3
)
119 __m128i vTemp
= row2
;
120 row2
= _mm_unpacklo_epi32(row2
, row3
);
121 vTemp
= _mm_unpackhi_epi32(vTemp
, row3
);
124 row0
= _mm_unpacklo_epi32(row0
, row1
);
125 row3
= _mm_unpackhi_epi32(row3
, row1
);
128 row0
= _mm_unpacklo_epi64(row0
, row2
);
129 row1
= _mm_unpackhi_epi64(row1
, row2
);
132 row2
= _mm_unpacklo_epi64(row2
, vTemp
);
133 row3
= _mm_unpackhi_epi64(row3
, vTemp
);
136 #define GCC_VERSION (__GNUC__ * 10000 \
137 + __GNUC_MINOR__ * 100 \
138 + __GNUC_PATCHLEVEL__)
140 #if defined(__clang__) || (defined(__GNUC__) && (GCC_VERSION < 40900))
141 #define _mm_undefined_ps _mm_setzero_ps
142 #define _mm_undefined_si128 _mm_setzero_si128
143 #if KNOB_SIMD_WIDTH == 8
144 #define _mm256_undefined_ps _mm256_setzero_ps
148 #if KNOB_SIMD_WIDTH == 8 || KNOB_SIMD_WIDTH == 16
150 void vTranspose3x8(__m128 (&vDst
)[8], __m256
&vSrc0
, __m256
&vSrc1
, __m256
&vSrc2
)
152 __m256 r0r2
= _mm256_unpacklo_ps(vSrc0
, vSrc2
); //x0z0x1z1 x4z4x5z5
153 __m256 r1rx
= _mm256_unpacklo_ps(vSrc1
, _mm256_undefined_ps()); //y0w0y1w1 y4w4y5w5
154 __m256 r02r1xlolo
= _mm256_unpacklo_ps(r0r2
, r1rx
); //x0y0z0w0 x4y4z4w4
155 __m256 r02r1xlohi
= _mm256_unpackhi_ps(r0r2
, r1rx
); //x1y1z1w1 x5y5z5w5
157 r0r2
= _mm256_unpackhi_ps(vSrc0
, vSrc2
); //x2z2x3z3 x6z6x7z7
158 r1rx
= _mm256_unpackhi_ps(vSrc1
, _mm256_undefined_ps()); //y2w2y3w3 y6w6yw77
159 __m256 r02r1xhilo
= _mm256_unpacklo_ps(r0r2
, r1rx
); //x2y2z2w2 x6y6z6w6
160 __m256 r02r1xhihi
= _mm256_unpackhi_ps(r0r2
, r1rx
); //x3y3z3w3 x7y7z7w7
162 vDst
[0] = _mm256_castps256_ps128(r02r1xlolo
);
163 vDst
[1] = _mm256_castps256_ps128(r02r1xlohi
);
164 vDst
[2] = _mm256_castps256_ps128(r02r1xhilo
);
165 vDst
[3] = _mm256_castps256_ps128(r02r1xhihi
);
167 vDst
[4] = _mm256_extractf128_ps(r02r1xlolo
, 1);
168 vDst
[5] = _mm256_extractf128_ps(r02r1xlohi
, 1);
169 vDst
[6] = _mm256_extractf128_ps(r02r1xhilo
, 1);
170 vDst
[7] = _mm256_extractf128_ps(r02r1xhihi
, 1);
174 void vTranspose4x8(__m128 (&vDst
)[8], __m256
&vSrc0
, __m256
&vSrc1
, __m256
&vSrc2
, __m256
&vSrc3
)
176 __m256 r0r2
= _mm256_unpacklo_ps(vSrc0
, vSrc2
); //x0z0x1z1 x4z4x5z5
177 __m256 r1rx
= _mm256_unpacklo_ps(vSrc1
, vSrc3
); //y0w0y1w1 y4w4y5w5
178 __m256 r02r1xlolo
= _mm256_unpacklo_ps(r0r2
, r1rx
); //x0y0z0w0 x4y4z4w4
179 __m256 r02r1xlohi
= _mm256_unpackhi_ps(r0r2
, r1rx
); //x1y1z1w1 x5y5z5w5
181 r0r2
= _mm256_unpackhi_ps(vSrc0
, vSrc2
); //x2z2x3z3 x6z6x7z7
182 r1rx
= _mm256_unpackhi_ps(vSrc1
, vSrc3
) ; //y2w2y3w3 y6w6yw77
183 __m256 r02r1xhilo
= _mm256_unpacklo_ps(r0r2
, r1rx
); //x2y2z2w2 x6y6z6w6
184 __m256 r02r1xhihi
= _mm256_unpackhi_ps(r0r2
, r1rx
); //x3y3z3w3 x7y7z7w7
186 vDst
[0] = _mm256_castps256_ps128(r02r1xlolo
);
187 vDst
[1] = _mm256_castps256_ps128(r02r1xlohi
);
188 vDst
[2] = _mm256_castps256_ps128(r02r1xhilo
);
189 vDst
[3] = _mm256_castps256_ps128(r02r1xhihi
);
191 vDst
[4] = _mm256_extractf128_ps(r02r1xlolo
, 1);
192 vDst
[5] = _mm256_extractf128_ps(r02r1xlohi
, 1);
193 vDst
[6] = _mm256_extractf128_ps(r02r1xhilo
, 1);
194 vDst
[7] = _mm256_extractf128_ps(r02r1xhihi
, 1);
198 void vTranspose8x8(__m256 (&vDst
)[8], const __m256
&vMask0
, const __m256
&vMask1
, const __m256
&vMask2
, const __m256
&vMask3
, const __m256
&vMask4
, const __m256
&vMask5
, const __m256
&vMask6
, const __m256
&vMask7
)
200 __m256 __t0
= _mm256_unpacklo_ps(vMask0
, vMask1
);
201 __m256 __t1
= _mm256_unpackhi_ps(vMask0
, vMask1
);
202 __m256 __t2
= _mm256_unpacklo_ps(vMask2
, vMask3
);
203 __m256 __t3
= _mm256_unpackhi_ps(vMask2
, vMask3
);
204 __m256 __t4
= _mm256_unpacklo_ps(vMask4
, vMask5
);
205 __m256 __t5
= _mm256_unpackhi_ps(vMask4
, vMask5
);
206 __m256 __t6
= _mm256_unpacklo_ps(vMask6
, vMask7
);
207 __m256 __t7
= _mm256_unpackhi_ps(vMask6
, vMask7
);
208 __m256 __tt0
= _mm256_shuffle_ps(__t0
,__t2
,_MM_SHUFFLE(1,0,1,0));
209 __m256 __tt1
= _mm256_shuffle_ps(__t0
,__t2
,_MM_SHUFFLE(3,2,3,2));
210 __m256 __tt2
= _mm256_shuffle_ps(__t1
,__t3
,_MM_SHUFFLE(1,0,1,0));
211 __m256 __tt3
= _mm256_shuffle_ps(__t1
,__t3
,_MM_SHUFFLE(3,2,3,2));
212 __m256 __tt4
= _mm256_shuffle_ps(__t4
,__t6
,_MM_SHUFFLE(1,0,1,0));
213 __m256 __tt5
= _mm256_shuffle_ps(__t4
,__t6
,_MM_SHUFFLE(3,2,3,2));
214 __m256 __tt6
= _mm256_shuffle_ps(__t5
,__t7
,_MM_SHUFFLE(1,0,1,0));
215 __m256 __tt7
= _mm256_shuffle_ps(__t5
,__t7
,_MM_SHUFFLE(3,2,3,2));
216 vDst
[0] = _mm256_permute2f128_ps(__tt0
, __tt4
, 0x20);
217 vDst
[1] = _mm256_permute2f128_ps(__tt1
, __tt5
, 0x20);
218 vDst
[2] = _mm256_permute2f128_ps(__tt2
, __tt6
, 0x20);
219 vDst
[3] = _mm256_permute2f128_ps(__tt3
, __tt7
, 0x20);
220 vDst
[4] = _mm256_permute2f128_ps(__tt0
, __tt4
, 0x31);
221 vDst
[5] = _mm256_permute2f128_ps(__tt1
, __tt5
, 0x31);
222 vDst
[6] = _mm256_permute2f128_ps(__tt2
, __tt6
, 0x31);
223 vDst
[7] = _mm256_permute2f128_ps(__tt3
, __tt7
, 0x31);
227 void vTranspose8x8(__m256 (&vDst
)[8], const __m256i
&vMask0
, const __m256i
&vMask1
, const __m256i
&vMask2
, const __m256i
&vMask3
, const __m256i
&vMask4
, const __m256i
&vMask5
, const __m256i
&vMask6
, const __m256i
&vMask7
)
229 vTranspose8x8(vDst
, _mm256_castsi256_ps(vMask0
), _mm256_castsi256_ps(vMask1
), _mm256_castsi256_ps(vMask2
), _mm256_castsi256_ps(vMask3
),
230 _mm256_castsi256_ps(vMask4
), _mm256_castsi256_ps(vMask5
), _mm256_castsi256_ps(vMask6
), _mm256_castsi256_ps(vMask7
));
234 //////////////////////////////////////////////////////////////////////////
235 /// TranposeSingleComponent
236 //////////////////////////////////////////////////////////////////////////
237 template<uint32_t bpp
>
238 struct TransposeSingleComponent
240 //////////////////////////////////////////////////////////////////////////
241 /// @brief Pass-thru for single component.
242 /// @param pSrc - source data in SOA form
243 /// @param pDst - output data in AOS form
244 INLINE
static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
)
246 memcpy(pDst
, pSrc
, (bpp
* KNOB_SIMD_WIDTH
) / 8);
248 #if ENABLE_AVX512_SIMD16
250 INLINE
static void Transpose_16(const uint8_t* pSrc
, uint8_t* pDst
)
252 memcpy(pDst
, pSrc
, (bpp
* KNOB_SIMD16_WIDTH
) / 8);
257 //////////////////////////////////////////////////////////////////////////
259 //////////////////////////////////////////////////////////////////////////
260 struct Transpose8_8_8_8
262 //////////////////////////////////////////////////////////////////////////
263 /// @brief Performs an SOA to AOS conversion for packed 8_8_8_8 data.
264 /// @param pSrc - source data in SOA form
265 /// @param pDst - output data in AOS form
266 INLINE
static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
)
268 simdscalari src
= _simd_load_si((const simdscalari
*)pSrc
);
270 #if KNOB_SIMD_WIDTH == 8
271 #if KNOB_ARCH == KNOB_ARCH_AVX
272 __m128i c0c1
= _mm256_castsi256_si128(src
); // rrrrrrrrgggggggg
273 __m128i c2c3
= _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(src
), 1)); // bbbbbbbbaaaaaaaa
274 __m128i c0c2
= _mm_unpacklo_epi64(c0c1
, c2c3
); // rrrrrrrrbbbbbbbb
275 __m128i c1c3
= _mm_unpackhi_epi64(c0c1
, c2c3
); // ggggggggaaaaaaaa
276 __m128i c01
= _mm_unpacklo_epi8(c0c2
, c1c3
); // rgrgrgrgrgrgrgrg
277 __m128i c23
= _mm_unpackhi_epi8(c0c2
, c1c3
); // babababababababa
278 __m128i c0123lo
= _mm_unpacklo_epi16(c01
, c23
); // rgbargbargbargba
279 __m128i c0123hi
= _mm_unpackhi_epi16(c01
, c23
); // rgbargbargbargba
280 _mm_store_si128((__m128i
*)pDst
, c0123lo
);
281 _mm_store_si128((__m128i
*)(pDst
+ 16), c0123hi
);
282 #elif KNOB_ARCH == KNOB_ARCH_AVX2
283 simdscalari dst01
= _mm256_shuffle_epi8(src
,
284 _mm256_set_epi32(0x0f078080, 0x0e068080, 0x0d058080, 0x0c048080, 0x80800b03, 0x80800a02, 0x80800901, 0x80800800));
285 simdscalari dst23
= _mm256_permute2x128_si256(src
, src
, 0x01);
286 dst23
= _mm256_shuffle_epi8(dst23
,
287 _mm256_set_epi32(0x80800f07, 0x80800e06, 0x80800d05, 0x80800c04, 0x0b038080, 0x0a028080, 0x09018080, 0x08008080));
288 simdscalari dst
= _mm256_or_si256(dst01
, dst23
);
289 _simd_store_si((simdscalari
*)pDst
, dst
);
291 #elif KNOB_SIMD_WIDTH == 16
292 simdscalari mask0
= _simd_set_epi32(0x0f078080, 0x0e068080, 0x0d058080, 0x0c048080, 0x80800b03, 0x80800a02, 0x80800901, 0x80800800);
294 simdscalari dst01
= _simd_shuffle_epi8(src
, mask0
);
296 simdscalari perm1
= _simd_permute_128(src
, src
, 1);
298 simdscalari mask1
= _simd_set_epi32(0x80800f07, 0x80800e06, 0x80800d05, 0x80800c04, 0x0b038080, 0x0a028080, 0x09018080, 0x08008080);
300 simdscalari dst23
= _simd_shuffle_epi8(perm1
, mask1
);
302 simdscalari dst
= _simd_or_si(dst01
, dst23
);
304 _simd_store_si(reinterpret_cast<simdscalari
*>(pDst
), dst
);
306 #error Unsupported vector width
309 #if ENABLE_AVX512_SIMD16
311 INLINE
static void Transpose_16(const uint8_t* pSrc
, uint8_t* pDst
)
313 simd16scalari src
= _simd16_load_si(reinterpret_cast<const simd16scalari
*>(pSrc
));
315 simd16scalari mask0
= _simd16_set_epi32(0x0f078080, 0x0e068080, 0x0d058080, 0x0c048080, 0x80800b03, 0x80800a02, 0x80800901, 0x80800800);
317 simd16scalari dst01
= _simd16_shuffle_epi8(src
, mask0
);
319 simd16scalari perm1
= _simd16_permute2f128_si(src
, src
, 1);
321 simd16scalari mask1
= _simd16_set_epi32(0x80800f07, 0x80800e06, 0x80800d05, 0x80800c04, 0x0b038080, 0x0a028080, 0x09018080, 0x08008080);
323 simd16scalari dst23
= _simd16_shuffle_epi8(perm1
, mask1
);
325 simd16scalari dst
= _simd16_or_si(dst01
, dst23
);
327 _simd16_store_si(reinterpret_cast<simd16scalari
*>(pDst
), dst
);
332 //////////////////////////////////////////////////////////////////////////
334 //////////////////////////////////////////////////////////////////////////
335 struct Transpose8_8_8
337 //////////////////////////////////////////////////////////////////////////
338 /// @brief Performs an SOA to AOS conversion for packed 8_8_8 data.
339 /// @param pSrc - source data in SOA form
340 /// @param pDst - output data in AOS form
341 INLINE
static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
342 #if ENABLE_AVX512_SIMD16
344 INLINE
static void Transpose_16(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
348 //////////////////////////////////////////////////////////////////////////
350 //////////////////////////////////////////////////////////////////////////
353 //////////////////////////////////////////////////////////////////////////
354 /// @brief Performs an SOA to AOS conversion for packed 8_8 data.
355 /// @param pSrc - source data in SOA form
356 /// @param pDst - output data in AOS form
357 INLINE
static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
)
359 #if KNOB_SIMD_WIDTH == 8
360 simdscalari src
= _simd_load_si((const simdscalari
*)pSrc
);
362 __m128i rg
= _mm256_castsi256_si128(src
); // rrrrrrrr gggggggg
363 __m128i g
= _mm_unpackhi_epi64(rg
, rg
); // gggggggg gggggggg
364 rg
= _mm_unpacklo_epi8(rg
, g
);
365 _mm_store_si128((__m128i
*)pDst
, rg
);
366 #elif KNOB_SIMD_WIDTH == 16
367 __m256i src
= _mm256_load_si256(reinterpret_cast<const __m256i
*>(pSrc
)); // rrrrrrrrrrrrrrrrgggggggggggggggg
369 __m256i r
= _mm256_permute4x64_epi64(src
, 0x50); // 0x50 = 01010000b // rrrrrrrrxxxxxxxxrrrrrrrrxxxxxxxx
371 __m256i g
= _mm256_permute4x64_epi64(src
, 0xFA); // 0xFA = 11111010b // ggggggggxxxxxxxxggggggggxxxxxxxx
373 __m256i dst
= _mm256_unpacklo_epi8(r
, g
); // rgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrg
375 _mm256_store_si256(reinterpret_cast<__m256i
*>(pDst
), dst
);
377 #error Unsupported vector width
380 #if ENABLE_AVX512_SIMD16
382 INLINE
static void Transpose_16(const uint8_t* pSrc
, uint8_t* pDst
)
384 __m256i src
= _mm256_load_si256(reinterpret_cast<const __m256i
*>(pSrc
)); // rrrrrrrrrrrrrrrrgggggggggggggggg
386 __m256i r
= _mm256_permute4x64_epi64(src
, 0x50); // 0x50 = 01010000b // rrrrrrrrxxxxxxxxrrrrrrrrxxxxxxxx
388 __m256i g
= _mm256_permute4x64_epi64(src
, 0xFA); // 0xFA = 11111010b // ggggggggxxxxxxxxggggggggxxxxxxxx
390 __m256i dst
= _mm256_unpacklo_epi8(r
, g
); // rgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrg
392 _mm256_store_si256(reinterpret_cast<__m256i
*>(pDst
), dst
);
397 //////////////////////////////////////////////////////////////////////////
398 /// Transpose32_32_32_32
399 //////////////////////////////////////////////////////////////////////////
400 struct Transpose32_32_32_32
402 //////////////////////////////////////////////////////////////////////////
403 /// @brief Performs an SOA to AOS conversion for packed 32_32_32_32 data.
404 /// @param pSrc - source data in SOA form
405 /// @param pDst - output data in AOS form
406 INLINE
static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
)
408 #if KNOB_SIMD_WIDTH == 8
409 simdscalar src0
= _simd_load_ps((const float*)pSrc
);
410 simdscalar src1
= _simd_load_ps((const float*)pSrc
+ 8);
411 simdscalar src2
= _simd_load_ps((const float*)pSrc
+ 16);
412 simdscalar src3
= _simd_load_ps((const float*)pSrc
+ 24);
415 vTranspose4x8(vDst
, src0
, src1
, src2
, src3
);
416 _mm_store_ps((float*)pDst
, vDst
[0]);
417 _mm_store_ps((float*)pDst
+4, vDst
[1]);
418 _mm_store_ps((float*)pDst
+8, vDst
[2]);
419 _mm_store_ps((float*)pDst
+12, vDst
[3]);
420 _mm_store_ps((float*)pDst
+16, vDst
[4]);
421 _mm_store_ps((float*)pDst
+20, vDst
[5]);
422 _mm_store_ps((float*)pDst
+24, vDst
[6]);
423 _mm_store_ps((float*)pDst
+28, vDst
[7]);
424 #elif KNOB_SIMD_WIDTH == 16
425 #if ENABLE_AVX512_EMULATION
426 simdscalar src0
= _simd_load_ps(reinterpret_cast<const float*>(pSrc
));
427 simdscalar src1
= _simd_load_ps(reinterpret_cast<const float*>(pSrc
) + 16);
428 simdscalar src2
= _simd_load_ps(reinterpret_cast<const float*>(pSrc
) + 32);
429 simdscalar src3
= _simd_load_ps(reinterpret_cast<const float*>(pSrc
) + 48);
433 vTranspose4x8(vDst
, src0
.lo
, src1
.lo
, src2
.lo
, src3
.lo
);
435 _mm_store_ps(reinterpret_cast<float*>(pDst
), vDst
[0]);
436 _mm_store_ps(reinterpret_cast<float*>(pDst
) + 4, vDst
[1]);
437 _mm_store_ps(reinterpret_cast<float*>(pDst
) + 8, vDst
[2]);
438 _mm_store_ps(reinterpret_cast<float*>(pDst
) + 12, vDst
[3]);
439 _mm_store_ps(reinterpret_cast<float*>(pDst
) + 16, vDst
[4]);
440 _mm_store_ps(reinterpret_cast<float*>(pDst
) + 20, vDst
[5]);
441 _mm_store_ps(reinterpret_cast<float*>(pDst
) + 24, vDst
[6]);
442 _mm_store_ps(reinterpret_cast<float*>(pDst
) + 28, vDst
[7]);
444 vTranspose4x8(vDst
, src0
.hi
, src1
.hi
, src2
.hi
, src3
.hi
);
446 _mm_store_ps(reinterpret_cast<float*>(pDst
) + 32, vDst
[0]);
447 _mm_store_ps(reinterpret_cast<float*>(pDst
) + 36, vDst
[1]);
448 _mm_store_ps(reinterpret_cast<float*>(pDst
) + 40, vDst
[2]);
449 _mm_store_ps(reinterpret_cast<float*>(pDst
) + 44, vDst
[3]);
450 _mm_store_ps(reinterpret_cast<float*>(pDst
) + 48, vDst
[4]);
451 _mm_store_ps(reinterpret_cast<float*>(pDst
) + 52, vDst
[5]);
452 _mm_store_ps(reinterpret_cast<float*>(pDst
) + 56, vDst
[6]);
453 _mm_store_ps(reinterpret_cast<float*>(pDst
) + 60, vDst
[7]);
456 #error Unsupported vector width
459 #if ENABLE_AVX512_SIMD16
461 INLINE
static void Transpose_16(const uint8_t* pSrc
, uint8_t* pDst
)
463 simd16scalar src0
= _simd16_load_ps(reinterpret_cast<const float *>(pSrc
));
464 simd16scalar src1
= _simd16_load_ps(reinterpret_cast<const float *>(pSrc
) + 16);
465 simd16scalar src2
= _simd16_load_ps(reinterpret_cast<const float *>(pSrc
) + 32);
466 simd16scalar src3
= _simd16_load_ps(reinterpret_cast<const float *>(pSrc
) + 48);
470 vTranspose4x8(vDst
, _simd16_extract_ps(src0
, 0), _simd16_extract_ps(src1
, 0), _simd16_extract_ps(src2
, 0), _simd16_extract_ps(src3
, 0));
473 _simd16_store_ps(reinterpret_cast<float *>(pDst
) + 0, reinterpret_cast<simd16scalar
*>(vDst
)[0]);
474 _simd16_store_ps(reinterpret_cast<float *>(pDst
) + 16, reinterpret_cast<simd16scalar
*>(vDst
)[1]);
476 _mm_store_ps(reinterpret_cast<float *>(pDst
), vDst
[0]);
477 _mm_store_ps(reinterpret_cast<float *>(pDst
) + 4, vDst
[1]);
478 _mm_store_ps(reinterpret_cast<float *>(pDst
) + 8, vDst
[2]);
479 _mm_store_ps(reinterpret_cast<float *>(pDst
) + 12, vDst
[3]);
480 _mm_store_ps(reinterpret_cast<float *>(pDst
) + 16, vDst
[4]);
481 _mm_store_ps(reinterpret_cast<float *>(pDst
) + 20, vDst
[5]);
482 _mm_store_ps(reinterpret_cast<float *>(pDst
) + 24, vDst
[6]);
483 _mm_store_ps(reinterpret_cast<float *>(pDst
) + 28, vDst
[7]);
486 vTranspose4x8(vDst
, _simd16_extract_ps(src0
, 1), _simd16_extract_ps(src1
, 1), _simd16_extract_ps(src2
, 1), _simd16_extract_ps(src3
, 1));
489 _simd16_store_ps(reinterpret_cast<float *>(pDst
) + 32, reinterpret_cast<simd16scalar
*>(vDst
)[2]);
490 _simd16_store_ps(reinterpret_cast<float *>(pDst
) + 48, reinterpret_cast<simd16scalar
*>(vDst
)[3]);
492 _mm_store_ps(reinterpret_cast<float *>(pDst
) + 32, vDst
[0]);
493 _mm_store_ps(reinterpret_cast<float *>(pDst
) + 36, vDst
[1]);
494 _mm_store_ps(reinterpret_cast<float *>(pDst
) + 40, vDst
[2]);
495 _mm_store_ps(reinterpret_cast<float *>(pDst
) + 44, vDst
[3]);
496 _mm_store_ps(reinterpret_cast<float *>(pDst
) + 48, vDst
[4]);
497 _mm_store_ps(reinterpret_cast<float *>(pDst
) + 52, vDst
[5]);
498 _mm_store_ps(reinterpret_cast<float *>(pDst
) + 56, vDst
[6]);
499 _mm_store_ps(reinterpret_cast<float *>(pDst
) + 60, vDst
[7]);
505 //////////////////////////////////////////////////////////////////////////
506 /// Transpose32_32_32
507 //////////////////////////////////////////////////////////////////////////
508 struct Transpose32_32_32
510 //////////////////////////////////////////////////////////////////////////
511 /// @brief Performs an SOA to AOS conversion for packed 32_32_32 data.
512 /// @param pSrc - source data in SOA form
513 /// @param pDst - output data in AOS form
514 INLINE
static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
)
516 #if KNOB_SIMD_WIDTH == 8
517 simdscalar src0
= _simd_load_ps((const float*)pSrc
);
518 simdscalar src1
= _simd_load_ps((const float*)pSrc
+ 8);
519 simdscalar src2
= _simd_load_ps((const float*)pSrc
+ 16);
522 vTranspose3x8(vDst
, src0
, src1
, src2
);
523 _mm_store_ps((float*)pDst
, vDst
[0]);
524 _mm_store_ps((float*)pDst
+ 4, vDst
[1]);
525 _mm_store_ps((float*)pDst
+ 8, vDst
[2]);
526 _mm_store_ps((float*)pDst
+ 12, vDst
[3]);
527 _mm_store_ps((float*)pDst
+ 16, vDst
[4]);
528 _mm_store_ps((float*)pDst
+ 20, vDst
[5]);
529 _mm_store_ps((float*)pDst
+ 24, vDst
[6]);
530 _mm_store_ps((float*)pDst
+ 28, vDst
[7]);
531 #elif KNOB_SIMD_WIDTH == 16
532 #if ENABLE_AVX512_EMULATION
533 simdscalar src0
= _simd_load_ps(reinterpret_cast<const float*>(pSrc
));
534 simdscalar src1
= _simd_load_ps(reinterpret_cast<const float*>(pSrc
) + 16);
535 simdscalar src2
= _simd_load_ps(reinterpret_cast<const float*>(pSrc
) + 32);
539 vTranspose3x8(vDst
, src0
.lo
, src1
.lo
, src2
.lo
);
541 _mm_store_ps(reinterpret_cast<float*>(pDst
), vDst
[0]);
542 _mm_store_ps(reinterpret_cast<float*>(pDst
) + 4, vDst
[1]);
543 _mm_store_ps(reinterpret_cast<float*>(pDst
) + 8, vDst
[2]);
544 _mm_store_ps(reinterpret_cast<float*>(pDst
) + 12, vDst
[3]);
545 _mm_store_ps(reinterpret_cast<float*>(pDst
) + 16, vDst
[4]);
546 _mm_store_ps(reinterpret_cast<float*>(pDst
) + 20, vDst
[5]);
547 _mm_store_ps(reinterpret_cast<float*>(pDst
) + 24, vDst
[6]);
548 _mm_store_ps(reinterpret_cast<float*>(pDst
) + 28, vDst
[7]);
550 vTranspose3x8(vDst
, src0
.hi
, src1
.hi
, src2
.hi
);
552 _mm_store_ps(reinterpret_cast<float*>(pDst
) + 32, vDst
[0]);
553 _mm_store_ps(reinterpret_cast<float*>(pDst
) + 36, vDst
[1]);
554 _mm_store_ps(reinterpret_cast<float*>(pDst
) + 40, vDst
[2]);
555 _mm_store_ps(reinterpret_cast<float*>(pDst
) + 44, vDst
[3]);
556 _mm_store_ps(reinterpret_cast<float*>(pDst
) + 48, vDst
[4]);
557 _mm_store_ps(reinterpret_cast<float*>(pDst
) + 52, vDst
[5]);
558 _mm_store_ps(reinterpret_cast<float*>(pDst
) + 56, vDst
[6]);
559 _mm_store_ps(reinterpret_cast<float*>(pDst
) + 60, vDst
[7]);
562 #error Unsupported vector width
565 #if ENABLE_AVX512_SIMD16
567 INLINE
static void Transpose_16(const uint8_t* pSrc
, uint8_t* pDst
)
569 simd16scalar src0
= _simd16_load_ps(reinterpret_cast<const float *>(pSrc
));
570 simd16scalar src1
= _simd16_load_ps(reinterpret_cast<const float *>(pSrc
) + 16);
571 simd16scalar src2
= _simd16_load_ps(reinterpret_cast<const float *>(pSrc
) + 32);
575 vTranspose3x8(vDst
, _simd16_extract_ps(src0
, 0), _simd16_extract_ps(src1
, 0), _simd16_extract_ps(src2
, 0));
578 _simd16_store_ps(reinterpret_cast<float *>(pDst
) + 0, reinterpret_cast<simd16scalar
*>(vDst
)[0]);
579 _simd16_store_ps(reinterpret_cast<float *>(pDst
) + 16, reinterpret_cast<simd16scalar
*>(vDst
)[1]);
581 _mm_store_ps(reinterpret_cast<float *>(pDst
), vDst
[0]);
582 _mm_store_ps(reinterpret_cast<float *>(pDst
) + 4, vDst
[1]);
583 _mm_store_ps(reinterpret_cast<float *>(pDst
) + 8, vDst
[2]);
584 _mm_store_ps(reinterpret_cast<float *>(pDst
) + 12, vDst
[3]);
585 _mm_store_ps(reinterpret_cast<float *>(pDst
) + 16, vDst
[4]);
586 _mm_store_ps(reinterpret_cast<float *>(pDst
) + 20, vDst
[5]);
587 _mm_store_ps(reinterpret_cast<float *>(pDst
) + 24, vDst
[6]);
588 _mm_store_ps(reinterpret_cast<float *>(pDst
) + 28, vDst
[7]);
591 vTranspose3x8(vDst
, _simd16_extract_ps(src0
, 1), _simd16_extract_ps(src1
, 1), _simd16_extract_ps(src2
, 1));
594 _simd16_store_ps(reinterpret_cast<float *>(pDst
) + 32, reinterpret_cast<simd16scalar
*>(vDst
)[2]);
595 _simd16_store_ps(reinterpret_cast<float *>(pDst
) + 48, reinterpret_cast<simd16scalar
*>(vDst
)[3]);
597 _mm_store_ps(reinterpret_cast<float *>(pDst
) + 32, vDst
[0]);
598 _mm_store_ps(reinterpret_cast<float *>(pDst
) + 36, vDst
[1]);
599 _mm_store_ps(reinterpret_cast<float *>(pDst
) + 40, vDst
[2]);
600 _mm_store_ps(reinterpret_cast<float *>(pDst
) + 44, vDst
[3]);
601 _mm_store_ps(reinterpret_cast<float *>(pDst
) + 48, vDst
[4]);
602 _mm_store_ps(reinterpret_cast<float *>(pDst
) + 52, vDst
[5]);
603 _mm_store_ps(reinterpret_cast<float *>(pDst
) + 56, vDst
[6]);
604 _mm_store_ps(reinterpret_cast<float *>(pDst
) + 60, vDst
[7]);
610 //////////////////////////////////////////////////////////////////////////
612 //////////////////////////////////////////////////////////////////////////
613 struct Transpose32_32
615 //////////////////////////////////////////////////////////////////////////
616 /// @brief Performs an SOA to AOS conversion for packed 32_32 data.
617 /// @param pSrc - source data in SOA form
618 /// @param pDst - output data in AOS form
619 INLINE
static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
)
621 #if KNOB_SIMD_WIDTH == 8
622 const float* pfSrc
= (const float*)pSrc
;
623 __m128 src_r0
= _mm_load_ps(pfSrc
+ 0);
624 __m128 src_r1
= _mm_load_ps(pfSrc
+ 4);
625 __m128 src_g0
= _mm_load_ps(pfSrc
+ 8);
626 __m128 src_g1
= _mm_load_ps(pfSrc
+ 12);
628 __m128 dst0
= _mm_unpacklo_ps(src_r0
, src_g0
);
629 __m128 dst1
= _mm_unpackhi_ps(src_r0
, src_g0
);
630 __m128 dst2
= _mm_unpacklo_ps(src_r1
, src_g1
);
631 __m128 dst3
= _mm_unpackhi_ps(src_r1
, src_g1
);
633 float* pfDst
= (float*)pDst
;
634 _mm_store_ps(pfDst
+ 0, dst0
);
635 _mm_store_ps(pfDst
+ 4, dst1
);
636 _mm_store_ps(pfDst
+ 8, dst2
);
637 _mm_store_ps(pfDst
+ 12, dst3
);
638 #elif KNOB_SIMD_WIDTH == 16
639 const float* pfSrc
= (const float*)pSrc
;
640 __m256 src_r0
= _mm256_load_ps(pfSrc
+ 0);
641 __m256 src_r1
= _mm256_load_ps(pfSrc
+ 8);
642 __m256 src_g0
= _mm256_load_ps(pfSrc
+ 16);
643 __m256 src_g1
= _mm256_load_ps(pfSrc
+ 24);
645 __m256 dst0
= _mm256_unpacklo_ps(src_r0
, src_g0
);
646 __m256 dst1
= _mm256_unpackhi_ps(src_r0
, src_g0
);
647 __m256 dst2
= _mm256_unpacklo_ps(src_r1
, src_g1
);
648 __m256 dst3
= _mm256_unpackhi_ps(src_r1
, src_g1
);
650 float* pfDst
= (float*)pDst
;
651 _mm256_store_ps(pfDst
+ 0, dst0
);
652 _mm256_store_ps(pfDst
+ 8, dst1
);
653 _mm256_store_ps(pfDst
+ 16, dst2
);
654 _mm256_store_ps(pfDst
+ 24, dst3
);
656 #error Unsupported vector width
659 #if ENABLE_AVX512_SIMD16
661 INLINE
static void Transpose_16(const uint8_t* pSrc
, uint8_t* pDst
)
663 const float *pfSrc
= reinterpret_cast<const float *>(pSrc
);
665 __m256 src_r0
= _mm256_load_ps(pfSrc
+ 0);
666 __m256 src_r1
= _mm256_load_ps(pfSrc
+ 8);
667 __m256 src_g0
= _mm256_load_ps(pfSrc
+ 16);
668 __m256 src_g1
= _mm256_load_ps(pfSrc
+ 24);
670 __m256 dst0
= _mm256_unpacklo_ps(src_r0
, src_g0
);
671 __m256 dst1
= _mm256_unpackhi_ps(src_r0
, src_g0
);
672 __m256 dst2
= _mm256_unpacklo_ps(src_r1
, src_g1
);
673 __m256 dst3
= _mm256_unpackhi_ps(src_r1
, src_g1
);
675 float *pfDst
= reinterpret_cast<float *>(pDst
);
677 _mm256_store_ps(pfDst
+ 0, dst0
);
678 _mm256_store_ps(pfDst
+ 8, dst1
);
679 _mm256_store_ps(pfDst
+ 16, dst2
);
680 _mm256_store_ps(pfDst
+ 24, dst3
);
685 //////////////////////////////////////////////////////////////////////////
686 /// Transpose16_16_16_16
687 //////////////////////////////////////////////////////////////////////////
688 struct Transpose16_16_16_16
690 //////////////////////////////////////////////////////////////////////////
691 /// @brief Performs an SOA to AOS conversion for packed 16_16_16_16 data.
692 /// @param pSrc - source data in SOA form
693 /// @param pDst - output data in AOS form
694 INLINE
static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
)
696 #if KNOB_SIMD_WIDTH == 8
697 simdscalari src_rg
= _simd_load_si((const simdscalari
*)pSrc
);
698 simdscalari src_ba
= _simd_load_si((const simdscalari
*)(pSrc
+ sizeof(simdscalari
)));
700 __m128i src_r
= _mm256_extractf128_si256(src_rg
, 0);
701 __m128i src_g
= _mm256_extractf128_si256(src_rg
, 1);
702 __m128i src_b
= _mm256_extractf128_si256(src_ba
, 0);
703 __m128i src_a
= _mm256_extractf128_si256(src_ba
, 1);
705 __m128i rg0
= _mm_unpacklo_epi16(src_r
, src_g
);
706 __m128i rg1
= _mm_unpackhi_epi16(src_r
, src_g
);
707 __m128i ba0
= _mm_unpacklo_epi16(src_b
, src_a
);
708 __m128i ba1
= _mm_unpackhi_epi16(src_b
, src_a
);
710 __m128i dst0
= _mm_unpacklo_epi32(rg0
, ba0
);
711 __m128i dst1
= _mm_unpackhi_epi32(rg0
, ba0
);
712 __m128i dst2
= _mm_unpacklo_epi32(rg1
, ba1
);
713 __m128i dst3
= _mm_unpackhi_epi32(rg1
, ba1
);
715 _mm_store_si128(((__m128i
*)pDst
) + 0, dst0
);
716 _mm_store_si128(((__m128i
*)pDst
) + 1, dst1
);
717 _mm_store_si128(((__m128i
*)pDst
) + 2, dst2
);
718 _mm_store_si128(((__m128i
*)pDst
) + 3, dst3
);
719 #elif KNOB_SIMD_WIDTH == 16
720 #if ENABLE_AVX512_EMULATION
721 simdscalari src_rg
= _simd_load_si(reinterpret_cast<const simdscalari
*>(pSrc
));
722 simdscalari src_ba
= _simd_load_si(reinterpret_cast<const simdscalari
*>(pSrc
+ sizeof(simdscalari
)));
724 __m256i src_r
= src_rg
.lo
;
725 __m256i src_g
= src_rg
.hi
;
726 __m256i src_b
= src_ba
.lo
;
727 __m256i src_a
= src_ba
.hi
;
729 __m256i rg0
= _mm256_unpacklo_epi16(src_r
, src_g
);
730 __m256i rg1
= _mm256_unpackhi_epi16(src_r
, src_g
);
731 __m256i ba0
= _mm256_unpacklo_epi16(src_b
, src_a
);
732 __m256i ba1
= _mm256_unpackhi_epi16(src_b
, src_a
);
734 __m256i dst0
= _mm256_unpacklo_epi32(rg0
, ba0
);
735 __m256i dst1
= _mm256_unpackhi_epi32(rg0
, ba0
);
736 __m256i dst2
= _mm256_unpacklo_epi32(rg1
, ba1
);
737 __m256i dst3
= _mm256_unpackhi_epi32(rg1
, ba1
);
739 _mm256_store_si256(reinterpret_cast<__m256i
*>(pDst
) + 0, dst0
);
740 _mm256_store_si256(reinterpret_cast<__m256i
*>(pDst
) + 1, dst1
);
741 _mm256_store_si256(reinterpret_cast<__m256i
*>(pDst
) + 2, dst2
);
742 _mm256_store_si256(reinterpret_cast<__m256i
*>(pDst
) + 3, dst3
);
745 #error Unsupported vector width
748 #if ENABLE_AVX512_SIMD16
750 INLINE
static void Transpose_16(const uint8_t* pSrc
, uint8_t* pDst
)
752 simd16scalari src_rg
= _simd16_load_si(reinterpret_cast<const simd16scalari
*>(pSrc
));
753 simd16scalari src_ba
= _simd16_load_si(reinterpret_cast<const simd16scalari
*>(pSrc
+ sizeof(simd16scalari
)));
755 __m256i src_r
= _simd16_extract_si(src_rg
, 0);
756 __m256i src_g
= _simd16_extract_si(src_rg
, 1);
757 __m256i src_b
= _simd16_extract_si(src_ba
, 0);
758 __m256i src_a
= _simd16_extract_si(src_ba
, 1);
760 __m256i rg0
= _mm256_unpacklo_epi16(src_r
, src_g
);
761 __m256i rg1
= _mm256_unpackhi_epi16(src_r
, src_g
);
762 __m256i ba0
= _mm256_unpacklo_epi16(src_b
, src_a
);
763 __m256i ba1
= _mm256_unpackhi_epi16(src_b
, src_a
);
765 __m256i dst0
= _mm256_unpacklo_epi32(rg0
, ba0
);
766 __m256i dst1
= _mm256_unpackhi_epi32(rg0
, ba0
);
767 __m256i dst2
= _mm256_unpacklo_epi32(rg1
, ba1
);
768 __m256i dst3
= _mm256_unpackhi_epi32(rg1
, ba1
);
770 _mm256_store_si256(reinterpret_cast<__m256i
*>(pDst
) + 0, dst0
);
771 _mm256_store_si256(reinterpret_cast<__m256i
*>(pDst
) + 1, dst1
);
772 _mm256_store_si256(reinterpret_cast<__m256i
*>(pDst
) + 2, dst2
);
773 _mm256_store_si256(reinterpret_cast<__m256i
*>(pDst
) + 3, dst3
);
778 //////////////////////////////////////////////////////////////////////////
779 /// Transpose16_16_16
780 //////////////////////////////////////////////////////////////////////////
781 struct Transpose16_16_16
783 //////////////////////////////////////////////////////////////////////////
784 /// @brief Performs an SOA to AOS conversion for packed 16_16_16 data.
785 /// @param pSrc - source data in SOA form
786 /// @param pDst - output data in AOS form
787 INLINE
static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
)
789 #if KNOB_SIMD_WIDTH == 8
790 simdscalari src_rg
= _simd_load_si((const simdscalari
*)pSrc
);
792 __m128i src_r
= _mm256_extractf128_si256(src_rg
, 0);
793 __m128i src_g
= _mm256_extractf128_si256(src_rg
, 1);
794 __m128i src_b
= _mm_load_si128((const __m128i
*)(pSrc
+ sizeof(simdscalari
)));
795 __m128i src_a
= _mm_undefined_si128();
797 __m128i rg0
= _mm_unpacklo_epi16(src_r
, src_g
);
798 __m128i rg1
= _mm_unpackhi_epi16(src_r
, src_g
);
799 __m128i ba0
= _mm_unpacklo_epi16(src_b
, src_a
);
800 __m128i ba1
= _mm_unpackhi_epi16(src_b
, src_a
);
802 __m128i dst0
= _mm_unpacklo_epi32(rg0
, ba0
);
803 __m128i dst1
= _mm_unpackhi_epi32(rg0
, ba0
);
804 __m128i dst2
= _mm_unpacklo_epi32(rg1
, ba1
);
805 __m128i dst3
= _mm_unpackhi_epi32(rg1
, ba1
);
807 _mm_store_si128(((__m128i
*)pDst
) + 0, dst0
);
808 _mm_store_si128(((__m128i
*)pDst
) + 1, dst1
);
809 _mm_store_si128(((__m128i
*)pDst
) + 2, dst2
);
810 _mm_store_si128(((__m128i
*)pDst
) + 3, dst3
);
811 #elif KNOB_SIMD_WIDTH == 16
812 #if ENABLE_AVX512_EMULATION
813 simdscalari src_rg
= _simd_load_si(reinterpret_cast<const simdscalari
*>(pSrc
));
815 __m256i src_r
= src_rg
.lo
;
816 __m256i src_g
= src_rg
.hi
;
817 __m256i src_b
= _mm256_load_si256(reinterpret_cast<const __m256i
*>(pSrc
+ sizeof(simdscalari
)));
818 __m256i src_a
= _mm256_undefined_si256();
820 __m256i rg0
= _mm256_unpacklo_epi16(src_r
, src_g
);
821 __m256i rg1
= _mm256_unpackhi_epi16(src_r
, src_g
);
822 __m256i ba0
= _mm256_unpacklo_epi16(src_b
, src_a
);
823 __m256i ba1
= _mm256_unpackhi_epi16(src_b
, src_a
);
825 __m256i dst0
= _mm256_unpacklo_epi32(rg0
, ba0
);
826 __m256i dst1
= _mm256_unpackhi_epi32(rg0
, ba0
);
827 __m256i dst2
= _mm256_unpacklo_epi32(rg1
, ba1
);
828 __m256i dst3
= _mm256_unpackhi_epi32(rg1
, ba1
);
830 _mm256_store_si256(reinterpret_cast<__m256i
*>(pDst
) + 0, dst0
);
831 _mm256_store_si256(reinterpret_cast<__m256i
*>(pDst
) + 1, dst1
);
832 _mm256_store_si256(reinterpret_cast<__m256i
*>(pDst
) + 2, dst2
);
833 _mm256_store_si256(reinterpret_cast<__m256i
*>(pDst
) + 3, dst3
);
836 #error Unsupported vector width
839 #if ENABLE_AVX512_SIMD16
841 INLINE
static void Transpose_16(const uint8_t* pSrc
, uint8_t* pDst
)
843 simd16scalari src_rg
= _simd16_load_si(reinterpret_cast<const simd16scalari
*>(pSrc
));
845 __m256i src_r
= _simd16_extract_si(src_rg
, 0);
846 __m256i src_g
= _simd16_extract_si(src_rg
, 1);
847 __m256i src_b
= _mm256_load_si256(reinterpret_cast<const __m256i
*>(pSrc
+ sizeof(simd16scalari
)));
848 __m256i src_a
= _mm256_undefined_si256();
850 __m256i rg0
= _mm256_unpacklo_epi16(src_r
, src_g
);
851 __m256i rg1
= _mm256_unpackhi_epi16(src_r
, src_g
);
852 __m256i ba0
= _mm256_unpacklo_epi16(src_b
, src_a
);
853 __m256i ba1
= _mm256_unpackhi_epi16(src_b
, src_a
);
855 __m256i dst0
= _mm256_unpacklo_epi32(rg0
, ba0
);
856 __m256i dst1
= _mm256_unpackhi_epi32(rg0
, ba0
);
857 __m256i dst2
= _mm256_unpacklo_epi32(rg1
, ba1
);
858 __m256i dst3
= _mm256_unpackhi_epi32(rg1
, ba1
);
860 _mm256_store_si256(reinterpret_cast<__m256i
*>(pDst
) + 0, dst0
);
861 _mm256_store_si256(reinterpret_cast<__m256i
*>(pDst
) + 1, dst1
);
862 _mm256_store_si256(reinterpret_cast<__m256i
*>(pDst
) + 2, dst2
);
863 _mm256_store_si256(reinterpret_cast<__m256i
*>(pDst
) + 3, dst3
);
868 //////////////////////////////////////////////////////////////////////////
870 //////////////////////////////////////////////////////////////////////////
871 struct Transpose16_16
873 //////////////////////////////////////////////////////////////////////////
874 /// @brief Performs an SOA to AOS conversion for packed 16_16 data.
875 /// @param pSrc - source data in SOA form
876 /// @param pDst - output data in AOS form
877 INLINE
static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
)
879 #if KNOB_SIMD_WIDTH == 8
880 simdscalar src
= _simd_load_ps((const float*)pSrc
);
882 __m128 comp0
= _mm256_castps256_ps128(src
);
883 __m128 comp1
= _mm256_extractf128_ps(src
, 1);
885 __m128i comp0i
= _mm_castps_si128(comp0
);
886 __m128i comp1i
= _mm_castps_si128(comp1
);
888 __m128i resLo
= _mm_unpacklo_epi16(comp0i
, comp1i
);
889 __m128i resHi
= _mm_unpackhi_epi16(comp0i
, comp1i
);
891 _mm_store_si128((__m128i
*)pDst
, resLo
);
892 _mm_store_si128((__m128i
*)pDst
+ 1, resHi
);
893 #elif KNOB_SIMD_WIDTH == 16
894 #if ENABLE_AVX512_EMULATION
895 simdscalari src
= _simd_castps_si(_simd_load_ps(reinterpret_cast<const float*>(pSrc
)));
899 result
.lo
= _mm256_unpacklo_epi16(src
.lo
, src
.hi
);
900 result
.hi
= _mm256_unpackhi_epi16(src
.lo
, src
.hi
);
902 _simd_store_si(reinterpret_cast<simdscalari
*>(pDst
), result
);
905 #error Unsupported vector width
908 #if ENABLE_AVX512_SIMD16
910 INLINE
static void Transpose_16(const uint8_t* pSrc
, uint8_t* pDst
)
912 simd16scalari result
= _simd16_setzero_si();
914 simd16scalari src
= _simd16_castps_si(_simd16_load_ps(reinterpret_cast<const float *>(pSrc
)));
916 simdscalari srclo
= _simd16_extract_si(src
, 0);
917 simdscalari srchi
= _simd16_extract_si(src
, 1);
919 result
= _simd16_insert_si(result
, _mm256_unpacklo_epi16(srclo
, srchi
), 0);
920 result
= _simd16_insert_si(result
, _mm256_unpackhi_epi16(srclo
, srchi
), 1);
922 _simd16_store_si(reinterpret_cast<simd16scalari
*>(pDst
), result
);
927 //////////////////////////////////////////////////////////////////////////
929 //////////////////////////////////////////////////////////////////////////
932 //////////////////////////////////////////////////////////////////////////
933 /// @brief Performs an SOA to AOS conversion for packed 24_8 data.
934 /// @param pSrc - source data in SOA form
935 /// @param pDst - output data in AOS form
936 static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
937 #if ENABLE_AVX512_SIMD16
939 static void Transpose_16(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
943 //////////////////////////////////////////////////////////////////////////
945 //////////////////////////////////////////////////////////////////////////
946 struct Transpose32_8_24
948 //////////////////////////////////////////////////////////////////////////
949 /// @brief Performs an SOA to AOS conversion for packed 32_8_24 data.
950 /// @param pSrc - source data in SOA form
951 /// @param pDst - output data in AOS form
952 static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
953 #if ENABLE_AVX512_SIMD16
955 static void Transpose_16(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
959 //////////////////////////////////////////////////////////////////////////
961 //////////////////////////////////////////////////////////////////////////
962 struct Transpose4_4_4_4
964 //////////////////////////////////////////////////////////////////////////
965 /// @brief Performs an SOA to AOS conversion for packed 4_4_4_4 data.
966 /// @param pSrc - source data in SOA form
967 /// @param pDst - output data in AOS form
968 static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
969 #if ENABLE_AVX512_SIMD16
971 static void Transpose_16(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
975 //////////////////////////////////////////////////////////////////////////
977 //////////////////////////////////////////////////////////////////////////
978 struct Transpose5_6_5
980 //////////////////////////////////////////////////////////////////////////
981 /// @brief Performs an SOA to AOS conversion for packed 5_6_5 data.
982 /// @param pSrc - source data in SOA form
983 /// @param pDst - output data in AOS form
984 static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
985 #if ENABLE_AVX512_SIMD16
987 static void Transpose_16(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
991 //////////////////////////////////////////////////////////////////////////
993 //////////////////////////////////////////////////////////////////////////
994 struct Transpose9_9_9_5
996 //////////////////////////////////////////////////////////////////////////
997 /// @brief Performs an SOA to AOS conversion for packed 9_9_9_5 data.
998 /// @param pSrc - source data in SOA form
999 /// @param pDst - output data in AOS form
1000 static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
1001 #if ENABLE_AVX512_SIMD16
1003 static void Transpose_16(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
1007 //////////////////////////////////////////////////////////////////////////
1008 /// Transpose5_5_5_1
1009 //////////////////////////////////////////////////////////////////////////
1010 struct Transpose5_5_5_1
1012 //////////////////////////////////////////////////////////////////////////
1013 /// @brief Performs an SOA to AOS conversion for packed 5_5_5_1 data.
1014 /// @param pSrc - source data in SOA form
1015 /// @param pDst - output data in AOS form
1016 static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
1017 #if ENABLE_AVX512_SIMD16
1019 static void Transpose_16(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
1023 //////////////////////////////////////////////////////////////////////////
1024 /// Transpose1_5_5_5
1025 //////////////////////////////////////////////////////////////////////////
1026 struct Transpose1_5_5_5
1028 //////////////////////////////////////////////////////////////////////////
1029 /// @brief Performs an SOA to AOS conversion for packed 5_5_5_1 data.
1030 /// @param pSrc - source data in SOA form
1031 /// @param pDst - output data in AOS form
1032 static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
1035 //////////////////////////////////////////////////////////////////////////
1036 /// Transpose10_10_10_2
1037 //////////////////////////////////////////////////////////////////////////
1038 struct Transpose10_10_10_2
1040 //////////////////////////////////////////////////////////////////////////
1041 /// @brief Performs an SOA to AOS conversion for packed 10_10_10_2 data.
1042 /// @param pSrc - source data in SOA form
1043 /// @param pDst - output data in AOS form
1044 static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
1045 #if ENABLE_AVX512_SIMD16
1047 static void Transpose_16(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
1051 //////////////////////////////////////////////////////////////////////////
1052 /// Transpose11_11_10
1053 //////////////////////////////////////////////////////////////////////////
1054 struct Transpose11_11_10
1056 //////////////////////////////////////////////////////////////////////////
1057 /// @brief Performs an SOA to AOS conversion for packed 11_11_10 data.
1058 /// @param pSrc - source data in SOA form
1059 /// @param pDst - output data in AOS form
1060 static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
1061 #if ENABLE_AVX512_SIMD16
1063 static void Transpose_16(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
1067 // helper function to unroll loops
1068 template<int Begin
, int End
, int Step
= 1>
1070 template<typename Lambda
>
1071 INLINE
static void step(Lambda
& func
) {
1073 UnrollerL
<Begin
+ Step
, End
, Step
>::step(func
);
1077 template<int End
, int Step
>
1078 struct UnrollerL
<End
, End
, Step
> {
1079 template<typename Lambda
>
1080 static void step(Lambda
& func
) {
1084 // helper function to unroll loops, with mask to skip specific iterations
1085 template<int Begin
, int End
, int Step
= 1, int Mask
= 0x7f>
1086 struct UnrollerLMask
{
1087 template<typename Lambda
>
1088 INLINE
static void step(Lambda
& func
) {
1089 if(Mask
& (1 << Begin
))
1093 UnrollerL
<Begin
+ Step
, End
, Step
>::step(func
);
1097 template<int End
, int Step
, int Mask
>
1098 struct UnrollerLMask
<End
, End
, Step
, Mask
> {
1099 template<typename Lambda
>
1100 static void step(Lambda
& func
) {
1104 // general CRC compute
1106 uint32_t ComputeCRC(uint32_t crc
, const void *pData
, uint32_t size
)
1108 #if defined(_WIN64) || defined(__x86_64__)
1109 uint32_t sizeInQwords
= size
/ sizeof(uint64_t);
1110 uint32_t sizeRemainderBytes
= size
% sizeof(uint64_t);
1111 uint64_t* pDataWords
= (uint64_t*)pData
;
1112 for (uint32_t i
= 0; i
< sizeInQwords
; ++i
)
1114 crc
= (uint32_t)_mm_crc32_u64(crc
, *pDataWords
++);
1117 uint32_t sizeInDwords
= size
/ sizeof(uint32_t);
1118 uint32_t sizeRemainderBytes
= size
% sizeof(uint32_t);
1119 uint32_t* pDataWords
= (uint32_t*)pData
;
1120 for (uint32_t i
= 0; i
< sizeInDwords
; ++i
)
1122 crc
= _mm_crc32_u32(crc
, *pDataWords
++);
1126 uint8_t* pRemainderBytes
= (uint8_t*)pDataWords
;
1127 for (uint32_t i
= 0; i
< sizeRemainderBytes
; ++i
)
1129 crc
= _mm_crc32_u8(crc
, *pRemainderBytes
++);
1135 //////////////////////////////////////////////////////////////////////////
1136 /// Add byte offset to any-type pointer
1137 //////////////////////////////////////////////////////////////////////////
1138 template <typename T
>
1140 static T
* PtrAdd(T
* p
, intptr_t offset
)
1142 intptr_t intp
= reinterpret_cast<intptr_t>(p
);
1143 return reinterpret_cast<T
*>(intp
+ offset
);
1146 //////////////////////////////////////////////////////////////////////////
1147 /// Is a power-of-2?
1148 //////////////////////////////////////////////////////////////////////////
1149 template <typename T
>
1151 static bool IsPow2(T value
)
1153 return value
== (value
& (0 - value
));
1156 //////////////////////////////////////////////////////////////////////////
1157 /// Align down to specified alignment
1158 /// Note: IsPow2(alignment) MUST be true
1159 //////////////////////////////////////////////////////////////////////////
1160 template <typename T1
, typename T2
>
1162 static T1
AlignDownPow2(T1 value
, T2 alignment
)
1164 SWR_ASSERT(IsPow2(alignment
));
1165 return value
& ~T1(alignment
- 1);
1168 //////////////////////////////////////////////////////////////////////////
1169 /// Align up to specified alignment
1170 /// Note: IsPow2(alignment) MUST be true
1171 //////////////////////////////////////////////////////////////////////////
1172 template <typename T1
, typename T2
>
1174 static T1
AlignUpPow2(T1 value
, T2 alignment
)
1176 return AlignDownPow2(value
+ T1(alignment
- 1), alignment
);
1179 //////////////////////////////////////////////////////////////////////////
1180 /// Align up ptr to specified alignment
1181 /// Note: IsPow2(alignment) MUST be true
1182 //////////////////////////////////////////////////////////////////////////
1183 template <typename T1
, typename T2
>
1185 static T1
* AlignUpPow2(T1
* value
, T2 alignment
)
1187 return reinterpret_cast<T1
*>(
1188 AlignDownPow2(reinterpret_cast<uintptr_t>(value
) + uintptr_t(alignment
- 1), alignment
));
1191 //////////////////////////////////////////////////////////////////////////
1192 /// Align down to specified alignment
1193 //////////////////////////////////////////////////////////////////////////
1194 template <typename T1
, typename T2
>
1196 static T1
AlignDown(T1 value
, T2 alignment
)
1198 if (IsPow2(alignment
)) { return AlignDownPow2(value
, alignment
); }
1199 return value
- T1(value
% alignment
);
1202 //////////////////////////////////////////////////////////////////////////
1203 /// Align down to specified alignment
1204 //////////////////////////////////////////////////////////////////////////
1205 template <typename T1
, typename T2
>
1207 static T1
* AlignDown(T1
* value
, T2 alignment
)
1209 return (T1
*)AlignDown(uintptr_t(value
), alignment
);
1212 //////////////////////////////////////////////////////////////////////////
1213 /// Align up to specified alignment
1214 /// Note: IsPow2(alignment) MUST be true
1215 //////////////////////////////////////////////////////////////////////////
1216 template <typename T1
, typename T2
>
1218 static T1
AlignUp(T1 value
, T2 alignment
)
1220 return AlignDown(value
+ T1(alignment
- 1), alignment
);
1223 //////////////////////////////////////////////////////////////////////////
1224 /// Align up to specified alignment
1225 /// Note: IsPow2(alignment) MUST be true
1226 //////////////////////////////////////////////////////////////////////////
1227 template <typename T1
, typename T2
>
1229 static T1
* AlignUp(T1
* value
, T2 alignment
)
1231 return AlignDown(PtrAdd(value
, alignment
- 1), alignment
);
1234 //////////////////////////////////////////////////////////////////////////
1235 /// Helper structure used to access an array of elements that don't
1236 /// correspond to a typical word size.
1237 //////////////////////////////////////////////////////////////////////////
1238 template<typename T
, size_t BitsPerElementT
, size_t ArrayLenT
>
1242 static const size_t BITS_PER_WORD
= sizeof(size_t) * 8;
1243 static const size_t ELEMENTS_PER_WORD
= BITS_PER_WORD
/ BitsPerElementT
;
1244 static const size_t NUM_WORDS
= (ArrayLenT
+ ELEMENTS_PER_WORD
- 1) / ELEMENTS_PER_WORD
;
1245 static const size_t ELEMENT_MASK
= (size_t(1) << BitsPerElementT
) - 1;
1247 static_assert(ELEMENTS_PER_WORD
* BitsPerElementT
== BITS_PER_WORD
,
1248 "Element size must an integral fraction of pointer size");
1250 size_t m_words
[NUM_WORDS
] = {};
1254 T
operator[] (size_t elementIndex
) const
1256 size_t word
= m_words
[elementIndex
/ ELEMENTS_PER_WORD
];
1257 word
>>= ((elementIndex
% ELEMENTS_PER_WORD
) * BitsPerElementT
);
1258 return T(word
& ELEMENT_MASK
);
1262 // Ranged integer argument for TemplateArgUnroller
1263 template <uint32_t TMin
, uint32_t TMax
>
1269 // Recursive template used to auto-nest conditionals. Converts dynamic boolean function
1270 // arguments to static template arguments.
1271 template <typename TermT
, typename
... ArgsB
>
1272 struct TemplateArgUnroller
1274 //-----------------------------------------
1276 //-----------------------------------------
1278 // Last Arg Terminator
1279 static typename
TermT::FuncType
GetFunc(bool bArg
)
1283 return TermT::template GetFunc
<ArgsB
..., std::true_type
>();
1286 return TermT::template GetFunc
<ArgsB
..., std::false_type
>();
1289 // Recursively parse args
1290 template <typename
... TArgsT
>
1291 static typename
TermT::FuncType
GetFunc(bool bArg
, TArgsT
... remainingArgs
)
1295 return TemplateArgUnroller
<TermT
, ArgsB
..., std::true_type
>::GetFunc(remainingArgs
...);
1298 return TemplateArgUnroller
<TermT
, ArgsB
..., std::false_type
>::GetFunc(remainingArgs
...);
1301 //-----------------------------------------
1302 // Integer value (within specified range)
1303 //-----------------------------------------
1305 // Last Arg Terminator
1306 template <uint32_t TMin
, uint32_t TMax
>
1307 static typename
TermT::FuncType
GetFunc(IntArg
<TMin
, TMax
> iArg
)
1309 if (iArg
.val
== TMax
)
1311 return TermT::template GetFunc
<ArgsB
..., std::integral_constant
<uint32_t, TMax
>>();
1315 return TemplateArgUnroller
<TermT
, ArgsB
...>::GetFunc(IntArg
<TMin
, TMax
-1>{iArg
.val
});
1317 SWR_ASSUME(false); return nullptr;
1319 template <uint32_t TVal
>
1320 static typename
TermT::FuncType
GetFunc(IntArg
<TVal
, TVal
> iArg
)
1322 SWR_ASSERT(iArg
.val
== TVal
);
1323 return TermT::template GetFunc
<ArgsB
..., std::integral_constant
<uint32_t, TVal
>>();
1326 // Recursively parse args
1327 template <uint32_t TMin
, uint32_t TMax
, typename
... TArgsT
>
1328 static typename
TermT::FuncType
GetFunc(IntArg
<TMin
, TMax
> iArg
, TArgsT
... remainingArgs
)
1330 if (iArg
.val
== TMax
)
1332 return TemplateArgUnroller
<TermT
, ArgsB
..., std::integral_constant
<uint32_t, TMax
>>::GetFunc(remainingArgs
...);
1336 return TemplateArgUnroller
<TermT
, ArgsB
...>::GetFunc(IntArg
<TMin
, TMax
- 1>{iArg
.val
}, remainingArgs
...);
1338 SWR_ASSUME(false); return nullptr;
1340 template <uint32_t TVal
, typename
... TArgsT
>
1341 static typename
TermT::FuncType
GetFunc(IntArg
<TVal
, TVal
> iArg
, TArgsT
... remainingArgs
)
1343 SWR_ASSERT(iArg
.val
== TVal
);
1344 return TemplateArgUnroller
<TermT
, ArgsB
..., std::integral_constant
<uint32_t, TVal
>>::GetFunc(remainingArgs
...);