9dfa16a529e432594e2cf27137ef46270b402857
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * @brief Utilities used by SWR core.
27 ******************************************************************************/
31 #include <type_traits>
33 #include "common/os.h"
34 #include "common/simdintrin.h"
35 #include "common/swr_assert.h"
38 #if defined(_WIN64) || defined(__x86_64__)
39 #define _MM_INSERT_EPI64 _mm_insert_epi64
40 #define _MM_EXTRACT_EPI64 _mm_extract_epi64
42 INLINE
int64_t _MM_EXTRACT_EPI64(__m128i a
, const int32_t ndx
)
44 OSALIGNLINE(uint32_t) elems
[4];
45 _mm_store_si128((__m128i
*)elems
, a
);
48 uint64_t foo
= elems
[0];
49 foo
|= (uint64_t)elems
[1] << 32;
54 uint64_t foo
= elems
[2];
55 foo
|= (uint64_t)elems
[3] << 32;
60 INLINE __m128i
_MM_INSERT_EPI64(__m128i a
, int64_t b
, const int32_t ndx
)
62 OSALIGNLINE(int64_t) elems
[2];
63 _mm_store_si128((__m128i
*)elems
, a
);
73 out
= _mm_load_si128((const __m128i
*)elems
);
87 void vTranspose(__m128
&row0
, __m128
&row1
, __m128
&row2
, __m128
&row3
)
89 __m128i row0i
= _mm_castps_si128(row0
);
90 __m128i row1i
= _mm_castps_si128(row1
);
91 __m128i row2i
= _mm_castps_si128(row2
);
92 __m128i row3i
= _mm_castps_si128(row3
);
94 __m128i vTemp
= row2i
;
95 row2i
= _mm_unpacklo_epi32(row2i
, row3i
);
96 vTemp
= _mm_unpackhi_epi32(vTemp
, row3i
);
99 row0i
= _mm_unpacklo_epi32(row0i
, row1i
);
100 row3i
= _mm_unpackhi_epi32(row3i
, row1i
);
103 row0i
= _mm_unpacklo_epi64(row0i
, row2i
);
104 row1i
= _mm_unpackhi_epi64(row1i
, row2i
);
107 row2i
= _mm_unpacklo_epi64(row2i
, vTemp
);
108 row3i
= _mm_unpackhi_epi64(row3i
, vTemp
);
110 row0
= _mm_castsi128_ps(row0i
);
111 row1
= _mm_castsi128_ps(row1i
);
112 row2
= _mm_castsi128_ps(row2i
);
113 row3
= _mm_castsi128_ps(row3i
);
117 void vTranspose(__m128i
&row0
, __m128i
&row1
, __m128i
&row2
, __m128i
&row3
)
119 __m128i vTemp
= row2
;
120 row2
= _mm_unpacklo_epi32(row2
, row3
);
121 vTemp
= _mm_unpackhi_epi32(vTemp
, row3
);
124 row0
= _mm_unpacklo_epi32(row0
, row1
);
125 row3
= _mm_unpackhi_epi32(row3
, row1
);
128 row0
= _mm_unpacklo_epi64(row0
, row2
);
129 row1
= _mm_unpackhi_epi64(row1
, row2
);
132 row2
= _mm_unpacklo_epi64(row2
, vTemp
);
133 row3
= _mm_unpackhi_epi64(row3
, vTemp
);
136 #if KNOB_SIMD_WIDTH == 8
138 void vTranspose3x8(__m128 (&vDst
)[8], const simdscalar
&vSrc0
, const simdscalar
&vSrc1
, const simdscalar
&vSrc2
)
140 simdscalar r0r2
= _simd_unpacklo_ps(vSrc0
, vSrc2
); //x0z0x1z1 x4z4x5z5
141 simdscalar r1rx
= _simd_unpacklo_ps(vSrc1
, _simd_setzero_ps()); //y0w0y1w1 y4w4y5w5
142 simdscalar r02r1xlolo
= _simd_unpacklo_ps(r0r2
, r1rx
); //x0y0z0w0 x4y4z4w4
143 simdscalar r02r1xlohi
= _simd_unpackhi_ps(r0r2
, r1rx
); //x1y1z1w1 x5y5z5w5
145 r0r2
= _simd_unpackhi_ps(vSrc0
, vSrc2
); //x2z2x3z3 x6z6x7z7
146 r1rx
= _simd_unpackhi_ps(vSrc1
, _simd_setzero_ps()); //y2w2y3w3 y6w6yw77
147 simdscalar r02r1xhilo
= _simd_unpacklo_ps(r0r2
, r1rx
); //x2y2z2w2 x6y6z6w6
148 simdscalar r02r1xhihi
= _simd_unpackhi_ps(r0r2
, r1rx
); //x3y3z3w3 x7y7z7w7
150 vDst
[0] = _mm256_castps256_ps128(r02r1xlolo
);
151 vDst
[1] = _mm256_castps256_ps128(r02r1xlohi
);
152 vDst
[2] = _mm256_castps256_ps128(r02r1xhilo
);
153 vDst
[3] = _mm256_castps256_ps128(r02r1xhihi
);
155 vDst
[4] = _simd_extractf128_ps(r02r1xlolo
, 1);
156 vDst
[5] = _simd_extractf128_ps(r02r1xlohi
, 1);
157 vDst
[6] = _simd_extractf128_ps(r02r1xhilo
, 1);
158 vDst
[7] = _simd_extractf128_ps(r02r1xhihi
, 1);
162 void vTranspose4x8(__m128 (&vDst
)[8], const simdscalar
&vSrc0
, const simdscalar
&vSrc1
, const simdscalar
&vSrc2
, const simdscalar
&vSrc3
)
164 simdscalar r0r2
= _simd_unpacklo_ps(vSrc0
, vSrc2
); //x0z0x1z1 x4z4x5z5
165 simdscalar r1rx
= _simd_unpacklo_ps(vSrc1
, vSrc3
); //y0w0y1w1 y4w4y5w5
166 simdscalar r02r1xlolo
= _simd_unpacklo_ps(r0r2
, r1rx
); //x0y0z0w0 x4y4z4w4
167 simdscalar r02r1xlohi
= _simd_unpackhi_ps(r0r2
, r1rx
); //x1y1z1w1 x5y5z5w5
169 r0r2
= _simd_unpackhi_ps(vSrc0
, vSrc2
); //x2z2x3z3 x6z6x7z7
170 r1rx
= _simd_unpackhi_ps(vSrc1
, vSrc3
); //y2w2y3w3 y6w6yw77
171 simdscalar r02r1xhilo
= _simd_unpacklo_ps(r0r2
, r1rx
); //x2y2z2w2 x6y6z6w6
172 simdscalar r02r1xhihi
= _simd_unpackhi_ps(r0r2
, r1rx
); //x3y3z3w3 x7y7z7w7
174 vDst
[0] = _mm256_castps256_ps128(r02r1xlolo
);
175 vDst
[1] = _mm256_castps256_ps128(r02r1xlohi
);
176 vDst
[2] = _mm256_castps256_ps128(r02r1xhilo
);
177 vDst
[3] = _mm256_castps256_ps128(r02r1xhihi
);
179 vDst
[4] = _simd_extractf128_ps(r02r1xlolo
, 1);
180 vDst
[5] = _simd_extractf128_ps(r02r1xlohi
, 1);
181 vDst
[6] = _simd_extractf128_ps(r02r1xhilo
, 1);
182 vDst
[7] = _simd_extractf128_ps(r02r1xhihi
, 1);
185 #if ENABLE_AVX512_SIMD16
187 void vTranspose4x16(simd16scalar(&dst
)[4], const simd16scalar
&src0
, const simd16scalar
&src1
, const simd16scalar
&src2
, const simd16scalar
&src3
)
189 const simd16scalari perm
= _simd16_set_epi32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0); // pre-permute input to setup the right order after all the unpacking
191 simd16scalar pre0
= _simd16_permute_ps(src0
, perm
); // r
192 simd16scalar pre1
= _simd16_permute_ps(src1
, perm
); // g
193 simd16scalar pre2
= _simd16_permute_ps(src2
, perm
); // b
194 simd16scalar pre3
= _simd16_permute_ps(src3
, perm
); // a
196 simd16scalar rblo
= _simd16_unpacklo_ps(pre0
, pre2
);
197 simd16scalar galo
= _simd16_unpacklo_ps(pre1
, pre3
);
198 simd16scalar rbhi
= _simd16_unpackhi_ps(pre0
, pre2
);
199 simd16scalar gahi
= _simd16_unpackhi_ps(pre1
, pre3
);
201 dst
[0] = _simd16_unpacklo_ps(rblo
, galo
);
202 dst
[1] = _simd16_unpackhi_ps(rblo
, galo
);
203 dst
[2] = _simd16_unpacklo_ps(rbhi
, gahi
);
204 dst
[3] = _simd16_unpackhi_ps(rbhi
, gahi
);
209 void vTranspose8x8(simdscalar (&vDst
)[8], const simdscalar
&vMask0
, const simdscalar
&vMask1
, const simdscalar
&vMask2
, const simdscalar
&vMask3
, const simdscalar
&vMask4
, const simdscalar
&vMask5
, const simdscalar
&vMask6
, const simdscalar
&vMask7
)
211 simdscalar __t0
= _simd_unpacklo_ps(vMask0
, vMask1
);
212 simdscalar __t1
= _simd_unpackhi_ps(vMask0
, vMask1
);
213 simdscalar __t2
= _simd_unpacklo_ps(vMask2
, vMask3
);
214 simdscalar __t3
= _simd_unpackhi_ps(vMask2
, vMask3
);
215 simdscalar __t4
= _simd_unpacklo_ps(vMask4
, vMask5
);
216 simdscalar __t5
= _simd_unpackhi_ps(vMask4
, vMask5
);
217 simdscalar __t6
= _simd_unpacklo_ps(vMask6
, vMask7
);
218 simdscalar __t7
= _simd_unpackhi_ps(vMask6
, vMask7
);
219 simdscalar __tt0
= _simd_shuffle_ps(__t0
,__t2
,_MM_SHUFFLE(1,0,1,0));
220 simdscalar __tt1
= _simd_shuffle_ps(__t0
,__t2
,_MM_SHUFFLE(3,2,3,2));
221 simdscalar __tt2
= _simd_shuffle_ps(__t1
,__t3
,_MM_SHUFFLE(1,0,1,0));
222 simdscalar __tt3
= _simd_shuffle_ps(__t1
,__t3
,_MM_SHUFFLE(3,2,3,2));
223 simdscalar __tt4
= _simd_shuffle_ps(__t4
,__t6
,_MM_SHUFFLE(1,0,1,0));
224 simdscalar __tt5
= _simd_shuffle_ps(__t4
,__t6
,_MM_SHUFFLE(3,2,3,2));
225 simdscalar __tt6
= _simd_shuffle_ps(__t5
,__t7
,_MM_SHUFFLE(1,0,1,0));
226 simdscalar __tt7
= _simd_shuffle_ps(__t5
,__t7
,_MM_SHUFFLE(3,2,3,2));
227 vDst
[0] = _simd_permute2f128_ps(__tt0
, __tt4
, 0x20);
228 vDst
[1] = _simd_permute2f128_ps(__tt1
, __tt5
, 0x20);
229 vDst
[2] = _simd_permute2f128_ps(__tt2
, __tt6
, 0x20);
230 vDst
[3] = _simd_permute2f128_ps(__tt3
, __tt7
, 0x20);
231 vDst
[4] = _simd_permute2f128_ps(__tt0
, __tt4
, 0x31);
232 vDst
[5] = _simd_permute2f128_ps(__tt1
, __tt5
, 0x31);
233 vDst
[6] = _simd_permute2f128_ps(__tt2
, __tt6
, 0x31);
234 vDst
[7] = _simd_permute2f128_ps(__tt3
, __tt7
, 0x31);
238 void vTranspose8x8(simdscalar (&vDst
)[8], const simdscalari
&vMask0
, const simdscalari
&vMask1
, const simdscalari
&vMask2
, const simdscalari
&vMask3
, const simdscalari
&vMask4
, const simdscalari
&vMask5
, const simdscalari
&vMask6
, const simdscalari
&vMask7
)
240 vTranspose8x8(vDst
, _simd_castsi_ps(vMask0
), _simd_castsi_ps(vMask1
), _simd_castsi_ps(vMask2
), _simd_castsi_ps(vMask3
),
241 _simd_castsi_ps(vMask4
), _simd_castsi_ps(vMask5
), _simd_castsi_ps(vMask6
), _simd_castsi_ps(vMask7
));
245 //////////////////////////////////////////////////////////////////////////
246 /// TranposeSingleComponent
247 //////////////////////////////////////////////////////////////////////////
248 template<uint32_t bpp
>
249 struct TransposeSingleComponent
251 //////////////////////////////////////////////////////////////////////////
252 /// @brief Pass-thru for single component.
253 /// @param pSrc - source data in SOA form
254 /// @param pDst - output data in AOS form
255 INLINE
static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
)
257 memcpy(pDst
, pSrc
, (bpp
* KNOB_SIMD_WIDTH
) / 8);
259 #if ENABLE_AVX512_SIMD16
261 INLINE
static void Transpose_16(const uint8_t* pSrc
, uint8_t* pDst
)
263 memcpy(pDst
, pSrc
, (bpp
* KNOB_SIMD16_WIDTH
) / 8);
268 //////////////////////////////////////////////////////////////////////////
270 //////////////////////////////////////////////////////////////////////////
271 struct Transpose8_8_8_8
273 //////////////////////////////////////////////////////////////////////////
274 /// @brief Performs an SOA to AOS conversion for packed 8_8_8_8 data.
275 /// @param pSrc - source data in SOA form
276 /// @param pDst - output data in AOS form
277 INLINE
static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
)
279 simdscalari src
= _simd_load_si((const simdscalari
*)pSrc
);
281 #if KNOB_SIMD_WIDTH == 8
282 #if KNOB_ARCH <= KNOB_ARCH_AVX
283 __m128i c0c1
= _mm256_castsi256_si128(src
); // rrrrrrrrgggggggg
284 __m128i c2c3
= _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(src
), 1)); // bbbbbbbbaaaaaaaa
285 __m128i c0c2
= _mm_unpacklo_epi64(c0c1
, c2c3
); // rrrrrrrrbbbbbbbb
286 __m128i c1c3
= _mm_unpackhi_epi64(c0c1
, c2c3
); // ggggggggaaaaaaaa
287 __m128i c01
= _mm_unpacklo_epi8(c0c2
, c1c3
); // rgrgrgrgrgrgrgrg
288 __m128i c23
= _mm_unpackhi_epi8(c0c2
, c1c3
); // babababababababa
289 __m128i c0123lo
= _mm_unpacklo_epi16(c01
, c23
); // rgbargbargbargba
290 __m128i c0123hi
= _mm_unpackhi_epi16(c01
, c23
); // rgbargbargbargba
291 _mm_store_si128((__m128i
*)pDst
, c0123lo
);
292 _mm_store_si128((__m128i
*)(pDst
+ 16), c0123hi
);
294 simdscalari dst01
= _simd_shuffle_epi8(src
,
295 _simd_set_epi32(0x0f078080, 0x0e068080, 0x0d058080, 0x0c048080, 0x80800b03, 0x80800a02, 0x80800901, 0x80800800));
296 simdscalari dst23
= _mm256_permute2x128_si256(src
, src
, 0x01);
297 dst23
= _simd_shuffle_epi8(dst23
,
298 _simd_set_epi32(0x80800f07, 0x80800e06, 0x80800d05, 0x80800c04, 0x0b038080, 0x0a028080, 0x09018080, 0x08008080));
299 simdscalari dst
= _simd_or_si(dst01
, dst23
);
300 _simd_store_si((simdscalari
*)pDst
, dst
);
303 #error Unsupported vector width
306 #if ENABLE_AVX512_SIMD16
308 INLINE
static void Transpose_16(const uint8_t* pSrc
, uint8_t* pDst
)
310 __m128i src0
= _mm_load_si128(reinterpret_cast<const __m128i
*>(pSrc
)); // rrrrrrrrrrrrrrrr
311 __m128i src1
= _mm_load_si128(reinterpret_cast<const __m128i
*>(pSrc
) + 1); // gggggggggggggggg
312 __m128i src2
= _mm_load_si128(reinterpret_cast<const __m128i
*>(pSrc
) + 2); // bbbbbbbbbbbbbbbb
313 __m128i src3
= _mm_load_si128(reinterpret_cast<const __m128i
*>(pSrc
) + 3); // aaaaaaaaaaaaaaaa
315 simd16scalari cvt0
= _simd16_cvtepu8_epi32(src0
);
316 simd16scalari cvt1
= _simd16_cvtepu8_epi32(src1
);
317 simd16scalari cvt2
= _simd16_cvtepu8_epi32(src2
);
318 simd16scalari cvt3
= _simd16_cvtepu8_epi32(src3
);
320 simd16scalari shl1
= _simd16_slli_epi32(cvt1
, 8);
321 simd16scalari shl2
= _simd16_slli_epi32(cvt2
, 16);
322 simd16scalari shl3
= _simd16_slli_epi32(cvt3
, 24);
324 simd16scalari dst
= _simd16_or_si(_simd16_or_si(cvt0
, shl1
), _simd16_or_si(shl2
, shl3
));
326 _simd16_store_si(reinterpret_cast<simd16scalari
*>(pDst
), dst
); // rgbargbargbargbargbargbargbargbargbargbargbargbargbargbargbargba
331 //////////////////////////////////////////////////////////////////////////
333 //////////////////////////////////////////////////////////////////////////
334 struct Transpose8_8_8
336 //////////////////////////////////////////////////////////////////////////
337 /// @brief Performs an SOA to AOS conversion for packed 8_8_8 data.
338 /// @param pSrc - source data in SOA form
339 /// @param pDst - output data in AOS form
340 INLINE
static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
341 #if ENABLE_AVX512_SIMD16
343 INLINE
static void Transpose_16(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
347 //////////////////////////////////////////////////////////////////////////
349 //////////////////////////////////////////////////////////////////////////
352 //////////////////////////////////////////////////////////////////////////
353 /// @brief Performs an SOA to AOS conversion for packed 8_8 data.
354 /// @param pSrc - source data in SOA form
355 /// @param pDst - output data in AOS form
356 INLINE
static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
)
358 #if KNOB_SIMD_WIDTH == 8
359 simdscalari src
= _simd_load_si((const simdscalari
*)pSrc
);
361 __m128i rg
= _mm256_castsi256_si128(src
); // rrrrrrrr gggggggg
362 __m128i g
= _mm_unpackhi_epi64(rg
, rg
); // gggggggg gggggggg
363 rg
= _mm_unpacklo_epi8(rg
, g
);
364 _mm_store_si128((__m128i
*)pDst
, rg
);
366 #error Unsupported vector width
369 #if ENABLE_AVX512_SIMD16
371 INLINE
static void Transpose_16(const uint8_t* pSrc
, uint8_t* pDst
)
373 __m128i src0
= _mm_load_si128(reinterpret_cast<const __m128i
*>(pSrc
)); // rrrrrrrrrrrrrrrr
374 __m128i src1
= _mm_load_si128(reinterpret_cast<const __m128i
*>(pSrc
) + 1); // gggggggggggggggg
376 simdscalari cvt0
= _simd_cvtepu8_epi16(src0
);
377 simdscalari cvt1
= _simd_cvtepu8_epi16(src1
);
379 simdscalari shl1
= _simd_slli_epi32(cvt1
, 8);
381 simdscalari dst
= _simd_or_si(cvt0
, shl1
);
383 _simd_store_si(reinterpret_cast<simdscalari
*>(pDst
), dst
); // rgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrg
388 //////////////////////////////////////////////////////////////////////////
389 /// Transpose32_32_32_32
390 //////////////////////////////////////////////////////////////////////////
391 struct Transpose32_32_32_32
393 //////////////////////////////////////////////////////////////////////////
394 /// @brief Performs an SOA to AOS conversion for packed 32_32_32_32 data.
395 /// @param pSrc - source data in SOA form
396 /// @param pDst - output data in AOS form
397 INLINE
static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
)
399 #if KNOB_SIMD_WIDTH == 8
400 simdscalar src0
= _simd_load_ps((const float*)pSrc
);
401 simdscalar src1
= _simd_load_ps((const float*)pSrc
+ 8);
402 simdscalar src2
= _simd_load_ps((const float*)pSrc
+ 16);
403 simdscalar src3
= _simd_load_ps((const float*)pSrc
+ 24);
406 vTranspose4x8(vDst
, src0
, src1
, src2
, src3
);
407 _mm_store_ps((float*)pDst
, vDst
[0]);
408 _mm_store_ps((float*)pDst
+4, vDst
[1]);
409 _mm_store_ps((float*)pDst
+8, vDst
[2]);
410 _mm_store_ps((float*)pDst
+12, vDst
[3]);
411 _mm_store_ps((float*)pDst
+16, vDst
[4]);
412 _mm_store_ps((float*)pDst
+20, vDst
[5]);
413 _mm_store_ps((float*)pDst
+24, vDst
[6]);
414 _mm_store_ps((float*)pDst
+28, vDst
[7]);
416 #error Unsupported vector width
419 #if ENABLE_AVX512_SIMD16
421 INLINE
static void Transpose_16(const uint8_t* pSrc
, uint8_t* pDst
)
423 simd16scalar src0
= _simd16_load_ps(reinterpret_cast<const float *>(pSrc
));
424 simd16scalar src1
= _simd16_load_ps(reinterpret_cast<const float *>(pSrc
) + 16);
425 simd16scalar src2
= _simd16_load_ps(reinterpret_cast<const float *>(pSrc
) + 32);
426 simd16scalar src3
= _simd16_load_ps(reinterpret_cast<const float *>(pSrc
) + 48);
430 vTranspose4x16(dst
, src0
, src1
, src2
, src3
);
432 _simd16_store_ps(reinterpret_cast<float *>(pDst
) + 0, dst
[0]);
433 _simd16_store_ps(reinterpret_cast<float *>(pDst
) + 16, dst
[1]);
434 _simd16_store_ps(reinterpret_cast<float *>(pDst
) + 32, dst
[2]);
435 _simd16_store_ps(reinterpret_cast<float *>(pDst
) + 48, dst
[3]);
440 //////////////////////////////////////////////////////////////////////////
441 /// Transpose32_32_32
442 //////////////////////////////////////////////////////////////////////////
443 struct Transpose32_32_32
445 //////////////////////////////////////////////////////////////////////////
446 /// @brief Performs an SOA to AOS conversion for packed 32_32_32 data.
447 /// @param pSrc - source data in SOA form
448 /// @param pDst - output data in AOS form
449 INLINE
static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
)
451 #if KNOB_SIMD_WIDTH == 8
452 simdscalar src0
= _simd_load_ps((const float*)pSrc
);
453 simdscalar src1
= _simd_load_ps((const float*)pSrc
+ 8);
454 simdscalar src2
= _simd_load_ps((const float*)pSrc
+ 16);
457 vTranspose3x8(vDst
, src0
, src1
, src2
);
458 _mm_store_ps((float*)pDst
, vDst
[0]);
459 _mm_store_ps((float*)pDst
+ 4, vDst
[1]);
460 _mm_store_ps((float*)pDst
+ 8, vDst
[2]);
461 _mm_store_ps((float*)pDst
+ 12, vDst
[3]);
462 _mm_store_ps((float*)pDst
+ 16, vDst
[4]);
463 _mm_store_ps((float*)pDst
+ 20, vDst
[5]);
464 _mm_store_ps((float*)pDst
+ 24, vDst
[6]);
465 _mm_store_ps((float*)pDst
+ 28, vDst
[7]);
467 #error Unsupported vector width
470 #if ENABLE_AVX512_SIMD16
472 INLINE
static void Transpose_16(const uint8_t* pSrc
, uint8_t* pDst
)
474 simd16scalar src0
= _simd16_load_ps(reinterpret_cast<const float *>(pSrc
));
475 simd16scalar src1
= _simd16_load_ps(reinterpret_cast<const float *>(pSrc
) + 16);
476 simd16scalar src2
= _simd16_load_ps(reinterpret_cast<const float *>(pSrc
) + 32);
477 simd16scalar src3
= _simd16_setzero_ps();
481 vTranspose4x16(dst
, src0
, src1
, src2
, src3
);
483 _simd16_store_ps(reinterpret_cast<float *>(pDst
) + 0, dst
[0]);
484 _simd16_store_ps(reinterpret_cast<float *>(pDst
) + 16, dst
[1]);
485 _simd16_store_ps(reinterpret_cast<float *>(pDst
) + 32, dst
[2]);
486 _simd16_store_ps(reinterpret_cast<float *>(pDst
) + 48, dst
[3]);
491 //////////////////////////////////////////////////////////////////////////
493 //////////////////////////////////////////////////////////////////////////
494 struct Transpose32_32
496 //////////////////////////////////////////////////////////////////////////
497 /// @brief Performs an SOA to AOS conversion for packed 32_32 data.
498 /// @param pSrc - source data in SOA form
499 /// @param pDst - output data in AOS form
500 INLINE
static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
)
502 #if KNOB_SIMD_WIDTH == 8
503 const float* pfSrc
= (const float*)pSrc
;
504 __m128 src_r0
= _mm_load_ps(pfSrc
+ 0);
505 __m128 src_r1
= _mm_load_ps(pfSrc
+ 4);
506 __m128 src_g0
= _mm_load_ps(pfSrc
+ 8);
507 __m128 src_g1
= _mm_load_ps(pfSrc
+ 12);
509 __m128 dst0
= _mm_unpacklo_ps(src_r0
, src_g0
);
510 __m128 dst1
= _mm_unpackhi_ps(src_r0
, src_g0
);
511 __m128 dst2
= _mm_unpacklo_ps(src_r1
, src_g1
);
512 __m128 dst3
= _mm_unpackhi_ps(src_r1
, src_g1
);
514 float* pfDst
= (float*)pDst
;
515 _mm_store_ps(pfDst
+ 0, dst0
);
516 _mm_store_ps(pfDst
+ 4, dst1
);
517 _mm_store_ps(pfDst
+ 8, dst2
);
518 _mm_store_ps(pfDst
+ 12, dst3
);
520 #error Unsupported vector width
523 #if ENABLE_AVX512_SIMD16
525 INLINE
static void Transpose_16(const uint8_t* pSrc
, uint8_t* pDst
)
527 simd16scalar src0
= _simd16_load_ps(reinterpret_cast<const float *>(pSrc
)); // rrrrrrrrrrrrrrrr
528 simd16scalar src1
= _simd16_load_ps(reinterpret_cast<const float *>(pSrc
) + 16); // gggggggggggggggg
530 simd16scalar tmp0
= _simd16_unpacklo_ps(src0
, src1
); // r0 g0 r1 g1 r4 g4 r5 g5 r8 g8 r9 g9 rC gC rD gD
531 simd16scalar tmp1
= _simd16_unpackhi_ps(src0
, src1
); // r2 g2 r3 g3 r6 g6 r7 g7 rA gA rB gB rE gE rF gF
533 simd16scalar per0
= _simd16_permute2f128_ps(tmp0
, tmp1
, 0x44); // (1, 0, 1, 0) // r0 g0 r1 g1 r4 g4 r5 g5 r2 g2 r3 g3 r6 g6 r7 g7
534 simd16scalar per1
= _simd16_permute2f128_ps(tmp0
, tmp1
, 0xEE); // (3, 2, 3, 2) // r8 g8 r9 g9 rC gC rD gD rA gA rB gB rE gE rF gF
536 simd16scalar dst0
= _simd16_permute2f128_ps(per0
, per0
, 0xD8); // (3, 1, 2, 0) // r0 g0 r1 g1 r2 g2 r3 g3 r4 g4 r5 g5 r6 g6 r7 g7
537 simd16scalar dst1
= _simd16_permute2f128_ps(per1
, per1
, 0xD8); // (3, 1, 2, 0) // r8 g8 r9 g9 rA gA rB gB rC gC rD gD rE gE rF gF
539 _simd16_store_ps(reinterpret_cast<float *>(pDst
) + 0, dst0
); // rgrgrgrgrgrgrgrg
540 _simd16_store_ps(reinterpret_cast<float *>(pDst
) + 16, dst1
); // rgrgrgrgrgrgrgrg
545 //////////////////////////////////////////////////////////////////////////
546 /// Transpose16_16_16_16
547 //////////////////////////////////////////////////////////////////////////
548 struct Transpose16_16_16_16
550 //////////////////////////////////////////////////////////////////////////
551 /// @brief Performs an SOA to AOS conversion for packed 16_16_16_16 data.
552 /// @param pSrc - source data in SOA form
553 /// @param pDst - output data in AOS form
554 INLINE
static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
)
556 #if KNOB_SIMD_WIDTH == 8
557 simdscalari src_rg
= _simd_load_si((const simdscalari
*)pSrc
);
558 simdscalari src_ba
= _simd_load_si((const simdscalari
*)(pSrc
+ sizeof(simdscalari
)));
560 __m128i src_r
= _mm256_extractf128_si256(src_rg
, 0);
561 __m128i src_g
= _mm256_extractf128_si256(src_rg
, 1);
562 __m128i src_b
= _mm256_extractf128_si256(src_ba
, 0);
563 __m128i src_a
= _mm256_extractf128_si256(src_ba
, 1);
565 __m128i rg0
= _mm_unpacklo_epi16(src_r
, src_g
);
566 __m128i rg1
= _mm_unpackhi_epi16(src_r
, src_g
);
567 __m128i ba0
= _mm_unpacklo_epi16(src_b
, src_a
);
568 __m128i ba1
= _mm_unpackhi_epi16(src_b
, src_a
);
570 __m128i dst0
= _mm_unpacklo_epi32(rg0
, ba0
);
571 __m128i dst1
= _mm_unpackhi_epi32(rg0
, ba0
);
572 __m128i dst2
= _mm_unpacklo_epi32(rg1
, ba1
);
573 __m128i dst3
= _mm_unpackhi_epi32(rg1
, ba1
);
575 _mm_store_si128(((__m128i
*)pDst
) + 0, dst0
);
576 _mm_store_si128(((__m128i
*)pDst
) + 1, dst1
);
577 _mm_store_si128(((__m128i
*)pDst
) + 2, dst2
);
578 _mm_store_si128(((__m128i
*)pDst
) + 3, dst3
);
580 #error Unsupported vector width
583 #if ENABLE_AVX512_SIMD16
585 INLINE
static void Transpose_16(const uint8_t* pSrc
, uint8_t* pDst
)
587 simdscalari src0
= _simd_load_si(reinterpret_cast<const simdscalari
*>(pSrc
)); // rrrrrrrrrrrrrrrr
588 simdscalari src1
= _simd_load_si(reinterpret_cast<const simdscalari
*>(pSrc
) + 1); // gggggggggggggggg
589 simdscalari src2
= _simd_load_si(reinterpret_cast<const simdscalari
*>(pSrc
) + 2); // bbbbbbbbbbbbbbbb
590 simdscalari src3
= _simd_load_si(reinterpret_cast<const simdscalari
*>(pSrc
) + 3); // aaaaaaaaaaaaaaaa
592 simdscalari pre0
= _simd_unpacklo_epi16(src0
, src1
); // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB
593 simdscalari pre1
= _simd_unpackhi_epi16(src0
, src1
); // rg4 rg5 rg6 rg7 rgC rgD rgE rgF
594 simdscalari pre2
= _simd_unpacklo_epi16(src2
, src3
); // ba0 ba1 ba3 ba3 ba8 ba9 baA baB
595 simdscalari pre3
= _simd_unpackhi_epi16(src2
, src3
); // ba4 ba5 ba6 ba7 baC baD baE baF
597 simdscalari tmp0
= _simd_unpacklo_epi32(pre0
, pre2
); // rbga0 rbga1 rbga8 rbga9
598 simdscalari tmp1
= _simd_unpackhi_epi32(pre0
, pre2
); // rbga2 rbga3 rbgaA rbgaB
599 simdscalari tmp2
= _simd_unpacklo_epi32(pre1
, pre3
); // rbga4 rbga5 rgbaC rbgaD
600 simdscalari tmp3
= _simd_unpackhi_epi32(pre1
, pre3
); // rbga6 rbga7 rbgaE rbgaF
602 simdscalari dst0
= _simd_permute2f128_si(tmp0
, tmp1
, 0x20); // (2, 0) // rbga0 rbga1 rbga2 rbga3
603 simdscalari dst1
= _simd_permute2f128_si(tmp2
, tmp3
, 0x20); // (2, 0) // rbga4 rbga5 rbga6 rbga7
604 simdscalari dst2
= _simd_permute2f128_si(tmp0
, tmp1
, 0x31); // (3, 1) // rbga8 rbga9 rbgaA rbgaB
605 simdscalari dst3
= _simd_permute2f128_si(tmp2
, tmp3
, 0x31); // (3, 1) // rbgaC rbgaD rbgaE rbgaF
607 _simd_store_si(reinterpret_cast<simdscalari
*>(pDst
) + 0, dst0
); // rgbargbargbargba
608 _simd_store_si(reinterpret_cast<simdscalari
*>(pDst
) + 1, dst1
); // rgbargbargbargba
609 _simd_store_si(reinterpret_cast<simdscalari
*>(pDst
) + 2, dst2
); // rgbargbargbargba
610 _simd_store_si(reinterpret_cast<simdscalari
*>(pDst
) + 3, dst3
); // rgbargbargbargba
615 //////////////////////////////////////////////////////////////////////////
616 /// Transpose16_16_16
617 //////////////////////////////////////////////////////////////////////////
618 struct Transpose16_16_16
620 //////////////////////////////////////////////////////////////////////////
621 /// @brief Performs an SOA to AOS conversion for packed 16_16_16 data.
622 /// @param pSrc - source data in SOA form
623 /// @param pDst - output data in AOS form
624 INLINE
static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
)
626 #if KNOB_SIMD_WIDTH == 8
627 simdscalari src_rg
= _simd_load_si((const simdscalari
*)pSrc
);
629 __m128i src_r
= _mm256_extractf128_si256(src_rg
, 0);
630 __m128i src_g
= _mm256_extractf128_si256(src_rg
, 1);
631 __m128i src_b
= _mm_load_si128((const __m128i
*)(pSrc
+ sizeof(simdscalari
)));
632 __m128i src_a
= _mm_undefined_si128();
634 __m128i rg0
= _mm_unpacklo_epi16(src_r
, src_g
);
635 __m128i rg1
= _mm_unpackhi_epi16(src_r
, src_g
);
636 __m128i ba0
= _mm_unpacklo_epi16(src_b
, src_a
);
637 __m128i ba1
= _mm_unpackhi_epi16(src_b
, src_a
);
639 __m128i dst0
= _mm_unpacklo_epi32(rg0
, ba0
);
640 __m128i dst1
= _mm_unpackhi_epi32(rg0
, ba0
);
641 __m128i dst2
= _mm_unpacklo_epi32(rg1
, ba1
);
642 __m128i dst3
= _mm_unpackhi_epi32(rg1
, ba1
);
644 _mm_store_si128(((__m128i
*)pDst
) + 0, dst0
);
645 _mm_store_si128(((__m128i
*)pDst
) + 1, dst1
);
646 _mm_store_si128(((__m128i
*)pDst
) + 2, dst2
);
647 _mm_store_si128(((__m128i
*)pDst
) + 3, dst3
);
649 #error Unsupported vector width
652 #if ENABLE_AVX512_SIMD16
654 INLINE
static void Transpose_16(const uint8_t* pSrc
, uint8_t* pDst
)
656 simdscalari src0
= _simd_load_si(reinterpret_cast<const simdscalari
*>(pSrc
)); // rrrrrrrrrrrrrrrr
657 simdscalari src1
= _simd_load_si(reinterpret_cast<const simdscalari
*>(pSrc
) + 1); // gggggggggggggggg
658 simdscalari src2
= _simd_load_si(reinterpret_cast<const simdscalari
*>(pSrc
) + 2); // bbbbbbbbbbbbbbbb
659 simdscalari src3
= _simd_setzero_si(); // aaaaaaaaaaaaaaaa
661 simdscalari pre0
= _simd_unpacklo_epi16(src0
, src1
); // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB
662 simdscalari pre1
= _simd_unpackhi_epi16(src0
, src1
); // rg4 rg5 rg6 rg7 rgC rgD rgE rgF
663 simdscalari pre2
= _simd_unpacklo_epi16(src2
, src3
); // ba0 ba1 ba3 ba3 ba8 ba9 baA baB
664 simdscalari pre3
= _simd_unpackhi_epi16(src2
, src3
); // ba4 ba5 ba6 ba7 baC baD baE baF
666 simdscalari tmp0
= _simd_unpacklo_epi32(pre0
, pre2
); // rbga0 rbga1 rbga8 rbga9
667 simdscalari tmp1
= _simd_unpackhi_epi32(pre0
, pre2
); // rbga2 rbga3 rbgaA rbgaB
668 simdscalari tmp2
= _simd_unpacklo_epi32(pre1
, pre3
); // rbga4 rbga5 rgbaC rbgaD
669 simdscalari tmp3
= _simd_unpackhi_epi32(pre1
, pre3
); // rbga6 rbga7 rbgaE rbgaF
671 simdscalari dst0
= _simd_permute2f128_si(tmp0
, tmp1
, 0x20); // (2, 0) // rbga0 rbga1 rbga2 rbga3
672 simdscalari dst1
= _simd_permute2f128_si(tmp2
, tmp3
, 0x20); // (2, 0) // rbga4 rbga5 rbga6 rbga7
673 simdscalari dst2
= _simd_permute2f128_si(tmp0
, tmp1
, 0x31); // (3, 1) // rbga8 rbga9 rbgaA rbgaB
674 simdscalari dst3
= _simd_permute2f128_si(tmp2
, tmp3
, 0x31); // (3, 1) // rbgaC rbgaD rbgaE rbgaF
676 _simd_store_si(reinterpret_cast<simdscalari
*>(pDst
) + 0, dst0
); // rgbargbargbargba
677 _simd_store_si(reinterpret_cast<simdscalari
*>(pDst
) + 1, dst1
); // rgbargbargbargba
678 _simd_store_si(reinterpret_cast<simdscalari
*>(pDst
) + 2, dst2
); // rgbargbargbargba
679 _simd_store_si(reinterpret_cast<simdscalari
*>(pDst
) + 3, dst3
); // rgbargbargbargba
684 //////////////////////////////////////////////////////////////////////////
686 //////////////////////////////////////////////////////////////////////////
687 struct Transpose16_16
689 //////////////////////////////////////////////////////////////////////////
690 /// @brief Performs an SOA to AOS conversion for packed 16_16 data.
691 /// @param pSrc - source data in SOA form
692 /// @param pDst - output data in AOS form
693 INLINE
static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
)
695 #if KNOB_SIMD_WIDTH == 8
696 simdscalar src
= _simd_load_ps((const float*)pSrc
);
698 __m128 comp0
= _mm256_castps256_ps128(src
);
699 __m128 comp1
= _mm256_extractf128_ps(src
, 1);
701 __m128i comp0i
= _mm_castps_si128(comp0
);
702 __m128i comp1i
= _mm_castps_si128(comp1
);
704 __m128i resLo
= _mm_unpacklo_epi16(comp0i
, comp1i
);
705 __m128i resHi
= _mm_unpackhi_epi16(comp0i
, comp1i
);
707 _mm_store_si128((__m128i
*)pDst
, resLo
);
708 _mm_store_si128((__m128i
*)pDst
+ 1, resHi
);
710 #error Unsupported vector width
713 #if ENABLE_AVX512_SIMD16
715 INLINE
static void Transpose_16(const uint8_t* pSrc
, uint8_t* pDst
)
717 simdscalari src0
= _simd_load_si(reinterpret_cast<const simdscalari
*>(pSrc
)); // rrrrrrrrrrrrrrrr
718 simdscalari src1
= _simd_load_si(reinterpret_cast<const simdscalari
*>(pSrc
) + 1); // gggggggggggggggg
720 simdscalari tmp0
= _simd_unpacklo_epi16(src0
, src1
); // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB
721 simdscalari tmp1
= _simd_unpackhi_epi16(src0
, src1
); // rg4 rg5 rg6 rg7 rgC rgD rgE rgF
723 simdscalari dst0
= _simd_permute2f128_si(tmp0
, tmp1
, 0x20); // (2, 0) // rg0 rg1 rg2 rg3 rg4 rg5 rg6 rg7
724 simdscalari dst1
= _simd_permute2f128_si(tmp0
, tmp1
, 0x31); // (3, 1) // rg8 rg9 rgA rgB rgC rgD rgE rgF
726 _simd_store_si(reinterpret_cast<simdscalari
*>(pDst
) + 0, dst0
); // rgrgrgrgrgrgrgrg
727 _simd_store_si(reinterpret_cast<simdscalari
*>(pDst
) + 1, dst1
); // rgrgrgrgrgrgrgrg
732 //////////////////////////////////////////////////////////////////////////
734 //////////////////////////////////////////////////////////////////////////
737 //////////////////////////////////////////////////////////////////////////
738 /// @brief Performs an SOA to AOS conversion for packed 24_8 data.
739 /// @param pSrc - source data in SOA form
740 /// @param pDst - output data in AOS form
741 static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
742 #if ENABLE_AVX512_SIMD16
744 static void Transpose_16(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
748 //////////////////////////////////////////////////////////////////////////
750 //////////////////////////////////////////////////////////////////////////
751 struct Transpose32_8_24
753 //////////////////////////////////////////////////////////////////////////
754 /// @brief Performs an SOA to AOS conversion for packed 32_8_24 data.
755 /// @param pSrc - source data in SOA form
756 /// @param pDst - output data in AOS form
757 static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
758 #if ENABLE_AVX512_SIMD16
760 static void Transpose_16(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
764 //////////////////////////////////////////////////////////////////////////
766 //////////////////////////////////////////////////////////////////////////
767 struct Transpose4_4_4_4
769 //////////////////////////////////////////////////////////////////////////
770 /// @brief Performs an SOA to AOS conversion for packed 4_4_4_4 data.
771 /// @param pSrc - source data in SOA form
772 /// @param pDst - output data in AOS form
773 static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
774 #if ENABLE_AVX512_SIMD16
776 static void Transpose_16(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
780 //////////////////////////////////////////////////////////////////////////
782 //////////////////////////////////////////////////////////////////////////
783 struct Transpose5_6_5
785 //////////////////////////////////////////////////////////////////////////
786 /// @brief Performs an SOA to AOS conversion for packed 5_6_5 data.
787 /// @param pSrc - source data in SOA form
788 /// @param pDst - output data in AOS form
789 static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
790 #if ENABLE_AVX512_SIMD16
792 static void Transpose_16(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
796 //////////////////////////////////////////////////////////////////////////
798 //////////////////////////////////////////////////////////////////////////
799 struct Transpose9_9_9_5
801 //////////////////////////////////////////////////////////////////////////
802 /// @brief Performs an SOA to AOS conversion for packed 9_9_9_5 data.
803 /// @param pSrc - source data in SOA form
804 /// @param pDst - output data in AOS form
805 static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
806 #if ENABLE_AVX512_SIMD16
808 static void Transpose_16(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
812 //////////////////////////////////////////////////////////////////////////
814 //////////////////////////////////////////////////////////////////////////
815 struct Transpose5_5_5_1
817 //////////////////////////////////////////////////////////////////////////
818 /// @brief Performs an SOA to AOS conversion for packed 5_5_5_1 data.
819 /// @param pSrc - source data in SOA form
820 /// @param pDst - output data in AOS form
821 static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
822 #if ENABLE_AVX512_SIMD16
824 static void Transpose_16(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
828 //////////////////////////////////////////////////////////////////////////
830 //////////////////////////////////////////////////////////////////////////
831 struct Transpose1_5_5_5
833 //////////////////////////////////////////////////////////////////////////
834 /// @brief Performs an SOA to AOS conversion for packed 5_5_5_1 data.
835 /// @param pSrc - source data in SOA form
836 /// @param pDst - output data in AOS form
837 static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
840 //////////////////////////////////////////////////////////////////////////
841 /// Transpose10_10_10_2
842 //////////////////////////////////////////////////////////////////////////
843 struct Transpose10_10_10_2
845 //////////////////////////////////////////////////////////////////////////
846 /// @brief Performs an SOA to AOS conversion for packed 10_10_10_2 data.
847 /// @param pSrc - source data in SOA form
848 /// @param pDst - output data in AOS form
849 static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
850 #if ENABLE_AVX512_SIMD16
852 static void Transpose_16(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
856 //////////////////////////////////////////////////////////////////////////
857 /// Transpose11_11_10
858 //////////////////////////////////////////////////////////////////////////
859 struct Transpose11_11_10
861 //////////////////////////////////////////////////////////////////////////
862 /// @brief Performs an SOA to AOS conversion for packed 11_11_10 data.
863 /// @param pSrc - source data in SOA form
864 /// @param pDst - output data in AOS form
865 static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
866 #if ENABLE_AVX512_SIMD16
868 static void Transpose_16(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
872 //////////////////////////////////////////////////////////////////////////
874 //////////////////////////////////////////////////////////////////////////
877 //////////////////////////////////////////////////////////////////////////
878 /// @brief Performs an SOA to AOS conversion
879 /// @param pSrc - source data in SOA form
880 /// @param pDst - output data in AOS form
881 static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
882 #if ENABLE_AVX512_SIMD16
884 static void Transpose_16(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
888 //////////////////////////////////////////////////////////////////////////
890 //////////////////////////////////////////////////////////////////////////
891 struct Transpose64_64
893 //////////////////////////////////////////////////////////////////////////
894 /// @brief Performs an SOA to AOS conversion
895 /// @param pSrc - source data in SOA form
896 /// @param pDst - output data in AOS form
897 static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
898 #if ENABLE_AVX512_SIMD16
900 static void Transpose_16(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
904 //////////////////////////////////////////////////////////////////////////
905 /// Transpose64_64_64
906 //////////////////////////////////////////////////////////////////////////
907 struct Transpose64_64_64
909 //////////////////////////////////////////////////////////////////////////
910 /// @brief Performs an SOA to AOS conversion
911 /// @param pSrc - source data in SOA form
912 /// @param pDst - output data in AOS form
913 static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
914 #if ENABLE_AVX512_SIMD16
916 static void Transpose_16(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
920 //////////////////////////////////////////////////////////////////////////
921 /// Transpose64_64_64_64
922 //////////////////////////////////////////////////////////////////////////
923 struct Transpose64_64_64_64
925 //////////////////////////////////////////////////////////////////////////
926 /// @brief Performs an SOA to AOS conversion
927 /// @param pSrc - source data in SOA form
928 /// @param pDst - output data in AOS form
929 static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
930 #if ENABLE_AVX512_SIMD16
932 static void Transpose_16(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
936 // helper function to unroll loops
937 template<int Begin
, int End
, int Step
= 1>
939 template<typename Lambda
>
940 INLINE
static void step(Lambda
& func
) {
942 UnrollerL
<Begin
+ Step
, End
, Step
>::step(func
);
946 template<int End
, int Step
>
947 struct UnrollerL
<End
, End
, Step
> {
948 template<typename Lambda
>
949 static void step(Lambda
& func
) {
953 // helper function to unroll loops, with mask to skip specific iterations
954 template<int Begin
, int End
, int Step
= 1, int Mask
= 0x7f>
955 struct UnrollerLMask
{
956 template<typename Lambda
>
957 INLINE
static void step(Lambda
& func
) {
958 if(Mask
& (1 << Begin
))
962 UnrollerL
<Begin
+ Step
, End
, Step
>::step(func
);
966 template<int End
, int Step
, int Mask
>
967 struct UnrollerLMask
<End
, End
, Step
, Mask
> {
968 template<typename Lambda
>
969 static void step(Lambda
& func
) {
973 // general CRC compute
975 uint32_t ComputeCRC(uint32_t crc
, const void *pData
, uint32_t size
)
977 #if defined(_WIN64) || defined(__x86_64__)
978 uint32_t sizeInQwords
= size
/ sizeof(uint64_t);
979 uint32_t sizeRemainderBytes
= size
% sizeof(uint64_t);
980 uint64_t* pDataWords
= (uint64_t*)pData
;
981 for (uint32_t i
= 0; i
< sizeInQwords
; ++i
)
983 crc
= (uint32_t)_mm_crc32_u64(crc
, *pDataWords
++);
986 uint32_t sizeInDwords
= size
/ sizeof(uint32_t);
987 uint32_t sizeRemainderBytes
= size
% sizeof(uint32_t);
988 uint32_t* pDataWords
= (uint32_t*)pData
;
989 for (uint32_t i
= 0; i
< sizeInDwords
; ++i
)
991 crc
= _mm_crc32_u32(crc
, *pDataWords
++);
995 uint8_t* pRemainderBytes
= (uint8_t*)pDataWords
;
996 for (uint32_t i
= 0; i
< sizeRemainderBytes
; ++i
)
998 crc
= _mm_crc32_u8(crc
, *pRemainderBytes
++);
1004 //////////////////////////////////////////////////////////////////////////
1005 /// Add byte offset to any-type pointer
1006 //////////////////////////////////////////////////////////////////////////
1007 template <typename T
>
1009 static T
* PtrAdd(T
* p
, intptr_t offset
)
1011 intptr_t intp
= reinterpret_cast<intptr_t>(p
);
1012 return reinterpret_cast<T
*>(intp
+ offset
);
1015 //////////////////////////////////////////////////////////////////////////
1016 /// Is a power-of-2?
1017 //////////////////////////////////////////////////////////////////////////
1018 template <typename T
>
1020 static bool IsPow2(T value
)
1022 return value
== (value
& (0 - value
));
1025 //////////////////////////////////////////////////////////////////////////
1026 /// Align down to specified alignment
1027 /// Note: IsPow2(alignment) MUST be true
1028 //////////////////////////////////////////////////////////////////////////
1029 template <typename T1
, typename T2
>
1031 static T1
AlignDownPow2(T1 value
, T2 alignment
)
1033 SWR_ASSERT(IsPow2(alignment
));
1034 return value
& ~T1(alignment
- 1);
1037 //////////////////////////////////////////////////////////////////////////
1038 /// Align up to specified alignment
1039 /// Note: IsPow2(alignment) MUST be true
1040 //////////////////////////////////////////////////////////////////////////
1041 template <typename T1
, typename T2
>
1043 static T1
AlignUpPow2(T1 value
, T2 alignment
)
1045 return AlignDownPow2(value
+ T1(alignment
- 1), alignment
);
1048 //////////////////////////////////////////////////////////////////////////
1049 /// Align up ptr to specified alignment
1050 /// Note: IsPow2(alignment) MUST be true
1051 //////////////////////////////////////////////////////////////////////////
1052 template <typename T1
, typename T2
>
1054 static T1
* AlignUpPow2(T1
* value
, T2 alignment
)
1056 return reinterpret_cast<T1
*>(
1057 AlignDownPow2(reinterpret_cast<uintptr_t>(value
) + uintptr_t(alignment
- 1), alignment
));
1060 //////////////////////////////////////////////////////////////////////////
1061 /// Align down to specified alignment
1062 //////////////////////////////////////////////////////////////////////////
1063 template <typename T1
, typename T2
>
1065 static T1
AlignDown(T1 value
, T2 alignment
)
1067 if (IsPow2(alignment
)) { return AlignDownPow2(value
, alignment
); }
1068 return value
- T1(value
% alignment
);
1071 //////////////////////////////////////////////////////////////////////////
1072 /// Align down to specified alignment
1073 //////////////////////////////////////////////////////////////////////////
1074 template <typename T1
, typename T2
>
1076 static T1
* AlignDown(T1
* value
, T2 alignment
)
1078 return (T1
*)AlignDown(uintptr_t(value
), alignment
);
1081 //////////////////////////////////////////////////////////////////////////
1082 /// Align up to specified alignment
1083 /// Note: IsPow2(alignment) MUST be true
1084 //////////////////////////////////////////////////////////////////////////
1085 template <typename T1
, typename T2
>
1087 static T1
AlignUp(T1 value
, T2 alignment
)
1089 return AlignDown(value
+ T1(alignment
- 1), alignment
);
1092 //////////////////////////////////////////////////////////////////////////
1093 /// Align up to specified alignment
1094 /// Note: IsPow2(alignment) MUST be true
1095 //////////////////////////////////////////////////////////////////////////
1096 template <typename T1
, typename T2
>
1098 static T1
* AlignUp(T1
* value
, T2 alignment
)
1100 return AlignDown(PtrAdd(value
, alignment
- 1), alignment
);
1103 //////////////////////////////////////////////////////////////////////////
1104 /// Helper structure used to access an array of elements that don't
1105 /// correspond to a typical word size.
1106 //////////////////////////////////////////////////////////////////////////
1107 template<typename T
, size_t BitsPerElementT
, size_t ArrayLenT
>
1111 static const size_t BITS_PER_WORD
= sizeof(size_t) * 8;
1112 static const size_t ELEMENTS_PER_WORD
= BITS_PER_WORD
/ BitsPerElementT
;
1113 static const size_t NUM_WORDS
= (ArrayLenT
+ ELEMENTS_PER_WORD
- 1) / ELEMENTS_PER_WORD
;
1114 static const size_t ELEMENT_MASK
= (size_t(1) << BitsPerElementT
) - 1;
1116 static_assert(ELEMENTS_PER_WORD
* BitsPerElementT
== BITS_PER_WORD
,
1117 "Element size must an integral fraction of pointer size");
1119 size_t m_words
[NUM_WORDS
] = {};
1123 T
operator[] (size_t elementIndex
) const
1125 size_t word
= m_words
[elementIndex
/ ELEMENTS_PER_WORD
];
1126 word
>>= ((elementIndex
% ELEMENTS_PER_WORD
) * BitsPerElementT
);
1127 return T(word
& ELEMENT_MASK
);
1131 // Ranged integer argument for TemplateArgUnroller
1132 template <uint32_t TMin
, uint32_t TMax
>
1138 // Recursive template used to auto-nest conditionals. Converts dynamic boolean function
1139 // arguments to static template arguments.
1140 template <typename TermT
, typename
... ArgsB
>
1141 struct TemplateArgUnroller
1143 //-----------------------------------------
1145 //-----------------------------------------
1147 // Last Arg Terminator
1148 static typename
TermT::FuncType
GetFunc(bool bArg
)
1152 return TermT::template GetFunc
<ArgsB
..., std::true_type
>();
1155 return TermT::template GetFunc
<ArgsB
..., std::false_type
>();
1158 // Recursively parse args
1159 template <typename
... TArgsT
>
1160 static typename
TermT::FuncType
GetFunc(bool bArg
, TArgsT
... remainingArgs
)
1164 return TemplateArgUnroller
<TermT
, ArgsB
..., std::true_type
>::GetFunc(remainingArgs
...);
1167 return TemplateArgUnroller
<TermT
, ArgsB
..., std::false_type
>::GetFunc(remainingArgs
...);
1170 //-----------------------------------------
1171 // Integer value (within specified range)
1172 //-----------------------------------------
1174 // Last Arg Terminator
1175 template <uint32_t TMin
, uint32_t TMax
>
1176 static typename
TermT::FuncType
GetFunc(IntArg
<TMin
, TMax
> iArg
)
1178 if (iArg
.val
== TMax
)
1180 return TermT::template GetFunc
<ArgsB
..., std::integral_constant
<uint32_t, TMax
>>();
1184 return TemplateArgUnroller
<TermT
, ArgsB
...>::GetFunc(IntArg
<TMin
, TMax
-1>{iArg
.val
});
1186 SWR_ASSUME(false); return nullptr;
1188 template <uint32_t TVal
>
1189 static typename
TermT::FuncType
GetFunc(IntArg
<TVal
, TVal
> iArg
)
1191 SWR_ASSERT(iArg
.val
== TVal
);
1192 return TermT::template GetFunc
<ArgsB
..., std::integral_constant
<uint32_t, TVal
>>();
1195 // Recursively parse args
1196 template <uint32_t TMin
, uint32_t TMax
, typename
... TArgsT
>
1197 static typename
TermT::FuncType
GetFunc(IntArg
<TMin
, TMax
> iArg
, TArgsT
... remainingArgs
)
1199 if (iArg
.val
== TMax
)
1201 return TemplateArgUnroller
<TermT
, ArgsB
..., std::integral_constant
<uint32_t, TMax
>>::GetFunc(remainingArgs
...);
1205 return TemplateArgUnroller
<TermT
, ArgsB
...>::GetFunc(IntArg
<TMin
, TMax
- 1>{iArg
.val
}, remainingArgs
...);
1207 SWR_ASSUME(false); return nullptr;
1209 template <uint32_t TVal
, typename
... TArgsT
>
1210 static typename
TermT::FuncType
GetFunc(IntArg
<TVal
, TVal
> iArg
, TArgsT
... remainingArgs
)
1212 SWR_ASSERT(iArg
.val
== TVal
);
1213 return TemplateArgUnroller
<TermT
, ArgsB
..., std::integral_constant
<uint32_t, TVal
>>::GetFunc(remainingArgs
...);