b51755dab50bf589c7cd12c14304a738052bb5f3
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * @brief Utilities used by SWR core related to pixel formats.
27 ******************************************************************************/
30 #include "core/utils.h"
31 #include "common/simdintrin.h"
34 void vTranspose(simd4scalar
& row0
, simd4scalar
& row1
, simd4scalar
& row2
, simd4scalar
& row3
)
36 simd4scalari row0i
= SIMD128::castps_si(row0
);
37 simd4scalari row1i
= SIMD128::castps_si(row1
);
38 simd4scalari row2i
= SIMD128::castps_si(row2
);
39 simd4scalari row3i
= SIMD128::castps_si(row3
);
41 simd4scalari vTemp
= row2i
;
42 row2i
= SIMD128::unpacklo_epi32(row2i
, row3i
);
43 vTemp
= SIMD128::unpackhi_epi32(vTemp
, row3i
);
46 row0i
= SIMD128::unpacklo_epi32(row0i
, row1i
);
47 row3i
= SIMD128::unpackhi_epi32(row3i
, row1i
);
50 row0i
= SIMD128::unpacklo_epi64(row0i
, row2i
);
51 row1i
= SIMD128::unpackhi_epi64(row1i
, row2i
);
54 row2i
= SIMD128::unpacklo_epi64(row2i
, vTemp
);
55 row3i
= SIMD128::unpackhi_epi64(row3i
, vTemp
);
57 row0
= SIMD128::castsi_ps(row0i
);
58 row1
= SIMD128::castsi_ps(row1i
);
59 row2
= SIMD128::castsi_ps(row2i
);
60 row3
= SIMD128::castsi_ps(row3i
);
64 void vTranspose(simd4scalari
& row0
, simd4scalari
& row1
, simd4scalari
& row2
, simd4scalari
& row3
)
66 simd4scalari vTemp
= row2
;
67 row2
= SIMD128::unpacklo_epi32(row2
, row3
);
68 vTemp
= SIMD128::unpackhi_epi32(vTemp
, row3
);
71 row0
= SIMD128::unpacklo_epi32(row0
, row1
);
72 row3
= SIMD128::unpackhi_epi32(row3
, row1
);
75 row0
= SIMD128::unpacklo_epi64(row0
, row2
);
76 row1
= SIMD128::unpackhi_epi64(row1
, row2
);
79 row2
= SIMD128::unpacklo_epi64(row2
, vTemp
);
80 row3
= SIMD128::unpackhi_epi64(row3
, vTemp
);
83 #if KNOB_SIMD_WIDTH == 8
85 void vTranspose3x8(simd4scalar (&vDst
)[8],
86 const simdscalar
& vSrc0
,
87 const simdscalar
& vSrc1
,
88 const simdscalar
& vSrc2
)
90 simdscalar r0r2
= _simd_unpacklo_ps(vSrc0
, vSrc2
); // x0z0x1z1 x4z4x5z5
91 simdscalar r1rx
= _simd_unpacklo_ps(vSrc1
, _simd_setzero_ps()); // y0w0y1w1 y4w4y5w5
92 simdscalar r02r1xlolo
= _simd_unpacklo_ps(r0r2
, r1rx
); // x0y0z0w0 x4y4z4w4
93 simdscalar r02r1xlohi
= _simd_unpackhi_ps(r0r2
, r1rx
); // x1y1z1w1 x5y5z5w5
95 r0r2
= _simd_unpackhi_ps(vSrc0
, vSrc2
); // x2z2x3z3 x6z6x7z7
96 r1rx
= _simd_unpackhi_ps(vSrc1
, _simd_setzero_ps()); // y2w2y3w3 y6w6yw77
97 simdscalar r02r1xhilo
= _simd_unpacklo_ps(r0r2
, r1rx
); // x2y2z2w2 x6y6z6w6
98 simdscalar r02r1xhihi
= _simd_unpackhi_ps(r0r2
, r1rx
); // x3y3z3w3 x7y7z7w7
100 vDst
[0] = _simd_extractf128_ps(r02r1xlolo
, 0);
101 vDst
[1] = _simd_extractf128_ps(r02r1xlohi
, 0);
102 vDst
[2] = _simd_extractf128_ps(r02r1xhilo
, 0);
103 vDst
[3] = _simd_extractf128_ps(r02r1xhihi
, 0);
105 vDst
[4] = _simd_extractf128_ps(r02r1xlolo
, 1);
106 vDst
[5] = _simd_extractf128_ps(r02r1xlohi
, 1);
107 vDst
[6] = _simd_extractf128_ps(r02r1xhilo
, 1);
108 vDst
[7] = _simd_extractf128_ps(r02r1xhihi
, 1);
112 void vTranspose4x8(simd4scalar (&vDst
)[8],
113 const simdscalar
& vSrc0
,
114 const simdscalar
& vSrc1
,
115 const simdscalar
& vSrc2
,
116 const simdscalar
& vSrc3
)
118 simdscalar r0r2
= _simd_unpacklo_ps(vSrc0
, vSrc2
); // x0z0x1z1 x4z4x5z5
119 simdscalar r1rx
= _simd_unpacklo_ps(vSrc1
, vSrc3
); // y0w0y1w1 y4w4y5w5
120 simdscalar r02r1xlolo
= _simd_unpacklo_ps(r0r2
, r1rx
); // x0y0z0w0 x4y4z4w4
121 simdscalar r02r1xlohi
= _simd_unpackhi_ps(r0r2
, r1rx
); // x1y1z1w1 x5y5z5w5
123 r0r2
= _simd_unpackhi_ps(vSrc0
, vSrc2
); // x2z2x3z3 x6z6x7z7
124 r1rx
= _simd_unpackhi_ps(vSrc1
, vSrc3
); // y2w2y3w3 y6w6yw77
125 simdscalar r02r1xhilo
= _simd_unpacklo_ps(r0r2
, r1rx
); // x2y2z2w2 x6y6z6w6
126 simdscalar r02r1xhihi
= _simd_unpackhi_ps(r0r2
, r1rx
); // x3y3z3w3 x7y7z7w7
128 vDst
[0] = _simd_extractf128_ps(r02r1xlolo
, 0);
129 vDst
[1] = _simd_extractf128_ps(r02r1xlohi
, 0);
130 vDst
[2] = _simd_extractf128_ps(r02r1xhilo
, 0);
131 vDst
[3] = _simd_extractf128_ps(r02r1xhihi
, 0);
133 vDst
[4] = _simd_extractf128_ps(r02r1xlolo
, 1);
134 vDst
[5] = _simd_extractf128_ps(r02r1xlohi
, 1);
135 vDst
[6] = _simd_extractf128_ps(r02r1xhilo
, 1);
136 vDst
[7] = _simd_extractf128_ps(r02r1xhihi
, 1);
139 #if ENABLE_AVX512_SIMD16
141 void vTranspose4x16(simd16scalar (&dst
)[4],
142 const simd16scalar
& src0
,
143 const simd16scalar
& src1
,
144 const simd16scalar
& src2
,
145 const simd16scalar
& src3
)
147 const simd16scalari perm
=
148 _simd16_set_epi32(15,
163 0); // pre-permute input to setup the right order after all the unpacking
165 simd16scalar pre0
= _simd16_permute_ps(src0
, perm
); // r
166 simd16scalar pre1
= _simd16_permute_ps(src1
, perm
); // g
167 simd16scalar pre2
= _simd16_permute_ps(src2
, perm
); // b
168 simd16scalar pre3
= _simd16_permute_ps(src3
, perm
); // a
170 simd16scalar rblo
= _simd16_unpacklo_ps(pre0
, pre2
);
171 simd16scalar galo
= _simd16_unpacklo_ps(pre1
, pre3
);
172 simd16scalar rbhi
= _simd16_unpackhi_ps(pre0
, pre2
);
173 simd16scalar gahi
= _simd16_unpackhi_ps(pre1
, pre3
);
175 dst
[0] = _simd16_unpacklo_ps(rblo
, galo
);
176 dst
[1] = _simd16_unpackhi_ps(rblo
, galo
);
177 dst
[2] = _simd16_unpacklo_ps(rbhi
, gahi
);
178 dst
[3] = _simd16_unpackhi_ps(rbhi
, gahi
);
183 void vTranspose8x8(simdscalar (&vDst
)[8],
184 const simdscalar
& vMask0
,
185 const simdscalar
& vMask1
,
186 const simdscalar
& vMask2
,
187 const simdscalar
& vMask3
,
188 const simdscalar
& vMask4
,
189 const simdscalar
& vMask5
,
190 const simdscalar
& vMask6
,
191 const simdscalar
& vMask7
)
193 simdscalar __t0
= _simd_unpacklo_ps(vMask0
, vMask1
);
194 simdscalar __t1
= _simd_unpackhi_ps(vMask0
, vMask1
);
195 simdscalar __t2
= _simd_unpacklo_ps(vMask2
, vMask3
);
196 simdscalar __t3
= _simd_unpackhi_ps(vMask2
, vMask3
);
197 simdscalar __t4
= _simd_unpacklo_ps(vMask4
, vMask5
);
198 simdscalar __t5
= _simd_unpackhi_ps(vMask4
, vMask5
);
199 simdscalar __t6
= _simd_unpacklo_ps(vMask6
, vMask7
);
200 simdscalar __t7
= _simd_unpackhi_ps(vMask6
, vMask7
);
201 simdscalar __tt0
= _simd_shuffle_ps(__t0
, __t2
, _MM_SHUFFLE(1, 0, 1, 0));
202 simdscalar __tt1
= _simd_shuffle_ps(__t0
, __t2
, _MM_SHUFFLE(3, 2, 3, 2));
203 simdscalar __tt2
= _simd_shuffle_ps(__t1
, __t3
, _MM_SHUFFLE(1, 0, 1, 0));
204 simdscalar __tt3
= _simd_shuffle_ps(__t1
, __t3
, _MM_SHUFFLE(3, 2, 3, 2));
205 simdscalar __tt4
= _simd_shuffle_ps(__t4
, __t6
, _MM_SHUFFLE(1, 0, 1, 0));
206 simdscalar __tt5
= _simd_shuffle_ps(__t4
, __t6
, _MM_SHUFFLE(3, 2, 3, 2));
207 simdscalar __tt6
= _simd_shuffle_ps(__t5
, __t7
, _MM_SHUFFLE(1, 0, 1, 0));
208 simdscalar __tt7
= _simd_shuffle_ps(__t5
, __t7
, _MM_SHUFFLE(3, 2, 3, 2));
209 vDst
[0] = _simd_permute2f128_ps(__tt0
, __tt4
, 0x20);
210 vDst
[1] = _simd_permute2f128_ps(__tt1
, __tt5
, 0x20);
211 vDst
[2] = _simd_permute2f128_ps(__tt2
, __tt6
, 0x20);
212 vDst
[3] = _simd_permute2f128_ps(__tt3
, __tt7
, 0x20);
213 vDst
[4] = _simd_permute2f128_ps(__tt0
, __tt4
, 0x31);
214 vDst
[5] = _simd_permute2f128_ps(__tt1
, __tt5
, 0x31);
215 vDst
[6] = _simd_permute2f128_ps(__tt2
, __tt6
, 0x31);
216 vDst
[7] = _simd_permute2f128_ps(__tt3
, __tt7
, 0x31);
220 void vTranspose8x8(simdscalar (&vDst
)[8],
221 const simdscalari
& vMask0
,
222 const simdscalari
& vMask1
,
223 const simdscalari
& vMask2
,
224 const simdscalari
& vMask3
,
225 const simdscalari
& vMask4
,
226 const simdscalari
& vMask5
,
227 const simdscalari
& vMask6
,
228 const simdscalari
& vMask7
)
231 _simd_castsi_ps(vMask0
),
232 _simd_castsi_ps(vMask1
),
233 _simd_castsi_ps(vMask2
),
234 _simd_castsi_ps(vMask3
),
235 _simd_castsi_ps(vMask4
),
236 _simd_castsi_ps(vMask5
),
237 _simd_castsi_ps(vMask6
),
238 _simd_castsi_ps(vMask7
));
242 //////////////////////////////////////////////////////////////////////////
243 /// TranposeSingleComponent
244 //////////////////////////////////////////////////////////////////////////
245 template <uint32_t bpp
>
246 struct TransposeSingleComponent
248 //////////////////////////////////////////////////////////////////////////
249 /// @brief Pass-thru for single component.
250 /// @param pSrc - source data in SOA form
251 /// @param pDst - output data in AOS form
252 INLINE
static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
)
254 memcpy(pDst
, pSrc
, (bpp
* KNOB_SIMD_WIDTH
) / 8);
256 #if ENABLE_AVX512_SIMD16
258 INLINE
static void Transpose_16(const uint8_t* pSrc
, uint8_t* pDst
)
260 memcpy(pDst
, pSrc
, (bpp
* KNOB_SIMD16_WIDTH
) / 8);
265 //////////////////////////////////////////////////////////////////////////
267 //////////////////////////////////////////////////////////////////////////
268 struct Transpose8_8_8_8
270 //////////////////////////////////////////////////////////////////////////
271 /// @brief Performs an SOA to AOS conversion for packed 8_8_8_8 data.
272 /// @param pSrc - source data in SOA form
273 /// @param pDst - output data in AOS form
274 INLINE
static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
)
276 simdscalari src
= _simd_load_si((const simdscalari
*)pSrc
);
278 #if KNOB_SIMD_WIDTH == 8
279 #if KNOB_ARCH <= KNOB_ARCH_AVX
280 simd4scalari c0c1
= src
.v4
[0]; // rrrrrrrrgggggggg
282 SIMD128::castps_si(_simd_extractf128_ps(_simd_castsi_ps(src
), 1)); // bbbbbbbbaaaaaaaa
283 simd4scalari c0c2
= SIMD128::unpacklo_epi64(c0c1
, c2c3
); // rrrrrrrrbbbbbbbb
284 simd4scalari c1c3
= SIMD128::unpackhi_epi64(c0c1
, c2c3
); // ggggggggaaaaaaaa
285 simd4scalari c01
= SIMD128::unpacklo_epi8(c0c2
, c1c3
); // rgrgrgrgrgrgrgrg
286 simd4scalari c23
= SIMD128::unpackhi_epi8(c0c2
, c1c3
); // babababababababa
287 simd4scalari c0123lo
= SIMD128::unpacklo_epi16(c01
, c23
); // rgbargbargbargba
288 simd4scalari c0123hi
= SIMD128::unpackhi_epi16(c01
, c23
); // rgbargbargbargba
289 SIMD128::store_si((simd4scalari
*)pDst
, c0123lo
);
290 SIMD128::store_si((simd4scalari
*)(pDst
+ 16), c0123hi
);
292 simdscalari dst01
= _simd_shuffle_epi8(src
,
293 _simd_set_epi32(0x0f078080,
301 simdscalari dst23
= _mm256_permute2x128_si256(src
, src
, 0x01);
302 dst23
= _simd_shuffle_epi8(dst23
,
303 _simd_set_epi32(0x80800f07,
311 simdscalari dst
= _simd_or_si(dst01
, dst23
);
312 _simd_store_si((simdscalari
*)pDst
, dst
);
315 #error Unsupported vector width
318 #if ENABLE_AVX512_SIMD16
320 INLINE
static void Transpose_16(const uint8_t* pSrc
, uint8_t* pDst
)
323 SIMD128::load_si(reinterpret_cast<const simd4scalari
*>(pSrc
)); // rrrrrrrrrrrrrrrr
325 SIMD128::load_si(reinterpret_cast<const simd4scalari
*>(pSrc
) + 1); // gggggggggggggggg
327 SIMD128::load_si(reinterpret_cast<const simd4scalari
*>(pSrc
) + 2); // bbbbbbbbbbbbbbbb
329 SIMD128::load_si(reinterpret_cast<const simd4scalari
*>(pSrc
) + 3); // aaaaaaaaaaaaaaaa
331 simd16scalari cvt0
= _simd16_cvtepu8_epi32(src0
);
332 simd16scalari cvt1
= _simd16_cvtepu8_epi32(src1
);
333 simd16scalari cvt2
= _simd16_cvtepu8_epi32(src2
);
334 simd16scalari cvt3
= _simd16_cvtepu8_epi32(src3
);
336 simd16scalari shl1
= _simd16_slli_epi32(cvt1
, 8);
337 simd16scalari shl2
= _simd16_slli_epi32(cvt2
, 16);
338 simd16scalari shl3
= _simd16_slli_epi32(cvt3
, 24);
340 simd16scalari dst
= _simd16_or_si(_simd16_or_si(cvt0
, shl1
), _simd16_or_si(shl2
, shl3
));
342 _simd16_store_si(reinterpret_cast<simd16scalari
*>(pDst
),
343 dst
); // rgbargbargbargbargbargbargbargbargbargbargbargbargbargbargbargba
348 //////////////////////////////////////////////////////////////////////////
350 //////////////////////////////////////////////////////////////////////////
351 struct Transpose8_8_8
353 //////////////////////////////////////////////////////////////////////////
354 /// @brief Performs an SOA to AOS conversion for packed 8_8_8 data.
355 /// @param pSrc - source data in SOA form
356 /// @param pDst - output data in AOS form
357 INLINE
static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
358 #if ENABLE_AVX512_SIMD16
360 INLINE
static void Transpose_16(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
364 //////////////////////////////////////////////////////////////////////////
366 //////////////////////////////////////////////////////////////////////////
369 //////////////////////////////////////////////////////////////////////////
370 /// @brief Performs an SOA to AOS conversion for packed 8_8 data.
371 /// @param pSrc - source data in SOA form
372 /// @param pDst - output data in AOS form
373 INLINE
static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
)
375 #if KNOB_SIMD_WIDTH == 8
376 simdscalari src
= _simd_load_si((const simdscalari
*)pSrc
);
378 simd4scalari rg
= src
.v4
[0]; // rrrrrrrr gggggggg
379 simd4scalari g
= SIMD128::unpackhi_epi64(rg
, rg
); // gggggggg gggggggg
380 rg
= SIMD128::unpacklo_epi8(rg
, g
);
381 SIMD128::store_si((simd4scalari
*)pDst
, rg
);
383 #error Unsupported vector width
386 #if ENABLE_AVX512_SIMD16
388 INLINE
static void Transpose_16(const uint8_t* pSrc
, uint8_t* pDst
)
391 SIMD128::load_si(reinterpret_cast<const simd4scalari
*>(pSrc
)); // rrrrrrrrrrrrrrrr
393 SIMD128::load_si(reinterpret_cast<const simd4scalari
*>(pSrc
) + 1); // gggggggggggggggg
395 simdscalari cvt0
= _simd_cvtepu8_epi16(src0
);
396 simdscalari cvt1
= _simd_cvtepu8_epi16(src1
);
398 simdscalari shl1
= _simd_slli_epi32(cvt1
, 8);
400 simdscalari dst
= _simd_or_si(cvt0
, shl1
);
402 _simd_store_si(reinterpret_cast<simdscalari
*>(pDst
),
403 dst
); // rgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrg
408 //////////////////////////////////////////////////////////////////////////
409 /// Transpose32_32_32_32
410 //////////////////////////////////////////////////////////////////////////
411 struct Transpose32_32_32_32
413 //////////////////////////////////////////////////////////////////////////
414 /// @brief Performs an SOA to AOS conversion for packed 32_32_32_32 data.
415 /// @param pSrc - source data in SOA form
416 /// @param pDst - output data in AOS form
417 INLINE
static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
)
419 #if KNOB_SIMD_WIDTH == 8
420 simdscalar src0
= _simd_load_ps((const float*)pSrc
);
421 simdscalar src1
= _simd_load_ps((const float*)pSrc
+ 8);
422 simdscalar src2
= _simd_load_ps((const float*)pSrc
+ 16);
423 simdscalar src3
= _simd_load_ps((const float*)pSrc
+ 24);
426 vTranspose4x8(vDst
, src0
, src1
, src2
, src3
);
427 SIMD128::store_ps((float*)pDst
, vDst
[0]);
428 SIMD128::store_ps((float*)pDst
+ 4, vDst
[1]);
429 SIMD128::store_ps((float*)pDst
+ 8, vDst
[2]);
430 SIMD128::store_ps((float*)pDst
+ 12, vDst
[3]);
431 SIMD128::store_ps((float*)pDst
+ 16, vDst
[4]);
432 SIMD128::store_ps((float*)pDst
+ 20, vDst
[5]);
433 SIMD128::store_ps((float*)pDst
+ 24, vDst
[6]);
434 SIMD128::store_ps((float*)pDst
+ 28, vDst
[7]);
436 #error Unsupported vector width
439 #if ENABLE_AVX512_SIMD16
441 INLINE
static void Transpose_16(const uint8_t* pSrc
, uint8_t* pDst
)
443 simd16scalar src0
= _simd16_load_ps(reinterpret_cast<const float*>(pSrc
));
444 simd16scalar src1
= _simd16_load_ps(reinterpret_cast<const float*>(pSrc
) + 16);
445 simd16scalar src2
= _simd16_load_ps(reinterpret_cast<const float*>(pSrc
) + 32);
446 simd16scalar src3
= _simd16_load_ps(reinterpret_cast<const float*>(pSrc
) + 48);
450 vTranspose4x16(dst
, src0
, src1
, src2
, src3
);
452 _simd16_store_ps(reinterpret_cast<float*>(pDst
) + 0, dst
[0]);
453 _simd16_store_ps(reinterpret_cast<float*>(pDst
) + 16, dst
[1]);
454 _simd16_store_ps(reinterpret_cast<float*>(pDst
) + 32, dst
[2]);
455 _simd16_store_ps(reinterpret_cast<float*>(pDst
) + 48, dst
[3]);
460 //////////////////////////////////////////////////////////////////////////
461 /// Transpose32_32_32
462 //////////////////////////////////////////////////////////////////////////
463 struct Transpose32_32_32
465 //////////////////////////////////////////////////////////////////////////
466 /// @brief Performs an SOA to AOS conversion for packed 32_32_32 data.
467 /// @param pSrc - source data in SOA form
468 /// @param pDst - output data in AOS form
469 INLINE
static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
)
471 #if KNOB_SIMD_WIDTH == 8
472 simdscalar src0
= _simd_load_ps((const float*)pSrc
);
473 simdscalar src1
= _simd_load_ps((const float*)pSrc
+ 8);
474 simdscalar src2
= _simd_load_ps((const float*)pSrc
+ 16);
477 vTranspose3x8(vDst
, src0
, src1
, src2
);
478 SIMD128::store_ps((float*)pDst
, vDst
[0]);
479 SIMD128::store_ps((float*)pDst
+ 4, vDst
[1]);
480 SIMD128::store_ps((float*)pDst
+ 8, vDst
[2]);
481 SIMD128::store_ps((float*)pDst
+ 12, vDst
[3]);
482 SIMD128::store_ps((float*)pDst
+ 16, vDst
[4]);
483 SIMD128::store_ps((float*)pDst
+ 20, vDst
[5]);
484 SIMD128::store_ps((float*)pDst
+ 24, vDst
[6]);
485 SIMD128::store_ps((float*)pDst
+ 28, vDst
[7]);
487 #error Unsupported vector width
490 #if ENABLE_AVX512_SIMD16
492 INLINE
static void Transpose_16(const uint8_t* pSrc
, uint8_t* pDst
)
494 simd16scalar src0
= _simd16_load_ps(reinterpret_cast<const float*>(pSrc
));
495 simd16scalar src1
= _simd16_load_ps(reinterpret_cast<const float*>(pSrc
) + 16);
496 simd16scalar src2
= _simd16_load_ps(reinterpret_cast<const float*>(pSrc
) + 32);
497 simd16scalar src3
= _simd16_setzero_ps();
501 vTranspose4x16(dst
, src0
, src1
, src2
, src3
);
503 _simd16_store_ps(reinterpret_cast<float*>(pDst
) + 0, dst
[0]);
504 _simd16_store_ps(reinterpret_cast<float*>(pDst
) + 16, dst
[1]);
505 _simd16_store_ps(reinterpret_cast<float*>(pDst
) + 32, dst
[2]);
506 _simd16_store_ps(reinterpret_cast<float*>(pDst
) + 48, dst
[3]);
511 //////////////////////////////////////////////////////////////////////////
513 //////////////////////////////////////////////////////////////////////////
514 struct Transpose32_32
516 //////////////////////////////////////////////////////////////////////////
517 /// @brief Performs an SOA to AOS conversion for packed 32_32 data.
518 /// @param pSrc - source data in SOA form
519 /// @param pDst - output data in AOS form
520 INLINE
static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
)
522 #if KNOB_SIMD_WIDTH == 8
523 const float* pfSrc
= (const float*)pSrc
;
524 simd4scalar src_r0
= SIMD128::load_ps(pfSrc
+ 0);
525 simd4scalar src_r1
= SIMD128::load_ps(pfSrc
+ 4);
526 simd4scalar src_g0
= SIMD128::load_ps(pfSrc
+ 8);
527 simd4scalar src_g1
= SIMD128::load_ps(pfSrc
+ 12);
529 simd4scalar dst0
= SIMD128::unpacklo_ps(src_r0
, src_g0
);
530 simd4scalar dst1
= SIMD128::unpackhi_ps(src_r0
, src_g0
);
531 simd4scalar dst2
= SIMD128::unpacklo_ps(src_r1
, src_g1
);
532 simd4scalar dst3
= SIMD128::unpackhi_ps(src_r1
, src_g1
);
534 float* pfDst
= (float*)pDst
;
535 SIMD128::store_ps(pfDst
+ 0, dst0
);
536 SIMD128::store_ps(pfDst
+ 4, dst1
);
537 SIMD128::store_ps(pfDst
+ 8, dst2
);
538 SIMD128::store_ps(pfDst
+ 12, dst3
);
540 #error Unsupported vector width
543 #if ENABLE_AVX512_SIMD16
545 INLINE
static void Transpose_16(const uint8_t* pSrc
, uint8_t* pDst
)
548 _simd16_load_ps(reinterpret_cast<const float*>(pSrc
)); // rrrrrrrrrrrrrrrr
550 _simd16_load_ps(reinterpret_cast<const float*>(pSrc
) + 16); // gggggggggggggggg
553 _simd16_unpacklo_ps(src0
, src1
); // r0 g0 r1 g1 r4 g4 r5 g5 r8 g8 r9 g9 rC gC rD gD
555 _simd16_unpackhi_ps(src0
, src1
); // r2 g2 r3 g3 r6 g6 r7 g7 rA gA rB gB rE gE rF gF
557 simd16scalar per0
= _simd16_permute2f128_ps(
560 0x44); // (1, 0, 1, 0) // r0 g0 r1 g1 r4 g4 r5 g5 r2 g2 r3 g3 r6 g6 r7 g7
561 simd16scalar per1
= _simd16_permute2f128_ps(
564 0xEE); // (3, 2, 3, 2) // r8 g8 r9 g9 rC gC rD gD rA gA rB gB rE gE rF gF
566 simd16scalar dst0
= _simd16_permute2f128_ps(
569 0xD8); // (3, 1, 2, 0) // r0 g0 r1 g1 r2 g2 r3 g3 r4 g4 r5 g5 r6 g6 r7 g7
570 simd16scalar dst1
= _simd16_permute2f128_ps(
573 0xD8); // (3, 1, 2, 0) // r8 g8 r9 g9 rA gA rB gB rC gC rD gD rE gE rF gF
575 _simd16_store_ps(reinterpret_cast<float*>(pDst
) + 0, dst0
); // rgrgrgrgrgrgrgrg
576 _simd16_store_ps(reinterpret_cast<float*>(pDst
) + 16, dst1
); // rgrgrgrgrgrgrgrg
581 //////////////////////////////////////////////////////////////////////////
582 /// Transpose16_16_16_16
583 //////////////////////////////////////////////////////////////////////////
584 struct Transpose16_16_16_16
586 //////////////////////////////////////////////////////////////////////////
587 /// @brief Performs an SOA to AOS conversion for packed 16_16_16_16 data.
588 /// @param pSrc - source data in SOA form
589 /// @param pDst - output data in AOS form
590 INLINE
static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
)
592 #if KNOB_SIMD_WIDTH == 8
593 simdscalari src_rg
= _simd_load_si((const simdscalari
*)pSrc
);
594 simdscalari src_ba
= _simd_load_si((const simdscalari
*)(pSrc
+ sizeof(simdscalari
)));
596 simd4scalari src_r
= _simd_extractf128_si(src_rg
, 0);
597 simd4scalari src_g
= _simd_extractf128_si(src_rg
, 1);
598 simd4scalari src_b
= _simd_extractf128_si(src_ba
, 0);
599 simd4scalari src_a
= _simd_extractf128_si(src_ba
, 1);
601 simd4scalari rg0
= SIMD128::unpacklo_epi16(src_r
, src_g
);
602 simd4scalari rg1
= SIMD128::unpackhi_epi16(src_r
, src_g
);
603 simd4scalari ba0
= SIMD128::unpacklo_epi16(src_b
, src_a
);
604 simd4scalari ba1
= SIMD128::unpackhi_epi16(src_b
, src_a
);
606 simd4scalari dst0
= SIMD128::unpacklo_epi32(rg0
, ba0
);
607 simd4scalari dst1
= SIMD128::unpackhi_epi32(rg0
, ba0
);
608 simd4scalari dst2
= SIMD128::unpacklo_epi32(rg1
, ba1
);
609 simd4scalari dst3
= SIMD128::unpackhi_epi32(rg1
, ba1
);
611 SIMD128::store_si(((simd4scalari
*)pDst
) + 0, dst0
);
612 SIMD128::store_si(((simd4scalari
*)pDst
) + 1, dst1
);
613 SIMD128::store_si(((simd4scalari
*)pDst
) + 2, dst2
);
614 SIMD128::store_si(((simd4scalari
*)pDst
) + 3, dst3
);
616 #error Unsupported vector width
619 #if ENABLE_AVX512_SIMD16
621 INLINE
static void Transpose_16(const uint8_t* pSrc
, uint8_t* pDst
)
624 _simd_load_si(reinterpret_cast<const simdscalari
*>(pSrc
)); // rrrrrrrrrrrrrrrr
626 _simd_load_si(reinterpret_cast<const simdscalari
*>(pSrc
) + 1); // gggggggggggggggg
628 _simd_load_si(reinterpret_cast<const simdscalari
*>(pSrc
) + 2); // bbbbbbbbbbbbbbbb
630 _simd_load_si(reinterpret_cast<const simdscalari
*>(pSrc
) + 3); // aaaaaaaaaaaaaaaa
632 simdscalari pre0
= _simd_unpacklo_epi16(src0
, src1
); // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB
633 simdscalari pre1
= _simd_unpackhi_epi16(src0
, src1
); // rg4 rg5 rg6 rg7 rgC rgD rgE rgF
634 simdscalari pre2
= _simd_unpacklo_epi16(src2
, src3
); // ba0 ba1 ba3 ba3 ba8 ba9 baA baB
635 simdscalari pre3
= _simd_unpackhi_epi16(src2
, src3
); // ba4 ba5 ba6 ba7 baC baD baE baF
637 simdscalari tmp0
= _simd_unpacklo_epi32(pre0
, pre2
); // rbga0 rbga1 rbga8 rbga9
638 simdscalari tmp1
= _simd_unpackhi_epi32(pre0
, pre2
); // rbga2 rbga3 rbgaA rbgaB
639 simdscalari tmp2
= _simd_unpacklo_epi32(pre1
, pre3
); // rbga4 rbga5 rgbaC rbgaD
640 simdscalari tmp3
= _simd_unpackhi_epi32(pre1
, pre3
); // rbga6 rbga7 rbgaE rbgaF
642 simdscalari dst0
= _simd_permute2f128_si(
643 tmp0
, tmp1
, 0x20); // (2, 0) // rbga0 rbga1 rbga2 rbga3
644 simdscalari dst1
= _simd_permute2f128_si(
645 tmp2
, tmp3
, 0x20); // (2, 0) // rbga4 rbga5 rbga6 rbga7
646 simdscalari dst2
= _simd_permute2f128_si(
647 tmp0
, tmp1
, 0x31); // (3, 1) // rbga8 rbga9 rbgaA rbgaB
648 simdscalari dst3
= _simd_permute2f128_si(
649 tmp2
, tmp3
, 0x31); // (3, 1) // rbgaC rbgaD rbgaE rbgaF
651 _simd_store_si(reinterpret_cast<simdscalari
*>(pDst
) + 0, dst0
); // rgbargbargbargba
652 _simd_store_si(reinterpret_cast<simdscalari
*>(pDst
) + 1, dst1
); // rgbargbargbargba
653 _simd_store_si(reinterpret_cast<simdscalari
*>(pDst
) + 2, dst2
); // rgbargbargbargba
654 _simd_store_si(reinterpret_cast<simdscalari
*>(pDst
) + 3, dst3
); // rgbargbargbargba
659 //////////////////////////////////////////////////////////////////////////
660 /// Transpose16_16_16
661 //////////////////////////////////////////////////////////////////////////
662 struct Transpose16_16_16
664 //////////////////////////////////////////////////////////////////////////
665 /// @brief Performs an SOA to AOS conversion for packed 16_16_16 data.
666 /// @param pSrc - source data in SOA form
667 /// @param pDst - output data in AOS form
668 INLINE
static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
)
670 #if KNOB_SIMD_WIDTH == 8
671 simdscalari src_rg
= _simd_load_si((const simdscalari
*)pSrc
);
673 simd4scalari src_r
= _simd_extractf128_si(src_rg
, 0);
674 simd4scalari src_g
= _simd_extractf128_si(src_rg
, 1);
675 simd4scalari src_b
= SIMD128::load_si((const simd4scalari
*)(pSrc
+ sizeof(simdscalari
)));
676 simd4scalari src_a
= SIMD128::setzero_si();
678 simd4scalari rg0
= SIMD128::unpacklo_epi16(src_r
, src_g
);
679 simd4scalari rg1
= SIMD128::unpackhi_epi16(src_r
, src_g
);
680 simd4scalari ba0
= SIMD128::unpacklo_epi16(src_b
, src_a
);
681 simd4scalari ba1
= SIMD128::unpackhi_epi16(src_b
, src_a
);
683 simd4scalari dst0
= SIMD128::unpacklo_epi32(rg0
, ba0
);
684 simd4scalari dst1
= SIMD128::unpackhi_epi32(rg0
, ba0
);
685 simd4scalari dst2
= SIMD128::unpacklo_epi32(rg1
, ba1
);
686 simd4scalari dst3
= SIMD128::unpackhi_epi32(rg1
, ba1
);
688 SIMD128::store_si(((simd4scalari
*)pDst
) + 0, dst0
);
689 SIMD128::store_si(((simd4scalari
*)pDst
) + 1, dst1
);
690 SIMD128::store_si(((simd4scalari
*)pDst
) + 2, dst2
);
691 SIMD128::store_si(((simd4scalari
*)pDst
) + 3, dst3
);
693 #error Unsupported vector width
696 #if ENABLE_AVX512_SIMD16
698 INLINE
static void Transpose_16(const uint8_t* pSrc
, uint8_t* pDst
)
701 _simd_load_si(reinterpret_cast<const simdscalari
*>(pSrc
)); // rrrrrrrrrrrrrrrr
703 _simd_load_si(reinterpret_cast<const simdscalari
*>(pSrc
) + 1); // gggggggggggggggg
705 _simd_load_si(reinterpret_cast<const simdscalari
*>(pSrc
) + 2); // bbbbbbbbbbbbbbbb
706 simdscalari src3
= _simd_setzero_si(); // aaaaaaaaaaaaaaaa
708 simdscalari pre0
= _simd_unpacklo_epi16(src0
, src1
); // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB
709 simdscalari pre1
= _simd_unpackhi_epi16(src0
, src1
); // rg4 rg5 rg6 rg7 rgC rgD rgE rgF
710 simdscalari pre2
= _simd_unpacklo_epi16(src2
, src3
); // ba0 ba1 ba3 ba3 ba8 ba9 baA baB
711 simdscalari pre3
= _simd_unpackhi_epi16(src2
, src3
); // ba4 ba5 ba6 ba7 baC baD baE baF
713 simdscalari tmp0
= _simd_unpacklo_epi32(pre0
, pre2
); // rbga0 rbga1 rbga8 rbga9
714 simdscalari tmp1
= _simd_unpackhi_epi32(pre0
, pre2
); // rbga2 rbga3 rbgaA rbgaB
715 simdscalari tmp2
= _simd_unpacklo_epi32(pre1
, pre3
); // rbga4 rbga5 rgbaC rbgaD
716 simdscalari tmp3
= _simd_unpackhi_epi32(pre1
, pre3
); // rbga6 rbga7 rbgaE rbgaF
718 simdscalari dst0
= _simd_permute2f128_si(
719 tmp0
, tmp1
, 0x20); // (2, 0) // rbga0 rbga1 rbga2 rbga3
720 simdscalari dst1
= _simd_permute2f128_si(
721 tmp2
, tmp3
, 0x20); // (2, 0) // rbga4 rbga5 rbga6 rbga7
722 simdscalari dst2
= _simd_permute2f128_si(
723 tmp0
, tmp1
, 0x31); // (3, 1) // rbga8 rbga9 rbgaA rbgaB
724 simdscalari dst3
= _simd_permute2f128_si(
725 tmp2
, tmp3
, 0x31); // (3, 1) // rbgaC rbgaD rbgaE rbgaF
727 _simd_store_si(reinterpret_cast<simdscalari
*>(pDst
) + 0, dst0
); // rgbargbargbargba
728 _simd_store_si(reinterpret_cast<simdscalari
*>(pDst
) + 1, dst1
); // rgbargbargbargba
729 _simd_store_si(reinterpret_cast<simdscalari
*>(pDst
) + 2, dst2
); // rgbargbargbargba
730 _simd_store_si(reinterpret_cast<simdscalari
*>(pDst
) + 3, dst3
); // rgbargbargbargba
735 //////////////////////////////////////////////////////////////////////////
737 //////////////////////////////////////////////////////////////////////////
738 struct Transpose16_16
740 //////////////////////////////////////////////////////////////////////////
741 /// @brief Performs an SOA to AOS conversion for packed 16_16 data.
742 /// @param pSrc - source data in SOA form
743 /// @param pDst - output data in AOS form
744 INLINE
static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
)
746 #if KNOB_SIMD_WIDTH == 8
747 simdscalar src
= _simd_load_ps((const float*)pSrc
);
749 simd4scalar comp0
= _simd_extractf128_ps(src
, 0);
750 simd4scalar comp1
= _simd_extractf128_ps(src
, 1);
752 simd4scalari comp0i
= SIMD128::castps_si(comp0
);
753 simd4scalari comp1i
= SIMD128::castps_si(comp1
);
755 simd4scalari resLo
= SIMD128::unpacklo_epi16(comp0i
, comp1i
);
756 simd4scalari resHi
= SIMD128::unpackhi_epi16(comp0i
, comp1i
);
758 SIMD128::store_si((simd4scalari
*)pDst
, resLo
);
759 SIMD128::store_si((simd4scalari
*)pDst
+ 1, resHi
);
761 #error Unsupported vector width
764 #if ENABLE_AVX512_SIMD16
766 INLINE
static void Transpose_16(const uint8_t* pSrc
, uint8_t* pDst
)
769 _simd_load_si(reinterpret_cast<const simdscalari
*>(pSrc
)); // rrrrrrrrrrrrrrrr
771 _simd_load_si(reinterpret_cast<const simdscalari
*>(pSrc
) + 1); // gggggggggggggggg
773 simdscalari tmp0
= _simd_unpacklo_epi16(src0
, src1
); // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB
774 simdscalari tmp1
= _simd_unpackhi_epi16(src0
, src1
); // rg4 rg5 rg6 rg7 rgC rgD rgE rgF
776 simdscalari dst0
= _simd_permute2f128_si(
777 tmp0
, tmp1
, 0x20); // (2, 0) // rg0 rg1 rg2 rg3 rg4 rg5 rg6 rg7
778 simdscalari dst1
= _simd_permute2f128_si(
779 tmp0
, tmp1
, 0x31); // (3, 1) // rg8 rg9 rgA rgB rgC rgD rgE rgF
781 _simd_store_si(reinterpret_cast<simdscalari
*>(pDst
) + 0, dst0
); // rgrgrgrgrgrgrgrg
782 _simd_store_si(reinterpret_cast<simdscalari
*>(pDst
) + 1, dst1
); // rgrgrgrgrgrgrgrg
787 //////////////////////////////////////////////////////////////////////////
789 //////////////////////////////////////////////////////////////////////////
792 //////////////////////////////////////////////////////////////////////////
793 /// @brief Performs an SOA to AOS conversion for packed 24_8 data.
794 /// @param pSrc - source data in SOA form
795 /// @param pDst - output data in AOS form
796 static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
797 #if ENABLE_AVX512_SIMD16
799 static void Transpose_16(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
803 //////////////////////////////////////////////////////////////////////////
805 //////////////////////////////////////////////////////////////////////////
806 struct Transpose32_8_24
808 //////////////////////////////////////////////////////////////////////////
809 /// @brief Performs an SOA to AOS conversion for packed 32_8_24 data.
810 /// @param pSrc - source data in SOA form
811 /// @param pDst - output data in AOS form
812 static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
813 #if ENABLE_AVX512_SIMD16
815 static void Transpose_16(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
819 //////////////////////////////////////////////////////////////////////////
821 //////////////////////////////////////////////////////////////////////////
822 struct Transpose4_4_4_4
824 //////////////////////////////////////////////////////////////////////////
825 /// @brief Performs an SOA to AOS conversion for packed 4_4_4_4 data.
826 /// @param pSrc - source data in SOA form
827 /// @param pDst - output data in AOS form
828 static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
829 #if ENABLE_AVX512_SIMD16
831 static void Transpose_16(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
835 //////////////////////////////////////////////////////////////////////////
837 //////////////////////////////////////////////////////////////////////////
838 struct Transpose5_6_5
840 //////////////////////////////////////////////////////////////////////////
841 /// @brief Performs an SOA to AOS conversion for packed 5_6_5 data.
842 /// @param pSrc - source data in SOA form
843 /// @param pDst - output data in AOS form
844 static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
845 #if ENABLE_AVX512_SIMD16
847 static void Transpose_16(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
851 //////////////////////////////////////////////////////////////////////////
853 //////////////////////////////////////////////////////////////////////////
854 struct Transpose9_9_9_5
856 //////////////////////////////////////////////////////////////////////////
857 /// @brief Performs an SOA to AOS conversion for packed 9_9_9_5 data.
858 /// @param pSrc - source data in SOA form
859 /// @param pDst - output data in AOS form
860 static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
861 #if ENABLE_AVX512_SIMD16
863 static void Transpose_16(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
867 //////////////////////////////////////////////////////////////////////////
869 //////////////////////////////////////////////////////////////////////////
870 struct Transpose5_5_5_1
872 //////////////////////////////////////////////////////////////////////////
873 /// @brief Performs an SOA to AOS conversion for packed 5_5_5_1 data.
874 /// @param pSrc - source data in SOA form
875 /// @param pDst - output data in AOS form
876 static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
877 #if ENABLE_AVX512_SIMD16
879 static void Transpose_16(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
883 //////////////////////////////////////////////////////////////////////////
885 //////////////////////////////////////////////////////////////////////////
886 struct Transpose1_5_5_5
888 //////////////////////////////////////////////////////////////////////////
889 /// @brief Performs an SOA to AOS conversion for packed 5_5_5_1 data.
890 /// @param pSrc - source data in SOA form
891 /// @param pDst - output data in AOS form
892 static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
895 //////////////////////////////////////////////////////////////////////////
896 /// Transpose10_10_10_2
897 //////////////////////////////////////////////////////////////////////////
898 struct Transpose10_10_10_2
900 //////////////////////////////////////////////////////////////////////////
901 /// @brief Performs an SOA to AOS conversion for packed 10_10_10_2 data.
902 /// @param pSrc - source data in SOA form
903 /// @param pDst - output data in AOS form
904 static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
905 #if ENABLE_AVX512_SIMD16
907 static void Transpose_16(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
911 //////////////////////////////////////////////////////////////////////////
912 /// Transpose11_11_10
913 //////////////////////////////////////////////////////////////////////////
914 struct Transpose11_11_10
916 //////////////////////////////////////////////////////////////////////////
917 /// @brief Performs an SOA to AOS conversion for packed 11_11_10 data.
918 /// @param pSrc - source data in SOA form
919 /// @param pDst - output data in AOS form
920 static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
921 #if ENABLE_AVX512_SIMD16
923 static void Transpose_16(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
927 //////////////////////////////////////////////////////////////////////////
929 //////////////////////////////////////////////////////////////////////////
932 //////////////////////////////////////////////////////////////////////////
933 /// @brief Performs an SOA to AOS conversion
934 /// @param pSrc - source data in SOA form
935 /// @param pDst - output data in AOS form
936 static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
937 #if ENABLE_AVX512_SIMD16
939 static void Transpose_16(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
943 //////////////////////////////////////////////////////////////////////////
945 //////////////////////////////////////////////////////////////////////////
946 struct Transpose64_64
948 //////////////////////////////////////////////////////////////////////////
949 /// @brief Performs an SOA to AOS conversion
950 /// @param pSrc - source data in SOA form
951 /// @param pDst - output data in AOS form
952 static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
953 #if ENABLE_AVX512_SIMD16
955 static void Transpose_16(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
959 //////////////////////////////////////////////////////////////////////////
960 /// Transpose64_64_64
961 //////////////////////////////////////////////////////////////////////////
962 struct Transpose64_64_64
964 //////////////////////////////////////////////////////////////////////////
965 /// @brief Performs an SOA to AOS conversion
966 /// @param pSrc - source data in SOA form
967 /// @param pDst - output data in AOS form
968 static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
969 #if ENABLE_AVX512_SIMD16
971 static void Transpose_16(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
975 //////////////////////////////////////////////////////////////////////////
976 /// Transpose64_64_64_64
977 //////////////////////////////////////////////////////////////////////////
978 struct Transpose64_64_64_64
980 //////////////////////////////////////////////////////////////////////////
981 /// @brief Performs an SOA to AOS conversion
982 /// @param pSrc - source data in SOA form
983 /// @param pDst - output data in AOS form
984 static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
985 #if ENABLE_AVX512_SIMD16
987 static void Transpose_16(const uint8_t* pSrc
, uint8_t* pDst
) = delete;