1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * @brief Utilities used by SWR core related to pixel formats.
27 ******************************************************************************/
30 #include "core/utils.h"
31 #include "common/simdintrin.h"
34 void vTranspose(simd4scalar
&row0
, simd4scalar
&row1
, simd4scalar
&row2
, simd4scalar
&row3
)
36 simd4scalari row0i
= SIMD128::castps_si(row0
);
37 simd4scalari row1i
= SIMD128::castps_si(row1
);
38 simd4scalari row2i
= SIMD128::castps_si(row2
);
39 simd4scalari row3i
= SIMD128::castps_si(row3
);
41 simd4scalari vTemp
= row2i
;
42 row2i
= SIMD128::unpacklo_epi32(row2i
, row3i
);
43 vTemp
= SIMD128::unpackhi_epi32(vTemp
, row3i
);
46 row0i
= SIMD128::unpacklo_epi32(row0i
, row1i
);
47 row3i
= SIMD128::unpackhi_epi32(row3i
, row1i
);
50 row0i
= SIMD128::unpacklo_epi64(row0i
, row2i
);
51 row1i
= SIMD128::unpackhi_epi64(row1i
, row2i
);
54 row2i
= SIMD128::unpacklo_epi64(row2i
, vTemp
);
55 row3i
= SIMD128::unpackhi_epi64(row3i
, vTemp
);
57 row0
= SIMD128::castsi_ps(row0i
);
58 row1
= SIMD128::castsi_ps(row1i
);
59 row2
= SIMD128::castsi_ps(row2i
);
60 row3
= SIMD128::castsi_ps(row3i
);
64 void vTranspose(simd4scalari
&row0
, simd4scalari
&row1
, simd4scalari
&row2
, simd4scalari
&row3
)
66 simd4scalari vTemp
= row2
;
67 row2
= SIMD128::unpacklo_epi32(row2
, row3
);
68 vTemp
= SIMD128::unpackhi_epi32(vTemp
, row3
);
71 row0
= SIMD128::unpacklo_epi32(row0
, row1
);
72 row3
= SIMD128::unpackhi_epi32(row3
, row1
);
75 row0
= SIMD128::unpacklo_epi64(row0
, row2
);
76 row1
= SIMD128::unpackhi_epi64(row1
, row2
);
79 row2
= SIMD128::unpacklo_epi64(row2
, vTemp
);
80 row3
= SIMD128::unpackhi_epi64(row3
, vTemp
);
83 #if KNOB_SIMD_WIDTH == 8
85 void vTranspose3x8(simd4scalar (&vDst
)[8], const simdscalar
&vSrc0
, const simdscalar
&vSrc1
, const simdscalar
&vSrc2
)
87 simdscalar r0r2
= _simd_unpacklo_ps(vSrc0
, vSrc2
); //x0z0x1z1 x4z4x5z5
88 simdscalar r1rx
= _simd_unpacklo_ps(vSrc1
, _simd_setzero_ps()); //y0w0y1w1 y4w4y5w5
89 simdscalar r02r1xlolo
= _simd_unpacklo_ps(r0r2
, r1rx
); //x0y0z0w0 x4y4z4w4
90 simdscalar r02r1xlohi
= _simd_unpackhi_ps(r0r2
, r1rx
); //x1y1z1w1 x5y5z5w5
92 r0r2
= _simd_unpackhi_ps(vSrc0
, vSrc2
); //x2z2x3z3 x6z6x7z7
93 r1rx
= _simd_unpackhi_ps(vSrc1
, _simd_setzero_ps()); //y2w2y3w3 y6w6yw77
94 simdscalar r02r1xhilo
= _simd_unpacklo_ps(r0r2
, r1rx
); //x2y2z2w2 x6y6z6w6
95 simdscalar r02r1xhihi
= _simd_unpackhi_ps(r0r2
, r1rx
); //x3y3z3w3 x7y7z7w7
97 vDst
[0] = _simd_extractf128_ps(r02r1xlolo
, 0);
98 vDst
[1] = _simd_extractf128_ps(r02r1xlohi
, 0);
99 vDst
[2] = _simd_extractf128_ps(r02r1xhilo
, 0);
100 vDst
[3] = _simd_extractf128_ps(r02r1xhihi
, 0);
102 vDst
[4] = _simd_extractf128_ps(r02r1xlolo
, 1);
103 vDst
[5] = _simd_extractf128_ps(r02r1xlohi
, 1);
104 vDst
[6] = _simd_extractf128_ps(r02r1xhilo
, 1);
105 vDst
[7] = _simd_extractf128_ps(r02r1xhihi
, 1);
109 void vTranspose4x8(simd4scalar (&vDst
)[8], const simdscalar
&vSrc0
, const simdscalar
&vSrc1
, const simdscalar
&vSrc2
, const simdscalar
&vSrc3
)
111 simdscalar r0r2
= _simd_unpacklo_ps(vSrc0
, vSrc2
); //x0z0x1z1 x4z4x5z5
112 simdscalar r1rx
= _simd_unpacklo_ps(vSrc1
, vSrc3
); //y0w0y1w1 y4w4y5w5
113 simdscalar r02r1xlolo
= _simd_unpacklo_ps(r0r2
, r1rx
); //x0y0z0w0 x4y4z4w4
114 simdscalar r02r1xlohi
= _simd_unpackhi_ps(r0r2
, r1rx
); //x1y1z1w1 x5y5z5w5
116 r0r2
= _simd_unpackhi_ps(vSrc0
, vSrc2
); //x2z2x3z3 x6z6x7z7
117 r1rx
= _simd_unpackhi_ps(vSrc1
, vSrc3
); //y2w2y3w3 y6w6yw77
118 simdscalar r02r1xhilo
= _simd_unpacklo_ps(r0r2
, r1rx
); //x2y2z2w2 x6y6z6w6
119 simdscalar r02r1xhihi
= _simd_unpackhi_ps(r0r2
, r1rx
); //x3y3z3w3 x7y7z7w7
121 vDst
[0] = _simd_extractf128_ps(r02r1xlolo
, 0);
122 vDst
[1] = _simd_extractf128_ps(r02r1xlohi
, 0);
123 vDst
[2] = _simd_extractf128_ps(r02r1xhilo
, 0);
124 vDst
[3] = _simd_extractf128_ps(r02r1xhihi
, 0);
126 vDst
[4] = _simd_extractf128_ps(r02r1xlolo
, 1);
127 vDst
[5] = _simd_extractf128_ps(r02r1xlohi
, 1);
128 vDst
[6] = _simd_extractf128_ps(r02r1xhilo
, 1);
129 vDst
[7] = _simd_extractf128_ps(r02r1xhihi
, 1);
132 #if ENABLE_AVX512_SIMD16
134 void vTranspose4x16(simd16scalar(&dst
)[4], const simd16scalar
&src0
, const simd16scalar
&src1
, const simd16scalar
&src2
, const simd16scalar
&src3
)
136 const simd16scalari perm
= _simd16_set_epi32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0); // pre-permute input to setup the right order after all the unpacking
138 simd16scalar pre0
= _simd16_permute_ps(src0
, perm
); // r
139 simd16scalar pre1
= _simd16_permute_ps(src1
, perm
); // g
140 simd16scalar pre2
= _simd16_permute_ps(src2
, perm
); // b
141 simd16scalar pre3
= _simd16_permute_ps(src3
, perm
); // a
143 simd16scalar rblo
= _simd16_unpacklo_ps(pre0
, pre2
);
144 simd16scalar galo
= _simd16_unpacklo_ps(pre1
, pre3
);
145 simd16scalar rbhi
= _simd16_unpackhi_ps(pre0
, pre2
);
146 simd16scalar gahi
= _simd16_unpackhi_ps(pre1
, pre3
);
148 dst
[0] = _simd16_unpacklo_ps(rblo
, galo
);
149 dst
[1] = _simd16_unpackhi_ps(rblo
, galo
);
150 dst
[2] = _simd16_unpacklo_ps(rbhi
, gahi
);
151 dst
[3] = _simd16_unpackhi_ps(rbhi
, gahi
);
156 void vTranspose8x8(simdscalar (&vDst
)[8], const simdscalar
&vMask0
, const simdscalar
&vMask1
, const simdscalar
&vMask2
, const simdscalar
&vMask3
, const simdscalar
&vMask4
, const simdscalar
&vMask5
, const simdscalar
&vMask6
, const simdscalar
&vMask7
)
158 simdscalar __t0
= _simd_unpacklo_ps(vMask0
, vMask1
);
159 simdscalar __t1
= _simd_unpackhi_ps(vMask0
, vMask1
);
160 simdscalar __t2
= _simd_unpacklo_ps(vMask2
, vMask3
);
161 simdscalar __t3
= _simd_unpackhi_ps(vMask2
, vMask3
);
162 simdscalar __t4
= _simd_unpacklo_ps(vMask4
, vMask5
);
163 simdscalar __t5
= _simd_unpackhi_ps(vMask4
, vMask5
);
164 simdscalar __t6
= _simd_unpacklo_ps(vMask6
, vMask7
);
165 simdscalar __t7
= _simd_unpackhi_ps(vMask6
, vMask7
);
166 simdscalar __tt0
= _simd_shuffle_ps(__t0
,__t2
,_MM_SHUFFLE(1,0,1,0));
167 simdscalar __tt1
= _simd_shuffle_ps(__t0
,__t2
,_MM_SHUFFLE(3,2,3,2));
168 simdscalar __tt2
= _simd_shuffle_ps(__t1
,__t3
,_MM_SHUFFLE(1,0,1,0));
169 simdscalar __tt3
= _simd_shuffle_ps(__t1
,__t3
,_MM_SHUFFLE(3,2,3,2));
170 simdscalar __tt4
= _simd_shuffle_ps(__t4
,__t6
,_MM_SHUFFLE(1,0,1,0));
171 simdscalar __tt5
= _simd_shuffle_ps(__t4
,__t6
,_MM_SHUFFLE(3,2,3,2));
172 simdscalar __tt6
= _simd_shuffle_ps(__t5
,__t7
,_MM_SHUFFLE(1,0,1,0));
173 simdscalar __tt7
= _simd_shuffle_ps(__t5
,__t7
,_MM_SHUFFLE(3,2,3,2));
174 vDst
[0] = _simd_permute2f128_ps(__tt0
, __tt4
, 0x20);
175 vDst
[1] = _simd_permute2f128_ps(__tt1
, __tt5
, 0x20);
176 vDst
[2] = _simd_permute2f128_ps(__tt2
, __tt6
, 0x20);
177 vDst
[3] = _simd_permute2f128_ps(__tt3
, __tt7
, 0x20);
178 vDst
[4] = _simd_permute2f128_ps(__tt0
, __tt4
, 0x31);
179 vDst
[5] = _simd_permute2f128_ps(__tt1
, __tt5
, 0x31);
180 vDst
[6] = _simd_permute2f128_ps(__tt2
, __tt6
, 0x31);
181 vDst
[7] = _simd_permute2f128_ps(__tt3
, __tt7
, 0x31);
185 void vTranspose8x8(simdscalar (&vDst
)[8], const simdscalari
&vMask0
, const simdscalari
&vMask1
, const simdscalari
&vMask2
, const simdscalari
&vMask3
, const simdscalari
&vMask4
, const simdscalari
&vMask5
, const simdscalari
&vMask6
, const simdscalari
&vMask7
)
187 vTranspose8x8(vDst
, _simd_castsi_ps(vMask0
), _simd_castsi_ps(vMask1
), _simd_castsi_ps(vMask2
), _simd_castsi_ps(vMask3
),
188 _simd_castsi_ps(vMask4
), _simd_castsi_ps(vMask5
), _simd_castsi_ps(vMask6
), _simd_castsi_ps(vMask7
));
192 //////////////////////////////////////////////////////////////////////////
193 /// TranposeSingleComponent
194 //////////////////////////////////////////////////////////////////////////
195 template<uint32_t bpp
>
196 struct TransposeSingleComponent
198 //////////////////////////////////////////////////////////////////////////
199 /// @brief Pass-thru for single component.
200 /// @param pSrc - source data in SOA form
201 /// @param pDst - output data in AOS form
202 INLINE
static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
)
204 memcpy(pDst
, pSrc
, (bpp
* KNOB_SIMD_WIDTH
) / 8);
206 #if ENABLE_AVX512_SIMD16
208 INLINE
static void Transpose_16(const uint8_t* pSrc
, uint8_t* pDst
)
210 memcpy(pDst
, pSrc
, (bpp
* KNOB_SIMD16_WIDTH
) / 8);
215 //////////////////////////////////////////////////////////////////////////
217 //////////////////////////////////////////////////////////////////////////
218 struct Transpose8_8_8_8
220 //////////////////////////////////////////////////////////////////////////
221 /// @brief Performs an SOA to AOS conversion for packed 8_8_8_8 data.
222 /// @param pSrc - source data in SOA form
223 /// @param pDst - output data in AOS form
224 INLINE
static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
)
226 simdscalari src
= _simd_load_si((const simdscalari
*)pSrc
);
228 #if KNOB_SIMD_WIDTH == 8
229 #if KNOB_ARCH <= KNOB_ARCH_AVX
230 simd4scalari c0c1
= src
.v4
[0]; // rrrrrrrrgggggggg
231 simd4scalari c2c3
= SIMD128::castps_si(_simd_extractf128_ps(_simd_castsi_ps(src
), 1)); // bbbbbbbbaaaaaaaa
232 simd4scalari c0c2
= SIMD128::unpacklo_epi64(c0c1
, c2c3
); // rrrrrrrrbbbbbbbb
233 simd4scalari c1c3
= SIMD128::unpackhi_epi64(c0c1
, c2c3
); // ggggggggaaaaaaaa
234 simd4scalari c01
= SIMD128::unpacklo_epi8(c0c2
, c1c3
); // rgrgrgrgrgrgrgrg
235 simd4scalari c23
= SIMD128::unpackhi_epi8(c0c2
, c1c3
); // babababababababa
236 simd4scalari c0123lo
= SIMD128::unpacklo_epi16(c01
, c23
); // rgbargbargbargba
237 simd4scalari c0123hi
= SIMD128::unpackhi_epi16(c01
, c23
); // rgbargbargbargba
238 SIMD128::store_si((simd4scalari
*)pDst
, c0123lo
);
239 SIMD128::store_si((simd4scalari
*)(pDst
+ 16), c0123hi
);
241 simdscalari dst01
= _simd_shuffle_epi8(src
,
242 _simd_set_epi32(0x0f078080, 0x0e068080, 0x0d058080, 0x0c048080, 0x80800b03, 0x80800a02, 0x80800901, 0x80800800));
243 simdscalari dst23
= _mm256_permute2x128_si256(src
, src
, 0x01);
244 dst23
= _simd_shuffle_epi8(dst23
,
245 _simd_set_epi32(0x80800f07, 0x80800e06, 0x80800d05, 0x80800c04, 0x0b038080, 0x0a028080, 0x09018080, 0x08008080));
246 simdscalari dst
= _simd_or_si(dst01
, dst23
);
247 _simd_store_si((simdscalari
*)pDst
, dst
);
250 #error Unsupported vector width
253 #if ENABLE_AVX512_SIMD16
255 INLINE
static void Transpose_16(const uint8_t* pSrc
, uint8_t* pDst
)
257 simd4scalari src0
= SIMD128::load_si(reinterpret_cast<const simd4scalari
*>(pSrc
)); // rrrrrrrrrrrrrrrr
258 simd4scalari src1
= SIMD128::load_si(reinterpret_cast<const simd4scalari
*>(pSrc
) + 1); // gggggggggggggggg
259 simd4scalari src2
= SIMD128::load_si(reinterpret_cast<const simd4scalari
*>(pSrc
) + 2); // bbbbbbbbbbbbbbbb
260 simd4scalari src3
= SIMD128::load_si(reinterpret_cast<const simd4scalari
*>(pSrc
) + 3); // aaaaaaaaaaaaaaaa
262 simd16scalari cvt0
= _simd16_cvtepu8_epi32(src0
);
263 simd16scalari cvt1
= _simd16_cvtepu8_epi32(src1
);
264 simd16scalari cvt2
= _simd16_cvtepu8_epi32(src2
);
265 simd16scalari cvt3
= _simd16_cvtepu8_epi32(src3
);
267 simd16scalari shl1
= _simd16_slli_epi32(cvt1
, 8);
268 simd16scalari shl2
= _simd16_slli_epi32(cvt2
, 16);
269 simd16scalari shl3
= _simd16_slli_epi32(cvt3
, 24);
271 simd16scalari dst
= _simd16_or_si(_simd16_or_si(cvt0
, shl1
), _simd16_or_si(shl2
, shl3
));
273 _simd16_store_si(reinterpret_cast<simd16scalari
*>(pDst
), dst
); // rgbargbargbargbargbargbargbargbargbargbargbargbargbargbargbargba
278 //////////////////////////////////////////////////////////////////////////
280 //////////////////////////////////////////////////////////////////////////
281 struct Transpose8_8_8
283 //////////////////////////////////////////////////////////////////////////
284 /// @brief Performs an SOA to AOS conversion for packed 8_8_8 data.
285 /// @param pSrc - source data in SOA form
286 /// @param pDst - output data in AOS form
287 INLINE
static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
288 #if ENABLE_AVX512_SIMD16
290 INLINE
static void Transpose_16(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
294 //////////////////////////////////////////////////////////////////////////
296 //////////////////////////////////////////////////////////////////////////
299 //////////////////////////////////////////////////////////////////////////
300 /// @brief Performs an SOA to AOS conversion for packed 8_8 data.
301 /// @param pSrc - source data in SOA form
302 /// @param pDst - output data in AOS form
303 INLINE
static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
)
305 #if KNOB_SIMD_WIDTH == 8
306 simdscalari src
= _simd_load_si((const simdscalari
*)pSrc
);
308 simd4scalari rg
= src
.v4
[0]; // rrrrrrrr gggggggg
309 simd4scalari g
= SIMD128::unpackhi_epi64(rg
, rg
); // gggggggg gggggggg
310 rg
= SIMD128::unpacklo_epi8(rg
, g
);
311 SIMD128::store_si((simd4scalari
*)pDst
, rg
);
313 #error Unsupported vector width
316 #if ENABLE_AVX512_SIMD16
318 INLINE
static void Transpose_16(const uint8_t* pSrc
, uint8_t* pDst
)
320 simd4scalari src0
= SIMD128::load_si(reinterpret_cast<const simd4scalari
*>(pSrc
)); // rrrrrrrrrrrrrrrr
321 simd4scalari src1
= SIMD128::load_si(reinterpret_cast<const simd4scalari
*>(pSrc
) + 1); // gggggggggggggggg
323 simdscalari cvt0
= _simd_cvtepu8_epi16(src0
);
324 simdscalari cvt1
= _simd_cvtepu8_epi16(src1
);
326 simdscalari shl1
= _simd_slli_epi32(cvt1
, 8);
328 simdscalari dst
= _simd_or_si(cvt0
, shl1
);
330 _simd_store_si(reinterpret_cast<simdscalari
*>(pDst
), dst
); // rgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrg
335 //////////////////////////////////////////////////////////////////////////
336 /// Transpose32_32_32_32
337 //////////////////////////////////////////////////////////////////////////
338 struct Transpose32_32_32_32
340 //////////////////////////////////////////////////////////////////////////
341 /// @brief Performs an SOA to AOS conversion for packed 32_32_32_32 data.
342 /// @param pSrc - source data in SOA form
343 /// @param pDst - output data in AOS form
344 INLINE
static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
)
346 #if KNOB_SIMD_WIDTH == 8
347 simdscalar src0
= _simd_load_ps((const float*)pSrc
);
348 simdscalar src1
= _simd_load_ps((const float*)pSrc
+ 8);
349 simdscalar src2
= _simd_load_ps((const float*)pSrc
+ 16);
350 simdscalar src3
= _simd_load_ps((const float*)pSrc
+ 24);
353 vTranspose4x8(vDst
, src0
, src1
, src2
, src3
);
354 SIMD128::store_ps((float*)pDst
, vDst
[0]);
355 SIMD128::store_ps((float*)pDst
+4, vDst
[1]);
356 SIMD128::store_ps((float*)pDst
+8, vDst
[2]);
357 SIMD128::store_ps((float*)pDst
+12, vDst
[3]);
358 SIMD128::store_ps((float*)pDst
+16, vDst
[4]);
359 SIMD128::store_ps((float*)pDst
+20, vDst
[5]);
360 SIMD128::store_ps((float*)pDst
+24, vDst
[6]);
361 SIMD128::store_ps((float*)pDst
+28, vDst
[7]);
363 #error Unsupported vector width
366 #if ENABLE_AVX512_SIMD16
368 INLINE
static void Transpose_16(const uint8_t* pSrc
, uint8_t* pDst
)
370 simd16scalar src0
= _simd16_load_ps(reinterpret_cast<const float *>(pSrc
));
371 simd16scalar src1
= _simd16_load_ps(reinterpret_cast<const float *>(pSrc
) + 16);
372 simd16scalar src2
= _simd16_load_ps(reinterpret_cast<const float *>(pSrc
) + 32);
373 simd16scalar src3
= _simd16_load_ps(reinterpret_cast<const float *>(pSrc
) + 48);
377 vTranspose4x16(dst
, src0
, src1
, src2
, src3
);
379 _simd16_store_ps(reinterpret_cast<float *>(pDst
) + 0, dst
[0]);
380 _simd16_store_ps(reinterpret_cast<float *>(pDst
) + 16, dst
[1]);
381 _simd16_store_ps(reinterpret_cast<float *>(pDst
) + 32, dst
[2]);
382 _simd16_store_ps(reinterpret_cast<float *>(pDst
) + 48, dst
[3]);
387 //////////////////////////////////////////////////////////////////////////
388 /// Transpose32_32_32
389 //////////////////////////////////////////////////////////////////////////
390 struct Transpose32_32_32
392 //////////////////////////////////////////////////////////////////////////
393 /// @brief Performs an SOA to AOS conversion for packed 32_32_32 data.
394 /// @param pSrc - source data in SOA form
395 /// @param pDst - output data in AOS form
396 INLINE
static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
)
398 #if KNOB_SIMD_WIDTH == 8
399 simdscalar src0
= _simd_load_ps((const float*)pSrc
);
400 simdscalar src1
= _simd_load_ps((const float*)pSrc
+ 8);
401 simdscalar src2
= _simd_load_ps((const float*)pSrc
+ 16);
404 vTranspose3x8(vDst
, src0
, src1
, src2
);
405 SIMD128::store_ps((float*)pDst
, vDst
[0]);
406 SIMD128::store_ps((float*)pDst
+ 4, vDst
[1]);
407 SIMD128::store_ps((float*)pDst
+ 8, vDst
[2]);
408 SIMD128::store_ps((float*)pDst
+ 12, vDst
[3]);
409 SIMD128::store_ps((float*)pDst
+ 16, vDst
[4]);
410 SIMD128::store_ps((float*)pDst
+ 20, vDst
[5]);
411 SIMD128::store_ps((float*)pDst
+ 24, vDst
[6]);
412 SIMD128::store_ps((float*)pDst
+ 28, vDst
[7]);
414 #error Unsupported vector width
417 #if ENABLE_AVX512_SIMD16
419 INLINE
static void Transpose_16(const uint8_t* pSrc
, uint8_t* pDst
)
421 simd16scalar src0
= _simd16_load_ps(reinterpret_cast<const float *>(pSrc
));
422 simd16scalar src1
= _simd16_load_ps(reinterpret_cast<const float *>(pSrc
) + 16);
423 simd16scalar src2
= _simd16_load_ps(reinterpret_cast<const float *>(pSrc
) + 32);
424 simd16scalar src3
= _simd16_setzero_ps();
428 vTranspose4x16(dst
, src0
, src1
, src2
, src3
);
430 _simd16_store_ps(reinterpret_cast<float *>(pDst
) + 0, dst
[0]);
431 _simd16_store_ps(reinterpret_cast<float *>(pDst
) + 16, dst
[1]);
432 _simd16_store_ps(reinterpret_cast<float *>(pDst
) + 32, dst
[2]);
433 _simd16_store_ps(reinterpret_cast<float *>(pDst
) + 48, dst
[3]);
438 //////////////////////////////////////////////////////////////////////////
440 //////////////////////////////////////////////////////////////////////////
441 struct Transpose32_32
443 //////////////////////////////////////////////////////////////////////////
444 /// @brief Performs an SOA to AOS conversion for packed 32_32 data.
445 /// @param pSrc - source data in SOA form
446 /// @param pDst - output data in AOS form
447 INLINE
static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
)
449 #if KNOB_SIMD_WIDTH == 8
450 const float* pfSrc
= (const float*)pSrc
;
451 simd4scalar src_r0
= SIMD128::load_ps(pfSrc
+ 0);
452 simd4scalar src_r1
= SIMD128::load_ps(pfSrc
+ 4);
453 simd4scalar src_g0
= SIMD128::load_ps(pfSrc
+ 8);
454 simd4scalar src_g1
= SIMD128::load_ps(pfSrc
+ 12);
456 simd4scalar dst0
= SIMD128::unpacklo_ps(src_r0
, src_g0
);
457 simd4scalar dst1
= SIMD128::unpackhi_ps(src_r0
, src_g0
);
458 simd4scalar dst2
= SIMD128::unpacklo_ps(src_r1
, src_g1
);
459 simd4scalar dst3
= SIMD128::unpackhi_ps(src_r1
, src_g1
);
461 float* pfDst
= (float*)pDst
;
462 SIMD128::store_ps(pfDst
+ 0, dst0
);
463 SIMD128::store_ps(pfDst
+ 4, dst1
);
464 SIMD128::store_ps(pfDst
+ 8, dst2
);
465 SIMD128::store_ps(pfDst
+ 12, dst3
);
467 #error Unsupported vector width
470 #if ENABLE_AVX512_SIMD16
472 INLINE
static void Transpose_16(const uint8_t* pSrc
, uint8_t* pDst
)
474 simd16scalar src0
= _simd16_load_ps(reinterpret_cast<const float *>(pSrc
)); // rrrrrrrrrrrrrrrr
475 simd16scalar src1
= _simd16_load_ps(reinterpret_cast<const float *>(pSrc
) + 16); // gggggggggggggggg
477 simd16scalar tmp0
= _simd16_unpacklo_ps(src0
, src1
); // r0 g0 r1 g1 r4 g4 r5 g5 r8 g8 r9 g9 rC gC rD gD
478 simd16scalar tmp1
= _simd16_unpackhi_ps(src0
, src1
); // r2 g2 r3 g3 r6 g6 r7 g7 rA gA rB gB rE gE rF gF
480 simd16scalar per0
= _simd16_permute2f128_ps(tmp0
, tmp1
, 0x44); // (1, 0, 1, 0) // r0 g0 r1 g1 r4 g4 r5 g5 r2 g2 r3 g3 r6 g6 r7 g7
481 simd16scalar per1
= _simd16_permute2f128_ps(tmp0
, tmp1
, 0xEE); // (3, 2, 3, 2) // r8 g8 r9 g9 rC gC rD gD rA gA rB gB rE gE rF gF
483 simd16scalar dst0
= _simd16_permute2f128_ps(per0
, per0
, 0xD8); // (3, 1, 2, 0) // r0 g0 r1 g1 r2 g2 r3 g3 r4 g4 r5 g5 r6 g6 r7 g7
484 simd16scalar dst1
= _simd16_permute2f128_ps(per1
, per1
, 0xD8); // (3, 1, 2, 0) // r8 g8 r9 g9 rA gA rB gB rC gC rD gD rE gE rF gF
486 _simd16_store_ps(reinterpret_cast<float *>(pDst
) + 0, dst0
); // rgrgrgrgrgrgrgrg
487 _simd16_store_ps(reinterpret_cast<float *>(pDst
) + 16, dst1
); // rgrgrgrgrgrgrgrg
492 //////////////////////////////////////////////////////////////////////////
493 /// Transpose16_16_16_16
494 //////////////////////////////////////////////////////////////////////////
495 struct Transpose16_16_16_16
497 //////////////////////////////////////////////////////////////////////////
498 /// @brief Performs an SOA to AOS conversion for packed 16_16_16_16 data.
499 /// @param pSrc - source data in SOA form
500 /// @param pDst - output data in AOS form
501 INLINE
static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
)
503 #if KNOB_SIMD_WIDTH == 8
504 simdscalari src_rg
= _simd_load_si((const simdscalari
*)pSrc
);
505 simdscalari src_ba
= _simd_load_si((const simdscalari
*)(pSrc
+ sizeof(simdscalari
)));
507 simd4scalari src_r
= _simd_extractf128_si(src_rg
, 0);
508 simd4scalari src_g
= _simd_extractf128_si(src_rg
, 1);
509 simd4scalari src_b
= _simd_extractf128_si(src_ba
, 0);
510 simd4scalari src_a
= _simd_extractf128_si(src_ba
, 1);
512 simd4scalari rg0
= SIMD128::unpacklo_epi16(src_r
, src_g
);
513 simd4scalari rg1
= SIMD128::unpackhi_epi16(src_r
, src_g
);
514 simd4scalari ba0
= SIMD128::unpacklo_epi16(src_b
, src_a
);
515 simd4scalari ba1
= SIMD128::unpackhi_epi16(src_b
, src_a
);
517 simd4scalari dst0
= SIMD128::unpacklo_epi32(rg0
, ba0
);
518 simd4scalari dst1
= SIMD128::unpackhi_epi32(rg0
, ba0
);
519 simd4scalari dst2
= SIMD128::unpacklo_epi32(rg1
, ba1
);
520 simd4scalari dst3
= SIMD128::unpackhi_epi32(rg1
, ba1
);
522 SIMD128::store_si(((simd4scalari
*)pDst
) + 0, dst0
);
523 SIMD128::store_si(((simd4scalari
*)pDst
) + 1, dst1
);
524 SIMD128::store_si(((simd4scalari
*)pDst
) + 2, dst2
);
525 SIMD128::store_si(((simd4scalari
*)pDst
) + 3, dst3
);
527 #error Unsupported vector width
530 #if ENABLE_AVX512_SIMD16
532 INLINE
static void Transpose_16(const uint8_t* pSrc
, uint8_t* pDst
)
534 simdscalari src0
= _simd_load_si(reinterpret_cast<const simdscalari
*>(pSrc
)); // rrrrrrrrrrrrrrrr
535 simdscalari src1
= _simd_load_si(reinterpret_cast<const simdscalari
*>(pSrc
) + 1); // gggggggggggggggg
536 simdscalari src2
= _simd_load_si(reinterpret_cast<const simdscalari
*>(pSrc
) + 2); // bbbbbbbbbbbbbbbb
537 simdscalari src3
= _simd_load_si(reinterpret_cast<const simdscalari
*>(pSrc
) + 3); // aaaaaaaaaaaaaaaa
539 simdscalari pre0
= _simd_unpacklo_epi16(src0
, src1
); // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB
540 simdscalari pre1
= _simd_unpackhi_epi16(src0
, src1
); // rg4 rg5 rg6 rg7 rgC rgD rgE rgF
541 simdscalari pre2
= _simd_unpacklo_epi16(src2
, src3
); // ba0 ba1 ba3 ba3 ba8 ba9 baA baB
542 simdscalari pre3
= _simd_unpackhi_epi16(src2
, src3
); // ba4 ba5 ba6 ba7 baC baD baE baF
544 simdscalari tmp0
= _simd_unpacklo_epi32(pre0
, pre2
); // rbga0 rbga1 rbga8 rbga9
545 simdscalari tmp1
= _simd_unpackhi_epi32(pre0
, pre2
); // rbga2 rbga3 rbgaA rbgaB
546 simdscalari tmp2
= _simd_unpacklo_epi32(pre1
, pre3
); // rbga4 rbga5 rgbaC rbgaD
547 simdscalari tmp3
= _simd_unpackhi_epi32(pre1
, pre3
); // rbga6 rbga7 rbgaE rbgaF
549 simdscalari dst0
= _simd_permute2f128_si(tmp0
, tmp1
, 0x20); // (2, 0) // rbga0 rbga1 rbga2 rbga3
550 simdscalari dst1
= _simd_permute2f128_si(tmp2
, tmp3
, 0x20); // (2, 0) // rbga4 rbga5 rbga6 rbga7
551 simdscalari dst2
= _simd_permute2f128_si(tmp0
, tmp1
, 0x31); // (3, 1) // rbga8 rbga9 rbgaA rbgaB
552 simdscalari dst3
= _simd_permute2f128_si(tmp2
, tmp3
, 0x31); // (3, 1) // rbgaC rbgaD rbgaE rbgaF
554 _simd_store_si(reinterpret_cast<simdscalari
*>(pDst
) + 0, dst0
); // rgbargbargbargba
555 _simd_store_si(reinterpret_cast<simdscalari
*>(pDst
) + 1, dst1
); // rgbargbargbargba
556 _simd_store_si(reinterpret_cast<simdscalari
*>(pDst
) + 2, dst2
); // rgbargbargbargba
557 _simd_store_si(reinterpret_cast<simdscalari
*>(pDst
) + 3, dst3
); // rgbargbargbargba
562 //////////////////////////////////////////////////////////////////////////
563 /// Transpose16_16_16
564 //////////////////////////////////////////////////////////////////////////
565 struct Transpose16_16_16
567 //////////////////////////////////////////////////////////////////////////
568 /// @brief Performs an SOA to AOS conversion for packed 16_16_16 data.
569 /// @param pSrc - source data in SOA form
570 /// @param pDst - output data in AOS form
571 INLINE
static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
)
573 #if KNOB_SIMD_WIDTH == 8
574 simdscalari src_rg
= _simd_load_si((const simdscalari
*)pSrc
);
576 simd4scalari src_r
= _simd_extractf128_si(src_rg
, 0);
577 simd4scalari src_g
= _simd_extractf128_si(src_rg
, 1);
578 simd4scalari src_b
= SIMD128::load_si((const simd4scalari
*)(pSrc
+ sizeof(simdscalari
)));
579 simd4scalari src_a
= SIMD128::setzero_si();
581 simd4scalari rg0
= SIMD128::unpacklo_epi16(src_r
, src_g
);
582 simd4scalari rg1
= SIMD128::unpackhi_epi16(src_r
, src_g
);
583 simd4scalari ba0
= SIMD128::unpacklo_epi16(src_b
, src_a
);
584 simd4scalari ba1
= SIMD128::unpackhi_epi16(src_b
, src_a
);
586 simd4scalari dst0
= SIMD128::unpacklo_epi32(rg0
, ba0
);
587 simd4scalari dst1
= SIMD128::unpackhi_epi32(rg0
, ba0
);
588 simd4scalari dst2
= SIMD128::unpacklo_epi32(rg1
, ba1
);
589 simd4scalari dst3
= SIMD128::unpackhi_epi32(rg1
, ba1
);
591 SIMD128::store_si(((simd4scalari
*)pDst
) + 0, dst0
);
592 SIMD128::store_si(((simd4scalari
*)pDst
) + 1, dst1
);
593 SIMD128::store_si(((simd4scalari
*)pDst
) + 2, dst2
);
594 SIMD128::store_si(((simd4scalari
*)pDst
) + 3, dst3
);
596 #error Unsupported vector width
599 #if ENABLE_AVX512_SIMD16
601 INLINE
static void Transpose_16(const uint8_t* pSrc
, uint8_t* pDst
)
603 simdscalari src0
= _simd_load_si(reinterpret_cast<const simdscalari
*>(pSrc
)); // rrrrrrrrrrrrrrrr
604 simdscalari src1
= _simd_load_si(reinterpret_cast<const simdscalari
*>(pSrc
) + 1); // gggggggggggggggg
605 simdscalari src2
= _simd_load_si(reinterpret_cast<const simdscalari
*>(pSrc
) + 2); // bbbbbbbbbbbbbbbb
606 simdscalari src3
= _simd_setzero_si(); // aaaaaaaaaaaaaaaa
608 simdscalari pre0
= _simd_unpacklo_epi16(src0
, src1
); // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB
609 simdscalari pre1
= _simd_unpackhi_epi16(src0
, src1
); // rg4 rg5 rg6 rg7 rgC rgD rgE rgF
610 simdscalari pre2
= _simd_unpacklo_epi16(src2
, src3
); // ba0 ba1 ba3 ba3 ba8 ba9 baA baB
611 simdscalari pre3
= _simd_unpackhi_epi16(src2
, src3
); // ba4 ba5 ba6 ba7 baC baD baE baF
613 simdscalari tmp0
= _simd_unpacklo_epi32(pre0
, pre2
); // rbga0 rbga1 rbga8 rbga9
614 simdscalari tmp1
= _simd_unpackhi_epi32(pre0
, pre2
); // rbga2 rbga3 rbgaA rbgaB
615 simdscalari tmp2
= _simd_unpacklo_epi32(pre1
, pre3
); // rbga4 rbga5 rgbaC rbgaD
616 simdscalari tmp3
= _simd_unpackhi_epi32(pre1
, pre3
); // rbga6 rbga7 rbgaE rbgaF
618 simdscalari dst0
= _simd_permute2f128_si(tmp0
, tmp1
, 0x20); // (2, 0) // rbga0 rbga1 rbga2 rbga3
619 simdscalari dst1
= _simd_permute2f128_si(tmp2
, tmp3
, 0x20); // (2, 0) // rbga4 rbga5 rbga6 rbga7
620 simdscalari dst2
= _simd_permute2f128_si(tmp0
, tmp1
, 0x31); // (3, 1) // rbga8 rbga9 rbgaA rbgaB
621 simdscalari dst3
= _simd_permute2f128_si(tmp2
, tmp3
, 0x31); // (3, 1) // rbgaC rbgaD rbgaE rbgaF
623 _simd_store_si(reinterpret_cast<simdscalari
*>(pDst
) + 0, dst0
); // rgbargbargbargba
624 _simd_store_si(reinterpret_cast<simdscalari
*>(pDst
) + 1, dst1
); // rgbargbargbargba
625 _simd_store_si(reinterpret_cast<simdscalari
*>(pDst
) + 2, dst2
); // rgbargbargbargba
626 _simd_store_si(reinterpret_cast<simdscalari
*>(pDst
) + 3, dst3
); // rgbargbargbargba
631 //////////////////////////////////////////////////////////////////////////
633 //////////////////////////////////////////////////////////////////////////
634 struct Transpose16_16
636 //////////////////////////////////////////////////////////////////////////
637 /// @brief Performs an SOA to AOS conversion for packed 16_16 data.
638 /// @param pSrc - source data in SOA form
639 /// @param pDst - output data in AOS form
640 INLINE
static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
)
642 #if KNOB_SIMD_WIDTH == 8
643 simdscalar src
= _simd_load_ps((const float*)pSrc
);
645 simd4scalar comp0
= _simd_extractf128_ps(src
, 0);
646 simd4scalar comp1
= _simd_extractf128_ps(src
, 1);
648 simd4scalari comp0i
= SIMD128::castps_si(comp0
);
649 simd4scalari comp1i
= SIMD128::castps_si(comp1
);
651 simd4scalari resLo
= SIMD128::unpacklo_epi16(comp0i
, comp1i
);
652 simd4scalari resHi
= SIMD128::unpackhi_epi16(comp0i
, comp1i
);
654 SIMD128::store_si((simd4scalari
*)pDst
, resLo
);
655 SIMD128::store_si((simd4scalari
*)pDst
+ 1, resHi
);
657 #error Unsupported vector width
660 #if ENABLE_AVX512_SIMD16
662 INLINE
static void Transpose_16(const uint8_t* pSrc
, uint8_t* pDst
)
664 simdscalari src0
= _simd_load_si(reinterpret_cast<const simdscalari
*>(pSrc
)); // rrrrrrrrrrrrrrrr
665 simdscalari src1
= _simd_load_si(reinterpret_cast<const simdscalari
*>(pSrc
) + 1); // gggggggggggggggg
667 simdscalari tmp0
= _simd_unpacklo_epi16(src0
, src1
); // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB
668 simdscalari tmp1
= _simd_unpackhi_epi16(src0
, src1
); // rg4 rg5 rg6 rg7 rgC rgD rgE rgF
670 simdscalari dst0
= _simd_permute2f128_si(tmp0
, tmp1
, 0x20); // (2, 0) // rg0 rg1 rg2 rg3 rg4 rg5 rg6 rg7
671 simdscalari dst1
= _simd_permute2f128_si(tmp0
, tmp1
, 0x31); // (3, 1) // rg8 rg9 rgA rgB rgC rgD rgE rgF
673 _simd_store_si(reinterpret_cast<simdscalari
*>(pDst
) + 0, dst0
); // rgrgrgrgrgrgrgrg
674 _simd_store_si(reinterpret_cast<simdscalari
*>(pDst
) + 1, dst1
); // rgrgrgrgrgrgrgrg
679 //////////////////////////////////////////////////////////////////////////
681 //////////////////////////////////////////////////////////////////////////
684 //////////////////////////////////////////////////////////////////////////
685 /// @brief Performs an SOA to AOS conversion for packed 24_8 data.
686 /// @param pSrc - source data in SOA form
687 /// @param pDst - output data in AOS form
688 static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
689 #if ENABLE_AVX512_SIMD16
691 static void Transpose_16(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
695 //////////////////////////////////////////////////////////////////////////
697 //////////////////////////////////////////////////////////////////////////
698 struct Transpose32_8_24
700 //////////////////////////////////////////////////////////////////////////
701 /// @brief Performs an SOA to AOS conversion for packed 32_8_24 data.
702 /// @param pSrc - source data in SOA form
703 /// @param pDst - output data in AOS form
704 static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
705 #if ENABLE_AVX512_SIMD16
707 static void Transpose_16(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
711 //////////////////////////////////////////////////////////////////////////
713 //////////////////////////////////////////////////////////////////////////
714 struct Transpose4_4_4_4
716 //////////////////////////////////////////////////////////////////////////
717 /// @brief Performs an SOA to AOS conversion for packed 4_4_4_4 data.
718 /// @param pSrc - source data in SOA form
719 /// @param pDst - output data in AOS form
720 static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
721 #if ENABLE_AVX512_SIMD16
723 static void Transpose_16(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
727 //////////////////////////////////////////////////////////////////////////
729 //////////////////////////////////////////////////////////////////////////
730 struct Transpose5_6_5
732 //////////////////////////////////////////////////////////////////////////
733 /// @brief Performs an SOA to AOS conversion for packed 5_6_5 data.
734 /// @param pSrc - source data in SOA form
735 /// @param pDst - output data in AOS form
736 static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
737 #if ENABLE_AVX512_SIMD16
739 static void Transpose_16(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
743 //////////////////////////////////////////////////////////////////////////
745 //////////////////////////////////////////////////////////////////////////
746 struct Transpose9_9_9_5
748 //////////////////////////////////////////////////////////////////////////
749 /// @brief Performs an SOA to AOS conversion for packed 9_9_9_5 data.
750 /// @param pSrc - source data in SOA form
751 /// @param pDst - output data in AOS form
752 static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
753 #if ENABLE_AVX512_SIMD16
755 static void Transpose_16(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
759 //////////////////////////////////////////////////////////////////////////
761 //////////////////////////////////////////////////////////////////////////
762 struct Transpose5_5_5_1
764 //////////////////////////////////////////////////////////////////////////
765 /// @brief Performs an SOA to AOS conversion for packed 5_5_5_1 data.
766 /// @param pSrc - source data in SOA form
767 /// @param pDst - output data in AOS form
768 static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
769 #if ENABLE_AVX512_SIMD16
771 static void Transpose_16(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
775 //////////////////////////////////////////////////////////////////////////
777 //////////////////////////////////////////////////////////////////////////
778 struct Transpose1_5_5_5
780 //////////////////////////////////////////////////////////////////////////
781 /// @brief Performs an SOA to AOS conversion for packed 5_5_5_1 data.
782 /// @param pSrc - source data in SOA form
783 /// @param pDst - output data in AOS form
784 static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
787 //////////////////////////////////////////////////////////////////////////
788 /// Transpose10_10_10_2
789 //////////////////////////////////////////////////////////////////////////
790 struct Transpose10_10_10_2
792 //////////////////////////////////////////////////////////////////////////
793 /// @brief Performs an SOA to AOS conversion for packed 10_10_10_2 data.
794 /// @param pSrc - source data in SOA form
795 /// @param pDst - output data in AOS form
796 static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
797 #if ENABLE_AVX512_SIMD16
799 static void Transpose_16(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
803 //////////////////////////////////////////////////////////////////////////
804 /// Transpose11_11_10
805 //////////////////////////////////////////////////////////////////////////
806 struct Transpose11_11_10
808 //////////////////////////////////////////////////////////////////////////
809 /// @brief Performs an SOA to AOS conversion for packed 11_11_10 data.
810 /// @param pSrc - source data in SOA form
811 /// @param pDst - output data in AOS form
812 static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
813 #if ENABLE_AVX512_SIMD16
815 static void Transpose_16(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
819 //////////////////////////////////////////////////////////////////////////
821 //////////////////////////////////////////////////////////////////////////
824 //////////////////////////////////////////////////////////////////////////
825 /// @brief Performs an SOA to AOS conversion
826 /// @param pSrc - source data in SOA form
827 /// @param pDst - output data in AOS form
828 static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
829 #if ENABLE_AVX512_SIMD16
831 static void Transpose_16(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
835 //////////////////////////////////////////////////////////////////////////
837 //////////////////////////////////////////////////////////////////////////
838 struct Transpose64_64
840 //////////////////////////////////////////////////////////////////////////
841 /// @brief Performs an SOA to AOS conversion
842 /// @param pSrc - source data in SOA form
843 /// @param pDst - output data in AOS form
844 static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
845 #if ENABLE_AVX512_SIMD16
847 static void Transpose_16(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
851 //////////////////////////////////////////////////////////////////////////
852 /// Transpose64_64_64
853 //////////////////////////////////////////////////////////////////////////
854 struct Transpose64_64_64
856 //////////////////////////////////////////////////////////////////////////
857 /// @brief Performs an SOA to AOS conversion
858 /// @param pSrc - source data in SOA form
859 /// @param pDst - output data in AOS form
860 static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
861 #if ENABLE_AVX512_SIMD16
863 static void Transpose_16(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
867 //////////////////////////////////////////////////////////////////////////
868 /// Transpose64_64_64_64
869 //////////////////////////////////////////////////////////////////////////
870 struct Transpose64_64_64_64
872 //////////////////////////////////////////////////////////////////////////
873 /// @brief Performs an SOA to AOS conversion
874 /// @param pSrc - source data in SOA form
875 /// @param pDst - output data in AOS form
876 static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
877 #if ENABLE_AVX512_SIMD16
879 static void Transpose_16(const uint8_t* pSrc
, uint8_t* pDst
) = delete;