b51755dab50bf589c7cd12c14304a738052bb5f3
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / format_utils.h
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file utils.h
24 *
25 * @brief Utilities used by SWR core related to pixel formats.
26 *
27 ******************************************************************************/
28 #pragma once
29
30 #include "core/utils.h"
31 #include "common/simdintrin.h"
32
33 INLINE
34 void vTranspose(simd4scalar& row0, simd4scalar& row1, simd4scalar& row2, simd4scalar& row3)
35 {
36 simd4scalari row0i = SIMD128::castps_si(row0);
37 simd4scalari row1i = SIMD128::castps_si(row1);
38 simd4scalari row2i = SIMD128::castps_si(row2);
39 simd4scalari row3i = SIMD128::castps_si(row3);
40
41 simd4scalari vTemp = row2i;
42 row2i = SIMD128::unpacklo_epi32(row2i, row3i);
43 vTemp = SIMD128::unpackhi_epi32(vTemp, row3i);
44
45 row3i = row0i;
46 row0i = SIMD128::unpacklo_epi32(row0i, row1i);
47 row3i = SIMD128::unpackhi_epi32(row3i, row1i);
48
49 row1i = row0i;
50 row0i = SIMD128::unpacklo_epi64(row0i, row2i);
51 row1i = SIMD128::unpackhi_epi64(row1i, row2i);
52
53 row2i = row3i;
54 row2i = SIMD128::unpacklo_epi64(row2i, vTemp);
55 row3i = SIMD128::unpackhi_epi64(row3i, vTemp);
56
57 row0 = SIMD128::castsi_ps(row0i);
58 row1 = SIMD128::castsi_ps(row1i);
59 row2 = SIMD128::castsi_ps(row2i);
60 row3 = SIMD128::castsi_ps(row3i);
61 }
62
63 INLINE
64 void vTranspose(simd4scalari& row0, simd4scalari& row1, simd4scalari& row2, simd4scalari& row3)
65 {
66 simd4scalari vTemp = row2;
67 row2 = SIMD128::unpacklo_epi32(row2, row3);
68 vTemp = SIMD128::unpackhi_epi32(vTemp, row3);
69
70 row3 = row0;
71 row0 = SIMD128::unpacklo_epi32(row0, row1);
72 row3 = SIMD128::unpackhi_epi32(row3, row1);
73
74 row1 = row0;
75 row0 = SIMD128::unpacklo_epi64(row0, row2);
76 row1 = SIMD128::unpackhi_epi64(row1, row2);
77
78 row2 = row3;
79 row2 = SIMD128::unpacklo_epi64(row2, vTemp);
80 row3 = SIMD128::unpackhi_epi64(row3, vTemp);
81 }
82
83 #if KNOB_SIMD_WIDTH == 8
84 INLINE
85 void vTranspose3x8(simd4scalar (&vDst)[8],
86 const simdscalar& vSrc0,
87 const simdscalar& vSrc1,
88 const simdscalar& vSrc2)
89 {
90 simdscalar r0r2 = _simd_unpacklo_ps(vSrc0, vSrc2); // x0z0x1z1 x4z4x5z5
91 simdscalar r1rx = _simd_unpacklo_ps(vSrc1, _simd_setzero_ps()); // y0w0y1w1 y4w4y5w5
92 simdscalar r02r1xlolo = _simd_unpacklo_ps(r0r2, r1rx); // x0y0z0w0 x4y4z4w4
93 simdscalar r02r1xlohi = _simd_unpackhi_ps(r0r2, r1rx); // x1y1z1w1 x5y5z5w5
94
95 r0r2 = _simd_unpackhi_ps(vSrc0, vSrc2); // x2z2x3z3 x6z6x7z7
96 r1rx = _simd_unpackhi_ps(vSrc1, _simd_setzero_ps()); // y2w2y3w3 y6w6yw77
97 simdscalar r02r1xhilo = _simd_unpacklo_ps(r0r2, r1rx); // x2y2z2w2 x6y6z6w6
98 simdscalar r02r1xhihi = _simd_unpackhi_ps(r0r2, r1rx); // x3y3z3w3 x7y7z7w7
99
100 vDst[0] = _simd_extractf128_ps(r02r1xlolo, 0);
101 vDst[1] = _simd_extractf128_ps(r02r1xlohi, 0);
102 vDst[2] = _simd_extractf128_ps(r02r1xhilo, 0);
103 vDst[3] = _simd_extractf128_ps(r02r1xhihi, 0);
104
105 vDst[4] = _simd_extractf128_ps(r02r1xlolo, 1);
106 vDst[5] = _simd_extractf128_ps(r02r1xlohi, 1);
107 vDst[6] = _simd_extractf128_ps(r02r1xhilo, 1);
108 vDst[7] = _simd_extractf128_ps(r02r1xhihi, 1);
109 }
110
111 INLINE
112 void vTranspose4x8(simd4scalar (&vDst)[8],
113 const simdscalar& vSrc0,
114 const simdscalar& vSrc1,
115 const simdscalar& vSrc2,
116 const simdscalar& vSrc3)
117 {
118 simdscalar r0r2 = _simd_unpacklo_ps(vSrc0, vSrc2); // x0z0x1z1 x4z4x5z5
119 simdscalar r1rx = _simd_unpacklo_ps(vSrc1, vSrc3); // y0w0y1w1 y4w4y5w5
120 simdscalar r02r1xlolo = _simd_unpacklo_ps(r0r2, r1rx); // x0y0z0w0 x4y4z4w4
121 simdscalar r02r1xlohi = _simd_unpackhi_ps(r0r2, r1rx); // x1y1z1w1 x5y5z5w5
122
123 r0r2 = _simd_unpackhi_ps(vSrc0, vSrc2); // x2z2x3z3 x6z6x7z7
124 r1rx = _simd_unpackhi_ps(vSrc1, vSrc3); // y2w2y3w3 y6w6yw77
125 simdscalar r02r1xhilo = _simd_unpacklo_ps(r0r2, r1rx); // x2y2z2w2 x6y6z6w6
126 simdscalar r02r1xhihi = _simd_unpackhi_ps(r0r2, r1rx); // x3y3z3w3 x7y7z7w7
127
128 vDst[0] = _simd_extractf128_ps(r02r1xlolo, 0);
129 vDst[1] = _simd_extractf128_ps(r02r1xlohi, 0);
130 vDst[2] = _simd_extractf128_ps(r02r1xhilo, 0);
131 vDst[3] = _simd_extractf128_ps(r02r1xhihi, 0);
132
133 vDst[4] = _simd_extractf128_ps(r02r1xlolo, 1);
134 vDst[5] = _simd_extractf128_ps(r02r1xlohi, 1);
135 vDst[6] = _simd_extractf128_ps(r02r1xhilo, 1);
136 vDst[7] = _simd_extractf128_ps(r02r1xhihi, 1);
137 }
138
139 #if ENABLE_AVX512_SIMD16
140 INLINE
141 void vTranspose4x16(simd16scalar (&dst)[4],
142 const simd16scalar& src0,
143 const simd16scalar& src1,
144 const simd16scalar& src2,
145 const simd16scalar& src3)
146 {
147 const simd16scalari perm =
148 _simd16_set_epi32(15,
149 11,
150 7,
151 3,
152 14,
153 10,
154 6,
155 2,
156 13,
157 9,
158 5,
159 1,
160 12,
161 8,
162 4,
163 0); // pre-permute input to setup the right order after all the unpacking
164
165 simd16scalar pre0 = _simd16_permute_ps(src0, perm); // r
166 simd16scalar pre1 = _simd16_permute_ps(src1, perm); // g
167 simd16scalar pre2 = _simd16_permute_ps(src2, perm); // b
168 simd16scalar pre3 = _simd16_permute_ps(src3, perm); // a
169
170 simd16scalar rblo = _simd16_unpacklo_ps(pre0, pre2);
171 simd16scalar galo = _simd16_unpacklo_ps(pre1, pre3);
172 simd16scalar rbhi = _simd16_unpackhi_ps(pre0, pre2);
173 simd16scalar gahi = _simd16_unpackhi_ps(pre1, pre3);
174
175 dst[0] = _simd16_unpacklo_ps(rblo, galo);
176 dst[1] = _simd16_unpackhi_ps(rblo, galo);
177 dst[2] = _simd16_unpacklo_ps(rbhi, gahi);
178 dst[3] = _simd16_unpackhi_ps(rbhi, gahi);
179 }
180
181 #endif
182 INLINE
183 void vTranspose8x8(simdscalar (&vDst)[8],
184 const simdscalar& vMask0,
185 const simdscalar& vMask1,
186 const simdscalar& vMask2,
187 const simdscalar& vMask3,
188 const simdscalar& vMask4,
189 const simdscalar& vMask5,
190 const simdscalar& vMask6,
191 const simdscalar& vMask7)
192 {
193 simdscalar __t0 = _simd_unpacklo_ps(vMask0, vMask1);
194 simdscalar __t1 = _simd_unpackhi_ps(vMask0, vMask1);
195 simdscalar __t2 = _simd_unpacklo_ps(vMask2, vMask3);
196 simdscalar __t3 = _simd_unpackhi_ps(vMask2, vMask3);
197 simdscalar __t4 = _simd_unpacklo_ps(vMask4, vMask5);
198 simdscalar __t5 = _simd_unpackhi_ps(vMask4, vMask5);
199 simdscalar __t6 = _simd_unpacklo_ps(vMask6, vMask7);
200 simdscalar __t7 = _simd_unpackhi_ps(vMask6, vMask7);
201 simdscalar __tt0 = _simd_shuffle_ps(__t0, __t2, _MM_SHUFFLE(1, 0, 1, 0));
202 simdscalar __tt1 = _simd_shuffle_ps(__t0, __t2, _MM_SHUFFLE(3, 2, 3, 2));
203 simdscalar __tt2 = _simd_shuffle_ps(__t1, __t3, _MM_SHUFFLE(1, 0, 1, 0));
204 simdscalar __tt3 = _simd_shuffle_ps(__t1, __t3, _MM_SHUFFLE(3, 2, 3, 2));
205 simdscalar __tt4 = _simd_shuffle_ps(__t4, __t6, _MM_SHUFFLE(1, 0, 1, 0));
206 simdscalar __tt5 = _simd_shuffle_ps(__t4, __t6, _MM_SHUFFLE(3, 2, 3, 2));
207 simdscalar __tt6 = _simd_shuffle_ps(__t5, __t7, _MM_SHUFFLE(1, 0, 1, 0));
208 simdscalar __tt7 = _simd_shuffle_ps(__t5, __t7, _MM_SHUFFLE(3, 2, 3, 2));
209 vDst[0] = _simd_permute2f128_ps(__tt0, __tt4, 0x20);
210 vDst[1] = _simd_permute2f128_ps(__tt1, __tt5, 0x20);
211 vDst[2] = _simd_permute2f128_ps(__tt2, __tt6, 0x20);
212 vDst[3] = _simd_permute2f128_ps(__tt3, __tt7, 0x20);
213 vDst[4] = _simd_permute2f128_ps(__tt0, __tt4, 0x31);
214 vDst[5] = _simd_permute2f128_ps(__tt1, __tt5, 0x31);
215 vDst[6] = _simd_permute2f128_ps(__tt2, __tt6, 0x31);
216 vDst[7] = _simd_permute2f128_ps(__tt3, __tt7, 0x31);
217 }
218
219 INLINE
220 void vTranspose8x8(simdscalar (&vDst)[8],
221 const simdscalari& vMask0,
222 const simdscalari& vMask1,
223 const simdscalari& vMask2,
224 const simdscalari& vMask3,
225 const simdscalari& vMask4,
226 const simdscalari& vMask5,
227 const simdscalari& vMask6,
228 const simdscalari& vMask7)
229 {
230 vTranspose8x8(vDst,
231 _simd_castsi_ps(vMask0),
232 _simd_castsi_ps(vMask1),
233 _simd_castsi_ps(vMask2),
234 _simd_castsi_ps(vMask3),
235 _simd_castsi_ps(vMask4),
236 _simd_castsi_ps(vMask5),
237 _simd_castsi_ps(vMask6),
238 _simd_castsi_ps(vMask7));
239 }
240 #endif
241
242 //////////////////////////////////////////////////////////////////////////
243 /// TranposeSingleComponent
244 //////////////////////////////////////////////////////////////////////////
245 template <uint32_t bpp>
246 struct TransposeSingleComponent
247 {
248 //////////////////////////////////////////////////////////////////////////
249 /// @brief Pass-thru for single component.
250 /// @param pSrc - source data in SOA form
251 /// @param pDst - output data in AOS form
252 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
253 {
254 memcpy(pDst, pSrc, (bpp * KNOB_SIMD_WIDTH) / 8);
255 }
256 #if ENABLE_AVX512_SIMD16
257
258 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
259 {
260 memcpy(pDst, pSrc, (bpp * KNOB_SIMD16_WIDTH) / 8);
261 }
262 #endif
263 };
264
265 //////////////////////////////////////////////////////////////////////////
266 /// Transpose8_8_8_8
267 //////////////////////////////////////////////////////////////////////////
268 struct Transpose8_8_8_8
269 {
270 //////////////////////////////////////////////////////////////////////////
271 /// @brief Performs an SOA to AOS conversion for packed 8_8_8_8 data.
272 /// @param pSrc - source data in SOA form
273 /// @param pDst - output data in AOS form
274 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
275 {
276 simdscalari src = _simd_load_si((const simdscalari*)pSrc);
277
278 #if KNOB_SIMD_WIDTH == 8
279 #if KNOB_ARCH <= KNOB_ARCH_AVX
280 simd4scalari c0c1 = src.v4[0]; // rrrrrrrrgggggggg
281 simd4scalari c2c3 =
282 SIMD128::castps_si(_simd_extractf128_ps(_simd_castsi_ps(src), 1)); // bbbbbbbbaaaaaaaa
283 simd4scalari c0c2 = SIMD128::unpacklo_epi64(c0c1, c2c3); // rrrrrrrrbbbbbbbb
284 simd4scalari c1c3 = SIMD128::unpackhi_epi64(c0c1, c2c3); // ggggggggaaaaaaaa
285 simd4scalari c01 = SIMD128::unpacklo_epi8(c0c2, c1c3); // rgrgrgrgrgrgrgrg
286 simd4scalari c23 = SIMD128::unpackhi_epi8(c0c2, c1c3); // babababababababa
287 simd4scalari c0123lo = SIMD128::unpacklo_epi16(c01, c23); // rgbargbargbargba
288 simd4scalari c0123hi = SIMD128::unpackhi_epi16(c01, c23); // rgbargbargbargba
289 SIMD128::store_si((simd4scalari*)pDst, c0123lo);
290 SIMD128::store_si((simd4scalari*)(pDst + 16), c0123hi);
291 #else
292 simdscalari dst01 = _simd_shuffle_epi8(src,
293 _simd_set_epi32(0x0f078080,
294 0x0e068080,
295 0x0d058080,
296 0x0c048080,
297 0x80800b03,
298 0x80800a02,
299 0x80800901,
300 0x80800800));
301 simdscalari dst23 = _mm256_permute2x128_si256(src, src, 0x01);
302 dst23 = _simd_shuffle_epi8(dst23,
303 _simd_set_epi32(0x80800f07,
304 0x80800e06,
305 0x80800d05,
306 0x80800c04,
307 0x0b038080,
308 0x0a028080,
309 0x09018080,
310 0x08008080));
311 simdscalari dst = _simd_or_si(dst01, dst23);
312 _simd_store_si((simdscalari*)pDst, dst);
313 #endif
314 #else
315 #error Unsupported vector width
316 #endif
317 }
318 #if ENABLE_AVX512_SIMD16
319
320 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
321 {
322 simd4scalari src0 =
323 SIMD128::load_si(reinterpret_cast<const simd4scalari*>(pSrc)); // rrrrrrrrrrrrrrrr
324 simd4scalari src1 =
325 SIMD128::load_si(reinterpret_cast<const simd4scalari*>(pSrc) + 1); // gggggggggggggggg
326 simd4scalari src2 =
327 SIMD128::load_si(reinterpret_cast<const simd4scalari*>(pSrc) + 2); // bbbbbbbbbbbbbbbb
328 simd4scalari src3 =
329 SIMD128::load_si(reinterpret_cast<const simd4scalari*>(pSrc) + 3); // aaaaaaaaaaaaaaaa
330
331 simd16scalari cvt0 = _simd16_cvtepu8_epi32(src0);
332 simd16scalari cvt1 = _simd16_cvtepu8_epi32(src1);
333 simd16scalari cvt2 = _simd16_cvtepu8_epi32(src2);
334 simd16scalari cvt3 = _simd16_cvtepu8_epi32(src3);
335
336 simd16scalari shl1 = _simd16_slli_epi32(cvt1, 8);
337 simd16scalari shl2 = _simd16_slli_epi32(cvt2, 16);
338 simd16scalari shl3 = _simd16_slli_epi32(cvt3, 24);
339
340 simd16scalari dst = _simd16_or_si(_simd16_or_si(cvt0, shl1), _simd16_or_si(shl2, shl3));
341
342 _simd16_store_si(reinterpret_cast<simd16scalari*>(pDst),
343 dst); // rgbargbargbargbargbargbargbargbargbargbargbargbargbargbargbargba
344 }
345 #endif
346 };
347
348 //////////////////////////////////////////////////////////////////////////
349 /// Transpose8_8_8
350 //////////////////////////////////////////////////////////////////////////
351 struct Transpose8_8_8
352 {
353 //////////////////////////////////////////////////////////////////////////
354 /// @brief Performs an SOA to AOS conversion for packed 8_8_8 data.
355 /// @param pSrc - source data in SOA form
356 /// @param pDst - output data in AOS form
357 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
358 #if ENABLE_AVX512_SIMD16
359
360 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
361 #endif
362 };
363
364 //////////////////////////////////////////////////////////////////////////
365 /// Transpose8_8
366 //////////////////////////////////////////////////////////////////////////
367 struct Transpose8_8
368 {
369 //////////////////////////////////////////////////////////////////////////
370 /// @brief Performs an SOA to AOS conversion for packed 8_8 data.
371 /// @param pSrc - source data in SOA form
372 /// @param pDst - output data in AOS form
373 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
374 {
375 #if KNOB_SIMD_WIDTH == 8
376 simdscalari src = _simd_load_si((const simdscalari*)pSrc);
377
378 simd4scalari rg = src.v4[0]; // rrrrrrrr gggggggg
379 simd4scalari g = SIMD128::unpackhi_epi64(rg, rg); // gggggggg gggggggg
380 rg = SIMD128::unpacklo_epi8(rg, g);
381 SIMD128::store_si((simd4scalari*)pDst, rg);
382 #else
383 #error Unsupported vector width
384 #endif
385 }
386 #if ENABLE_AVX512_SIMD16
387
388 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
389 {
390 simd4scalari src0 =
391 SIMD128::load_si(reinterpret_cast<const simd4scalari*>(pSrc)); // rrrrrrrrrrrrrrrr
392 simd4scalari src1 =
393 SIMD128::load_si(reinterpret_cast<const simd4scalari*>(pSrc) + 1); // gggggggggggggggg
394
395 simdscalari cvt0 = _simd_cvtepu8_epi16(src0);
396 simdscalari cvt1 = _simd_cvtepu8_epi16(src1);
397
398 simdscalari shl1 = _simd_slli_epi32(cvt1, 8);
399
400 simdscalari dst = _simd_or_si(cvt0, shl1);
401
402 _simd_store_si(reinterpret_cast<simdscalari*>(pDst),
403 dst); // rgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrg
404 }
405 #endif
406 };
407
408 //////////////////////////////////////////////////////////////////////////
409 /// Transpose32_32_32_32
410 //////////////////////////////////////////////////////////////////////////
411 struct Transpose32_32_32_32
412 {
413 //////////////////////////////////////////////////////////////////////////
414 /// @brief Performs an SOA to AOS conversion for packed 32_32_32_32 data.
415 /// @param pSrc - source data in SOA form
416 /// @param pDst - output data in AOS form
417 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
418 {
419 #if KNOB_SIMD_WIDTH == 8
420 simdscalar src0 = _simd_load_ps((const float*)pSrc);
421 simdscalar src1 = _simd_load_ps((const float*)pSrc + 8);
422 simdscalar src2 = _simd_load_ps((const float*)pSrc + 16);
423 simdscalar src3 = _simd_load_ps((const float*)pSrc + 24);
424
425 simd4scalar vDst[8];
426 vTranspose4x8(vDst, src0, src1, src2, src3);
427 SIMD128::store_ps((float*)pDst, vDst[0]);
428 SIMD128::store_ps((float*)pDst + 4, vDst[1]);
429 SIMD128::store_ps((float*)pDst + 8, vDst[2]);
430 SIMD128::store_ps((float*)pDst + 12, vDst[3]);
431 SIMD128::store_ps((float*)pDst + 16, vDst[4]);
432 SIMD128::store_ps((float*)pDst + 20, vDst[5]);
433 SIMD128::store_ps((float*)pDst + 24, vDst[6]);
434 SIMD128::store_ps((float*)pDst + 28, vDst[7]);
435 #else
436 #error Unsupported vector width
437 #endif
438 }
439 #if ENABLE_AVX512_SIMD16
440
441 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
442 {
443 simd16scalar src0 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc));
444 simd16scalar src1 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc) + 16);
445 simd16scalar src2 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc) + 32);
446 simd16scalar src3 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc) + 48);
447
448 simd16scalar dst[4];
449
450 vTranspose4x16(dst, src0, src1, src2, src3);
451
452 _simd16_store_ps(reinterpret_cast<float*>(pDst) + 0, dst[0]);
453 _simd16_store_ps(reinterpret_cast<float*>(pDst) + 16, dst[1]);
454 _simd16_store_ps(reinterpret_cast<float*>(pDst) + 32, dst[2]);
455 _simd16_store_ps(reinterpret_cast<float*>(pDst) + 48, dst[3]);
456 }
457 #endif
458 };
459
460 //////////////////////////////////////////////////////////////////////////
461 /// Transpose32_32_32
462 //////////////////////////////////////////////////////////////////////////
463 struct Transpose32_32_32
464 {
465 //////////////////////////////////////////////////////////////////////////
466 /// @brief Performs an SOA to AOS conversion for packed 32_32_32 data.
467 /// @param pSrc - source data in SOA form
468 /// @param pDst - output data in AOS form
469 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
470 {
471 #if KNOB_SIMD_WIDTH == 8
472 simdscalar src0 = _simd_load_ps((const float*)pSrc);
473 simdscalar src1 = _simd_load_ps((const float*)pSrc + 8);
474 simdscalar src2 = _simd_load_ps((const float*)pSrc + 16);
475
476 simd4scalar vDst[8];
477 vTranspose3x8(vDst, src0, src1, src2);
478 SIMD128::store_ps((float*)pDst, vDst[0]);
479 SIMD128::store_ps((float*)pDst + 4, vDst[1]);
480 SIMD128::store_ps((float*)pDst + 8, vDst[2]);
481 SIMD128::store_ps((float*)pDst + 12, vDst[3]);
482 SIMD128::store_ps((float*)pDst + 16, vDst[4]);
483 SIMD128::store_ps((float*)pDst + 20, vDst[5]);
484 SIMD128::store_ps((float*)pDst + 24, vDst[6]);
485 SIMD128::store_ps((float*)pDst + 28, vDst[7]);
486 #else
487 #error Unsupported vector width
488 #endif
489 }
490 #if ENABLE_AVX512_SIMD16
491
492 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
493 {
494 simd16scalar src0 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc));
495 simd16scalar src1 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc) + 16);
496 simd16scalar src2 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc) + 32);
497 simd16scalar src3 = _simd16_setzero_ps();
498
499 simd16scalar dst[4];
500
501 vTranspose4x16(dst, src0, src1, src2, src3);
502
503 _simd16_store_ps(reinterpret_cast<float*>(pDst) + 0, dst[0]);
504 _simd16_store_ps(reinterpret_cast<float*>(pDst) + 16, dst[1]);
505 _simd16_store_ps(reinterpret_cast<float*>(pDst) + 32, dst[2]);
506 _simd16_store_ps(reinterpret_cast<float*>(pDst) + 48, dst[3]);
507 }
508 #endif
509 };
510
511 //////////////////////////////////////////////////////////////////////////
512 /// Transpose32_32
513 //////////////////////////////////////////////////////////////////////////
514 struct Transpose32_32
515 {
516 //////////////////////////////////////////////////////////////////////////
517 /// @brief Performs an SOA to AOS conversion for packed 32_32 data.
518 /// @param pSrc - source data in SOA form
519 /// @param pDst - output data in AOS form
520 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
521 {
522 #if KNOB_SIMD_WIDTH == 8
523 const float* pfSrc = (const float*)pSrc;
524 simd4scalar src_r0 = SIMD128::load_ps(pfSrc + 0);
525 simd4scalar src_r1 = SIMD128::load_ps(pfSrc + 4);
526 simd4scalar src_g0 = SIMD128::load_ps(pfSrc + 8);
527 simd4scalar src_g1 = SIMD128::load_ps(pfSrc + 12);
528
529 simd4scalar dst0 = SIMD128::unpacklo_ps(src_r0, src_g0);
530 simd4scalar dst1 = SIMD128::unpackhi_ps(src_r0, src_g0);
531 simd4scalar dst2 = SIMD128::unpacklo_ps(src_r1, src_g1);
532 simd4scalar dst3 = SIMD128::unpackhi_ps(src_r1, src_g1);
533
534 float* pfDst = (float*)pDst;
535 SIMD128::store_ps(pfDst + 0, dst0);
536 SIMD128::store_ps(pfDst + 4, dst1);
537 SIMD128::store_ps(pfDst + 8, dst2);
538 SIMD128::store_ps(pfDst + 12, dst3);
539 #else
540 #error Unsupported vector width
541 #endif
542 }
543 #if ENABLE_AVX512_SIMD16
544
545 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
546 {
547 simd16scalar src0 =
548 _simd16_load_ps(reinterpret_cast<const float*>(pSrc)); // rrrrrrrrrrrrrrrr
549 simd16scalar src1 =
550 _simd16_load_ps(reinterpret_cast<const float*>(pSrc) + 16); // gggggggggggggggg
551
552 simd16scalar tmp0 =
553 _simd16_unpacklo_ps(src0, src1); // r0 g0 r1 g1 r4 g4 r5 g5 r8 g8 r9 g9 rC gC rD gD
554 simd16scalar tmp1 =
555 _simd16_unpackhi_ps(src0, src1); // r2 g2 r3 g3 r6 g6 r7 g7 rA gA rB gB rE gE rF gF
556
557 simd16scalar per0 = _simd16_permute2f128_ps(
558 tmp0,
559 tmp1,
560 0x44); // (1, 0, 1, 0) // r0 g0 r1 g1 r4 g4 r5 g5 r2 g2 r3 g3 r6 g6 r7 g7
561 simd16scalar per1 = _simd16_permute2f128_ps(
562 tmp0,
563 tmp1,
564 0xEE); // (3, 2, 3, 2) // r8 g8 r9 g9 rC gC rD gD rA gA rB gB rE gE rF gF
565
566 simd16scalar dst0 = _simd16_permute2f128_ps(
567 per0,
568 per0,
569 0xD8); // (3, 1, 2, 0) // r0 g0 r1 g1 r2 g2 r3 g3 r4 g4 r5 g5 r6 g6 r7 g7
570 simd16scalar dst1 = _simd16_permute2f128_ps(
571 per1,
572 per1,
573 0xD8); // (3, 1, 2, 0) // r8 g8 r9 g9 rA gA rB gB rC gC rD gD rE gE rF gF
574
575 _simd16_store_ps(reinterpret_cast<float*>(pDst) + 0, dst0); // rgrgrgrgrgrgrgrg
576 _simd16_store_ps(reinterpret_cast<float*>(pDst) + 16, dst1); // rgrgrgrgrgrgrgrg
577 }
578 #endif
579 };
580
581 //////////////////////////////////////////////////////////////////////////
582 /// Transpose16_16_16_16
583 //////////////////////////////////////////////////////////////////////////
584 struct Transpose16_16_16_16
585 {
586 //////////////////////////////////////////////////////////////////////////
587 /// @brief Performs an SOA to AOS conversion for packed 16_16_16_16 data.
588 /// @param pSrc - source data in SOA form
589 /// @param pDst - output data in AOS form
590 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
591 {
592 #if KNOB_SIMD_WIDTH == 8
593 simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc);
594 simdscalari src_ba = _simd_load_si((const simdscalari*)(pSrc + sizeof(simdscalari)));
595
596 simd4scalari src_r = _simd_extractf128_si(src_rg, 0);
597 simd4scalari src_g = _simd_extractf128_si(src_rg, 1);
598 simd4scalari src_b = _simd_extractf128_si(src_ba, 0);
599 simd4scalari src_a = _simd_extractf128_si(src_ba, 1);
600
601 simd4scalari rg0 = SIMD128::unpacklo_epi16(src_r, src_g);
602 simd4scalari rg1 = SIMD128::unpackhi_epi16(src_r, src_g);
603 simd4scalari ba0 = SIMD128::unpacklo_epi16(src_b, src_a);
604 simd4scalari ba1 = SIMD128::unpackhi_epi16(src_b, src_a);
605
606 simd4scalari dst0 = SIMD128::unpacklo_epi32(rg0, ba0);
607 simd4scalari dst1 = SIMD128::unpackhi_epi32(rg0, ba0);
608 simd4scalari dst2 = SIMD128::unpacklo_epi32(rg1, ba1);
609 simd4scalari dst3 = SIMD128::unpackhi_epi32(rg1, ba1);
610
611 SIMD128::store_si(((simd4scalari*)pDst) + 0, dst0);
612 SIMD128::store_si(((simd4scalari*)pDst) + 1, dst1);
613 SIMD128::store_si(((simd4scalari*)pDst) + 2, dst2);
614 SIMD128::store_si(((simd4scalari*)pDst) + 3, dst3);
615 #else
616 #error Unsupported vector width
617 #endif
618 }
619 #if ENABLE_AVX512_SIMD16
620
621 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
622 {
623 simdscalari src0 =
624 _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc)); // rrrrrrrrrrrrrrrr
625 simdscalari src1 =
626 _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc) + 1); // gggggggggggggggg
627 simdscalari src2 =
628 _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc) + 2); // bbbbbbbbbbbbbbbb
629 simdscalari src3 =
630 _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc) + 3); // aaaaaaaaaaaaaaaa
631
632 simdscalari pre0 = _simd_unpacklo_epi16(src0, src1); // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB
633 simdscalari pre1 = _simd_unpackhi_epi16(src0, src1); // rg4 rg5 rg6 rg7 rgC rgD rgE rgF
634 simdscalari pre2 = _simd_unpacklo_epi16(src2, src3); // ba0 ba1 ba3 ba3 ba8 ba9 baA baB
635 simdscalari pre3 = _simd_unpackhi_epi16(src2, src3); // ba4 ba5 ba6 ba7 baC baD baE baF
636
637 simdscalari tmp0 = _simd_unpacklo_epi32(pre0, pre2); // rbga0 rbga1 rbga8 rbga9
638 simdscalari tmp1 = _simd_unpackhi_epi32(pre0, pre2); // rbga2 rbga3 rbgaA rbgaB
639 simdscalari tmp2 = _simd_unpacklo_epi32(pre1, pre3); // rbga4 rbga5 rgbaC rbgaD
640 simdscalari tmp3 = _simd_unpackhi_epi32(pre1, pre3); // rbga6 rbga7 rbgaE rbgaF
641
642 simdscalari dst0 = _simd_permute2f128_si(
643 tmp0, tmp1, 0x20); // (2, 0) // rbga0 rbga1 rbga2 rbga3
644 simdscalari dst1 = _simd_permute2f128_si(
645 tmp2, tmp3, 0x20); // (2, 0) // rbga4 rbga5 rbga6 rbga7
646 simdscalari dst2 = _simd_permute2f128_si(
647 tmp0, tmp1, 0x31); // (3, 1) // rbga8 rbga9 rbgaA rbgaB
648 simdscalari dst3 = _simd_permute2f128_si(
649 tmp2, tmp3, 0x31); // (3, 1) // rbgaC rbgaD rbgaE rbgaF
650
651 _simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 0, dst0); // rgbargbargbargba
652 _simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 1, dst1); // rgbargbargbargba
653 _simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 2, dst2); // rgbargbargbargba
654 _simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 3, dst3); // rgbargbargbargba
655 }
656 #endif
657 };
658
659 //////////////////////////////////////////////////////////////////////////
660 /// Transpose16_16_16
661 //////////////////////////////////////////////////////////////////////////
662 struct Transpose16_16_16
663 {
664 //////////////////////////////////////////////////////////////////////////
665 /// @brief Performs an SOA to AOS conversion for packed 16_16_16 data.
666 /// @param pSrc - source data in SOA form
667 /// @param pDst - output data in AOS form
668 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
669 {
670 #if KNOB_SIMD_WIDTH == 8
671 simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc);
672
673 simd4scalari src_r = _simd_extractf128_si(src_rg, 0);
674 simd4scalari src_g = _simd_extractf128_si(src_rg, 1);
675 simd4scalari src_b = SIMD128::load_si((const simd4scalari*)(pSrc + sizeof(simdscalari)));
676 simd4scalari src_a = SIMD128::setzero_si();
677
678 simd4scalari rg0 = SIMD128::unpacklo_epi16(src_r, src_g);
679 simd4scalari rg1 = SIMD128::unpackhi_epi16(src_r, src_g);
680 simd4scalari ba0 = SIMD128::unpacklo_epi16(src_b, src_a);
681 simd4scalari ba1 = SIMD128::unpackhi_epi16(src_b, src_a);
682
683 simd4scalari dst0 = SIMD128::unpacklo_epi32(rg0, ba0);
684 simd4scalari dst1 = SIMD128::unpackhi_epi32(rg0, ba0);
685 simd4scalari dst2 = SIMD128::unpacklo_epi32(rg1, ba1);
686 simd4scalari dst3 = SIMD128::unpackhi_epi32(rg1, ba1);
687
688 SIMD128::store_si(((simd4scalari*)pDst) + 0, dst0);
689 SIMD128::store_si(((simd4scalari*)pDst) + 1, dst1);
690 SIMD128::store_si(((simd4scalari*)pDst) + 2, dst2);
691 SIMD128::store_si(((simd4scalari*)pDst) + 3, dst3);
692 #else
693 #error Unsupported vector width
694 #endif
695 }
696 #if ENABLE_AVX512_SIMD16
697
698 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
699 {
700 simdscalari src0 =
701 _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc)); // rrrrrrrrrrrrrrrr
702 simdscalari src1 =
703 _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc) + 1); // gggggggggggggggg
704 simdscalari src2 =
705 _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc) + 2); // bbbbbbbbbbbbbbbb
706 simdscalari src3 = _simd_setzero_si(); // aaaaaaaaaaaaaaaa
707
708 simdscalari pre0 = _simd_unpacklo_epi16(src0, src1); // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB
709 simdscalari pre1 = _simd_unpackhi_epi16(src0, src1); // rg4 rg5 rg6 rg7 rgC rgD rgE rgF
710 simdscalari pre2 = _simd_unpacklo_epi16(src2, src3); // ba0 ba1 ba3 ba3 ba8 ba9 baA baB
711 simdscalari pre3 = _simd_unpackhi_epi16(src2, src3); // ba4 ba5 ba6 ba7 baC baD baE baF
712
713 simdscalari tmp0 = _simd_unpacklo_epi32(pre0, pre2); // rbga0 rbga1 rbga8 rbga9
714 simdscalari tmp1 = _simd_unpackhi_epi32(pre0, pre2); // rbga2 rbga3 rbgaA rbgaB
715 simdscalari tmp2 = _simd_unpacklo_epi32(pre1, pre3); // rbga4 rbga5 rgbaC rbgaD
716 simdscalari tmp3 = _simd_unpackhi_epi32(pre1, pre3); // rbga6 rbga7 rbgaE rbgaF
717
718 simdscalari dst0 = _simd_permute2f128_si(
719 tmp0, tmp1, 0x20); // (2, 0) // rbga0 rbga1 rbga2 rbga3
720 simdscalari dst1 = _simd_permute2f128_si(
721 tmp2, tmp3, 0x20); // (2, 0) // rbga4 rbga5 rbga6 rbga7
722 simdscalari dst2 = _simd_permute2f128_si(
723 tmp0, tmp1, 0x31); // (3, 1) // rbga8 rbga9 rbgaA rbgaB
724 simdscalari dst3 = _simd_permute2f128_si(
725 tmp2, tmp3, 0x31); // (3, 1) // rbgaC rbgaD rbgaE rbgaF
726
727 _simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 0, dst0); // rgbargbargbargba
728 _simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 1, dst1); // rgbargbargbargba
729 _simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 2, dst2); // rgbargbargbargba
730 _simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 3, dst3); // rgbargbargbargba
731 }
732 #endif
733 };
734
735 //////////////////////////////////////////////////////////////////////////
736 /// Transpose16_16
737 //////////////////////////////////////////////////////////////////////////
738 struct Transpose16_16
739 {
740 //////////////////////////////////////////////////////////////////////////
741 /// @brief Performs an SOA to AOS conversion for packed 16_16 data.
742 /// @param pSrc - source data in SOA form
743 /// @param pDst - output data in AOS form
744 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
745 {
746 #if KNOB_SIMD_WIDTH == 8
747 simdscalar src = _simd_load_ps((const float*)pSrc);
748
749 simd4scalar comp0 = _simd_extractf128_ps(src, 0);
750 simd4scalar comp1 = _simd_extractf128_ps(src, 1);
751
752 simd4scalari comp0i = SIMD128::castps_si(comp0);
753 simd4scalari comp1i = SIMD128::castps_si(comp1);
754
755 simd4scalari resLo = SIMD128::unpacklo_epi16(comp0i, comp1i);
756 simd4scalari resHi = SIMD128::unpackhi_epi16(comp0i, comp1i);
757
758 SIMD128::store_si((simd4scalari*)pDst, resLo);
759 SIMD128::store_si((simd4scalari*)pDst + 1, resHi);
760 #else
761 #error Unsupported vector width
762 #endif
763 }
764 #if ENABLE_AVX512_SIMD16
765
766 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
767 {
768 simdscalari src0 =
769 _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc)); // rrrrrrrrrrrrrrrr
770 simdscalari src1 =
771 _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc) + 1); // gggggggggggggggg
772
773 simdscalari tmp0 = _simd_unpacklo_epi16(src0, src1); // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB
774 simdscalari tmp1 = _simd_unpackhi_epi16(src0, src1); // rg4 rg5 rg6 rg7 rgC rgD rgE rgF
775
776 simdscalari dst0 = _simd_permute2f128_si(
777 tmp0, tmp1, 0x20); // (2, 0) // rg0 rg1 rg2 rg3 rg4 rg5 rg6 rg7
778 simdscalari dst1 = _simd_permute2f128_si(
779 tmp0, tmp1, 0x31); // (3, 1) // rg8 rg9 rgA rgB rgC rgD rgE rgF
780
781 _simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 0, dst0); // rgrgrgrgrgrgrgrg
782 _simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 1, dst1); // rgrgrgrgrgrgrgrg
783 }
784 #endif
785 };
786
787 //////////////////////////////////////////////////////////////////////////
788 /// Transpose24_8
789 //////////////////////////////////////////////////////////////////////////
790 struct Transpose24_8
791 {
792 //////////////////////////////////////////////////////////////////////////
793 /// @brief Performs an SOA to AOS conversion for packed 24_8 data.
794 /// @param pSrc - source data in SOA form
795 /// @param pDst - output data in AOS form
796 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
797 #if ENABLE_AVX512_SIMD16
798
799 static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
800 #endif
801 };
802
803 //////////////////////////////////////////////////////////////////////////
804 /// Transpose32_8_24
805 //////////////////////////////////////////////////////////////////////////
806 struct Transpose32_8_24
807 {
808 //////////////////////////////////////////////////////////////////////////
809 /// @brief Performs an SOA to AOS conversion for packed 32_8_24 data.
810 /// @param pSrc - source data in SOA form
811 /// @param pDst - output data in AOS form
812 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
813 #if ENABLE_AVX512_SIMD16
814
815 static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
816 #endif
817 };
818
819 //////////////////////////////////////////////////////////////////////////
820 /// Transpose4_4_4_4
821 //////////////////////////////////////////////////////////////////////////
822 struct Transpose4_4_4_4
823 {
824 //////////////////////////////////////////////////////////////////////////
825 /// @brief Performs an SOA to AOS conversion for packed 4_4_4_4 data.
826 /// @param pSrc - source data in SOA form
827 /// @param pDst - output data in AOS form
828 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
829 #if ENABLE_AVX512_SIMD16
830
831 static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
832 #endif
833 };
834
835 //////////////////////////////////////////////////////////////////////////
836 /// Transpose5_6_5
837 //////////////////////////////////////////////////////////////////////////
838 struct Transpose5_6_5
839 {
840 //////////////////////////////////////////////////////////////////////////
841 /// @brief Performs an SOA to AOS conversion for packed 5_6_5 data.
842 /// @param pSrc - source data in SOA form
843 /// @param pDst - output data in AOS form
844 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
845 #if ENABLE_AVX512_SIMD16
846
847 static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
848 #endif
849 };
850
851 //////////////////////////////////////////////////////////////////////////
852 /// Transpose9_9_9_5
853 //////////////////////////////////////////////////////////////////////////
854 struct Transpose9_9_9_5
855 {
856 //////////////////////////////////////////////////////////////////////////
857 /// @brief Performs an SOA to AOS conversion for packed 9_9_9_5 data.
858 /// @param pSrc - source data in SOA form
859 /// @param pDst - output data in AOS form
860 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
861 #if ENABLE_AVX512_SIMD16
862
863 static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
864 #endif
865 };
866
867 //////////////////////////////////////////////////////////////////////////
868 /// Transpose5_5_5_1
869 //////////////////////////////////////////////////////////////////////////
870 struct Transpose5_5_5_1
871 {
872 //////////////////////////////////////////////////////////////////////////
873 /// @brief Performs an SOA to AOS conversion for packed 5_5_5_1 data.
874 /// @param pSrc - source data in SOA form
875 /// @param pDst - output data in AOS form
876 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
877 #if ENABLE_AVX512_SIMD16
878
879 static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
880 #endif
881 };
882
883 //////////////////////////////////////////////////////////////////////////
884 /// Transpose1_5_5_5
885 //////////////////////////////////////////////////////////////////////////
886 struct Transpose1_5_5_5
887 {
888 //////////////////////////////////////////////////////////////////////////
889 /// @brief Performs an SOA to AOS conversion for packed 5_5_5_1 data.
890 /// @param pSrc - source data in SOA form
891 /// @param pDst - output data in AOS form
892 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
893 };
894
895 //////////////////////////////////////////////////////////////////////////
896 /// Transpose10_10_10_2
897 //////////////////////////////////////////////////////////////////////////
898 struct Transpose10_10_10_2
899 {
900 //////////////////////////////////////////////////////////////////////////
901 /// @brief Performs an SOA to AOS conversion for packed 10_10_10_2 data.
902 /// @param pSrc - source data in SOA form
903 /// @param pDst - output data in AOS form
904 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
905 #if ENABLE_AVX512_SIMD16
906
907 static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
908 #endif
909 };
910
911 //////////////////////////////////////////////////////////////////////////
912 /// Transpose11_11_10
913 //////////////////////////////////////////////////////////////////////////
914 struct Transpose11_11_10
915 {
916 //////////////////////////////////////////////////////////////////////////
917 /// @brief Performs an SOA to AOS conversion for packed 11_11_10 data.
918 /// @param pSrc - source data in SOA form
919 /// @param pDst - output data in AOS form
920 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
921 #if ENABLE_AVX512_SIMD16
922
923 static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
924 #endif
925 };
926
927 //////////////////////////////////////////////////////////////////////////
928 /// Transpose64
929 //////////////////////////////////////////////////////////////////////////
930 struct Transpose64
931 {
932 //////////////////////////////////////////////////////////////////////////
933 /// @brief Performs an SOA to AOS conversion
934 /// @param pSrc - source data in SOA form
935 /// @param pDst - output data in AOS form
936 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
937 #if ENABLE_AVX512_SIMD16
938
939 static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
940 #endif
941 };
942
943 //////////////////////////////////////////////////////////////////////////
944 /// Transpose64_64
945 //////////////////////////////////////////////////////////////////////////
946 struct Transpose64_64
947 {
948 //////////////////////////////////////////////////////////////////////////
949 /// @brief Performs an SOA to AOS conversion
950 /// @param pSrc - source data in SOA form
951 /// @param pDst - output data in AOS form
952 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
953 #if ENABLE_AVX512_SIMD16
954
955 static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
956 #endif
957 };
958
959 //////////////////////////////////////////////////////////////////////////
960 /// Transpose64_64_64
961 //////////////////////////////////////////////////////////////////////////
962 struct Transpose64_64_64
963 {
964 //////////////////////////////////////////////////////////////////////////
965 /// @brief Performs an SOA to AOS conversion
966 /// @param pSrc - source data in SOA form
967 /// @param pDst - output data in AOS form
968 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
969 #if ENABLE_AVX512_SIMD16
970
971 static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
972 #endif
973 };
974
975 //////////////////////////////////////////////////////////////////////////
976 /// Transpose64_64_64_64
977 //////////////////////////////////////////////////////////////////////////
978 struct Transpose64_64_64_64
979 {
980 //////////////////////////////////////////////////////////////////////////
981 /// @brief Performs an SOA to AOS conversion
982 /// @param pSrc - source data in SOA form
983 /// @param pDst - output data in AOS form
984 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
985 #if ENABLE_AVX512_SIMD16
986
987 static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
988 #endif
989 };