swr: [rasterizer core/sim] 8x2 backend + 16-wide tile clear/load/store
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / utils.h
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file utils.h
24 *
25 * @brief Utilities used by SWR core.
26 *
27 ******************************************************************************/
28 #pragma once
29
30 #include <string.h>
31 #include <type_traits>
32 #include <algorithm>
33 #include "common/os.h"
34 #include "common/simdintrin.h"
35 #include "common/swr_assert.h"
36 #include "core/api.h"
37
38 #if defined(_WIN64) || defined(__x86_64__)
39 #define _MM_INSERT_EPI64 _mm_insert_epi64
40 #define _MM_EXTRACT_EPI64 _mm_extract_epi64
41 #else
42 INLINE int64_t _MM_EXTRACT_EPI64(__m128i a, const int32_t ndx)
43 {
44 OSALIGNLINE(uint32_t) elems[4];
45 _mm_store_si128((__m128i*)elems, a);
46 if (ndx == 0)
47 {
48 uint64_t foo = elems[0];
49 foo |= (uint64_t)elems[1] << 32;
50 return foo;
51 }
52 else
53 {
54 uint64_t foo = elems[2];
55 foo |= (uint64_t)elems[3] << 32;
56 return foo;
57 }
58 }
59
60 INLINE __m128i _MM_INSERT_EPI64(__m128i a, int64_t b, const int32_t ndx)
61 {
62 OSALIGNLINE(int64_t) elems[2];
63 _mm_store_si128((__m128i*)elems, a);
64 if (ndx == 0)
65 {
66 elems[0] = b;
67 }
68 else
69 {
70 elems[1] = b;
71 }
72 __m128i out;
73 out = _mm_load_si128((const __m128i*)elems);
74 return out;
75 }
76 #endif
77
78 struct simdBBox
79 {
80 simdscalari ymin;
81 simdscalari ymax;
82 simdscalari xmin;
83 simdscalari xmax;
84 };
85
86 INLINE
87 void vTranspose(__m128 &row0, __m128 &row1, __m128 &row2, __m128 &row3)
88 {
89 __m128i row0i = _mm_castps_si128(row0);
90 __m128i row1i = _mm_castps_si128(row1);
91 __m128i row2i = _mm_castps_si128(row2);
92 __m128i row3i = _mm_castps_si128(row3);
93
94 __m128i vTemp = row2i;
95 row2i = _mm_unpacklo_epi32(row2i, row3i);
96 vTemp = _mm_unpackhi_epi32(vTemp, row3i);
97
98 row3i = row0i;
99 row0i = _mm_unpacklo_epi32(row0i, row1i);
100 row3i = _mm_unpackhi_epi32(row3i, row1i);
101
102 row1i = row0i;
103 row0i = _mm_unpacklo_epi64(row0i, row2i);
104 row1i = _mm_unpackhi_epi64(row1i, row2i);
105
106 row2i = row3i;
107 row2i = _mm_unpacklo_epi64(row2i, vTemp);
108 row3i = _mm_unpackhi_epi64(row3i, vTemp);
109
110 row0 = _mm_castsi128_ps(row0i);
111 row1 = _mm_castsi128_ps(row1i);
112 row2 = _mm_castsi128_ps(row2i);
113 row3 = _mm_castsi128_ps(row3i);
114 }
115
116 INLINE
117 void vTranspose(__m128i &row0, __m128i &row1, __m128i &row2, __m128i &row3)
118 {
119 __m128i vTemp = row2;
120 row2 = _mm_unpacklo_epi32(row2, row3);
121 vTemp = _mm_unpackhi_epi32(vTemp, row3);
122
123 row3 = row0;
124 row0 = _mm_unpacklo_epi32(row0, row1);
125 row3 = _mm_unpackhi_epi32(row3, row1);
126
127 row1 = row0;
128 row0 = _mm_unpacklo_epi64(row0, row2);
129 row1 = _mm_unpackhi_epi64(row1, row2);
130
131 row2 = row3;
132 row2 = _mm_unpacklo_epi64(row2, vTemp);
133 row3 = _mm_unpackhi_epi64(row3, vTemp);
134 }
135
136 #define GCC_VERSION (__GNUC__ * 10000 \
137 + __GNUC_MINOR__ * 100 \
138 + __GNUC_PATCHLEVEL__)
139
140 #if defined(__clang__) || (defined(__GNUC__) && (GCC_VERSION < 40900))
141 #define _mm_undefined_ps _mm_setzero_ps
142 #define _mm_undefined_si128 _mm_setzero_si128
143 #if KNOB_SIMD_WIDTH == 8
144 #define _mm256_undefined_ps _mm256_setzero_ps
145 #endif
146 #endif
147
148 #if KNOB_SIMD_WIDTH == 8 || KNOB_SIMD_WIDTH == 16
149 INLINE
150 void vTranspose3x8(__m128 (&vDst)[8], __m256 &vSrc0, __m256 &vSrc1, __m256 &vSrc2)
151 {
152 __m256 r0r2 = _mm256_unpacklo_ps(vSrc0, vSrc2); //x0z0x1z1 x4z4x5z5
153 __m256 r1rx = _mm256_unpacklo_ps(vSrc1, _mm256_undefined_ps()); //y0w0y1w1 y4w4y5w5
154 __m256 r02r1xlolo = _mm256_unpacklo_ps(r0r2, r1rx); //x0y0z0w0 x4y4z4w4
155 __m256 r02r1xlohi = _mm256_unpackhi_ps(r0r2, r1rx); //x1y1z1w1 x5y5z5w5
156
157 r0r2 = _mm256_unpackhi_ps(vSrc0, vSrc2); //x2z2x3z3 x6z6x7z7
158 r1rx = _mm256_unpackhi_ps(vSrc1, _mm256_undefined_ps()); //y2w2y3w3 y6w6yw77
159 __m256 r02r1xhilo = _mm256_unpacklo_ps(r0r2, r1rx); //x2y2z2w2 x6y6z6w6
160 __m256 r02r1xhihi = _mm256_unpackhi_ps(r0r2, r1rx); //x3y3z3w3 x7y7z7w7
161
162 vDst[0] = _mm256_castps256_ps128(r02r1xlolo);
163 vDst[1] = _mm256_castps256_ps128(r02r1xlohi);
164 vDst[2] = _mm256_castps256_ps128(r02r1xhilo);
165 vDst[3] = _mm256_castps256_ps128(r02r1xhihi);
166
167 vDst[4] = _mm256_extractf128_ps(r02r1xlolo, 1);
168 vDst[5] = _mm256_extractf128_ps(r02r1xlohi, 1);
169 vDst[6] = _mm256_extractf128_ps(r02r1xhilo, 1);
170 vDst[7] = _mm256_extractf128_ps(r02r1xhihi, 1);
171 }
172
173 INLINE
174 void vTranspose4x8(__m128 (&vDst)[8], __m256 &vSrc0, __m256 &vSrc1, __m256 &vSrc2, __m256 &vSrc3)
175 {
176 __m256 r0r2 = _mm256_unpacklo_ps(vSrc0, vSrc2); //x0z0x1z1 x4z4x5z5
177 __m256 r1rx = _mm256_unpacklo_ps(vSrc1, vSrc3); //y0w0y1w1 y4w4y5w5
178 __m256 r02r1xlolo = _mm256_unpacklo_ps(r0r2, r1rx); //x0y0z0w0 x4y4z4w4
179 __m256 r02r1xlohi = _mm256_unpackhi_ps(r0r2, r1rx); //x1y1z1w1 x5y5z5w5
180
181 r0r2 = _mm256_unpackhi_ps(vSrc0, vSrc2); //x2z2x3z3 x6z6x7z7
182 r1rx = _mm256_unpackhi_ps(vSrc1, vSrc3) ; //y2w2y3w3 y6w6yw77
183 __m256 r02r1xhilo = _mm256_unpacklo_ps(r0r2, r1rx); //x2y2z2w2 x6y6z6w6
184 __m256 r02r1xhihi = _mm256_unpackhi_ps(r0r2, r1rx); //x3y3z3w3 x7y7z7w7
185
186 vDst[0] = _mm256_castps256_ps128(r02r1xlolo);
187 vDst[1] = _mm256_castps256_ps128(r02r1xlohi);
188 vDst[2] = _mm256_castps256_ps128(r02r1xhilo);
189 vDst[3] = _mm256_castps256_ps128(r02r1xhihi);
190
191 vDst[4] = _mm256_extractf128_ps(r02r1xlolo, 1);
192 vDst[5] = _mm256_extractf128_ps(r02r1xlohi, 1);
193 vDst[6] = _mm256_extractf128_ps(r02r1xhilo, 1);
194 vDst[7] = _mm256_extractf128_ps(r02r1xhihi, 1);
195 }
196
197 INLINE
198 void vTranspose8x8(__m256 (&vDst)[8], const __m256 &vMask0, const __m256 &vMask1, const __m256 &vMask2, const __m256 &vMask3, const __m256 &vMask4, const __m256 &vMask5, const __m256 &vMask6, const __m256 &vMask7)
199 {
200 __m256 __t0 = _mm256_unpacklo_ps(vMask0, vMask1);
201 __m256 __t1 = _mm256_unpackhi_ps(vMask0, vMask1);
202 __m256 __t2 = _mm256_unpacklo_ps(vMask2, vMask3);
203 __m256 __t3 = _mm256_unpackhi_ps(vMask2, vMask3);
204 __m256 __t4 = _mm256_unpacklo_ps(vMask4, vMask5);
205 __m256 __t5 = _mm256_unpackhi_ps(vMask4, vMask5);
206 __m256 __t6 = _mm256_unpacklo_ps(vMask6, vMask7);
207 __m256 __t7 = _mm256_unpackhi_ps(vMask6, vMask7);
208 __m256 __tt0 = _mm256_shuffle_ps(__t0,__t2,_MM_SHUFFLE(1,0,1,0));
209 __m256 __tt1 = _mm256_shuffle_ps(__t0,__t2,_MM_SHUFFLE(3,2,3,2));
210 __m256 __tt2 = _mm256_shuffle_ps(__t1,__t3,_MM_SHUFFLE(1,0,1,0));
211 __m256 __tt3 = _mm256_shuffle_ps(__t1,__t3,_MM_SHUFFLE(3,2,3,2));
212 __m256 __tt4 = _mm256_shuffle_ps(__t4,__t6,_MM_SHUFFLE(1,0,1,0));
213 __m256 __tt5 = _mm256_shuffle_ps(__t4,__t6,_MM_SHUFFLE(3,2,3,2));
214 __m256 __tt6 = _mm256_shuffle_ps(__t5,__t7,_MM_SHUFFLE(1,0,1,0));
215 __m256 __tt7 = _mm256_shuffle_ps(__t5,__t7,_MM_SHUFFLE(3,2,3,2));
216 vDst[0] = _mm256_permute2f128_ps(__tt0, __tt4, 0x20);
217 vDst[1] = _mm256_permute2f128_ps(__tt1, __tt5, 0x20);
218 vDst[2] = _mm256_permute2f128_ps(__tt2, __tt6, 0x20);
219 vDst[3] = _mm256_permute2f128_ps(__tt3, __tt7, 0x20);
220 vDst[4] = _mm256_permute2f128_ps(__tt0, __tt4, 0x31);
221 vDst[5] = _mm256_permute2f128_ps(__tt1, __tt5, 0x31);
222 vDst[6] = _mm256_permute2f128_ps(__tt2, __tt6, 0x31);
223 vDst[7] = _mm256_permute2f128_ps(__tt3, __tt7, 0x31);
224 }
225
226 INLINE
227 void vTranspose8x8(__m256 (&vDst)[8], const __m256i &vMask0, const __m256i &vMask1, const __m256i &vMask2, const __m256i &vMask3, const __m256i &vMask4, const __m256i &vMask5, const __m256i &vMask6, const __m256i &vMask7)
228 {
229 vTranspose8x8(vDst, _mm256_castsi256_ps(vMask0), _mm256_castsi256_ps(vMask1), _mm256_castsi256_ps(vMask2), _mm256_castsi256_ps(vMask3),
230 _mm256_castsi256_ps(vMask4), _mm256_castsi256_ps(vMask5), _mm256_castsi256_ps(vMask6), _mm256_castsi256_ps(vMask7));
231 }
232 #endif
233
234 //////////////////////////////////////////////////////////////////////////
235 /// TranposeSingleComponent
236 //////////////////////////////////////////////////////////////////////////
237 template<uint32_t bpp>
238 struct TransposeSingleComponent
239 {
240 //////////////////////////////////////////////////////////////////////////
241 /// @brief Pass-thru for single component.
242 /// @param pSrc - source data in SOA form
243 /// @param pDst - output data in AOS form
244 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
245 {
246 memcpy(pDst, pSrc, (bpp * KNOB_SIMD_WIDTH) / 8);
247 }
248 #if ENABLE_AVX512_SIMD16
249
250 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
251 {
252 memcpy(pDst, pSrc, (bpp * KNOB_SIMD16_WIDTH) / 8);
253 }
254 #endif
255 };
256
257 //////////////////////////////////////////////////////////////////////////
258 /// Transpose8_8_8_8
259 //////////////////////////////////////////////////////////////////////////
260 struct Transpose8_8_8_8
261 {
262 //////////////////////////////////////////////////////////////////////////
263 /// @brief Performs an SOA to AOS conversion for packed 8_8_8_8 data.
264 /// @param pSrc - source data in SOA form
265 /// @param pDst - output data in AOS form
266 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
267 {
268 simdscalari src = _simd_load_si((const simdscalari*)pSrc);
269
270 #if KNOB_SIMD_WIDTH == 8
271 #if KNOB_ARCH == KNOB_ARCH_AVX
272 __m128i c0c1 = _mm256_castsi256_si128(src); // rrrrrrrrgggggggg
273 __m128i c2c3 = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(src), 1)); // bbbbbbbbaaaaaaaa
274 __m128i c0c2 = _mm_unpacklo_epi64(c0c1, c2c3); // rrrrrrrrbbbbbbbb
275 __m128i c1c3 = _mm_unpackhi_epi64(c0c1, c2c3); // ggggggggaaaaaaaa
276 __m128i c01 = _mm_unpacklo_epi8(c0c2, c1c3); // rgrgrgrgrgrgrgrg
277 __m128i c23 = _mm_unpackhi_epi8(c0c2, c1c3); // babababababababa
278 __m128i c0123lo = _mm_unpacklo_epi16(c01, c23); // rgbargbargbargba
279 __m128i c0123hi = _mm_unpackhi_epi16(c01, c23); // rgbargbargbargba
280 _mm_store_si128((__m128i*)pDst, c0123lo);
281 _mm_store_si128((__m128i*)(pDst + 16), c0123hi);
282 #elif KNOB_ARCH == KNOB_ARCH_AVX2
283 simdscalari dst01 = _mm256_shuffle_epi8(src,
284 _mm256_set_epi32(0x0f078080, 0x0e068080, 0x0d058080, 0x0c048080, 0x80800b03, 0x80800a02, 0x80800901, 0x80800800));
285 simdscalari dst23 = _mm256_permute2x128_si256(src, src, 0x01);
286 dst23 = _mm256_shuffle_epi8(dst23,
287 _mm256_set_epi32(0x80800f07, 0x80800e06, 0x80800d05, 0x80800c04, 0x0b038080, 0x0a028080, 0x09018080, 0x08008080));
288 simdscalari dst = _mm256_or_si256(dst01, dst23);
289 _simd_store_si((simdscalari*)pDst, dst);
290 #endif
291 #elif KNOB_SIMD_WIDTH == 16
292 simdscalari mask0 = _simd_set_epi32(0x0f078080, 0x0e068080, 0x0d058080, 0x0c048080, 0x80800b03, 0x80800a02, 0x80800901, 0x80800800);
293
294 simdscalari dst01 = _simd_shuffle_epi8(src, mask0);
295
296 simdscalari perm1 = _simd_permute_128(src, src, 1);
297
298 simdscalari mask1 = _simd_set_epi32(0x80800f07, 0x80800e06, 0x80800d05, 0x80800c04, 0x0b038080, 0x0a028080, 0x09018080, 0x08008080);
299
300 simdscalari dst23 = _simd_shuffle_epi8(perm1, mask1);
301
302 simdscalari dst = _simd_or_si(dst01, dst23);
303
304 _simd_store_si(reinterpret_cast<simdscalari *>(pDst), dst);
305 #else
306 #error Unsupported vector width
307 #endif
308 }
309 #if ENABLE_AVX512_SIMD16
310
311 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
312 {
313 simd16scalari src = _simd16_load_si(reinterpret_cast<const simd16scalari *>(pSrc));
314
315 simd16scalari mask0 = _simd16_set_epi32(0x0f078080, 0x0e068080, 0x0d058080, 0x0c048080, 0x80800b03, 0x80800a02, 0x80800901, 0x80800800);
316
317 simd16scalari dst01 = _simd16_shuffle_epi8(src, mask0);
318
319 simd16scalari perm1 = _simd16_permute2f128_si(src, src, 1);
320
321 simd16scalari mask1 = _simd16_set_epi32(0x80800f07, 0x80800e06, 0x80800d05, 0x80800c04, 0x0b038080, 0x0a028080, 0x09018080, 0x08008080);
322
323 simd16scalari dst23 = _simd16_shuffle_epi8(perm1, mask1);
324
325 simd16scalari dst = _simd16_or_si(dst01, dst23);
326
327 _simd16_store_si(reinterpret_cast<simd16scalari *>(pDst), dst);
328 }
329 #endif
330 };
331
332 //////////////////////////////////////////////////////////////////////////
333 /// Transpose8_8_8
334 //////////////////////////////////////////////////////////////////////////
335 struct Transpose8_8_8
336 {
337 //////////////////////////////////////////////////////////////////////////
338 /// @brief Performs an SOA to AOS conversion for packed 8_8_8 data.
339 /// @param pSrc - source data in SOA form
340 /// @param pDst - output data in AOS form
341 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
342 #if ENABLE_AVX512_SIMD16
343
344 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
345 #endif
346 };
347
348 //////////////////////////////////////////////////////////////////////////
349 /// Transpose8_8
350 //////////////////////////////////////////////////////////////////////////
351 struct Transpose8_8
352 {
353 //////////////////////////////////////////////////////////////////////////
354 /// @brief Performs an SOA to AOS conversion for packed 8_8 data.
355 /// @param pSrc - source data in SOA form
356 /// @param pDst - output data in AOS form
357 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
358 {
359 #if KNOB_SIMD_WIDTH == 8
360 simdscalari src = _simd_load_si((const simdscalari*)pSrc);
361
362 __m128i rg = _mm256_castsi256_si128(src); // rrrrrrrr gggggggg
363 __m128i g = _mm_unpackhi_epi64(rg, rg); // gggggggg gggggggg
364 rg = _mm_unpacklo_epi8(rg, g);
365 _mm_store_si128((__m128i*)pDst, rg);
366 #elif KNOB_SIMD_WIDTH == 16
367 __m256i src = _mm256_load_si256(reinterpret_cast<const __m256i *>(pSrc)); // rrrrrrrrrrrrrrrrgggggggggggggggg
368
369 __m256i r = _mm256_permute4x64_epi64(src, 0x50); // 0x50 = 01010000b // rrrrrrrrxxxxxxxxrrrrrrrrxxxxxxxx
370
371 __m256i g = _mm256_permute4x64_epi64(src, 0xFA); // 0xFA = 11111010b // ggggggggxxxxxxxxggggggggxxxxxxxx
372
373 __m256i dst = _mm256_unpacklo_epi8(r, g); // rgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrg
374
375 _mm256_store_si256(reinterpret_cast<__m256i *>(pDst), dst);
376 #else
377 #error Unsupported vector width
378 #endif
379 }
380 #if ENABLE_AVX512_SIMD16
381
382 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
383 {
384 __m256i src = _mm256_load_si256(reinterpret_cast<const __m256i *>(pSrc)); // rrrrrrrrrrrrrrrrgggggggggggggggg
385
386 __m256i r = _mm256_permute4x64_epi64(src, 0x50); // 0x50 = 01010000b // rrrrrrrrxxxxxxxxrrrrrrrrxxxxxxxx
387
388 __m256i g = _mm256_permute4x64_epi64(src, 0xFA); // 0xFA = 11111010b // ggggggggxxxxxxxxggggggggxxxxxxxx
389
390 __m256i dst = _mm256_unpacklo_epi8(r, g); // rgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrg
391
392 _mm256_store_si256(reinterpret_cast<__m256i *>(pDst), dst);
393 }
394 #endif
395 };
396
397 //////////////////////////////////////////////////////////////////////////
398 /// Transpose32_32_32_32
399 //////////////////////////////////////////////////////////////////////////
400 struct Transpose32_32_32_32
401 {
402 //////////////////////////////////////////////////////////////////////////
403 /// @brief Performs an SOA to AOS conversion for packed 32_32_32_32 data.
404 /// @param pSrc - source data in SOA form
405 /// @param pDst - output data in AOS form
406 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
407 {
408 #if KNOB_SIMD_WIDTH == 8
409 simdscalar src0 = _simd_load_ps((const float*)pSrc);
410 simdscalar src1 = _simd_load_ps((const float*)pSrc + 8);
411 simdscalar src2 = _simd_load_ps((const float*)pSrc + 16);
412 simdscalar src3 = _simd_load_ps((const float*)pSrc + 24);
413
414 __m128 vDst[8];
415 vTranspose4x8(vDst, src0, src1, src2, src3);
416 _mm_store_ps((float*)pDst, vDst[0]);
417 _mm_store_ps((float*)pDst+4, vDst[1]);
418 _mm_store_ps((float*)pDst+8, vDst[2]);
419 _mm_store_ps((float*)pDst+12, vDst[3]);
420 _mm_store_ps((float*)pDst+16, vDst[4]);
421 _mm_store_ps((float*)pDst+20, vDst[5]);
422 _mm_store_ps((float*)pDst+24, vDst[6]);
423 _mm_store_ps((float*)pDst+28, vDst[7]);
424 #elif KNOB_SIMD_WIDTH == 16
425 #if ENABLE_AVX512_EMULATION
426 simdscalar src0 = _simd_load_ps(reinterpret_cast<const float*>(pSrc));
427 simdscalar src1 = _simd_load_ps(reinterpret_cast<const float*>(pSrc) + 16);
428 simdscalar src2 = _simd_load_ps(reinterpret_cast<const float*>(pSrc) + 32);
429 simdscalar src3 = _simd_load_ps(reinterpret_cast<const float*>(pSrc) + 48);
430
431 __m128 vDst[8];
432
433 vTranspose4x8(vDst, src0.lo, src1.lo, src2.lo, src3.lo);
434
435 _mm_store_ps(reinterpret_cast<float*>(pDst), vDst[0]);
436 _mm_store_ps(reinterpret_cast<float*>(pDst) + 4, vDst[1]);
437 _mm_store_ps(reinterpret_cast<float*>(pDst) + 8, vDst[2]);
438 _mm_store_ps(reinterpret_cast<float*>(pDst) + 12, vDst[3]);
439 _mm_store_ps(reinterpret_cast<float*>(pDst) + 16, vDst[4]);
440 _mm_store_ps(reinterpret_cast<float*>(pDst) + 20, vDst[5]);
441 _mm_store_ps(reinterpret_cast<float*>(pDst) + 24, vDst[6]);
442 _mm_store_ps(reinterpret_cast<float*>(pDst) + 28, vDst[7]);
443
444 vTranspose4x8(vDst, src0.hi, src1.hi, src2.hi, src3.hi);
445
446 _mm_store_ps(reinterpret_cast<float*>(pDst) + 32, vDst[0]);
447 _mm_store_ps(reinterpret_cast<float*>(pDst) + 36, vDst[1]);
448 _mm_store_ps(reinterpret_cast<float*>(pDst) + 40, vDst[2]);
449 _mm_store_ps(reinterpret_cast<float*>(pDst) + 44, vDst[3]);
450 _mm_store_ps(reinterpret_cast<float*>(pDst) + 48, vDst[4]);
451 _mm_store_ps(reinterpret_cast<float*>(pDst) + 52, vDst[5]);
452 _mm_store_ps(reinterpret_cast<float*>(pDst) + 56, vDst[6]);
453 _mm_store_ps(reinterpret_cast<float*>(pDst) + 60, vDst[7]);
454 #endif
455 #else
456 #error Unsupported vector width
457 #endif
458 }
459 #if ENABLE_AVX512_SIMD16
460
461 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
462 {
463 simd16scalar src0 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc));
464 simd16scalar src1 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 16);
465 simd16scalar src2 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 32);
466 simd16scalar src3 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 48);
467
468 __m128 vDst[8];
469
470 vTranspose4x8(vDst, _simd16_extract_ps(src0, 0), _simd16_extract_ps(src1, 0), _simd16_extract_ps(src2, 0), _simd16_extract_ps(src3, 0));
471
472 #if 1
473 _simd16_store_ps(reinterpret_cast<float *>(pDst) + 0, reinterpret_cast<simd16scalar *>(vDst)[0]);
474 _simd16_store_ps(reinterpret_cast<float *>(pDst) + 16, reinterpret_cast<simd16scalar *>(vDst)[1]);
475 #else
476 _mm_store_ps(reinterpret_cast<float *>(pDst), vDst[0]);
477 _mm_store_ps(reinterpret_cast<float *>(pDst) + 4, vDst[1]);
478 _mm_store_ps(reinterpret_cast<float *>(pDst) + 8, vDst[2]);
479 _mm_store_ps(reinterpret_cast<float *>(pDst) + 12, vDst[3]);
480 _mm_store_ps(reinterpret_cast<float *>(pDst) + 16, vDst[4]);
481 _mm_store_ps(reinterpret_cast<float *>(pDst) + 20, vDst[5]);
482 _mm_store_ps(reinterpret_cast<float *>(pDst) + 24, vDst[6]);
483 _mm_store_ps(reinterpret_cast<float *>(pDst) + 28, vDst[7]);
484 #endif
485
486 vTranspose4x8(vDst, _simd16_extract_ps(src0, 1), _simd16_extract_ps(src1, 1), _simd16_extract_ps(src2, 1), _simd16_extract_ps(src3, 1));
487
488 #if 1
489 _simd16_store_ps(reinterpret_cast<float *>(pDst) + 32, reinterpret_cast<simd16scalar *>(vDst)[2]);
490 _simd16_store_ps(reinterpret_cast<float *>(pDst) + 48, reinterpret_cast<simd16scalar *>(vDst)[3]);
491 #else
492 _mm_store_ps(reinterpret_cast<float *>(pDst) + 32, vDst[0]);
493 _mm_store_ps(reinterpret_cast<float *>(pDst) + 36, vDst[1]);
494 _mm_store_ps(reinterpret_cast<float *>(pDst) + 40, vDst[2]);
495 _mm_store_ps(reinterpret_cast<float *>(pDst) + 44, vDst[3]);
496 _mm_store_ps(reinterpret_cast<float *>(pDst) + 48, vDst[4]);
497 _mm_store_ps(reinterpret_cast<float *>(pDst) + 52, vDst[5]);
498 _mm_store_ps(reinterpret_cast<float *>(pDst) + 56, vDst[6]);
499 _mm_store_ps(reinterpret_cast<float *>(pDst) + 60, vDst[7]);
500 #endif
501 }
502 #endif
503 };
504
505 //////////////////////////////////////////////////////////////////////////
506 /// Transpose32_32_32
507 //////////////////////////////////////////////////////////////////////////
508 struct Transpose32_32_32
509 {
510 //////////////////////////////////////////////////////////////////////////
511 /// @brief Performs an SOA to AOS conversion for packed 32_32_32 data.
512 /// @param pSrc - source data in SOA form
513 /// @param pDst - output data in AOS form
514 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
515 {
516 #if KNOB_SIMD_WIDTH == 8
517 simdscalar src0 = _simd_load_ps((const float*)pSrc);
518 simdscalar src1 = _simd_load_ps((const float*)pSrc + 8);
519 simdscalar src2 = _simd_load_ps((const float*)pSrc + 16);
520
521 __m128 vDst[8];
522 vTranspose3x8(vDst, src0, src1, src2);
523 _mm_store_ps((float*)pDst, vDst[0]);
524 _mm_store_ps((float*)pDst + 4, vDst[1]);
525 _mm_store_ps((float*)pDst + 8, vDst[2]);
526 _mm_store_ps((float*)pDst + 12, vDst[3]);
527 _mm_store_ps((float*)pDst + 16, vDst[4]);
528 _mm_store_ps((float*)pDst + 20, vDst[5]);
529 _mm_store_ps((float*)pDst + 24, vDst[6]);
530 _mm_store_ps((float*)pDst + 28, vDst[7]);
531 #elif KNOB_SIMD_WIDTH == 16
532 #if ENABLE_AVX512_EMULATION
533 simdscalar src0 = _simd_load_ps(reinterpret_cast<const float*>(pSrc));
534 simdscalar src1 = _simd_load_ps(reinterpret_cast<const float*>(pSrc) + 16);
535 simdscalar src2 = _simd_load_ps(reinterpret_cast<const float*>(pSrc) + 32);
536
537 __m128 vDst[8];
538
539 vTranspose3x8(vDst, src0.lo, src1.lo, src2.lo);
540
541 _mm_store_ps(reinterpret_cast<float*>(pDst), vDst[0]);
542 _mm_store_ps(reinterpret_cast<float*>(pDst) + 4, vDst[1]);
543 _mm_store_ps(reinterpret_cast<float*>(pDst) + 8, vDst[2]);
544 _mm_store_ps(reinterpret_cast<float*>(pDst) + 12, vDst[3]);
545 _mm_store_ps(reinterpret_cast<float*>(pDst) + 16, vDst[4]);
546 _mm_store_ps(reinterpret_cast<float*>(pDst) + 20, vDst[5]);
547 _mm_store_ps(reinterpret_cast<float*>(pDst) + 24, vDst[6]);
548 _mm_store_ps(reinterpret_cast<float*>(pDst) + 28, vDst[7]);
549
550 vTranspose3x8(vDst, src0.hi, src1.hi, src2.hi);
551
552 _mm_store_ps(reinterpret_cast<float*>(pDst) + 32, vDst[0]);
553 _mm_store_ps(reinterpret_cast<float*>(pDst) + 36, vDst[1]);
554 _mm_store_ps(reinterpret_cast<float*>(pDst) + 40, vDst[2]);
555 _mm_store_ps(reinterpret_cast<float*>(pDst) + 44, vDst[3]);
556 _mm_store_ps(reinterpret_cast<float*>(pDst) + 48, vDst[4]);
557 _mm_store_ps(reinterpret_cast<float*>(pDst) + 52, vDst[5]);
558 _mm_store_ps(reinterpret_cast<float*>(pDst) + 56, vDst[6]);
559 _mm_store_ps(reinterpret_cast<float*>(pDst) + 60, vDst[7]);
560 #endif
561 #else
562 #error Unsupported vector width
563 #endif
564 }
565 #if ENABLE_AVX512_SIMD16
566
567 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
568 {
569 simd16scalar src0 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc));
570 simd16scalar src1 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 16);
571 simd16scalar src2 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 32);
572
573 __m128 vDst[8];
574
575 vTranspose3x8(vDst, _simd16_extract_ps(src0, 0), _simd16_extract_ps(src1, 0), _simd16_extract_ps(src2, 0));
576
577 #if 1
578 _simd16_store_ps(reinterpret_cast<float *>(pDst) + 0, reinterpret_cast<simd16scalar *>(vDst)[0]);
579 _simd16_store_ps(reinterpret_cast<float *>(pDst) + 16, reinterpret_cast<simd16scalar *>(vDst)[1]);
580 #else
581 _mm_store_ps(reinterpret_cast<float *>(pDst), vDst[0]);
582 _mm_store_ps(reinterpret_cast<float *>(pDst) + 4, vDst[1]);
583 _mm_store_ps(reinterpret_cast<float *>(pDst) + 8, vDst[2]);
584 _mm_store_ps(reinterpret_cast<float *>(pDst) + 12, vDst[3]);
585 _mm_store_ps(reinterpret_cast<float *>(pDst) + 16, vDst[4]);
586 _mm_store_ps(reinterpret_cast<float *>(pDst) + 20, vDst[5]);
587 _mm_store_ps(reinterpret_cast<float *>(pDst) + 24, vDst[6]);
588 _mm_store_ps(reinterpret_cast<float *>(pDst) + 28, vDst[7]);
589 #endif
590
591 vTranspose3x8(vDst, _simd16_extract_ps(src0, 1), _simd16_extract_ps(src1, 1), _simd16_extract_ps(src2, 1));
592
593 #if 1
594 _simd16_store_ps(reinterpret_cast<float *>(pDst) + 32, reinterpret_cast<simd16scalar *>(vDst)[2]);
595 _simd16_store_ps(reinterpret_cast<float *>(pDst) + 48, reinterpret_cast<simd16scalar *>(vDst)[3]);
596 #else
597 _mm_store_ps(reinterpret_cast<float *>(pDst) + 32, vDst[0]);
598 _mm_store_ps(reinterpret_cast<float *>(pDst) + 36, vDst[1]);
599 _mm_store_ps(reinterpret_cast<float *>(pDst) + 40, vDst[2]);
600 _mm_store_ps(reinterpret_cast<float *>(pDst) + 44, vDst[3]);
601 _mm_store_ps(reinterpret_cast<float *>(pDst) + 48, vDst[4]);
602 _mm_store_ps(reinterpret_cast<float *>(pDst) + 52, vDst[5]);
603 _mm_store_ps(reinterpret_cast<float *>(pDst) + 56, vDst[6]);
604 _mm_store_ps(reinterpret_cast<float *>(pDst) + 60, vDst[7]);
605 #endif
606 }
607 #endif
608 };
609
610 //////////////////////////////////////////////////////////////////////////
611 /// Transpose32_32
612 //////////////////////////////////////////////////////////////////////////
613 struct Transpose32_32
614 {
615 //////////////////////////////////////////////////////////////////////////
616 /// @brief Performs an SOA to AOS conversion for packed 32_32 data.
617 /// @param pSrc - source data in SOA form
618 /// @param pDst - output data in AOS form
619 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
620 {
621 #if KNOB_SIMD_WIDTH == 8
622 const float* pfSrc = (const float*)pSrc;
623 __m128 src_r0 = _mm_load_ps(pfSrc + 0);
624 __m128 src_r1 = _mm_load_ps(pfSrc + 4);
625 __m128 src_g0 = _mm_load_ps(pfSrc + 8);
626 __m128 src_g1 = _mm_load_ps(pfSrc + 12);
627
628 __m128 dst0 = _mm_unpacklo_ps(src_r0, src_g0);
629 __m128 dst1 = _mm_unpackhi_ps(src_r0, src_g0);
630 __m128 dst2 = _mm_unpacklo_ps(src_r1, src_g1);
631 __m128 dst3 = _mm_unpackhi_ps(src_r1, src_g1);
632
633 float* pfDst = (float*)pDst;
634 _mm_store_ps(pfDst + 0, dst0);
635 _mm_store_ps(pfDst + 4, dst1);
636 _mm_store_ps(pfDst + 8, dst2);
637 _mm_store_ps(pfDst + 12, dst3);
638 #elif KNOB_SIMD_WIDTH == 16
639 const float* pfSrc = (const float*)pSrc;
640 __m256 src_r0 = _mm256_load_ps(pfSrc + 0);
641 __m256 src_r1 = _mm256_load_ps(pfSrc + 8);
642 __m256 src_g0 = _mm256_load_ps(pfSrc + 16);
643 __m256 src_g1 = _mm256_load_ps(pfSrc + 24);
644
645 __m256 dst0 = _mm256_unpacklo_ps(src_r0, src_g0);
646 __m256 dst1 = _mm256_unpackhi_ps(src_r0, src_g0);
647 __m256 dst2 = _mm256_unpacklo_ps(src_r1, src_g1);
648 __m256 dst3 = _mm256_unpackhi_ps(src_r1, src_g1);
649
650 float* pfDst = (float*)pDst;
651 _mm256_store_ps(pfDst + 0, dst0);
652 _mm256_store_ps(pfDst + 8, dst1);
653 _mm256_store_ps(pfDst + 16, dst2);
654 _mm256_store_ps(pfDst + 24, dst3);
655 #else
656 #error Unsupported vector width
657 #endif
658 }
659 #if ENABLE_AVX512_SIMD16
660
661 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
662 {
663 const float *pfSrc = reinterpret_cast<const float *>(pSrc);
664
665 __m256 src_r0 = _mm256_load_ps(pfSrc + 0);
666 __m256 src_r1 = _mm256_load_ps(pfSrc + 8);
667 __m256 src_g0 = _mm256_load_ps(pfSrc + 16);
668 __m256 src_g1 = _mm256_load_ps(pfSrc + 24);
669
670 __m256 dst0 = _mm256_unpacklo_ps(src_r0, src_g0);
671 __m256 dst1 = _mm256_unpackhi_ps(src_r0, src_g0);
672 __m256 dst2 = _mm256_unpacklo_ps(src_r1, src_g1);
673 __m256 dst3 = _mm256_unpackhi_ps(src_r1, src_g1);
674
675 float *pfDst = reinterpret_cast<float *>(pDst);
676
677 _mm256_store_ps(pfDst + 0, dst0);
678 _mm256_store_ps(pfDst + 8, dst1);
679 _mm256_store_ps(pfDst + 16, dst2);
680 _mm256_store_ps(pfDst + 24, dst3);
681 }
682 #endif
683 };
684
685 //////////////////////////////////////////////////////////////////////////
686 /// Transpose16_16_16_16
687 //////////////////////////////////////////////////////////////////////////
688 struct Transpose16_16_16_16
689 {
690 //////////////////////////////////////////////////////////////////////////
691 /// @brief Performs an SOA to AOS conversion for packed 16_16_16_16 data.
692 /// @param pSrc - source data in SOA form
693 /// @param pDst - output data in AOS form
694 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
695 {
696 #if KNOB_SIMD_WIDTH == 8
697 simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc);
698 simdscalari src_ba = _simd_load_si((const simdscalari*)(pSrc + sizeof(simdscalari)));
699
700 __m128i src_r = _mm256_extractf128_si256(src_rg, 0);
701 __m128i src_g = _mm256_extractf128_si256(src_rg, 1);
702 __m128i src_b = _mm256_extractf128_si256(src_ba, 0);
703 __m128i src_a = _mm256_extractf128_si256(src_ba, 1);
704
705 __m128i rg0 = _mm_unpacklo_epi16(src_r, src_g);
706 __m128i rg1 = _mm_unpackhi_epi16(src_r, src_g);
707 __m128i ba0 = _mm_unpacklo_epi16(src_b, src_a);
708 __m128i ba1 = _mm_unpackhi_epi16(src_b, src_a);
709
710 __m128i dst0 = _mm_unpacklo_epi32(rg0, ba0);
711 __m128i dst1 = _mm_unpackhi_epi32(rg0, ba0);
712 __m128i dst2 = _mm_unpacklo_epi32(rg1, ba1);
713 __m128i dst3 = _mm_unpackhi_epi32(rg1, ba1);
714
715 _mm_store_si128(((__m128i*)pDst) + 0, dst0);
716 _mm_store_si128(((__m128i*)pDst) + 1, dst1);
717 _mm_store_si128(((__m128i*)pDst) + 2, dst2);
718 _mm_store_si128(((__m128i*)pDst) + 3, dst3);
719 #elif KNOB_SIMD_WIDTH == 16
720 #if ENABLE_AVX512_EMULATION
721 simdscalari src_rg = _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc));
722 simdscalari src_ba = _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc + sizeof(simdscalari)));
723
724 __m256i src_r = src_rg.lo;
725 __m256i src_g = src_rg.hi;
726 __m256i src_b = src_ba.lo;
727 __m256i src_a = src_ba.hi;
728
729 __m256i rg0 = _mm256_unpacklo_epi16(src_r, src_g);
730 __m256i rg1 = _mm256_unpackhi_epi16(src_r, src_g);
731 __m256i ba0 = _mm256_unpacklo_epi16(src_b, src_a);
732 __m256i ba1 = _mm256_unpackhi_epi16(src_b, src_a);
733
734 __m256i dst0 = _mm256_unpacklo_epi32(rg0, ba0);
735 __m256i dst1 = _mm256_unpackhi_epi32(rg0, ba0);
736 __m256i dst2 = _mm256_unpacklo_epi32(rg1, ba1);
737 __m256i dst3 = _mm256_unpackhi_epi32(rg1, ba1);
738
739 _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 0, dst0);
740 _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 1, dst1);
741 _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 2, dst2);
742 _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 3, dst3);
743 #endif
744 #else
745 #error Unsupported vector width
746 #endif
747 }
748 #if ENABLE_AVX512_SIMD16
749
750 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
751 {
752 simd16scalari src_rg = _simd16_load_si(reinterpret_cast<const simd16scalari *>(pSrc));
753 simd16scalari src_ba = _simd16_load_si(reinterpret_cast<const simd16scalari *>(pSrc + sizeof(simd16scalari)));
754
755 __m256i src_r = _simd16_extract_si(src_rg, 0);
756 __m256i src_g = _simd16_extract_si(src_rg, 1);
757 __m256i src_b = _simd16_extract_si(src_ba, 0);
758 __m256i src_a = _simd16_extract_si(src_ba, 1);
759
760 __m256i rg0 = _mm256_unpacklo_epi16(src_r, src_g);
761 __m256i rg1 = _mm256_unpackhi_epi16(src_r, src_g);
762 __m256i ba0 = _mm256_unpacklo_epi16(src_b, src_a);
763 __m256i ba1 = _mm256_unpackhi_epi16(src_b, src_a);
764
765 __m256i dst0 = _mm256_unpacklo_epi32(rg0, ba0);
766 __m256i dst1 = _mm256_unpackhi_epi32(rg0, ba0);
767 __m256i dst2 = _mm256_unpacklo_epi32(rg1, ba1);
768 __m256i dst3 = _mm256_unpackhi_epi32(rg1, ba1);
769
770 _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 0, dst0);
771 _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 1, dst1);
772 _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 2, dst2);
773 _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 3, dst3);
774 }
775 #endif
776 };
777
778 //////////////////////////////////////////////////////////////////////////
779 /// Transpose16_16_16
780 //////////////////////////////////////////////////////////////////////////
781 struct Transpose16_16_16
782 {
783 //////////////////////////////////////////////////////////////////////////
784 /// @brief Performs an SOA to AOS conversion for packed 16_16_16 data.
785 /// @param pSrc - source data in SOA form
786 /// @param pDst - output data in AOS form
787 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
788 {
789 #if KNOB_SIMD_WIDTH == 8
790 simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc);
791
792 __m128i src_r = _mm256_extractf128_si256(src_rg, 0);
793 __m128i src_g = _mm256_extractf128_si256(src_rg, 1);
794 __m128i src_b = _mm_load_si128((const __m128i*)(pSrc + sizeof(simdscalari)));
795 __m128i src_a = _mm_undefined_si128();
796
797 __m128i rg0 = _mm_unpacklo_epi16(src_r, src_g);
798 __m128i rg1 = _mm_unpackhi_epi16(src_r, src_g);
799 __m128i ba0 = _mm_unpacklo_epi16(src_b, src_a);
800 __m128i ba1 = _mm_unpackhi_epi16(src_b, src_a);
801
802 __m128i dst0 = _mm_unpacklo_epi32(rg0, ba0);
803 __m128i dst1 = _mm_unpackhi_epi32(rg0, ba0);
804 __m128i dst2 = _mm_unpacklo_epi32(rg1, ba1);
805 __m128i dst3 = _mm_unpackhi_epi32(rg1, ba1);
806
807 _mm_store_si128(((__m128i*)pDst) + 0, dst0);
808 _mm_store_si128(((__m128i*)pDst) + 1, dst1);
809 _mm_store_si128(((__m128i*)pDst) + 2, dst2);
810 _mm_store_si128(((__m128i*)pDst) + 3, dst3);
811 #elif KNOB_SIMD_WIDTH == 16
812 #if ENABLE_AVX512_EMULATION
813 simdscalari src_rg = _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc));
814
815 __m256i src_r = src_rg.lo;
816 __m256i src_g = src_rg.hi;
817 __m256i src_b = _mm256_load_si256(reinterpret_cast<const __m256i*>(pSrc + sizeof(simdscalari)));
818 __m256i src_a = _mm256_undefined_si256();
819
820 __m256i rg0 = _mm256_unpacklo_epi16(src_r, src_g);
821 __m256i rg1 = _mm256_unpackhi_epi16(src_r, src_g);
822 __m256i ba0 = _mm256_unpacklo_epi16(src_b, src_a);
823 __m256i ba1 = _mm256_unpackhi_epi16(src_b, src_a);
824
825 __m256i dst0 = _mm256_unpacklo_epi32(rg0, ba0);
826 __m256i dst1 = _mm256_unpackhi_epi32(rg0, ba0);
827 __m256i dst2 = _mm256_unpacklo_epi32(rg1, ba1);
828 __m256i dst3 = _mm256_unpackhi_epi32(rg1, ba1);
829
830 _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 0, dst0);
831 _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 1, dst1);
832 _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 2, dst2);
833 _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 3, dst3);
834 #endif
835 #else
836 #error Unsupported vector width
837 #endif
838 }
839 #if ENABLE_AVX512_SIMD16
840
841 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
842 {
843 simd16scalari src_rg = _simd16_load_si(reinterpret_cast<const simd16scalari *>(pSrc));
844
845 __m256i src_r = _simd16_extract_si(src_rg, 0);
846 __m256i src_g = _simd16_extract_si(src_rg, 1);
847 __m256i src_b = _mm256_load_si256(reinterpret_cast<const __m256i *>(pSrc + sizeof(simd16scalari)));
848 __m256i src_a = _mm256_undefined_si256();
849
850 __m256i rg0 = _mm256_unpacklo_epi16(src_r, src_g);
851 __m256i rg1 = _mm256_unpackhi_epi16(src_r, src_g);
852 __m256i ba0 = _mm256_unpacklo_epi16(src_b, src_a);
853 __m256i ba1 = _mm256_unpackhi_epi16(src_b, src_a);
854
855 __m256i dst0 = _mm256_unpacklo_epi32(rg0, ba0);
856 __m256i dst1 = _mm256_unpackhi_epi32(rg0, ba0);
857 __m256i dst2 = _mm256_unpacklo_epi32(rg1, ba1);
858 __m256i dst3 = _mm256_unpackhi_epi32(rg1, ba1);
859
860 _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 0, dst0);
861 _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 1, dst1);
862 _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 2, dst2);
863 _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 3, dst3);
864 }
865 #endif
866 };
867
868 //////////////////////////////////////////////////////////////////////////
869 /// Transpose16_16
870 //////////////////////////////////////////////////////////////////////////
871 struct Transpose16_16
872 {
873 //////////////////////////////////////////////////////////////////////////
874 /// @brief Performs an SOA to AOS conversion for packed 16_16 data.
875 /// @param pSrc - source data in SOA form
876 /// @param pDst - output data in AOS form
877 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
878 {
879 #if KNOB_SIMD_WIDTH == 8
880 simdscalar src = _simd_load_ps((const float*)pSrc);
881
882 __m128 comp0 = _mm256_castps256_ps128(src);
883 __m128 comp1 = _mm256_extractf128_ps(src, 1);
884
885 __m128i comp0i = _mm_castps_si128(comp0);
886 __m128i comp1i = _mm_castps_si128(comp1);
887
888 __m128i resLo = _mm_unpacklo_epi16(comp0i, comp1i);
889 __m128i resHi = _mm_unpackhi_epi16(comp0i, comp1i);
890
891 _mm_store_si128((__m128i*)pDst, resLo);
892 _mm_store_si128((__m128i*)pDst + 1, resHi);
893 #elif KNOB_SIMD_WIDTH == 16
894 #if ENABLE_AVX512_EMULATION
895 simdscalari src = _simd_castps_si(_simd_load_ps(reinterpret_cast<const float*>(pSrc)));
896
897 simdscalari result;
898
899 result.lo = _mm256_unpacklo_epi16(src.lo, src.hi);
900 result.hi = _mm256_unpackhi_epi16(src.lo, src.hi);
901
902 _simd_store_si(reinterpret_cast<simdscalari *>(pDst), result);
903 #endif
904 #else
905 #error Unsupported vector width
906 #endif
907 }
908 #if ENABLE_AVX512_SIMD16
909
910 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
911 {
912 simd16scalari result = _simd16_setzero_si();
913
914 simd16scalari src = _simd16_castps_si(_simd16_load_ps(reinterpret_cast<const float *>(pSrc)));
915
916 simdscalari srclo = _simd16_extract_si(src, 0);
917 simdscalari srchi = _simd16_extract_si(src, 1);
918
919 result = _simd16_insert_si(result, _mm256_unpacklo_epi16(srclo, srchi), 0);
920 result = _simd16_insert_si(result, _mm256_unpackhi_epi16(srclo, srchi), 1);
921
922 _simd16_store_si(reinterpret_cast<simd16scalari *>(pDst), result);
923 }
924 #endif
925 };
926
927 //////////////////////////////////////////////////////////////////////////
928 /// Transpose24_8
929 //////////////////////////////////////////////////////////////////////////
930 struct Transpose24_8
931 {
932 //////////////////////////////////////////////////////////////////////////
933 /// @brief Performs an SOA to AOS conversion for packed 24_8 data.
934 /// @param pSrc - source data in SOA form
935 /// @param pDst - output data in AOS form
936 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
937 #if ENABLE_AVX512_SIMD16
938
939 static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
940 #endif
941 };
942
943 //////////////////////////////////////////////////////////////////////////
944 /// Transpose32_8_24
945 //////////////////////////////////////////////////////////////////////////
946 struct Transpose32_8_24
947 {
948 //////////////////////////////////////////////////////////////////////////
949 /// @brief Performs an SOA to AOS conversion for packed 32_8_24 data.
950 /// @param pSrc - source data in SOA form
951 /// @param pDst - output data in AOS form
952 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
953 #if ENABLE_AVX512_SIMD16
954
955 static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
956 #endif
957 };
958
959 //////////////////////////////////////////////////////////////////////////
960 /// Transpose4_4_4_4
961 //////////////////////////////////////////////////////////////////////////
962 struct Transpose4_4_4_4
963 {
964 //////////////////////////////////////////////////////////////////////////
965 /// @brief Performs an SOA to AOS conversion for packed 4_4_4_4 data.
966 /// @param pSrc - source data in SOA form
967 /// @param pDst - output data in AOS form
968 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
969 #if ENABLE_AVX512_SIMD16
970
971 static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
972 #endif
973 };
974
975 //////////////////////////////////////////////////////////////////////////
976 /// Transpose5_6_5
977 //////////////////////////////////////////////////////////////////////////
978 struct Transpose5_6_5
979 {
980 //////////////////////////////////////////////////////////////////////////
981 /// @brief Performs an SOA to AOS conversion for packed 5_6_5 data.
982 /// @param pSrc - source data in SOA form
983 /// @param pDst - output data in AOS form
984 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
985 #if ENABLE_AVX512_SIMD16
986
987 static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
988 #endif
989 };
990
991 //////////////////////////////////////////////////////////////////////////
992 /// Transpose9_9_9_5
993 //////////////////////////////////////////////////////////////////////////
994 struct Transpose9_9_9_5
995 {
996 //////////////////////////////////////////////////////////////////////////
997 /// @brief Performs an SOA to AOS conversion for packed 9_9_9_5 data.
998 /// @param pSrc - source data in SOA form
999 /// @param pDst - output data in AOS form
1000 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
1001 #if ENABLE_AVX512_SIMD16
1002
1003 static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
1004 #endif
1005 };
1006
1007 //////////////////////////////////////////////////////////////////////////
1008 /// Transpose5_5_5_1
1009 //////////////////////////////////////////////////////////////////////////
1010 struct Transpose5_5_5_1
1011 {
1012 //////////////////////////////////////////////////////////////////////////
1013 /// @brief Performs an SOA to AOS conversion for packed 5_5_5_1 data.
1014 /// @param pSrc - source data in SOA form
1015 /// @param pDst - output data in AOS form
1016 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
1017 #if ENABLE_AVX512_SIMD16
1018
1019 static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
1020 #endif
1021 };
1022
1023 //////////////////////////////////////////////////////////////////////////
1024 /// Transpose1_5_5_5
1025 //////////////////////////////////////////////////////////////////////////
1026 struct Transpose1_5_5_5
1027 {
1028 //////////////////////////////////////////////////////////////////////////
1029 /// @brief Performs an SOA to AOS conversion for packed 5_5_5_1 data.
1030 /// @param pSrc - source data in SOA form
1031 /// @param pDst - output data in AOS form
1032 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
1033 };
1034
1035 //////////////////////////////////////////////////////////////////////////
1036 /// Transpose10_10_10_2
1037 //////////////////////////////////////////////////////////////////////////
1038 struct Transpose10_10_10_2
1039 {
1040 //////////////////////////////////////////////////////////////////////////
1041 /// @brief Performs an SOA to AOS conversion for packed 10_10_10_2 data.
1042 /// @param pSrc - source data in SOA form
1043 /// @param pDst - output data in AOS form
1044 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
1045 #if ENABLE_AVX512_SIMD16
1046
1047 static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
1048 #endif
1049 };
1050
1051 //////////////////////////////////////////////////////////////////////////
1052 /// Transpose11_11_10
1053 //////////////////////////////////////////////////////////////////////////
1054 struct Transpose11_11_10
1055 {
1056 //////////////////////////////////////////////////////////////////////////
1057 /// @brief Performs an SOA to AOS conversion for packed 11_11_10 data.
1058 /// @param pSrc - source data in SOA form
1059 /// @param pDst - output data in AOS form
1060 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
1061 #if ENABLE_AVX512_SIMD16
1062
1063 static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
1064 #endif
1065 };
1066
1067 // helper function to unroll loops
1068 template<int Begin, int End, int Step = 1>
1069 struct UnrollerL {
1070 template<typename Lambda>
1071 INLINE static void step(Lambda& func) {
1072 func(Begin);
1073 UnrollerL<Begin + Step, End, Step>::step(func);
1074 }
1075 };
1076
1077 template<int End, int Step>
1078 struct UnrollerL<End, End, Step> {
1079 template<typename Lambda>
1080 static void step(Lambda& func) {
1081 }
1082 };
1083
1084 // helper function to unroll loops, with mask to skip specific iterations
1085 template<int Begin, int End, int Step = 1, int Mask = 0x7f>
1086 struct UnrollerLMask {
1087 template<typename Lambda>
1088 INLINE static void step(Lambda& func) {
1089 if(Mask & (1 << Begin))
1090 {
1091 func(Begin);
1092 }
1093 UnrollerL<Begin + Step, End, Step>::step(func);
1094 }
1095 };
1096
1097 template<int End, int Step, int Mask>
1098 struct UnrollerLMask<End, End, Step, Mask> {
1099 template<typename Lambda>
1100 static void step(Lambda& func) {
1101 }
1102 };
1103
1104 // general CRC compute
1105 INLINE
1106 uint32_t ComputeCRC(uint32_t crc, const void *pData, uint32_t size)
1107 {
1108 #if defined(_WIN64) || defined(__x86_64__)
1109 uint32_t sizeInQwords = size / sizeof(uint64_t);
1110 uint32_t sizeRemainderBytes = size % sizeof(uint64_t);
1111 uint64_t* pDataWords = (uint64_t*)pData;
1112 for (uint32_t i = 0; i < sizeInQwords; ++i)
1113 {
1114 crc = (uint32_t)_mm_crc32_u64(crc, *pDataWords++);
1115 }
1116 #else
1117 uint32_t sizeInDwords = size / sizeof(uint32_t);
1118 uint32_t sizeRemainderBytes = size % sizeof(uint32_t);
1119 uint32_t* pDataWords = (uint32_t*)pData;
1120 for (uint32_t i = 0; i < sizeInDwords; ++i)
1121 {
1122 crc = _mm_crc32_u32(crc, *pDataWords++);
1123 }
1124 #endif
1125
1126 uint8_t* pRemainderBytes = (uint8_t*)pDataWords;
1127 for (uint32_t i = 0; i < sizeRemainderBytes; ++i)
1128 {
1129 crc = _mm_crc32_u8(crc, *pRemainderBytes++);
1130 }
1131
1132 return crc;
1133 }
1134
1135 //////////////////////////////////////////////////////////////////////////
1136 /// Add byte offset to any-type pointer
1137 //////////////////////////////////////////////////////////////////////////
1138 template <typename T>
1139 INLINE
1140 static T* PtrAdd(T* p, intptr_t offset)
1141 {
1142 intptr_t intp = reinterpret_cast<intptr_t>(p);
1143 return reinterpret_cast<T*>(intp + offset);
1144 }
1145
1146 //////////////////////////////////////////////////////////////////////////
1147 /// Is a power-of-2?
1148 //////////////////////////////////////////////////////////////////////////
1149 template <typename T>
1150 INLINE
1151 static bool IsPow2(T value)
1152 {
1153 return value == (value & (0 - value));
1154 }
1155
1156 //////////////////////////////////////////////////////////////////////////
1157 /// Align down to specified alignment
1158 /// Note: IsPow2(alignment) MUST be true
1159 //////////////////////////////////////////////////////////////////////////
1160 template <typename T1, typename T2>
1161 INLINE
1162 static T1 AlignDownPow2(T1 value, T2 alignment)
1163 {
1164 SWR_ASSERT(IsPow2(alignment));
1165 return value & ~T1(alignment - 1);
1166 }
1167
1168 //////////////////////////////////////////////////////////////////////////
1169 /// Align up to specified alignment
1170 /// Note: IsPow2(alignment) MUST be true
1171 //////////////////////////////////////////////////////////////////////////
1172 template <typename T1, typename T2>
1173 INLINE
1174 static T1 AlignUpPow2(T1 value, T2 alignment)
1175 {
1176 return AlignDownPow2(value + T1(alignment - 1), alignment);
1177 }
1178
1179 //////////////////////////////////////////////////////////////////////////
1180 /// Align up ptr to specified alignment
1181 /// Note: IsPow2(alignment) MUST be true
1182 //////////////////////////////////////////////////////////////////////////
1183 template <typename T1, typename T2>
1184 INLINE
1185 static T1* AlignUpPow2(T1* value, T2 alignment)
1186 {
1187 return reinterpret_cast<T1*>(
1188 AlignDownPow2(reinterpret_cast<uintptr_t>(value) + uintptr_t(alignment - 1), alignment));
1189 }
1190
1191 //////////////////////////////////////////////////////////////////////////
1192 /// Align down to specified alignment
1193 //////////////////////////////////////////////////////////////////////////
1194 template <typename T1, typename T2>
1195 INLINE
1196 static T1 AlignDown(T1 value, T2 alignment)
1197 {
1198 if (IsPow2(alignment)) { return AlignDownPow2(value, alignment); }
1199 return value - T1(value % alignment);
1200 }
1201
1202 //////////////////////////////////////////////////////////////////////////
1203 /// Align down to specified alignment
1204 //////////////////////////////////////////////////////////////////////////
1205 template <typename T1, typename T2>
1206 INLINE
1207 static T1* AlignDown(T1* value, T2 alignment)
1208 {
1209 return (T1*)AlignDown(uintptr_t(value), alignment);
1210 }
1211
1212 //////////////////////////////////////////////////////////////////////////
1213 /// Align up to specified alignment
1214 /// Note: IsPow2(alignment) MUST be true
1215 //////////////////////////////////////////////////////////////////////////
1216 template <typename T1, typename T2>
1217 INLINE
1218 static T1 AlignUp(T1 value, T2 alignment)
1219 {
1220 return AlignDown(value + T1(alignment - 1), alignment);
1221 }
1222
1223 //////////////////////////////////////////////////////////////////////////
1224 /// Align up to specified alignment
1225 /// Note: IsPow2(alignment) MUST be true
1226 //////////////////////////////////////////////////////////////////////////
1227 template <typename T1, typename T2>
1228 INLINE
1229 static T1* AlignUp(T1* value, T2 alignment)
1230 {
1231 return AlignDown(PtrAdd(value, alignment - 1), alignment);
1232 }
1233
1234 //////////////////////////////////////////////////////////////////////////
1235 /// Helper structure used to access an array of elements that don't
1236 /// correspond to a typical word size.
1237 //////////////////////////////////////////////////////////////////////////
1238 template<typename T, size_t BitsPerElementT, size_t ArrayLenT>
1239 class BitsArray
1240 {
1241 private:
1242 static const size_t BITS_PER_WORD = sizeof(size_t) * 8;
1243 static const size_t ELEMENTS_PER_WORD = BITS_PER_WORD / BitsPerElementT;
1244 static const size_t NUM_WORDS = (ArrayLenT + ELEMENTS_PER_WORD - 1) / ELEMENTS_PER_WORD;
1245 static const size_t ELEMENT_MASK = (size_t(1) << BitsPerElementT) - 1;
1246
1247 static_assert(ELEMENTS_PER_WORD * BitsPerElementT == BITS_PER_WORD,
1248 "Element size must an integral fraction of pointer size");
1249
1250 size_t m_words[NUM_WORDS] = {};
1251
1252 public:
1253
1254 T operator[] (size_t elementIndex) const
1255 {
1256 size_t word = m_words[elementIndex / ELEMENTS_PER_WORD];
1257 word >>= ((elementIndex % ELEMENTS_PER_WORD) * BitsPerElementT);
1258 return T(word & ELEMENT_MASK);
1259 }
1260 };
1261
1262 // Ranged integer argument for TemplateArgUnroller
1263 template <uint32_t TMin, uint32_t TMax>
1264 struct IntArg
1265 {
1266 uint32_t val;
1267 };
1268
1269 // Recursive template used to auto-nest conditionals. Converts dynamic boolean function
1270 // arguments to static template arguments.
1271 template <typename TermT, typename... ArgsB>
1272 struct TemplateArgUnroller
1273 {
1274 //-----------------------------------------
1275 // Boolean value
1276 //-----------------------------------------
1277
1278 // Last Arg Terminator
1279 static typename TermT::FuncType GetFunc(bool bArg)
1280 {
1281 if (bArg)
1282 {
1283 return TermT::template GetFunc<ArgsB..., std::true_type>();
1284 }
1285
1286 return TermT::template GetFunc<ArgsB..., std::false_type>();
1287 }
1288
1289 // Recursively parse args
1290 template <typename... TArgsT>
1291 static typename TermT::FuncType GetFunc(bool bArg, TArgsT... remainingArgs)
1292 {
1293 if (bArg)
1294 {
1295 return TemplateArgUnroller<TermT, ArgsB..., std::true_type>::GetFunc(remainingArgs...);
1296 }
1297
1298 return TemplateArgUnroller<TermT, ArgsB..., std::false_type>::GetFunc(remainingArgs...);
1299 }
1300
1301 //-----------------------------------------
1302 // Integer value (within specified range)
1303 //-----------------------------------------
1304
1305 // Last Arg Terminator
1306 template <uint32_t TMin, uint32_t TMax>
1307 static typename TermT::FuncType GetFunc(IntArg<TMin, TMax> iArg)
1308 {
1309 if (iArg.val == TMax)
1310 {
1311 return TermT::template GetFunc<ArgsB..., std::integral_constant<uint32_t, TMax>>();
1312 }
1313 if (TMax > TMin)
1314 {
1315 return TemplateArgUnroller<TermT, ArgsB...>::GetFunc(IntArg<TMin, TMax-1>{iArg.val});
1316 }
1317 SWR_ASSUME(false); return nullptr;
1318 }
1319 template <uint32_t TVal>
1320 static typename TermT::FuncType GetFunc(IntArg<TVal, TVal> iArg)
1321 {
1322 SWR_ASSERT(iArg.val == TVal);
1323 return TermT::template GetFunc<ArgsB..., std::integral_constant<uint32_t, TVal>>();
1324 }
1325
1326 // Recursively parse args
1327 template <uint32_t TMin, uint32_t TMax, typename... TArgsT>
1328 static typename TermT::FuncType GetFunc(IntArg<TMin, TMax> iArg, TArgsT... remainingArgs)
1329 {
1330 if (iArg.val == TMax)
1331 {
1332 return TemplateArgUnroller<TermT, ArgsB..., std::integral_constant<uint32_t, TMax>>::GetFunc(remainingArgs...);
1333 }
1334 if (TMax > TMin)
1335 {
1336 return TemplateArgUnroller<TermT, ArgsB...>::GetFunc(IntArg<TMin, TMax - 1>{iArg.val}, remainingArgs...);
1337 }
1338 SWR_ASSUME(false); return nullptr;
1339 }
1340 template <uint32_t TVal, typename... TArgsT>
1341 static typename TermT::FuncType GetFunc(IntArg<TVal, TVal> iArg, TArgsT... remainingArgs)
1342 {
1343 SWR_ASSERT(iArg.val == TVal);
1344 return TemplateArgUnroller<TermT, ArgsB..., std::integral_constant<uint32_t, TVal>>::GetFunc(remainingArgs...);
1345 }
1346 };
1347
1348