9dfa16a529e432594e2cf27137ef46270b402857
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / utils.h
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file utils.h
24 *
25 * @brief Utilities used by SWR core.
26 *
27 ******************************************************************************/
28 #pragma once
29
30 #include <string.h>
31 #include <type_traits>
32 #include <algorithm>
33 #include "common/os.h"
34 #include "common/simdintrin.h"
35 #include "common/swr_assert.h"
36 #include "core/api.h"
37
38 #if defined(_WIN64) || defined(__x86_64__)
39 #define _MM_INSERT_EPI64 _mm_insert_epi64
40 #define _MM_EXTRACT_EPI64 _mm_extract_epi64
41 #else
42 INLINE int64_t _MM_EXTRACT_EPI64(__m128i a, const int32_t ndx)
43 {
44 OSALIGNLINE(uint32_t) elems[4];
45 _mm_store_si128((__m128i*)elems, a);
46 if (ndx == 0)
47 {
48 uint64_t foo = elems[0];
49 foo |= (uint64_t)elems[1] << 32;
50 return foo;
51 }
52 else
53 {
54 uint64_t foo = elems[2];
55 foo |= (uint64_t)elems[3] << 32;
56 return foo;
57 }
58 }
59
60 INLINE __m128i _MM_INSERT_EPI64(__m128i a, int64_t b, const int32_t ndx)
61 {
62 OSALIGNLINE(int64_t) elems[2];
63 _mm_store_si128((__m128i*)elems, a);
64 if (ndx == 0)
65 {
66 elems[0] = b;
67 }
68 else
69 {
70 elems[1] = b;
71 }
72 __m128i out;
73 out = _mm_load_si128((const __m128i*)elems);
74 return out;
75 }
76 #endif
77
78 struct simdBBox
79 {
80 simdscalari ymin;
81 simdscalari ymax;
82 simdscalari xmin;
83 simdscalari xmax;
84 };
85
86 INLINE
87 void vTranspose(__m128 &row0, __m128 &row1, __m128 &row2, __m128 &row3)
88 {
89 __m128i row0i = _mm_castps_si128(row0);
90 __m128i row1i = _mm_castps_si128(row1);
91 __m128i row2i = _mm_castps_si128(row2);
92 __m128i row3i = _mm_castps_si128(row3);
93
94 __m128i vTemp = row2i;
95 row2i = _mm_unpacklo_epi32(row2i, row3i);
96 vTemp = _mm_unpackhi_epi32(vTemp, row3i);
97
98 row3i = row0i;
99 row0i = _mm_unpacklo_epi32(row0i, row1i);
100 row3i = _mm_unpackhi_epi32(row3i, row1i);
101
102 row1i = row0i;
103 row0i = _mm_unpacklo_epi64(row0i, row2i);
104 row1i = _mm_unpackhi_epi64(row1i, row2i);
105
106 row2i = row3i;
107 row2i = _mm_unpacklo_epi64(row2i, vTemp);
108 row3i = _mm_unpackhi_epi64(row3i, vTemp);
109
110 row0 = _mm_castsi128_ps(row0i);
111 row1 = _mm_castsi128_ps(row1i);
112 row2 = _mm_castsi128_ps(row2i);
113 row3 = _mm_castsi128_ps(row3i);
114 }
115
116 INLINE
117 void vTranspose(__m128i &row0, __m128i &row1, __m128i &row2, __m128i &row3)
118 {
119 __m128i vTemp = row2;
120 row2 = _mm_unpacklo_epi32(row2, row3);
121 vTemp = _mm_unpackhi_epi32(vTemp, row3);
122
123 row3 = row0;
124 row0 = _mm_unpacklo_epi32(row0, row1);
125 row3 = _mm_unpackhi_epi32(row3, row1);
126
127 row1 = row0;
128 row0 = _mm_unpacklo_epi64(row0, row2);
129 row1 = _mm_unpackhi_epi64(row1, row2);
130
131 row2 = row3;
132 row2 = _mm_unpacklo_epi64(row2, vTemp);
133 row3 = _mm_unpackhi_epi64(row3, vTemp);
134 }
135
136 #if KNOB_SIMD_WIDTH == 8
137 INLINE
138 void vTranspose3x8(__m128 (&vDst)[8], const simdscalar &vSrc0, const simdscalar &vSrc1, const simdscalar &vSrc2)
139 {
140 simdscalar r0r2 = _simd_unpacklo_ps(vSrc0, vSrc2); //x0z0x1z1 x4z4x5z5
141 simdscalar r1rx = _simd_unpacklo_ps(vSrc1, _simd_setzero_ps()); //y0w0y1w1 y4w4y5w5
142 simdscalar r02r1xlolo = _simd_unpacklo_ps(r0r2, r1rx); //x0y0z0w0 x4y4z4w4
143 simdscalar r02r1xlohi = _simd_unpackhi_ps(r0r2, r1rx); //x1y1z1w1 x5y5z5w5
144
145 r0r2 = _simd_unpackhi_ps(vSrc0, vSrc2); //x2z2x3z3 x6z6x7z7
146 r1rx = _simd_unpackhi_ps(vSrc1, _simd_setzero_ps()); //y2w2y3w3 y6w6yw77
147 simdscalar r02r1xhilo = _simd_unpacklo_ps(r0r2, r1rx); //x2y2z2w2 x6y6z6w6
148 simdscalar r02r1xhihi = _simd_unpackhi_ps(r0r2, r1rx); //x3y3z3w3 x7y7z7w7
149
150 vDst[0] = _mm256_castps256_ps128(r02r1xlolo);
151 vDst[1] = _mm256_castps256_ps128(r02r1xlohi);
152 vDst[2] = _mm256_castps256_ps128(r02r1xhilo);
153 vDst[3] = _mm256_castps256_ps128(r02r1xhihi);
154
155 vDst[4] = _simd_extractf128_ps(r02r1xlolo, 1);
156 vDst[5] = _simd_extractf128_ps(r02r1xlohi, 1);
157 vDst[6] = _simd_extractf128_ps(r02r1xhilo, 1);
158 vDst[7] = _simd_extractf128_ps(r02r1xhihi, 1);
159 }
160
161 INLINE
162 void vTranspose4x8(__m128 (&vDst)[8], const simdscalar &vSrc0, const simdscalar &vSrc1, const simdscalar &vSrc2, const simdscalar &vSrc3)
163 {
164 simdscalar r0r2 = _simd_unpacklo_ps(vSrc0, vSrc2); //x0z0x1z1 x4z4x5z5
165 simdscalar r1rx = _simd_unpacklo_ps(vSrc1, vSrc3); //y0w0y1w1 y4w4y5w5
166 simdscalar r02r1xlolo = _simd_unpacklo_ps(r0r2, r1rx); //x0y0z0w0 x4y4z4w4
167 simdscalar r02r1xlohi = _simd_unpackhi_ps(r0r2, r1rx); //x1y1z1w1 x5y5z5w5
168
169 r0r2 = _simd_unpackhi_ps(vSrc0, vSrc2); //x2z2x3z3 x6z6x7z7
170 r1rx = _simd_unpackhi_ps(vSrc1, vSrc3); //y2w2y3w3 y6w6yw77
171 simdscalar r02r1xhilo = _simd_unpacklo_ps(r0r2, r1rx); //x2y2z2w2 x6y6z6w6
172 simdscalar r02r1xhihi = _simd_unpackhi_ps(r0r2, r1rx); //x3y3z3w3 x7y7z7w7
173
174 vDst[0] = _mm256_castps256_ps128(r02r1xlolo);
175 vDst[1] = _mm256_castps256_ps128(r02r1xlohi);
176 vDst[2] = _mm256_castps256_ps128(r02r1xhilo);
177 vDst[3] = _mm256_castps256_ps128(r02r1xhihi);
178
179 vDst[4] = _simd_extractf128_ps(r02r1xlolo, 1);
180 vDst[5] = _simd_extractf128_ps(r02r1xlohi, 1);
181 vDst[6] = _simd_extractf128_ps(r02r1xhilo, 1);
182 vDst[7] = _simd_extractf128_ps(r02r1xhihi, 1);
183 }
184
185 #if ENABLE_AVX512_SIMD16
186 INLINE
187 void vTranspose4x16(simd16scalar(&dst)[4], const simd16scalar &src0, const simd16scalar &src1, const simd16scalar &src2, const simd16scalar &src3)
188 {
189 const simd16scalari perm = _simd16_set_epi32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0); // pre-permute input to setup the right order after all the unpacking
190
191 simd16scalar pre0 = _simd16_permute_ps(src0, perm); // r
192 simd16scalar pre1 = _simd16_permute_ps(src1, perm); // g
193 simd16scalar pre2 = _simd16_permute_ps(src2, perm); // b
194 simd16scalar pre3 = _simd16_permute_ps(src3, perm); // a
195
196 simd16scalar rblo = _simd16_unpacklo_ps(pre0, pre2);
197 simd16scalar galo = _simd16_unpacklo_ps(pre1, pre3);
198 simd16scalar rbhi = _simd16_unpackhi_ps(pre0, pre2);
199 simd16scalar gahi = _simd16_unpackhi_ps(pre1, pre3);
200
201 dst[0] = _simd16_unpacklo_ps(rblo, galo);
202 dst[1] = _simd16_unpackhi_ps(rblo, galo);
203 dst[2] = _simd16_unpacklo_ps(rbhi, gahi);
204 dst[3] = _simd16_unpackhi_ps(rbhi, gahi);
205 }
206
207 #endif
208 INLINE
209 void vTranspose8x8(simdscalar (&vDst)[8], const simdscalar &vMask0, const simdscalar &vMask1, const simdscalar &vMask2, const simdscalar &vMask3, const simdscalar &vMask4, const simdscalar &vMask5, const simdscalar &vMask6, const simdscalar &vMask7)
210 {
211 simdscalar __t0 = _simd_unpacklo_ps(vMask0, vMask1);
212 simdscalar __t1 = _simd_unpackhi_ps(vMask0, vMask1);
213 simdscalar __t2 = _simd_unpacklo_ps(vMask2, vMask3);
214 simdscalar __t3 = _simd_unpackhi_ps(vMask2, vMask3);
215 simdscalar __t4 = _simd_unpacklo_ps(vMask4, vMask5);
216 simdscalar __t5 = _simd_unpackhi_ps(vMask4, vMask5);
217 simdscalar __t6 = _simd_unpacklo_ps(vMask6, vMask7);
218 simdscalar __t7 = _simd_unpackhi_ps(vMask6, vMask7);
219 simdscalar __tt0 = _simd_shuffle_ps(__t0,__t2,_MM_SHUFFLE(1,0,1,0));
220 simdscalar __tt1 = _simd_shuffle_ps(__t0,__t2,_MM_SHUFFLE(3,2,3,2));
221 simdscalar __tt2 = _simd_shuffle_ps(__t1,__t3,_MM_SHUFFLE(1,0,1,0));
222 simdscalar __tt3 = _simd_shuffle_ps(__t1,__t3,_MM_SHUFFLE(3,2,3,2));
223 simdscalar __tt4 = _simd_shuffle_ps(__t4,__t6,_MM_SHUFFLE(1,0,1,0));
224 simdscalar __tt5 = _simd_shuffle_ps(__t4,__t6,_MM_SHUFFLE(3,2,3,2));
225 simdscalar __tt6 = _simd_shuffle_ps(__t5,__t7,_MM_SHUFFLE(1,0,1,0));
226 simdscalar __tt7 = _simd_shuffle_ps(__t5,__t7,_MM_SHUFFLE(3,2,3,2));
227 vDst[0] = _simd_permute2f128_ps(__tt0, __tt4, 0x20);
228 vDst[1] = _simd_permute2f128_ps(__tt1, __tt5, 0x20);
229 vDst[2] = _simd_permute2f128_ps(__tt2, __tt6, 0x20);
230 vDst[3] = _simd_permute2f128_ps(__tt3, __tt7, 0x20);
231 vDst[4] = _simd_permute2f128_ps(__tt0, __tt4, 0x31);
232 vDst[5] = _simd_permute2f128_ps(__tt1, __tt5, 0x31);
233 vDst[6] = _simd_permute2f128_ps(__tt2, __tt6, 0x31);
234 vDst[7] = _simd_permute2f128_ps(__tt3, __tt7, 0x31);
235 }
236
237 INLINE
238 void vTranspose8x8(simdscalar (&vDst)[8], const simdscalari &vMask0, const simdscalari &vMask1, const simdscalari &vMask2, const simdscalari &vMask3, const simdscalari &vMask4, const simdscalari &vMask5, const simdscalari &vMask6, const simdscalari &vMask7)
239 {
240 vTranspose8x8(vDst, _simd_castsi_ps(vMask0), _simd_castsi_ps(vMask1), _simd_castsi_ps(vMask2), _simd_castsi_ps(vMask3),
241 _simd_castsi_ps(vMask4), _simd_castsi_ps(vMask5), _simd_castsi_ps(vMask6), _simd_castsi_ps(vMask7));
242 }
243 #endif
244
245 //////////////////////////////////////////////////////////////////////////
246 /// TranposeSingleComponent
247 //////////////////////////////////////////////////////////////////////////
248 template<uint32_t bpp>
249 struct TransposeSingleComponent
250 {
251 //////////////////////////////////////////////////////////////////////////
252 /// @brief Pass-thru for single component.
253 /// @param pSrc - source data in SOA form
254 /// @param pDst - output data in AOS form
255 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
256 {
257 memcpy(pDst, pSrc, (bpp * KNOB_SIMD_WIDTH) / 8);
258 }
259 #if ENABLE_AVX512_SIMD16
260
261 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
262 {
263 memcpy(pDst, pSrc, (bpp * KNOB_SIMD16_WIDTH) / 8);
264 }
265 #endif
266 };
267
268 //////////////////////////////////////////////////////////////////////////
269 /// Transpose8_8_8_8
270 //////////////////////////////////////////////////////////////////////////
271 struct Transpose8_8_8_8
272 {
273 //////////////////////////////////////////////////////////////////////////
274 /// @brief Performs an SOA to AOS conversion for packed 8_8_8_8 data.
275 /// @param pSrc - source data in SOA form
276 /// @param pDst - output data in AOS form
277 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
278 {
279 simdscalari src = _simd_load_si((const simdscalari*)pSrc);
280
281 #if KNOB_SIMD_WIDTH == 8
282 #if KNOB_ARCH <= KNOB_ARCH_AVX
283 __m128i c0c1 = _mm256_castsi256_si128(src); // rrrrrrrrgggggggg
284 __m128i c2c3 = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(src), 1)); // bbbbbbbbaaaaaaaa
285 __m128i c0c2 = _mm_unpacklo_epi64(c0c1, c2c3); // rrrrrrrrbbbbbbbb
286 __m128i c1c3 = _mm_unpackhi_epi64(c0c1, c2c3); // ggggggggaaaaaaaa
287 __m128i c01 = _mm_unpacklo_epi8(c0c2, c1c3); // rgrgrgrgrgrgrgrg
288 __m128i c23 = _mm_unpackhi_epi8(c0c2, c1c3); // babababababababa
289 __m128i c0123lo = _mm_unpacklo_epi16(c01, c23); // rgbargbargbargba
290 __m128i c0123hi = _mm_unpackhi_epi16(c01, c23); // rgbargbargbargba
291 _mm_store_si128((__m128i*)pDst, c0123lo);
292 _mm_store_si128((__m128i*)(pDst + 16), c0123hi);
293 #else
294 simdscalari dst01 = _simd_shuffle_epi8(src,
295 _simd_set_epi32(0x0f078080, 0x0e068080, 0x0d058080, 0x0c048080, 0x80800b03, 0x80800a02, 0x80800901, 0x80800800));
296 simdscalari dst23 = _mm256_permute2x128_si256(src, src, 0x01);
297 dst23 = _simd_shuffle_epi8(dst23,
298 _simd_set_epi32(0x80800f07, 0x80800e06, 0x80800d05, 0x80800c04, 0x0b038080, 0x0a028080, 0x09018080, 0x08008080));
299 simdscalari dst = _simd_or_si(dst01, dst23);
300 _simd_store_si((simdscalari*)pDst, dst);
301 #endif
302 #else
303 #error Unsupported vector width
304 #endif
305 }
306 #if ENABLE_AVX512_SIMD16
307
308 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
309 {
310 __m128i src0 = _mm_load_si128(reinterpret_cast<const __m128i *>(pSrc)); // rrrrrrrrrrrrrrrr
311 __m128i src1 = _mm_load_si128(reinterpret_cast<const __m128i *>(pSrc) + 1); // gggggggggggggggg
312 __m128i src2 = _mm_load_si128(reinterpret_cast<const __m128i *>(pSrc) + 2); // bbbbbbbbbbbbbbbb
313 __m128i src3 = _mm_load_si128(reinterpret_cast<const __m128i *>(pSrc) + 3); // aaaaaaaaaaaaaaaa
314
315 simd16scalari cvt0 = _simd16_cvtepu8_epi32(src0);
316 simd16scalari cvt1 = _simd16_cvtepu8_epi32(src1);
317 simd16scalari cvt2 = _simd16_cvtepu8_epi32(src2);
318 simd16scalari cvt3 = _simd16_cvtepu8_epi32(src3);
319
320 simd16scalari shl1 = _simd16_slli_epi32(cvt1, 8);
321 simd16scalari shl2 = _simd16_slli_epi32(cvt2, 16);
322 simd16scalari shl3 = _simd16_slli_epi32(cvt3, 24);
323
324 simd16scalari dst = _simd16_or_si(_simd16_or_si(cvt0, shl1), _simd16_or_si(shl2, shl3));
325
326 _simd16_store_si(reinterpret_cast<simd16scalari *>(pDst), dst); // rgbargbargbargbargbargbargbargbargbargbargbargbargbargbargbargba
327 }
328 #endif
329 };
330
331 //////////////////////////////////////////////////////////////////////////
332 /// Transpose8_8_8
333 //////////////////////////////////////////////////////////////////////////
334 struct Transpose8_8_8
335 {
336 //////////////////////////////////////////////////////////////////////////
337 /// @brief Performs an SOA to AOS conversion for packed 8_8_8 data.
338 /// @param pSrc - source data in SOA form
339 /// @param pDst - output data in AOS form
340 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
341 #if ENABLE_AVX512_SIMD16
342
343 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
344 #endif
345 };
346
347 //////////////////////////////////////////////////////////////////////////
348 /// Transpose8_8
349 //////////////////////////////////////////////////////////////////////////
350 struct Transpose8_8
351 {
352 //////////////////////////////////////////////////////////////////////////
353 /// @brief Performs an SOA to AOS conversion for packed 8_8 data.
354 /// @param pSrc - source data in SOA form
355 /// @param pDst - output data in AOS form
356 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
357 {
358 #if KNOB_SIMD_WIDTH == 8
359 simdscalari src = _simd_load_si((const simdscalari*)pSrc);
360
361 __m128i rg = _mm256_castsi256_si128(src); // rrrrrrrr gggggggg
362 __m128i g = _mm_unpackhi_epi64(rg, rg); // gggggggg gggggggg
363 rg = _mm_unpacklo_epi8(rg, g);
364 _mm_store_si128((__m128i*)pDst, rg);
365 #else
366 #error Unsupported vector width
367 #endif
368 }
369 #if ENABLE_AVX512_SIMD16
370
371 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
372 {
373 __m128i src0 = _mm_load_si128(reinterpret_cast<const __m128i *>(pSrc)); // rrrrrrrrrrrrrrrr
374 __m128i src1 = _mm_load_si128(reinterpret_cast<const __m128i *>(pSrc) + 1); // gggggggggggggggg
375
376 simdscalari cvt0 = _simd_cvtepu8_epi16(src0);
377 simdscalari cvt1 = _simd_cvtepu8_epi16(src1);
378
379 simdscalari shl1 = _simd_slli_epi32(cvt1, 8);
380
381 simdscalari dst = _simd_or_si(cvt0, shl1);
382
383 _simd_store_si(reinterpret_cast<simdscalari *>(pDst), dst); // rgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrg
384 }
385 #endif
386 };
387
388 //////////////////////////////////////////////////////////////////////////
389 /// Transpose32_32_32_32
390 //////////////////////////////////////////////////////////////////////////
391 struct Transpose32_32_32_32
392 {
393 //////////////////////////////////////////////////////////////////////////
394 /// @brief Performs an SOA to AOS conversion for packed 32_32_32_32 data.
395 /// @param pSrc - source data in SOA form
396 /// @param pDst - output data in AOS form
397 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
398 {
399 #if KNOB_SIMD_WIDTH == 8
400 simdscalar src0 = _simd_load_ps((const float*)pSrc);
401 simdscalar src1 = _simd_load_ps((const float*)pSrc + 8);
402 simdscalar src2 = _simd_load_ps((const float*)pSrc + 16);
403 simdscalar src3 = _simd_load_ps((const float*)pSrc + 24);
404
405 __m128 vDst[8];
406 vTranspose4x8(vDst, src0, src1, src2, src3);
407 _mm_store_ps((float*)pDst, vDst[0]);
408 _mm_store_ps((float*)pDst+4, vDst[1]);
409 _mm_store_ps((float*)pDst+8, vDst[2]);
410 _mm_store_ps((float*)pDst+12, vDst[3]);
411 _mm_store_ps((float*)pDst+16, vDst[4]);
412 _mm_store_ps((float*)pDst+20, vDst[5]);
413 _mm_store_ps((float*)pDst+24, vDst[6]);
414 _mm_store_ps((float*)pDst+28, vDst[7]);
415 #else
416 #error Unsupported vector width
417 #endif
418 }
419 #if ENABLE_AVX512_SIMD16
420
421 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
422 {
423 simd16scalar src0 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc));
424 simd16scalar src1 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 16);
425 simd16scalar src2 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 32);
426 simd16scalar src3 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 48);
427
428 simd16scalar dst[4];
429
430 vTranspose4x16(dst, src0, src1, src2, src3);
431
432 _simd16_store_ps(reinterpret_cast<float *>(pDst) + 0, dst[0]);
433 _simd16_store_ps(reinterpret_cast<float *>(pDst) + 16, dst[1]);
434 _simd16_store_ps(reinterpret_cast<float *>(pDst) + 32, dst[2]);
435 _simd16_store_ps(reinterpret_cast<float *>(pDst) + 48, dst[3]);
436 }
437 #endif
438 };
439
440 //////////////////////////////////////////////////////////////////////////
441 /// Transpose32_32_32
442 //////////////////////////////////////////////////////////////////////////
443 struct Transpose32_32_32
444 {
445 //////////////////////////////////////////////////////////////////////////
446 /// @brief Performs an SOA to AOS conversion for packed 32_32_32 data.
447 /// @param pSrc - source data in SOA form
448 /// @param pDst - output data in AOS form
449 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
450 {
451 #if KNOB_SIMD_WIDTH == 8
452 simdscalar src0 = _simd_load_ps((const float*)pSrc);
453 simdscalar src1 = _simd_load_ps((const float*)pSrc + 8);
454 simdscalar src2 = _simd_load_ps((const float*)pSrc + 16);
455
456 __m128 vDst[8];
457 vTranspose3x8(vDst, src0, src1, src2);
458 _mm_store_ps((float*)pDst, vDst[0]);
459 _mm_store_ps((float*)pDst + 4, vDst[1]);
460 _mm_store_ps((float*)pDst + 8, vDst[2]);
461 _mm_store_ps((float*)pDst + 12, vDst[3]);
462 _mm_store_ps((float*)pDst + 16, vDst[4]);
463 _mm_store_ps((float*)pDst + 20, vDst[5]);
464 _mm_store_ps((float*)pDst + 24, vDst[6]);
465 _mm_store_ps((float*)pDst + 28, vDst[7]);
466 #else
467 #error Unsupported vector width
468 #endif
469 }
470 #if ENABLE_AVX512_SIMD16
471
472 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
473 {
474 simd16scalar src0 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc));
475 simd16scalar src1 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 16);
476 simd16scalar src2 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 32);
477 simd16scalar src3 = _simd16_setzero_ps();
478
479 simd16scalar dst[4];
480
481 vTranspose4x16(dst, src0, src1, src2, src3);
482
483 _simd16_store_ps(reinterpret_cast<float *>(pDst) + 0, dst[0]);
484 _simd16_store_ps(reinterpret_cast<float *>(pDst) + 16, dst[1]);
485 _simd16_store_ps(reinterpret_cast<float *>(pDst) + 32, dst[2]);
486 _simd16_store_ps(reinterpret_cast<float *>(pDst) + 48, dst[3]);
487 }
488 #endif
489 };
490
491 //////////////////////////////////////////////////////////////////////////
492 /// Transpose32_32
493 //////////////////////////////////////////////////////////////////////////
494 struct Transpose32_32
495 {
496 //////////////////////////////////////////////////////////////////////////
497 /// @brief Performs an SOA to AOS conversion for packed 32_32 data.
498 /// @param pSrc - source data in SOA form
499 /// @param pDst - output data in AOS form
500 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
501 {
502 #if KNOB_SIMD_WIDTH == 8
503 const float* pfSrc = (const float*)pSrc;
504 __m128 src_r0 = _mm_load_ps(pfSrc + 0);
505 __m128 src_r1 = _mm_load_ps(pfSrc + 4);
506 __m128 src_g0 = _mm_load_ps(pfSrc + 8);
507 __m128 src_g1 = _mm_load_ps(pfSrc + 12);
508
509 __m128 dst0 = _mm_unpacklo_ps(src_r0, src_g0);
510 __m128 dst1 = _mm_unpackhi_ps(src_r0, src_g0);
511 __m128 dst2 = _mm_unpacklo_ps(src_r1, src_g1);
512 __m128 dst3 = _mm_unpackhi_ps(src_r1, src_g1);
513
514 float* pfDst = (float*)pDst;
515 _mm_store_ps(pfDst + 0, dst0);
516 _mm_store_ps(pfDst + 4, dst1);
517 _mm_store_ps(pfDst + 8, dst2);
518 _mm_store_ps(pfDst + 12, dst3);
519 #else
520 #error Unsupported vector width
521 #endif
522 }
523 #if ENABLE_AVX512_SIMD16
524
525 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
526 {
527 simd16scalar src0 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc)); // rrrrrrrrrrrrrrrr
528 simd16scalar src1 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 16); // gggggggggggggggg
529
530 simd16scalar tmp0 = _simd16_unpacklo_ps(src0, src1); // r0 g0 r1 g1 r4 g4 r5 g5 r8 g8 r9 g9 rC gC rD gD
531 simd16scalar tmp1 = _simd16_unpackhi_ps(src0, src1); // r2 g2 r3 g3 r6 g6 r7 g7 rA gA rB gB rE gE rF gF
532
533 simd16scalar per0 = _simd16_permute2f128_ps(tmp0, tmp1, 0x44); // (1, 0, 1, 0) // r0 g0 r1 g1 r4 g4 r5 g5 r2 g2 r3 g3 r6 g6 r7 g7
534 simd16scalar per1 = _simd16_permute2f128_ps(tmp0, tmp1, 0xEE); // (3, 2, 3, 2) // r8 g8 r9 g9 rC gC rD gD rA gA rB gB rE gE rF gF
535
536 simd16scalar dst0 = _simd16_permute2f128_ps(per0, per0, 0xD8); // (3, 1, 2, 0) // r0 g0 r1 g1 r2 g2 r3 g3 r4 g4 r5 g5 r6 g6 r7 g7
537 simd16scalar dst1 = _simd16_permute2f128_ps(per1, per1, 0xD8); // (3, 1, 2, 0) // r8 g8 r9 g9 rA gA rB gB rC gC rD gD rE gE rF gF
538
539 _simd16_store_ps(reinterpret_cast<float *>(pDst) + 0, dst0); // rgrgrgrgrgrgrgrg
540 _simd16_store_ps(reinterpret_cast<float *>(pDst) + 16, dst1); // rgrgrgrgrgrgrgrg
541 }
542 #endif
543 };
544
545 //////////////////////////////////////////////////////////////////////////
546 /// Transpose16_16_16_16
547 //////////////////////////////////////////////////////////////////////////
548 struct Transpose16_16_16_16
549 {
550 //////////////////////////////////////////////////////////////////////////
551 /// @brief Performs an SOA to AOS conversion for packed 16_16_16_16 data.
552 /// @param pSrc - source data in SOA form
553 /// @param pDst - output data in AOS form
554 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
555 {
556 #if KNOB_SIMD_WIDTH == 8
557 simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc);
558 simdscalari src_ba = _simd_load_si((const simdscalari*)(pSrc + sizeof(simdscalari)));
559
560 __m128i src_r = _mm256_extractf128_si256(src_rg, 0);
561 __m128i src_g = _mm256_extractf128_si256(src_rg, 1);
562 __m128i src_b = _mm256_extractf128_si256(src_ba, 0);
563 __m128i src_a = _mm256_extractf128_si256(src_ba, 1);
564
565 __m128i rg0 = _mm_unpacklo_epi16(src_r, src_g);
566 __m128i rg1 = _mm_unpackhi_epi16(src_r, src_g);
567 __m128i ba0 = _mm_unpacklo_epi16(src_b, src_a);
568 __m128i ba1 = _mm_unpackhi_epi16(src_b, src_a);
569
570 __m128i dst0 = _mm_unpacklo_epi32(rg0, ba0);
571 __m128i dst1 = _mm_unpackhi_epi32(rg0, ba0);
572 __m128i dst2 = _mm_unpacklo_epi32(rg1, ba1);
573 __m128i dst3 = _mm_unpackhi_epi32(rg1, ba1);
574
575 _mm_store_si128(((__m128i*)pDst) + 0, dst0);
576 _mm_store_si128(((__m128i*)pDst) + 1, dst1);
577 _mm_store_si128(((__m128i*)pDst) + 2, dst2);
578 _mm_store_si128(((__m128i*)pDst) + 3, dst3);
579 #else
580 #error Unsupported vector width
581 #endif
582 }
583 #if ENABLE_AVX512_SIMD16
584
585 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
586 {
587 simdscalari src0 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc)); // rrrrrrrrrrrrrrrr
588 simdscalari src1 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc) + 1); // gggggggggggggggg
589 simdscalari src2 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc) + 2); // bbbbbbbbbbbbbbbb
590 simdscalari src3 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc) + 3); // aaaaaaaaaaaaaaaa
591
592 simdscalari pre0 = _simd_unpacklo_epi16(src0, src1); // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB
593 simdscalari pre1 = _simd_unpackhi_epi16(src0, src1); // rg4 rg5 rg6 rg7 rgC rgD rgE rgF
594 simdscalari pre2 = _simd_unpacklo_epi16(src2, src3); // ba0 ba1 ba3 ba3 ba8 ba9 baA baB
595 simdscalari pre3 = _simd_unpackhi_epi16(src2, src3); // ba4 ba5 ba6 ba7 baC baD baE baF
596
597 simdscalari tmp0 = _simd_unpacklo_epi32(pre0, pre2); // rbga0 rbga1 rbga8 rbga9
598 simdscalari tmp1 = _simd_unpackhi_epi32(pre0, pre2); // rbga2 rbga3 rbgaA rbgaB
599 simdscalari tmp2 = _simd_unpacklo_epi32(pre1, pre3); // rbga4 rbga5 rgbaC rbgaD
600 simdscalari tmp3 = _simd_unpackhi_epi32(pre1, pre3); // rbga6 rbga7 rbgaE rbgaF
601
602 simdscalari dst0 = _simd_permute2f128_si(tmp0, tmp1, 0x20); // (2, 0) // rbga0 rbga1 rbga2 rbga3
603 simdscalari dst1 = _simd_permute2f128_si(tmp2, tmp3, 0x20); // (2, 0) // rbga4 rbga5 rbga6 rbga7
604 simdscalari dst2 = _simd_permute2f128_si(tmp0, tmp1, 0x31); // (3, 1) // rbga8 rbga9 rbgaA rbgaB
605 simdscalari dst3 = _simd_permute2f128_si(tmp2, tmp3, 0x31); // (3, 1) // rbgaC rbgaD rbgaE rbgaF
606
607 _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 0, dst0); // rgbargbargbargba
608 _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 1, dst1); // rgbargbargbargba
609 _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 2, dst2); // rgbargbargbargba
610 _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 3, dst3); // rgbargbargbargba
611 }
612 #endif
613 };
614
615 //////////////////////////////////////////////////////////////////////////
616 /// Transpose16_16_16
617 //////////////////////////////////////////////////////////////////////////
618 struct Transpose16_16_16
619 {
620 //////////////////////////////////////////////////////////////////////////
621 /// @brief Performs an SOA to AOS conversion for packed 16_16_16 data.
622 /// @param pSrc - source data in SOA form
623 /// @param pDst - output data in AOS form
624 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
625 {
626 #if KNOB_SIMD_WIDTH == 8
627 simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc);
628
629 __m128i src_r = _mm256_extractf128_si256(src_rg, 0);
630 __m128i src_g = _mm256_extractf128_si256(src_rg, 1);
631 __m128i src_b = _mm_load_si128((const __m128i*)(pSrc + sizeof(simdscalari)));
632 __m128i src_a = _mm_undefined_si128();
633
634 __m128i rg0 = _mm_unpacklo_epi16(src_r, src_g);
635 __m128i rg1 = _mm_unpackhi_epi16(src_r, src_g);
636 __m128i ba0 = _mm_unpacklo_epi16(src_b, src_a);
637 __m128i ba1 = _mm_unpackhi_epi16(src_b, src_a);
638
639 __m128i dst0 = _mm_unpacklo_epi32(rg0, ba0);
640 __m128i dst1 = _mm_unpackhi_epi32(rg0, ba0);
641 __m128i dst2 = _mm_unpacklo_epi32(rg1, ba1);
642 __m128i dst3 = _mm_unpackhi_epi32(rg1, ba1);
643
644 _mm_store_si128(((__m128i*)pDst) + 0, dst0);
645 _mm_store_si128(((__m128i*)pDst) + 1, dst1);
646 _mm_store_si128(((__m128i*)pDst) + 2, dst2);
647 _mm_store_si128(((__m128i*)pDst) + 3, dst3);
648 #else
649 #error Unsupported vector width
650 #endif
651 }
652 #if ENABLE_AVX512_SIMD16
653
654 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
655 {
656 simdscalari src0 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc)); // rrrrrrrrrrrrrrrr
657 simdscalari src1 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc) + 1); // gggggggggggggggg
658 simdscalari src2 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc) + 2); // bbbbbbbbbbbbbbbb
659 simdscalari src3 = _simd_setzero_si(); // aaaaaaaaaaaaaaaa
660
661 simdscalari pre0 = _simd_unpacklo_epi16(src0, src1); // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB
662 simdscalari pre1 = _simd_unpackhi_epi16(src0, src1); // rg4 rg5 rg6 rg7 rgC rgD rgE rgF
663 simdscalari pre2 = _simd_unpacklo_epi16(src2, src3); // ba0 ba1 ba3 ba3 ba8 ba9 baA baB
664 simdscalari pre3 = _simd_unpackhi_epi16(src2, src3); // ba4 ba5 ba6 ba7 baC baD baE baF
665
666 simdscalari tmp0 = _simd_unpacklo_epi32(pre0, pre2); // rbga0 rbga1 rbga8 rbga9
667 simdscalari tmp1 = _simd_unpackhi_epi32(pre0, pre2); // rbga2 rbga3 rbgaA rbgaB
668 simdscalari tmp2 = _simd_unpacklo_epi32(pre1, pre3); // rbga4 rbga5 rgbaC rbgaD
669 simdscalari tmp3 = _simd_unpackhi_epi32(pre1, pre3); // rbga6 rbga7 rbgaE rbgaF
670
671 simdscalari dst0 = _simd_permute2f128_si(tmp0, tmp1, 0x20); // (2, 0) // rbga0 rbga1 rbga2 rbga3
672 simdscalari dst1 = _simd_permute2f128_si(tmp2, tmp3, 0x20); // (2, 0) // rbga4 rbga5 rbga6 rbga7
673 simdscalari dst2 = _simd_permute2f128_si(tmp0, tmp1, 0x31); // (3, 1) // rbga8 rbga9 rbgaA rbgaB
674 simdscalari dst3 = _simd_permute2f128_si(tmp2, tmp3, 0x31); // (3, 1) // rbgaC rbgaD rbgaE rbgaF
675
676 _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 0, dst0); // rgbargbargbargba
677 _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 1, dst1); // rgbargbargbargba
678 _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 2, dst2); // rgbargbargbargba
679 _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 3, dst3); // rgbargbargbargba
680 }
681 #endif
682 };
683
684 //////////////////////////////////////////////////////////////////////////
685 /// Transpose16_16
686 //////////////////////////////////////////////////////////////////////////
687 struct Transpose16_16
688 {
689 //////////////////////////////////////////////////////////////////////////
690 /// @brief Performs an SOA to AOS conversion for packed 16_16 data.
691 /// @param pSrc - source data in SOA form
692 /// @param pDst - output data in AOS form
693 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
694 {
695 #if KNOB_SIMD_WIDTH == 8
696 simdscalar src = _simd_load_ps((const float*)pSrc);
697
698 __m128 comp0 = _mm256_castps256_ps128(src);
699 __m128 comp1 = _mm256_extractf128_ps(src, 1);
700
701 __m128i comp0i = _mm_castps_si128(comp0);
702 __m128i comp1i = _mm_castps_si128(comp1);
703
704 __m128i resLo = _mm_unpacklo_epi16(comp0i, comp1i);
705 __m128i resHi = _mm_unpackhi_epi16(comp0i, comp1i);
706
707 _mm_store_si128((__m128i*)pDst, resLo);
708 _mm_store_si128((__m128i*)pDst + 1, resHi);
709 #else
710 #error Unsupported vector width
711 #endif
712 }
713 #if ENABLE_AVX512_SIMD16
714
715 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
716 {
717 simdscalari src0 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc)); // rrrrrrrrrrrrrrrr
718 simdscalari src1 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc) + 1); // gggggggggggggggg
719
720 simdscalari tmp0 = _simd_unpacklo_epi16(src0, src1); // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB
721 simdscalari tmp1 = _simd_unpackhi_epi16(src0, src1); // rg4 rg5 rg6 rg7 rgC rgD rgE rgF
722
723 simdscalari dst0 = _simd_permute2f128_si(tmp0, tmp1, 0x20); // (2, 0) // rg0 rg1 rg2 rg3 rg4 rg5 rg6 rg7
724 simdscalari dst1 = _simd_permute2f128_si(tmp0, tmp1, 0x31); // (3, 1) // rg8 rg9 rgA rgB rgC rgD rgE rgF
725
726 _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 0, dst0); // rgrgrgrgrgrgrgrg
727 _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 1, dst1); // rgrgrgrgrgrgrgrg
728 }
729 #endif
730 };
731
732 //////////////////////////////////////////////////////////////////////////
733 /// Transpose24_8
734 //////////////////////////////////////////////////////////////////////////
735 struct Transpose24_8
736 {
737 //////////////////////////////////////////////////////////////////////////
738 /// @brief Performs an SOA to AOS conversion for packed 24_8 data.
739 /// @param pSrc - source data in SOA form
740 /// @param pDst - output data in AOS form
741 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
742 #if ENABLE_AVX512_SIMD16
743
744 static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
745 #endif
746 };
747
748 //////////////////////////////////////////////////////////////////////////
749 /// Transpose32_8_24
750 //////////////////////////////////////////////////////////////////////////
751 struct Transpose32_8_24
752 {
753 //////////////////////////////////////////////////////////////////////////
754 /// @brief Performs an SOA to AOS conversion for packed 32_8_24 data.
755 /// @param pSrc - source data in SOA form
756 /// @param pDst - output data in AOS form
757 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
758 #if ENABLE_AVX512_SIMD16
759
760 static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
761 #endif
762 };
763
764 //////////////////////////////////////////////////////////////////////////
765 /// Transpose4_4_4_4
766 //////////////////////////////////////////////////////////////////////////
767 struct Transpose4_4_4_4
768 {
769 //////////////////////////////////////////////////////////////////////////
770 /// @brief Performs an SOA to AOS conversion for packed 4_4_4_4 data.
771 /// @param pSrc - source data in SOA form
772 /// @param pDst - output data in AOS form
773 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
774 #if ENABLE_AVX512_SIMD16
775
776 static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
777 #endif
778 };
779
780 //////////////////////////////////////////////////////////////////////////
781 /// Transpose5_6_5
782 //////////////////////////////////////////////////////////////////////////
783 struct Transpose5_6_5
784 {
785 //////////////////////////////////////////////////////////////////////////
786 /// @brief Performs an SOA to AOS conversion for packed 5_6_5 data.
787 /// @param pSrc - source data in SOA form
788 /// @param pDst - output data in AOS form
789 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
790 #if ENABLE_AVX512_SIMD16
791
792 static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
793 #endif
794 };
795
796 //////////////////////////////////////////////////////////////////////////
797 /// Transpose9_9_9_5
798 //////////////////////////////////////////////////////////////////////////
799 struct Transpose9_9_9_5
800 {
801 //////////////////////////////////////////////////////////////////////////
802 /// @brief Performs an SOA to AOS conversion for packed 9_9_9_5 data.
803 /// @param pSrc - source data in SOA form
804 /// @param pDst - output data in AOS form
805 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
806 #if ENABLE_AVX512_SIMD16
807
808 static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
809 #endif
810 };
811
812 //////////////////////////////////////////////////////////////////////////
813 /// Transpose5_5_5_1
814 //////////////////////////////////////////////////////////////////////////
815 struct Transpose5_5_5_1
816 {
817 //////////////////////////////////////////////////////////////////////////
818 /// @brief Performs an SOA to AOS conversion for packed 5_5_5_1 data.
819 /// @param pSrc - source data in SOA form
820 /// @param pDst - output data in AOS form
821 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
822 #if ENABLE_AVX512_SIMD16
823
824 static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
825 #endif
826 };
827
828 //////////////////////////////////////////////////////////////////////////
829 /// Transpose1_5_5_5
830 //////////////////////////////////////////////////////////////////////////
831 struct Transpose1_5_5_5
832 {
833 //////////////////////////////////////////////////////////////////////////
834 /// @brief Performs an SOA to AOS conversion for packed 5_5_5_1 data.
835 /// @param pSrc - source data in SOA form
836 /// @param pDst - output data in AOS form
837 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
838 };
839
840 //////////////////////////////////////////////////////////////////////////
841 /// Transpose10_10_10_2
842 //////////////////////////////////////////////////////////////////////////
843 struct Transpose10_10_10_2
844 {
845 //////////////////////////////////////////////////////////////////////////
846 /// @brief Performs an SOA to AOS conversion for packed 10_10_10_2 data.
847 /// @param pSrc - source data in SOA form
848 /// @param pDst - output data in AOS form
849 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
850 #if ENABLE_AVX512_SIMD16
851
852 static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
853 #endif
854 };
855
856 //////////////////////////////////////////////////////////////////////////
857 /// Transpose11_11_10
858 //////////////////////////////////////////////////////////////////////////
859 struct Transpose11_11_10
860 {
861 //////////////////////////////////////////////////////////////////////////
862 /// @brief Performs an SOA to AOS conversion for packed 11_11_10 data.
863 /// @param pSrc - source data in SOA form
864 /// @param pDst - output data in AOS form
865 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
866 #if ENABLE_AVX512_SIMD16
867
868 static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
869 #endif
870 };
871
872 //////////////////////////////////////////////////////////////////////////
873 /// Transpose64
874 //////////////////////////////////////////////////////////////////////////
875 struct Transpose64
876 {
877 //////////////////////////////////////////////////////////////////////////
878 /// @brief Performs an SOA to AOS conversion
879 /// @param pSrc - source data in SOA form
880 /// @param pDst - output data in AOS form
881 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
882 #if ENABLE_AVX512_SIMD16
883
884 static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
885 #endif
886 };
887
888 //////////////////////////////////////////////////////////////////////////
889 /// Transpose64_64
890 //////////////////////////////////////////////////////////////////////////
891 struct Transpose64_64
892 {
893 //////////////////////////////////////////////////////////////////////////
894 /// @brief Performs an SOA to AOS conversion
895 /// @param pSrc - source data in SOA form
896 /// @param pDst - output data in AOS form
897 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
898 #if ENABLE_AVX512_SIMD16
899
900 static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
901 #endif
902 };
903
904 //////////////////////////////////////////////////////////////////////////
905 /// Transpose64_64_64
906 //////////////////////////////////////////////////////////////////////////
907 struct Transpose64_64_64
908 {
909 //////////////////////////////////////////////////////////////////////////
910 /// @brief Performs an SOA to AOS conversion
911 /// @param pSrc - source data in SOA form
912 /// @param pDst - output data in AOS form
913 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
914 #if ENABLE_AVX512_SIMD16
915
916 static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
917 #endif
918 };
919
920 //////////////////////////////////////////////////////////////////////////
921 /// Transpose64_64_64_64
922 //////////////////////////////////////////////////////////////////////////
923 struct Transpose64_64_64_64
924 {
925 //////////////////////////////////////////////////////////////////////////
926 /// @brief Performs an SOA to AOS conversion
927 /// @param pSrc - source data in SOA form
928 /// @param pDst - output data in AOS form
929 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
930 #if ENABLE_AVX512_SIMD16
931
932 static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
933 #endif
934 };
935
936 // helper function to unroll loops
937 template<int Begin, int End, int Step = 1>
938 struct UnrollerL {
939 template<typename Lambda>
940 INLINE static void step(Lambda& func) {
941 func(Begin);
942 UnrollerL<Begin + Step, End, Step>::step(func);
943 }
944 };
945
946 template<int End, int Step>
947 struct UnrollerL<End, End, Step> {
948 template<typename Lambda>
949 static void step(Lambda& func) {
950 }
951 };
952
953 // helper function to unroll loops, with mask to skip specific iterations
954 template<int Begin, int End, int Step = 1, int Mask = 0x7f>
955 struct UnrollerLMask {
956 template<typename Lambda>
957 INLINE static void step(Lambda& func) {
958 if(Mask & (1 << Begin))
959 {
960 func(Begin);
961 }
962 UnrollerL<Begin + Step, End, Step>::step(func);
963 }
964 };
965
966 template<int End, int Step, int Mask>
967 struct UnrollerLMask<End, End, Step, Mask> {
968 template<typename Lambda>
969 static void step(Lambda& func) {
970 }
971 };
972
973 // general CRC compute
974 INLINE
975 uint32_t ComputeCRC(uint32_t crc, const void *pData, uint32_t size)
976 {
977 #if defined(_WIN64) || defined(__x86_64__)
978 uint32_t sizeInQwords = size / sizeof(uint64_t);
979 uint32_t sizeRemainderBytes = size % sizeof(uint64_t);
980 uint64_t* pDataWords = (uint64_t*)pData;
981 for (uint32_t i = 0; i < sizeInQwords; ++i)
982 {
983 crc = (uint32_t)_mm_crc32_u64(crc, *pDataWords++);
984 }
985 #else
986 uint32_t sizeInDwords = size / sizeof(uint32_t);
987 uint32_t sizeRemainderBytes = size % sizeof(uint32_t);
988 uint32_t* pDataWords = (uint32_t*)pData;
989 for (uint32_t i = 0; i < sizeInDwords; ++i)
990 {
991 crc = _mm_crc32_u32(crc, *pDataWords++);
992 }
993 #endif
994
995 uint8_t* pRemainderBytes = (uint8_t*)pDataWords;
996 for (uint32_t i = 0; i < sizeRemainderBytes; ++i)
997 {
998 crc = _mm_crc32_u8(crc, *pRemainderBytes++);
999 }
1000
1001 return crc;
1002 }
1003
1004 //////////////////////////////////////////////////////////////////////////
1005 /// Add byte offset to any-type pointer
1006 //////////////////////////////////////////////////////////////////////////
1007 template <typename T>
1008 INLINE
1009 static T* PtrAdd(T* p, intptr_t offset)
1010 {
1011 intptr_t intp = reinterpret_cast<intptr_t>(p);
1012 return reinterpret_cast<T*>(intp + offset);
1013 }
1014
1015 //////////////////////////////////////////////////////////////////////////
1016 /// Is a power-of-2?
1017 //////////////////////////////////////////////////////////////////////////
1018 template <typename T>
1019 INLINE
1020 static bool IsPow2(T value)
1021 {
1022 return value == (value & (0 - value));
1023 }
1024
1025 //////////////////////////////////////////////////////////////////////////
1026 /// Align down to specified alignment
1027 /// Note: IsPow2(alignment) MUST be true
1028 //////////////////////////////////////////////////////////////////////////
1029 template <typename T1, typename T2>
1030 INLINE
1031 static T1 AlignDownPow2(T1 value, T2 alignment)
1032 {
1033 SWR_ASSERT(IsPow2(alignment));
1034 return value & ~T1(alignment - 1);
1035 }
1036
1037 //////////////////////////////////////////////////////////////////////////
1038 /// Align up to specified alignment
1039 /// Note: IsPow2(alignment) MUST be true
1040 //////////////////////////////////////////////////////////////////////////
1041 template <typename T1, typename T2>
1042 INLINE
1043 static T1 AlignUpPow2(T1 value, T2 alignment)
1044 {
1045 return AlignDownPow2(value + T1(alignment - 1), alignment);
1046 }
1047
1048 //////////////////////////////////////////////////////////////////////////
1049 /// Align up ptr to specified alignment
1050 /// Note: IsPow2(alignment) MUST be true
1051 //////////////////////////////////////////////////////////////////////////
1052 template <typename T1, typename T2>
1053 INLINE
1054 static T1* AlignUpPow2(T1* value, T2 alignment)
1055 {
1056 return reinterpret_cast<T1*>(
1057 AlignDownPow2(reinterpret_cast<uintptr_t>(value) + uintptr_t(alignment - 1), alignment));
1058 }
1059
1060 //////////////////////////////////////////////////////////////////////////
1061 /// Align down to specified alignment
1062 //////////////////////////////////////////////////////////////////////////
1063 template <typename T1, typename T2>
1064 INLINE
1065 static T1 AlignDown(T1 value, T2 alignment)
1066 {
1067 if (IsPow2(alignment)) { return AlignDownPow2(value, alignment); }
1068 return value - T1(value % alignment);
1069 }
1070
1071 //////////////////////////////////////////////////////////////////////////
1072 /// Align down to specified alignment
1073 //////////////////////////////////////////////////////////////////////////
1074 template <typename T1, typename T2>
1075 INLINE
1076 static T1* AlignDown(T1* value, T2 alignment)
1077 {
1078 return (T1*)AlignDown(uintptr_t(value), alignment);
1079 }
1080
1081 //////////////////////////////////////////////////////////////////////////
1082 /// Align up to specified alignment
1083 /// Note: IsPow2(alignment) MUST be true
1084 //////////////////////////////////////////////////////////////////////////
1085 template <typename T1, typename T2>
1086 INLINE
1087 static T1 AlignUp(T1 value, T2 alignment)
1088 {
1089 return AlignDown(value + T1(alignment - 1), alignment);
1090 }
1091
1092 //////////////////////////////////////////////////////////////////////////
1093 /// Align up to specified alignment
1094 /// Note: IsPow2(alignment) MUST be true
1095 //////////////////////////////////////////////////////////////////////////
1096 template <typename T1, typename T2>
1097 INLINE
1098 static T1* AlignUp(T1* value, T2 alignment)
1099 {
1100 return AlignDown(PtrAdd(value, alignment - 1), alignment);
1101 }
1102
1103 //////////////////////////////////////////////////////////////////////////
1104 /// Helper structure used to access an array of elements that don't
1105 /// correspond to a typical word size.
1106 //////////////////////////////////////////////////////////////////////////
1107 template<typename T, size_t BitsPerElementT, size_t ArrayLenT>
1108 class BitsArray
1109 {
1110 private:
1111 static const size_t BITS_PER_WORD = sizeof(size_t) * 8;
1112 static const size_t ELEMENTS_PER_WORD = BITS_PER_WORD / BitsPerElementT;
1113 static const size_t NUM_WORDS = (ArrayLenT + ELEMENTS_PER_WORD - 1) / ELEMENTS_PER_WORD;
1114 static const size_t ELEMENT_MASK = (size_t(1) << BitsPerElementT) - 1;
1115
1116 static_assert(ELEMENTS_PER_WORD * BitsPerElementT == BITS_PER_WORD,
1117 "Element size must an integral fraction of pointer size");
1118
1119 size_t m_words[NUM_WORDS] = {};
1120
1121 public:
1122
1123 T operator[] (size_t elementIndex) const
1124 {
1125 size_t word = m_words[elementIndex / ELEMENTS_PER_WORD];
1126 word >>= ((elementIndex % ELEMENTS_PER_WORD) * BitsPerElementT);
1127 return T(word & ELEMENT_MASK);
1128 }
1129 };
1130
1131 // Ranged integer argument for TemplateArgUnroller
1132 template <uint32_t TMin, uint32_t TMax>
1133 struct IntArg
1134 {
1135 uint32_t val;
1136 };
1137
1138 // Recursive template used to auto-nest conditionals. Converts dynamic boolean function
1139 // arguments to static template arguments.
1140 template <typename TermT, typename... ArgsB>
1141 struct TemplateArgUnroller
1142 {
1143 //-----------------------------------------
1144 // Boolean value
1145 //-----------------------------------------
1146
1147 // Last Arg Terminator
1148 static typename TermT::FuncType GetFunc(bool bArg)
1149 {
1150 if (bArg)
1151 {
1152 return TermT::template GetFunc<ArgsB..., std::true_type>();
1153 }
1154
1155 return TermT::template GetFunc<ArgsB..., std::false_type>();
1156 }
1157
1158 // Recursively parse args
1159 template <typename... TArgsT>
1160 static typename TermT::FuncType GetFunc(bool bArg, TArgsT... remainingArgs)
1161 {
1162 if (bArg)
1163 {
1164 return TemplateArgUnroller<TermT, ArgsB..., std::true_type>::GetFunc(remainingArgs...);
1165 }
1166
1167 return TemplateArgUnroller<TermT, ArgsB..., std::false_type>::GetFunc(remainingArgs...);
1168 }
1169
1170 //-----------------------------------------
1171 // Integer value (within specified range)
1172 //-----------------------------------------
1173
1174 // Last Arg Terminator
1175 template <uint32_t TMin, uint32_t TMax>
1176 static typename TermT::FuncType GetFunc(IntArg<TMin, TMax> iArg)
1177 {
1178 if (iArg.val == TMax)
1179 {
1180 return TermT::template GetFunc<ArgsB..., std::integral_constant<uint32_t, TMax>>();
1181 }
1182 if (TMax > TMin)
1183 {
1184 return TemplateArgUnroller<TermT, ArgsB...>::GetFunc(IntArg<TMin, TMax-1>{iArg.val});
1185 }
1186 SWR_ASSUME(false); return nullptr;
1187 }
1188 template <uint32_t TVal>
1189 static typename TermT::FuncType GetFunc(IntArg<TVal, TVal> iArg)
1190 {
1191 SWR_ASSERT(iArg.val == TVal);
1192 return TermT::template GetFunc<ArgsB..., std::integral_constant<uint32_t, TVal>>();
1193 }
1194
1195 // Recursively parse args
1196 template <uint32_t TMin, uint32_t TMax, typename... TArgsT>
1197 static typename TermT::FuncType GetFunc(IntArg<TMin, TMax> iArg, TArgsT... remainingArgs)
1198 {
1199 if (iArg.val == TMax)
1200 {
1201 return TemplateArgUnroller<TermT, ArgsB..., std::integral_constant<uint32_t, TMax>>::GetFunc(remainingArgs...);
1202 }
1203 if (TMax > TMin)
1204 {
1205 return TemplateArgUnroller<TermT, ArgsB...>::GetFunc(IntArg<TMin, TMax - 1>{iArg.val}, remainingArgs...);
1206 }
1207 SWR_ASSUME(false); return nullptr;
1208 }
1209 template <uint32_t TVal, typename... TArgsT>
1210 static typename TermT::FuncType GetFunc(IntArg<TVal, TVal> iArg, TArgsT... remainingArgs)
1211 {
1212 SWR_ASSERT(iArg.val == TVal);
1213 return TemplateArgUnroller<TermT, ArgsB..., std::integral_constant<uint32_t, TVal>>::GetFunc(remainingArgs...);
1214 }
1215 };
1216
1217