swr/rast: Implement JIT shader caching to disk
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / utils.h
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file utils.h
24 *
25 * @brief Utilities used by SWR core.
26 *
27 ******************************************************************************/
28 #pragma once
29
30 #include <string.h>
31 #include <type_traits>
32 #include <algorithm>
33 #include "common/os.h"
34 #include "common/simdintrin.h"
35 #include "common/swr_assert.h"
36 #include "core/api.h"
37
38 #if defined(_WIN64) || defined(__x86_64__)
39 #define _MM_INSERT_EPI64 _mm_insert_epi64
40 #define _MM_EXTRACT_EPI64 _mm_extract_epi64
41 #else
42 INLINE int64_t _MM_EXTRACT_EPI64(__m128i a, const int32_t ndx)
43 {
44 OSALIGNLINE(uint32_t) elems[4];
45 _mm_store_si128((__m128i*)elems, a);
46 if (ndx == 0)
47 {
48 uint64_t foo = elems[0];
49 foo |= (uint64_t)elems[1] << 32;
50 return foo;
51 }
52 else
53 {
54 uint64_t foo = elems[2];
55 foo |= (uint64_t)elems[3] << 32;
56 return foo;
57 }
58 }
59
60 INLINE __m128i _MM_INSERT_EPI64(__m128i a, int64_t b, const int32_t ndx)
61 {
62 OSALIGNLINE(int64_t) elems[2];
63 _mm_store_si128((__m128i*)elems, a);
64 if (ndx == 0)
65 {
66 elems[0] = b;
67 }
68 else
69 {
70 elems[1] = b;
71 }
72 __m128i out;
73 out = _mm_load_si128((const __m128i*)elems);
74 return out;
75 }
76 #endif
77
78 struct simdBBox
79 {
80 simdscalari ymin;
81 simdscalari ymax;
82 simdscalari xmin;
83 simdscalari xmax;
84 };
85
86 #if ENABLE_AVX512_SIMD16
87 struct simd16BBox
88 {
89 simd16scalari ymin;
90 simd16scalari ymax;
91 simd16scalari xmin;
92 simd16scalari xmax;
93 };
94
95 #endif
96 INLINE
97 void vTranspose(__m128 &row0, __m128 &row1, __m128 &row2, __m128 &row3)
98 {
99 __m128i row0i = _mm_castps_si128(row0);
100 __m128i row1i = _mm_castps_si128(row1);
101 __m128i row2i = _mm_castps_si128(row2);
102 __m128i row3i = _mm_castps_si128(row3);
103
104 __m128i vTemp = row2i;
105 row2i = _mm_unpacklo_epi32(row2i, row3i);
106 vTemp = _mm_unpackhi_epi32(vTemp, row3i);
107
108 row3i = row0i;
109 row0i = _mm_unpacklo_epi32(row0i, row1i);
110 row3i = _mm_unpackhi_epi32(row3i, row1i);
111
112 row1i = row0i;
113 row0i = _mm_unpacklo_epi64(row0i, row2i);
114 row1i = _mm_unpackhi_epi64(row1i, row2i);
115
116 row2i = row3i;
117 row2i = _mm_unpacklo_epi64(row2i, vTemp);
118 row3i = _mm_unpackhi_epi64(row3i, vTemp);
119
120 row0 = _mm_castsi128_ps(row0i);
121 row1 = _mm_castsi128_ps(row1i);
122 row2 = _mm_castsi128_ps(row2i);
123 row3 = _mm_castsi128_ps(row3i);
124 }
125
126 INLINE
127 void vTranspose(__m128i &row0, __m128i &row1, __m128i &row2, __m128i &row3)
128 {
129 __m128i vTemp = row2;
130 row2 = _mm_unpacklo_epi32(row2, row3);
131 vTemp = _mm_unpackhi_epi32(vTemp, row3);
132
133 row3 = row0;
134 row0 = _mm_unpacklo_epi32(row0, row1);
135 row3 = _mm_unpackhi_epi32(row3, row1);
136
137 row1 = row0;
138 row0 = _mm_unpacklo_epi64(row0, row2);
139 row1 = _mm_unpackhi_epi64(row1, row2);
140
141 row2 = row3;
142 row2 = _mm_unpacklo_epi64(row2, vTemp);
143 row3 = _mm_unpackhi_epi64(row3, vTemp);
144 }
145
146 #if KNOB_SIMD_WIDTH == 8
147 INLINE
148 void vTranspose3x8(__m128 (&vDst)[8], const simdscalar &vSrc0, const simdscalar &vSrc1, const simdscalar &vSrc2)
149 {
150 simdscalar r0r2 = _simd_unpacklo_ps(vSrc0, vSrc2); //x0z0x1z1 x4z4x5z5
151 simdscalar r1rx = _simd_unpacklo_ps(vSrc1, _simd_setzero_ps()); //y0w0y1w1 y4w4y5w5
152 simdscalar r02r1xlolo = _simd_unpacklo_ps(r0r2, r1rx); //x0y0z0w0 x4y4z4w4
153 simdscalar r02r1xlohi = _simd_unpackhi_ps(r0r2, r1rx); //x1y1z1w1 x5y5z5w5
154
155 r0r2 = _simd_unpackhi_ps(vSrc0, vSrc2); //x2z2x3z3 x6z6x7z7
156 r1rx = _simd_unpackhi_ps(vSrc1, _simd_setzero_ps()); //y2w2y3w3 y6w6yw77
157 simdscalar r02r1xhilo = _simd_unpacklo_ps(r0r2, r1rx); //x2y2z2w2 x6y6z6w6
158 simdscalar r02r1xhihi = _simd_unpackhi_ps(r0r2, r1rx); //x3y3z3w3 x7y7z7w7
159
160 vDst[0] = _mm256_castps256_ps128(r02r1xlolo);
161 vDst[1] = _mm256_castps256_ps128(r02r1xlohi);
162 vDst[2] = _mm256_castps256_ps128(r02r1xhilo);
163 vDst[3] = _mm256_castps256_ps128(r02r1xhihi);
164
165 vDst[4] = _simd_extractf128_ps(r02r1xlolo, 1);
166 vDst[5] = _simd_extractf128_ps(r02r1xlohi, 1);
167 vDst[6] = _simd_extractf128_ps(r02r1xhilo, 1);
168 vDst[7] = _simd_extractf128_ps(r02r1xhihi, 1);
169 }
170
171 INLINE
172 void vTranspose4x8(__m128 (&vDst)[8], const simdscalar &vSrc0, const simdscalar &vSrc1, const simdscalar &vSrc2, const simdscalar &vSrc3)
173 {
174 simdscalar r0r2 = _simd_unpacklo_ps(vSrc0, vSrc2); //x0z0x1z1 x4z4x5z5
175 simdscalar r1rx = _simd_unpacklo_ps(vSrc1, vSrc3); //y0w0y1w1 y4w4y5w5
176 simdscalar r02r1xlolo = _simd_unpacklo_ps(r0r2, r1rx); //x0y0z0w0 x4y4z4w4
177 simdscalar r02r1xlohi = _simd_unpackhi_ps(r0r2, r1rx); //x1y1z1w1 x5y5z5w5
178
179 r0r2 = _simd_unpackhi_ps(vSrc0, vSrc2); //x2z2x3z3 x6z6x7z7
180 r1rx = _simd_unpackhi_ps(vSrc1, vSrc3); //y2w2y3w3 y6w6yw77
181 simdscalar r02r1xhilo = _simd_unpacklo_ps(r0r2, r1rx); //x2y2z2w2 x6y6z6w6
182 simdscalar r02r1xhihi = _simd_unpackhi_ps(r0r2, r1rx); //x3y3z3w3 x7y7z7w7
183
184 vDst[0] = _mm256_castps256_ps128(r02r1xlolo);
185 vDst[1] = _mm256_castps256_ps128(r02r1xlohi);
186 vDst[2] = _mm256_castps256_ps128(r02r1xhilo);
187 vDst[3] = _mm256_castps256_ps128(r02r1xhihi);
188
189 vDst[4] = _simd_extractf128_ps(r02r1xlolo, 1);
190 vDst[5] = _simd_extractf128_ps(r02r1xlohi, 1);
191 vDst[6] = _simd_extractf128_ps(r02r1xhilo, 1);
192 vDst[7] = _simd_extractf128_ps(r02r1xhihi, 1);
193 }
194
195 #if ENABLE_AVX512_SIMD16
196 INLINE
197 void vTranspose4x16(simd16scalar(&dst)[4], const simd16scalar &src0, const simd16scalar &src1, const simd16scalar &src2, const simd16scalar &src3)
198 {
199 const simd16scalari perm = _simd16_set_epi32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0); // pre-permute input to setup the right order after all the unpacking
200
201 simd16scalar pre0 = _simd16_permute_ps(src0, perm); // r
202 simd16scalar pre1 = _simd16_permute_ps(src1, perm); // g
203 simd16scalar pre2 = _simd16_permute_ps(src2, perm); // b
204 simd16scalar pre3 = _simd16_permute_ps(src3, perm); // a
205
206 simd16scalar rblo = _simd16_unpacklo_ps(pre0, pre2);
207 simd16scalar galo = _simd16_unpacklo_ps(pre1, pre3);
208 simd16scalar rbhi = _simd16_unpackhi_ps(pre0, pre2);
209 simd16scalar gahi = _simd16_unpackhi_ps(pre1, pre3);
210
211 dst[0] = _simd16_unpacklo_ps(rblo, galo);
212 dst[1] = _simd16_unpackhi_ps(rblo, galo);
213 dst[2] = _simd16_unpacklo_ps(rbhi, gahi);
214 dst[3] = _simd16_unpackhi_ps(rbhi, gahi);
215 }
216
217 #endif
218 INLINE
219 void vTranspose8x8(simdscalar (&vDst)[8], const simdscalar &vMask0, const simdscalar &vMask1, const simdscalar &vMask2, const simdscalar &vMask3, const simdscalar &vMask4, const simdscalar &vMask5, const simdscalar &vMask6, const simdscalar &vMask7)
220 {
221 simdscalar __t0 = _simd_unpacklo_ps(vMask0, vMask1);
222 simdscalar __t1 = _simd_unpackhi_ps(vMask0, vMask1);
223 simdscalar __t2 = _simd_unpacklo_ps(vMask2, vMask3);
224 simdscalar __t3 = _simd_unpackhi_ps(vMask2, vMask3);
225 simdscalar __t4 = _simd_unpacklo_ps(vMask4, vMask5);
226 simdscalar __t5 = _simd_unpackhi_ps(vMask4, vMask5);
227 simdscalar __t6 = _simd_unpacklo_ps(vMask6, vMask7);
228 simdscalar __t7 = _simd_unpackhi_ps(vMask6, vMask7);
229 simdscalar __tt0 = _simd_shuffle_ps(__t0,__t2,_MM_SHUFFLE(1,0,1,0));
230 simdscalar __tt1 = _simd_shuffle_ps(__t0,__t2,_MM_SHUFFLE(3,2,3,2));
231 simdscalar __tt2 = _simd_shuffle_ps(__t1,__t3,_MM_SHUFFLE(1,0,1,0));
232 simdscalar __tt3 = _simd_shuffle_ps(__t1,__t3,_MM_SHUFFLE(3,2,3,2));
233 simdscalar __tt4 = _simd_shuffle_ps(__t4,__t6,_MM_SHUFFLE(1,0,1,0));
234 simdscalar __tt5 = _simd_shuffle_ps(__t4,__t6,_MM_SHUFFLE(3,2,3,2));
235 simdscalar __tt6 = _simd_shuffle_ps(__t5,__t7,_MM_SHUFFLE(1,0,1,0));
236 simdscalar __tt7 = _simd_shuffle_ps(__t5,__t7,_MM_SHUFFLE(3,2,3,2));
237 vDst[0] = _simd_permute2f128_ps(__tt0, __tt4, 0x20);
238 vDst[1] = _simd_permute2f128_ps(__tt1, __tt5, 0x20);
239 vDst[2] = _simd_permute2f128_ps(__tt2, __tt6, 0x20);
240 vDst[3] = _simd_permute2f128_ps(__tt3, __tt7, 0x20);
241 vDst[4] = _simd_permute2f128_ps(__tt0, __tt4, 0x31);
242 vDst[5] = _simd_permute2f128_ps(__tt1, __tt5, 0x31);
243 vDst[6] = _simd_permute2f128_ps(__tt2, __tt6, 0x31);
244 vDst[7] = _simd_permute2f128_ps(__tt3, __tt7, 0x31);
245 }
246
247 INLINE
248 void vTranspose8x8(simdscalar (&vDst)[8], const simdscalari &vMask0, const simdscalari &vMask1, const simdscalari &vMask2, const simdscalari &vMask3, const simdscalari &vMask4, const simdscalari &vMask5, const simdscalari &vMask6, const simdscalari &vMask7)
249 {
250 vTranspose8x8(vDst, _simd_castsi_ps(vMask0), _simd_castsi_ps(vMask1), _simd_castsi_ps(vMask2), _simd_castsi_ps(vMask3),
251 _simd_castsi_ps(vMask4), _simd_castsi_ps(vMask5), _simd_castsi_ps(vMask6), _simd_castsi_ps(vMask7));
252 }
253 #endif
254
255 //////////////////////////////////////////////////////////////////////////
256 /// TranposeSingleComponent
257 //////////////////////////////////////////////////////////////////////////
258 template<uint32_t bpp>
259 struct TransposeSingleComponent
260 {
261 //////////////////////////////////////////////////////////////////////////
262 /// @brief Pass-thru for single component.
263 /// @param pSrc - source data in SOA form
264 /// @param pDst - output data in AOS form
265 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
266 {
267 memcpy(pDst, pSrc, (bpp * KNOB_SIMD_WIDTH) / 8);
268 }
269 #if ENABLE_AVX512_SIMD16
270
271 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
272 {
273 memcpy(pDst, pSrc, (bpp * KNOB_SIMD16_WIDTH) / 8);
274 }
275 #endif
276 };
277
278 //////////////////////////////////////////////////////////////////////////
279 /// Transpose8_8_8_8
280 //////////////////////////////////////////////////////////////////////////
281 struct Transpose8_8_8_8
282 {
283 //////////////////////////////////////////////////////////////////////////
284 /// @brief Performs an SOA to AOS conversion for packed 8_8_8_8 data.
285 /// @param pSrc - source data in SOA form
286 /// @param pDst - output data in AOS form
287 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
288 {
289 simdscalari src = _simd_load_si((const simdscalari*)pSrc);
290
291 #if KNOB_SIMD_WIDTH == 8
292 #if KNOB_ARCH <= KNOB_ARCH_AVX
293 __m128i c0c1 = _mm256_castsi256_si128(src); // rrrrrrrrgggggggg
294 __m128i c2c3 = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(src), 1)); // bbbbbbbbaaaaaaaa
295 __m128i c0c2 = _mm_unpacklo_epi64(c0c1, c2c3); // rrrrrrrrbbbbbbbb
296 __m128i c1c3 = _mm_unpackhi_epi64(c0c1, c2c3); // ggggggggaaaaaaaa
297 __m128i c01 = _mm_unpacklo_epi8(c0c2, c1c3); // rgrgrgrgrgrgrgrg
298 __m128i c23 = _mm_unpackhi_epi8(c0c2, c1c3); // babababababababa
299 __m128i c0123lo = _mm_unpacklo_epi16(c01, c23); // rgbargbargbargba
300 __m128i c0123hi = _mm_unpackhi_epi16(c01, c23); // rgbargbargbargba
301 _mm_store_si128((__m128i*)pDst, c0123lo);
302 _mm_store_si128((__m128i*)(pDst + 16), c0123hi);
303 #else
304 simdscalari dst01 = _simd_shuffle_epi8(src,
305 _simd_set_epi32(0x0f078080, 0x0e068080, 0x0d058080, 0x0c048080, 0x80800b03, 0x80800a02, 0x80800901, 0x80800800));
306 simdscalari dst23 = _mm256_permute2x128_si256(src, src, 0x01);
307 dst23 = _simd_shuffle_epi8(dst23,
308 _simd_set_epi32(0x80800f07, 0x80800e06, 0x80800d05, 0x80800c04, 0x0b038080, 0x0a028080, 0x09018080, 0x08008080));
309 simdscalari dst = _simd_or_si(dst01, dst23);
310 _simd_store_si((simdscalari*)pDst, dst);
311 #endif
312 #else
313 #error Unsupported vector width
314 #endif
315 }
316 #if ENABLE_AVX512_SIMD16
317
318 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
319 {
320 __m128i src0 = _mm_load_si128(reinterpret_cast<const __m128i *>(pSrc)); // rrrrrrrrrrrrrrrr
321 __m128i src1 = _mm_load_si128(reinterpret_cast<const __m128i *>(pSrc) + 1); // gggggggggggggggg
322 __m128i src2 = _mm_load_si128(reinterpret_cast<const __m128i *>(pSrc) + 2); // bbbbbbbbbbbbbbbb
323 __m128i src3 = _mm_load_si128(reinterpret_cast<const __m128i *>(pSrc) + 3); // aaaaaaaaaaaaaaaa
324
325 simd16scalari cvt0 = _simd16_cvtepu8_epi32(src0);
326 simd16scalari cvt1 = _simd16_cvtepu8_epi32(src1);
327 simd16scalari cvt2 = _simd16_cvtepu8_epi32(src2);
328 simd16scalari cvt3 = _simd16_cvtepu8_epi32(src3);
329
330 simd16scalari shl1 = _simd16_slli_epi32(cvt1, 8);
331 simd16scalari shl2 = _simd16_slli_epi32(cvt2, 16);
332 simd16scalari shl3 = _simd16_slli_epi32(cvt3, 24);
333
334 simd16scalari dst = _simd16_or_si(_simd16_or_si(cvt0, shl1), _simd16_or_si(shl2, shl3));
335
336 _simd16_store_si(reinterpret_cast<simd16scalari *>(pDst), dst); // rgbargbargbargbargbargbargbargbargbargbargbargbargbargbargbargba
337 }
338 #endif
339 };
340
341 //////////////////////////////////////////////////////////////////////////
342 /// Transpose8_8_8
343 //////////////////////////////////////////////////////////////////////////
344 struct Transpose8_8_8
345 {
346 //////////////////////////////////////////////////////////////////////////
347 /// @brief Performs an SOA to AOS conversion for packed 8_8_8 data.
348 /// @param pSrc - source data in SOA form
349 /// @param pDst - output data in AOS form
350 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
351 #if ENABLE_AVX512_SIMD16
352
353 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
354 #endif
355 };
356
357 //////////////////////////////////////////////////////////////////////////
358 /// Transpose8_8
359 //////////////////////////////////////////////////////////////////////////
360 struct Transpose8_8
361 {
362 //////////////////////////////////////////////////////////////////////////
363 /// @brief Performs an SOA to AOS conversion for packed 8_8 data.
364 /// @param pSrc - source data in SOA form
365 /// @param pDst - output data in AOS form
366 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
367 {
368 #if KNOB_SIMD_WIDTH == 8
369 simdscalari src = _simd_load_si((const simdscalari*)pSrc);
370
371 __m128i rg = _mm256_castsi256_si128(src); // rrrrrrrr gggggggg
372 __m128i g = _mm_unpackhi_epi64(rg, rg); // gggggggg gggggggg
373 rg = _mm_unpacklo_epi8(rg, g);
374 _mm_store_si128((__m128i*)pDst, rg);
375 #else
376 #error Unsupported vector width
377 #endif
378 }
379 #if ENABLE_AVX512_SIMD16
380
381 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
382 {
383 __m128i src0 = _mm_load_si128(reinterpret_cast<const __m128i *>(pSrc)); // rrrrrrrrrrrrrrrr
384 __m128i src1 = _mm_load_si128(reinterpret_cast<const __m128i *>(pSrc) + 1); // gggggggggggggggg
385
386 simdscalari cvt0 = _simd_cvtepu8_epi16(src0);
387 simdscalari cvt1 = _simd_cvtepu8_epi16(src1);
388
389 simdscalari shl1 = _simd_slli_epi32(cvt1, 8);
390
391 simdscalari dst = _simd_or_si(cvt0, shl1);
392
393 _simd_store_si(reinterpret_cast<simdscalari *>(pDst), dst); // rgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrg
394 }
395 #endif
396 };
397
398 //////////////////////////////////////////////////////////////////////////
399 /// Transpose32_32_32_32
400 //////////////////////////////////////////////////////////////////////////
401 struct Transpose32_32_32_32
402 {
403 //////////////////////////////////////////////////////////////////////////
404 /// @brief Performs an SOA to AOS conversion for packed 32_32_32_32 data.
405 /// @param pSrc - source data in SOA form
406 /// @param pDst - output data in AOS form
407 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
408 {
409 #if KNOB_SIMD_WIDTH == 8
410 simdscalar src0 = _simd_load_ps((const float*)pSrc);
411 simdscalar src1 = _simd_load_ps((const float*)pSrc + 8);
412 simdscalar src2 = _simd_load_ps((const float*)pSrc + 16);
413 simdscalar src3 = _simd_load_ps((const float*)pSrc + 24);
414
415 __m128 vDst[8];
416 vTranspose4x8(vDst, src0, src1, src2, src3);
417 _mm_store_ps((float*)pDst, vDst[0]);
418 _mm_store_ps((float*)pDst+4, vDst[1]);
419 _mm_store_ps((float*)pDst+8, vDst[2]);
420 _mm_store_ps((float*)pDst+12, vDst[3]);
421 _mm_store_ps((float*)pDst+16, vDst[4]);
422 _mm_store_ps((float*)pDst+20, vDst[5]);
423 _mm_store_ps((float*)pDst+24, vDst[6]);
424 _mm_store_ps((float*)pDst+28, vDst[7]);
425 #else
426 #error Unsupported vector width
427 #endif
428 }
429 #if ENABLE_AVX512_SIMD16
430
431 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
432 {
433 simd16scalar src0 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc));
434 simd16scalar src1 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 16);
435 simd16scalar src2 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 32);
436 simd16scalar src3 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 48);
437
438 simd16scalar dst[4];
439
440 vTranspose4x16(dst, src0, src1, src2, src3);
441
442 _simd16_store_ps(reinterpret_cast<float *>(pDst) + 0, dst[0]);
443 _simd16_store_ps(reinterpret_cast<float *>(pDst) + 16, dst[1]);
444 _simd16_store_ps(reinterpret_cast<float *>(pDst) + 32, dst[2]);
445 _simd16_store_ps(reinterpret_cast<float *>(pDst) + 48, dst[3]);
446 }
447 #endif
448 };
449
450 //////////////////////////////////////////////////////////////////////////
451 /// Transpose32_32_32
452 //////////////////////////////////////////////////////////////////////////
453 struct Transpose32_32_32
454 {
455 //////////////////////////////////////////////////////////////////////////
456 /// @brief Performs an SOA to AOS conversion for packed 32_32_32 data.
457 /// @param pSrc - source data in SOA form
458 /// @param pDst - output data in AOS form
459 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
460 {
461 #if KNOB_SIMD_WIDTH == 8
462 simdscalar src0 = _simd_load_ps((const float*)pSrc);
463 simdscalar src1 = _simd_load_ps((const float*)pSrc + 8);
464 simdscalar src2 = _simd_load_ps((const float*)pSrc + 16);
465
466 __m128 vDst[8];
467 vTranspose3x8(vDst, src0, src1, src2);
468 _mm_store_ps((float*)pDst, vDst[0]);
469 _mm_store_ps((float*)pDst + 4, vDst[1]);
470 _mm_store_ps((float*)pDst + 8, vDst[2]);
471 _mm_store_ps((float*)pDst + 12, vDst[3]);
472 _mm_store_ps((float*)pDst + 16, vDst[4]);
473 _mm_store_ps((float*)pDst + 20, vDst[5]);
474 _mm_store_ps((float*)pDst + 24, vDst[6]);
475 _mm_store_ps((float*)pDst + 28, vDst[7]);
476 #else
477 #error Unsupported vector width
478 #endif
479 }
480 #if ENABLE_AVX512_SIMD16
481
482 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
483 {
484 simd16scalar src0 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc));
485 simd16scalar src1 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 16);
486 simd16scalar src2 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 32);
487 simd16scalar src3 = _simd16_setzero_ps();
488
489 simd16scalar dst[4];
490
491 vTranspose4x16(dst, src0, src1, src2, src3);
492
493 _simd16_store_ps(reinterpret_cast<float *>(pDst) + 0, dst[0]);
494 _simd16_store_ps(reinterpret_cast<float *>(pDst) + 16, dst[1]);
495 _simd16_store_ps(reinterpret_cast<float *>(pDst) + 32, dst[2]);
496 _simd16_store_ps(reinterpret_cast<float *>(pDst) + 48, dst[3]);
497 }
498 #endif
499 };
500
501 //////////////////////////////////////////////////////////////////////////
502 /// Transpose32_32
503 //////////////////////////////////////////////////////////////////////////
504 struct Transpose32_32
505 {
506 //////////////////////////////////////////////////////////////////////////
507 /// @brief Performs an SOA to AOS conversion for packed 32_32 data.
508 /// @param pSrc - source data in SOA form
509 /// @param pDst - output data in AOS form
510 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
511 {
512 #if KNOB_SIMD_WIDTH == 8
513 const float* pfSrc = (const float*)pSrc;
514 __m128 src_r0 = _mm_load_ps(pfSrc + 0);
515 __m128 src_r1 = _mm_load_ps(pfSrc + 4);
516 __m128 src_g0 = _mm_load_ps(pfSrc + 8);
517 __m128 src_g1 = _mm_load_ps(pfSrc + 12);
518
519 __m128 dst0 = _mm_unpacklo_ps(src_r0, src_g0);
520 __m128 dst1 = _mm_unpackhi_ps(src_r0, src_g0);
521 __m128 dst2 = _mm_unpacklo_ps(src_r1, src_g1);
522 __m128 dst3 = _mm_unpackhi_ps(src_r1, src_g1);
523
524 float* pfDst = (float*)pDst;
525 _mm_store_ps(pfDst + 0, dst0);
526 _mm_store_ps(pfDst + 4, dst1);
527 _mm_store_ps(pfDst + 8, dst2);
528 _mm_store_ps(pfDst + 12, dst3);
529 #else
530 #error Unsupported vector width
531 #endif
532 }
533 #if ENABLE_AVX512_SIMD16
534
535 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
536 {
537 simd16scalar src0 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc)); // rrrrrrrrrrrrrrrr
538 simd16scalar src1 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 16); // gggggggggggggggg
539
540 simd16scalar tmp0 = _simd16_unpacklo_ps(src0, src1); // r0 g0 r1 g1 r4 g4 r5 g5 r8 g8 r9 g9 rC gC rD gD
541 simd16scalar tmp1 = _simd16_unpackhi_ps(src0, src1); // r2 g2 r3 g3 r6 g6 r7 g7 rA gA rB gB rE gE rF gF
542
543 simd16scalar per0 = _simd16_permute2f128_ps(tmp0, tmp1, 0x44); // (1, 0, 1, 0) // r0 g0 r1 g1 r4 g4 r5 g5 r2 g2 r3 g3 r6 g6 r7 g7
544 simd16scalar per1 = _simd16_permute2f128_ps(tmp0, tmp1, 0xEE); // (3, 2, 3, 2) // r8 g8 r9 g9 rC gC rD gD rA gA rB gB rE gE rF gF
545
546 simd16scalar dst0 = _simd16_permute2f128_ps(per0, per0, 0xD8); // (3, 1, 2, 0) // r0 g0 r1 g1 r2 g2 r3 g3 r4 g4 r5 g5 r6 g6 r7 g7
547 simd16scalar dst1 = _simd16_permute2f128_ps(per1, per1, 0xD8); // (3, 1, 2, 0) // r8 g8 r9 g9 rA gA rB gB rC gC rD gD rE gE rF gF
548
549 _simd16_store_ps(reinterpret_cast<float *>(pDst) + 0, dst0); // rgrgrgrgrgrgrgrg
550 _simd16_store_ps(reinterpret_cast<float *>(pDst) + 16, dst1); // rgrgrgrgrgrgrgrg
551 }
552 #endif
553 };
554
555 //////////////////////////////////////////////////////////////////////////
556 /// Transpose16_16_16_16
557 //////////////////////////////////////////////////////////////////////////
558 struct Transpose16_16_16_16
559 {
560 //////////////////////////////////////////////////////////////////////////
561 /// @brief Performs an SOA to AOS conversion for packed 16_16_16_16 data.
562 /// @param pSrc - source data in SOA form
563 /// @param pDst - output data in AOS form
564 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
565 {
566 #if KNOB_SIMD_WIDTH == 8
567 simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc);
568 simdscalari src_ba = _simd_load_si((const simdscalari*)(pSrc + sizeof(simdscalari)));
569
570 __m128i src_r = _mm256_extractf128_si256(src_rg, 0);
571 __m128i src_g = _mm256_extractf128_si256(src_rg, 1);
572 __m128i src_b = _mm256_extractf128_si256(src_ba, 0);
573 __m128i src_a = _mm256_extractf128_si256(src_ba, 1);
574
575 __m128i rg0 = _mm_unpacklo_epi16(src_r, src_g);
576 __m128i rg1 = _mm_unpackhi_epi16(src_r, src_g);
577 __m128i ba0 = _mm_unpacklo_epi16(src_b, src_a);
578 __m128i ba1 = _mm_unpackhi_epi16(src_b, src_a);
579
580 __m128i dst0 = _mm_unpacklo_epi32(rg0, ba0);
581 __m128i dst1 = _mm_unpackhi_epi32(rg0, ba0);
582 __m128i dst2 = _mm_unpacklo_epi32(rg1, ba1);
583 __m128i dst3 = _mm_unpackhi_epi32(rg1, ba1);
584
585 _mm_store_si128(((__m128i*)pDst) + 0, dst0);
586 _mm_store_si128(((__m128i*)pDst) + 1, dst1);
587 _mm_store_si128(((__m128i*)pDst) + 2, dst2);
588 _mm_store_si128(((__m128i*)pDst) + 3, dst3);
589 #else
590 #error Unsupported vector width
591 #endif
592 }
593 #if ENABLE_AVX512_SIMD16
594
595 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
596 {
597 simdscalari src0 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc)); // rrrrrrrrrrrrrrrr
598 simdscalari src1 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc) + 1); // gggggggggggggggg
599 simdscalari src2 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc) + 2); // bbbbbbbbbbbbbbbb
600 simdscalari src3 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc) + 3); // aaaaaaaaaaaaaaaa
601
602 simdscalari pre0 = _simd_unpacklo_epi16(src0, src1); // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB
603 simdscalari pre1 = _simd_unpackhi_epi16(src0, src1); // rg4 rg5 rg6 rg7 rgC rgD rgE rgF
604 simdscalari pre2 = _simd_unpacklo_epi16(src2, src3); // ba0 ba1 ba3 ba3 ba8 ba9 baA baB
605 simdscalari pre3 = _simd_unpackhi_epi16(src2, src3); // ba4 ba5 ba6 ba7 baC baD baE baF
606
607 simdscalari tmp0 = _simd_unpacklo_epi32(pre0, pre2); // rbga0 rbga1 rbga8 rbga9
608 simdscalari tmp1 = _simd_unpackhi_epi32(pre0, pre2); // rbga2 rbga3 rbgaA rbgaB
609 simdscalari tmp2 = _simd_unpacklo_epi32(pre1, pre3); // rbga4 rbga5 rgbaC rbgaD
610 simdscalari tmp3 = _simd_unpackhi_epi32(pre1, pre3); // rbga6 rbga7 rbgaE rbgaF
611
612 simdscalari dst0 = _simd_permute2f128_si(tmp0, tmp1, 0x20); // (2, 0) // rbga0 rbga1 rbga2 rbga3
613 simdscalari dst1 = _simd_permute2f128_si(tmp2, tmp3, 0x20); // (2, 0) // rbga4 rbga5 rbga6 rbga7
614 simdscalari dst2 = _simd_permute2f128_si(tmp0, tmp1, 0x31); // (3, 1) // rbga8 rbga9 rbgaA rbgaB
615 simdscalari dst3 = _simd_permute2f128_si(tmp2, tmp3, 0x31); // (3, 1) // rbgaC rbgaD rbgaE rbgaF
616
617 _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 0, dst0); // rgbargbargbargba
618 _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 1, dst1); // rgbargbargbargba
619 _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 2, dst2); // rgbargbargbargba
620 _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 3, dst3); // rgbargbargbargba
621 }
622 #endif
623 };
624
625 //////////////////////////////////////////////////////////////////////////
626 /// Transpose16_16_16
627 //////////////////////////////////////////////////////////////////////////
628 struct Transpose16_16_16
629 {
630 //////////////////////////////////////////////////////////////////////////
631 /// @brief Performs an SOA to AOS conversion for packed 16_16_16 data.
632 /// @param pSrc - source data in SOA form
633 /// @param pDst - output data in AOS form
634 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
635 {
636 #if KNOB_SIMD_WIDTH == 8
637 simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc);
638
639 __m128i src_r = _mm256_extractf128_si256(src_rg, 0);
640 __m128i src_g = _mm256_extractf128_si256(src_rg, 1);
641 __m128i src_b = _mm_load_si128((const __m128i*)(pSrc + sizeof(simdscalari)));
642 __m128i src_a = _mm_undefined_si128();
643
644 __m128i rg0 = _mm_unpacklo_epi16(src_r, src_g);
645 __m128i rg1 = _mm_unpackhi_epi16(src_r, src_g);
646 __m128i ba0 = _mm_unpacklo_epi16(src_b, src_a);
647 __m128i ba1 = _mm_unpackhi_epi16(src_b, src_a);
648
649 __m128i dst0 = _mm_unpacklo_epi32(rg0, ba0);
650 __m128i dst1 = _mm_unpackhi_epi32(rg0, ba0);
651 __m128i dst2 = _mm_unpacklo_epi32(rg1, ba1);
652 __m128i dst3 = _mm_unpackhi_epi32(rg1, ba1);
653
654 _mm_store_si128(((__m128i*)pDst) + 0, dst0);
655 _mm_store_si128(((__m128i*)pDst) + 1, dst1);
656 _mm_store_si128(((__m128i*)pDst) + 2, dst2);
657 _mm_store_si128(((__m128i*)pDst) + 3, dst3);
658 #else
659 #error Unsupported vector width
660 #endif
661 }
662 #if ENABLE_AVX512_SIMD16
663
664 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
665 {
666 simdscalari src0 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc)); // rrrrrrrrrrrrrrrr
667 simdscalari src1 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc) + 1); // gggggggggggggggg
668 simdscalari src2 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc) + 2); // bbbbbbbbbbbbbbbb
669 simdscalari src3 = _simd_setzero_si(); // aaaaaaaaaaaaaaaa
670
671 simdscalari pre0 = _simd_unpacklo_epi16(src0, src1); // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB
672 simdscalari pre1 = _simd_unpackhi_epi16(src0, src1); // rg4 rg5 rg6 rg7 rgC rgD rgE rgF
673 simdscalari pre2 = _simd_unpacklo_epi16(src2, src3); // ba0 ba1 ba3 ba3 ba8 ba9 baA baB
674 simdscalari pre3 = _simd_unpackhi_epi16(src2, src3); // ba4 ba5 ba6 ba7 baC baD baE baF
675
676 simdscalari tmp0 = _simd_unpacklo_epi32(pre0, pre2); // rbga0 rbga1 rbga8 rbga9
677 simdscalari tmp1 = _simd_unpackhi_epi32(pre0, pre2); // rbga2 rbga3 rbgaA rbgaB
678 simdscalari tmp2 = _simd_unpacklo_epi32(pre1, pre3); // rbga4 rbga5 rgbaC rbgaD
679 simdscalari tmp3 = _simd_unpackhi_epi32(pre1, pre3); // rbga6 rbga7 rbgaE rbgaF
680
681 simdscalari dst0 = _simd_permute2f128_si(tmp0, tmp1, 0x20); // (2, 0) // rbga0 rbga1 rbga2 rbga3
682 simdscalari dst1 = _simd_permute2f128_si(tmp2, tmp3, 0x20); // (2, 0) // rbga4 rbga5 rbga6 rbga7
683 simdscalari dst2 = _simd_permute2f128_si(tmp0, tmp1, 0x31); // (3, 1) // rbga8 rbga9 rbgaA rbgaB
684 simdscalari dst3 = _simd_permute2f128_si(tmp2, tmp3, 0x31); // (3, 1) // rbgaC rbgaD rbgaE rbgaF
685
686 _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 0, dst0); // rgbargbargbargba
687 _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 1, dst1); // rgbargbargbargba
688 _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 2, dst2); // rgbargbargbargba
689 _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 3, dst3); // rgbargbargbargba
690 }
691 #endif
692 };
693
694 //////////////////////////////////////////////////////////////////////////
695 /// Transpose16_16
696 //////////////////////////////////////////////////////////////////////////
697 struct Transpose16_16
698 {
699 //////////////////////////////////////////////////////////////////////////
700 /// @brief Performs an SOA to AOS conversion for packed 16_16 data.
701 /// @param pSrc - source data in SOA form
702 /// @param pDst - output data in AOS form
703 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
704 {
705 #if KNOB_SIMD_WIDTH == 8
706 simdscalar src = _simd_load_ps((const float*)pSrc);
707
708 __m128 comp0 = _mm256_castps256_ps128(src);
709 __m128 comp1 = _mm256_extractf128_ps(src, 1);
710
711 __m128i comp0i = _mm_castps_si128(comp0);
712 __m128i comp1i = _mm_castps_si128(comp1);
713
714 __m128i resLo = _mm_unpacklo_epi16(comp0i, comp1i);
715 __m128i resHi = _mm_unpackhi_epi16(comp0i, comp1i);
716
717 _mm_store_si128((__m128i*)pDst, resLo);
718 _mm_store_si128((__m128i*)pDst + 1, resHi);
719 #else
720 #error Unsupported vector width
721 #endif
722 }
723 #if ENABLE_AVX512_SIMD16
724
725 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
726 {
727 simdscalari src0 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc)); // rrrrrrrrrrrrrrrr
728 simdscalari src1 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc) + 1); // gggggggggggggggg
729
730 simdscalari tmp0 = _simd_unpacklo_epi16(src0, src1); // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB
731 simdscalari tmp1 = _simd_unpackhi_epi16(src0, src1); // rg4 rg5 rg6 rg7 rgC rgD rgE rgF
732
733 simdscalari dst0 = _simd_permute2f128_si(tmp0, tmp1, 0x20); // (2, 0) // rg0 rg1 rg2 rg3 rg4 rg5 rg6 rg7
734 simdscalari dst1 = _simd_permute2f128_si(tmp0, tmp1, 0x31); // (3, 1) // rg8 rg9 rgA rgB rgC rgD rgE rgF
735
736 _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 0, dst0); // rgrgrgrgrgrgrgrg
737 _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 1, dst1); // rgrgrgrgrgrgrgrg
738 }
739 #endif
740 };
741
742 //////////////////////////////////////////////////////////////////////////
743 /// Transpose24_8
744 //////////////////////////////////////////////////////////////////////////
745 struct Transpose24_8
746 {
747 //////////////////////////////////////////////////////////////////////////
748 /// @brief Performs an SOA to AOS conversion for packed 24_8 data.
749 /// @param pSrc - source data in SOA form
750 /// @param pDst - output data in AOS form
751 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
752 #if ENABLE_AVX512_SIMD16
753
754 static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
755 #endif
756 };
757
758 //////////////////////////////////////////////////////////////////////////
759 /// Transpose32_8_24
760 //////////////////////////////////////////////////////////////////////////
761 struct Transpose32_8_24
762 {
763 //////////////////////////////////////////////////////////////////////////
764 /// @brief Performs an SOA to AOS conversion for packed 32_8_24 data.
765 /// @param pSrc - source data in SOA form
766 /// @param pDst - output data in AOS form
767 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
768 #if ENABLE_AVX512_SIMD16
769
770 static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
771 #endif
772 };
773
774 //////////////////////////////////////////////////////////////////////////
775 /// Transpose4_4_4_4
776 //////////////////////////////////////////////////////////////////////////
777 struct Transpose4_4_4_4
778 {
779 //////////////////////////////////////////////////////////////////////////
780 /// @brief Performs an SOA to AOS conversion for packed 4_4_4_4 data.
781 /// @param pSrc - source data in SOA form
782 /// @param pDst - output data in AOS form
783 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
784 #if ENABLE_AVX512_SIMD16
785
786 static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
787 #endif
788 };
789
790 //////////////////////////////////////////////////////////////////////////
791 /// Transpose5_6_5
792 //////////////////////////////////////////////////////////////////////////
793 struct Transpose5_6_5
794 {
795 //////////////////////////////////////////////////////////////////////////
796 /// @brief Performs an SOA to AOS conversion for packed 5_6_5 data.
797 /// @param pSrc - source data in SOA form
798 /// @param pDst - output data in AOS form
799 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
800 #if ENABLE_AVX512_SIMD16
801
802 static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
803 #endif
804 };
805
806 //////////////////////////////////////////////////////////////////////////
807 /// Transpose9_9_9_5
808 //////////////////////////////////////////////////////////////////////////
809 struct Transpose9_9_9_5
810 {
811 //////////////////////////////////////////////////////////////////////////
812 /// @brief Performs an SOA to AOS conversion for packed 9_9_9_5 data.
813 /// @param pSrc - source data in SOA form
814 /// @param pDst - output data in AOS form
815 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
816 #if ENABLE_AVX512_SIMD16
817
818 static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
819 #endif
820 };
821
822 //////////////////////////////////////////////////////////////////////////
823 /// Transpose5_5_5_1
824 //////////////////////////////////////////////////////////////////////////
825 struct Transpose5_5_5_1
826 {
827 //////////////////////////////////////////////////////////////////////////
828 /// @brief Performs an SOA to AOS conversion for packed 5_5_5_1 data.
829 /// @param pSrc - source data in SOA form
830 /// @param pDst - output data in AOS form
831 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
832 #if ENABLE_AVX512_SIMD16
833
834 static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
835 #endif
836 };
837
838 //////////////////////////////////////////////////////////////////////////
839 /// Transpose1_5_5_5
840 //////////////////////////////////////////////////////////////////////////
841 struct Transpose1_5_5_5
842 {
843 //////////////////////////////////////////////////////////////////////////
844 /// @brief Performs an SOA to AOS conversion for packed 5_5_5_1 data.
845 /// @param pSrc - source data in SOA form
846 /// @param pDst - output data in AOS form
847 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
848 };
849
850 //////////////////////////////////////////////////////////////////////////
851 /// Transpose10_10_10_2
852 //////////////////////////////////////////////////////////////////////////
853 struct Transpose10_10_10_2
854 {
855 //////////////////////////////////////////////////////////////////////////
856 /// @brief Performs an SOA to AOS conversion for packed 10_10_10_2 data.
857 /// @param pSrc - source data in SOA form
858 /// @param pDst - output data in AOS form
859 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
860 #if ENABLE_AVX512_SIMD16
861
862 static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
863 #endif
864 };
865
866 //////////////////////////////////////////////////////////////////////////
867 /// Transpose11_11_10
868 //////////////////////////////////////////////////////////////////////////
869 struct Transpose11_11_10
870 {
871 //////////////////////////////////////////////////////////////////////////
872 /// @brief Performs an SOA to AOS conversion for packed 11_11_10 data.
873 /// @param pSrc - source data in SOA form
874 /// @param pDst - output data in AOS form
875 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
876 #if ENABLE_AVX512_SIMD16
877
878 static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
879 #endif
880 };
881
882 //////////////////////////////////////////////////////////////////////////
883 /// Transpose64
884 //////////////////////////////////////////////////////////////////////////
885 struct Transpose64
886 {
887 //////////////////////////////////////////////////////////////////////////
888 /// @brief Performs an SOA to AOS conversion
889 /// @param pSrc - source data in SOA form
890 /// @param pDst - output data in AOS form
891 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
892 #if ENABLE_AVX512_SIMD16
893
894 static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
895 #endif
896 };
897
898 //////////////////////////////////////////////////////////////////////////
899 /// Transpose64_64
900 //////////////////////////////////////////////////////////////////////////
901 struct Transpose64_64
902 {
903 //////////////////////////////////////////////////////////////////////////
904 /// @brief Performs an SOA to AOS conversion
905 /// @param pSrc - source data in SOA form
906 /// @param pDst - output data in AOS form
907 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
908 #if ENABLE_AVX512_SIMD16
909
910 static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
911 #endif
912 };
913
914 //////////////////////////////////////////////////////////////////////////
915 /// Transpose64_64_64
916 //////////////////////////////////////////////////////////////////////////
917 struct Transpose64_64_64
918 {
919 //////////////////////////////////////////////////////////////////////////
920 /// @brief Performs an SOA to AOS conversion
921 /// @param pSrc - source data in SOA form
922 /// @param pDst - output data in AOS form
923 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
924 #if ENABLE_AVX512_SIMD16
925
926 static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
927 #endif
928 };
929
930 //////////////////////////////////////////////////////////////////////////
931 /// Transpose64_64_64_64
932 //////////////////////////////////////////////////////////////////////////
933 struct Transpose64_64_64_64
934 {
935 //////////////////////////////////////////////////////////////////////////
936 /// @brief Performs an SOA to AOS conversion
937 /// @param pSrc - source data in SOA form
938 /// @param pDst - output data in AOS form
939 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
940 #if ENABLE_AVX512_SIMD16
941
942 static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
943 #endif
944 };
945
946 // helper function to unroll loops
947 template<int Begin, int End, int Step = 1>
948 struct UnrollerL {
949 template<typename Lambda>
950 INLINE static void step(Lambda& func) {
951 func(Begin);
952 UnrollerL<Begin + Step, End, Step>::step(func);
953 }
954 };
955
956 template<int End, int Step>
957 struct UnrollerL<End, End, Step> {
958 template<typename Lambda>
959 static void step(Lambda& func) {
960 }
961 };
962
963 // helper function to unroll loops, with mask to skip specific iterations
964 template<int Begin, int End, int Step = 1, int Mask = 0x7f>
965 struct UnrollerLMask {
966 template<typename Lambda>
967 INLINE static void step(Lambda& func) {
968 if(Mask & (1 << Begin))
969 {
970 func(Begin);
971 }
972 UnrollerL<Begin + Step, End, Step>::step(func);
973 }
974 };
975
976 template<int End, int Step, int Mask>
977 struct UnrollerLMask<End, End, Step, Mask> {
978 template<typename Lambda>
979 static void step(Lambda& func) {
980 }
981 };
982
983 // general CRC compute
984 INLINE
985 uint32_t ComputeCRC(uint32_t crc, const void *pData, uint32_t size)
986 {
987 #if defined(_WIN64) || defined(__x86_64__)
988 uint32_t sizeInQwords = size / sizeof(uint64_t);
989 uint32_t sizeRemainderBytes = size % sizeof(uint64_t);
990 uint64_t* pDataWords = (uint64_t*)pData;
991 for (uint32_t i = 0; i < sizeInQwords; ++i)
992 {
993 crc = (uint32_t)_mm_crc32_u64(crc, *pDataWords++);
994 }
995 #else
996 uint32_t sizeInDwords = size / sizeof(uint32_t);
997 uint32_t sizeRemainderBytes = size % sizeof(uint32_t);
998 uint32_t* pDataWords = (uint32_t*)pData;
999 for (uint32_t i = 0; i < sizeInDwords; ++i)
1000 {
1001 crc = _mm_crc32_u32(crc, *pDataWords++);
1002 }
1003 #endif
1004
1005 uint8_t* pRemainderBytes = (uint8_t*)pDataWords;
1006 for (uint32_t i = 0; i < sizeRemainderBytes; ++i)
1007 {
1008 crc = _mm_crc32_u8(crc, *pRemainderBytes++);
1009 }
1010
1011 return crc;
1012 }
1013
1014 //////////////////////////////////////////////////////////////////////////
1015 /// Add byte offset to any-type pointer
1016 //////////////////////////////////////////////////////////////////////////
1017 template <typename T>
1018 INLINE
1019 static T* PtrAdd(T* p, intptr_t offset)
1020 {
1021 intptr_t intp = reinterpret_cast<intptr_t>(p);
1022 return reinterpret_cast<T*>(intp + offset);
1023 }
1024
1025 //////////////////////////////////////////////////////////////////////////
1026 /// Is a power-of-2?
1027 //////////////////////////////////////////////////////////////////////////
1028 template <typename T>
1029 INLINE
1030 static bool IsPow2(T value)
1031 {
1032 return value == (value & (0 - value));
1033 }
1034
1035 //////////////////////////////////////////////////////////////////////////
1036 /// Align down to specified alignment
1037 /// Note: IsPow2(alignment) MUST be true
1038 //////////////////////////////////////////////////////////////////////////
1039 template <typename T1, typename T2>
1040 INLINE
1041 static T1 AlignDownPow2(T1 value, T2 alignment)
1042 {
1043 SWR_ASSERT(IsPow2(alignment));
1044 return value & ~T1(alignment - 1);
1045 }
1046
1047 //////////////////////////////////////////////////////////////////////////
1048 /// Align up to specified alignment
1049 /// Note: IsPow2(alignment) MUST be true
1050 //////////////////////////////////////////////////////////////////////////
1051 template <typename T1, typename T2>
1052 INLINE
1053 static T1 AlignUpPow2(T1 value, T2 alignment)
1054 {
1055 return AlignDownPow2(value + T1(alignment - 1), alignment);
1056 }
1057
1058 //////////////////////////////////////////////////////////////////////////
1059 /// Align up ptr to specified alignment
1060 /// Note: IsPow2(alignment) MUST be true
1061 //////////////////////////////////////////////////////////////////////////
1062 template <typename T1, typename T2>
1063 INLINE
1064 static T1* AlignUpPow2(T1* value, T2 alignment)
1065 {
1066 return reinterpret_cast<T1*>(
1067 AlignDownPow2(reinterpret_cast<uintptr_t>(value) + uintptr_t(alignment - 1), alignment));
1068 }
1069
1070 //////////////////////////////////////////////////////////////////////////
1071 /// Align down to specified alignment
1072 //////////////////////////////////////////////////////////////////////////
1073 template <typename T1, typename T2>
1074 INLINE
1075 static T1 AlignDown(T1 value, T2 alignment)
1076 {
1077 if (IsPow2(alignment)) { return AlignDownPow2(value, alignment); }
1078 return value - T1(value % alignment);
1079 }
1080
1081 //////////////////////////////////////////////////////////////////////////
1082 /// Align down to specified alignment
1083 //////////////////////////////////////////////////////////////////////////
1084 template <typename T1, typename T2>
1085 INLINE
1086 static T1* AlignDown(T1* value, T2 alignment)
1087 {
1088 return (T1*)AlignDown(uintptr_t(value), alignment);
1089 }
1090
1091 //////////////////////////////////////////////////////////////////////////
1092 /// Align up to specified alignment
1093 /// Note: IsPow2(alignment) MUST be true
1094 //////////////////////////////////////////////////////////////////////////
1095 template <typename T1, typename T2>
1096 INLINE
1097 static T1 AlignUp(T1 value, T2 alignment)
1098 {
1099 return AlignDown(value + T1(alignment - 1), alignment);
1100 }
1101
1102 //////////////////////////////////////////////////////////////////////////
1103 /// Align up to specified alignment
1104 /// Note: IsPow2(alignment) MUST be true
1105 //////////////////////////////////////////////////////////////////////////
1106 template <typename T1, typename T2>
1107 INLINE
1108 static T1* AlignUp(T1* value, T2 alignment)
1109 {
1110 return AlignDown(PtrAdd(value, alignment - 1), alignment);
1111 }
1112
1113 //////////////////////////////////////////////////////////////////////////
1114 /// Helper structure used to access an array of elements that don't
1115 /// correspond to a typical word size.
1116 //////////////////////////////////////////////////////////////////////////
1117 template<typename T, size_t BitsPerElementT, size_t ArrayLenT>
1118 class BitsArray
1119 {
1120 private:
1121 static const size_t BITS_PER_WORD = sizeof(size_t) * 8;
1122 static const size_t ELEMENTS_PER_WORD = BITS_PER_WORD / BitsPerElementT;
1123 static const size_t NUM_WORDS = (ArrayLenT + ELEMENTS_PER_WORD - 1) / ELEMENTS_PER_WORD;
1124 static const size_t ELEMENT_MASK = (size_t(1) << BitsPerElementT) - 1;
1125
1126 static_assert(ELEMENTS_PER_WORD * BitsPerElementT == BITS_PER_WORD,
1127 "Element size must an integral fraction of pointer size");
1128
1129 size_t m_words[NUM_WORDS] = {};
1130
1131 public:
1132
1133 T operator[] (size_t elementIndex) const
1134 {
1135 size_t word = m_words[elementIndex / ELEMENTS_PER_WORD];
1136 word >>= ((elementIndex % ELEMENTS_PER_WORD) * BitsPerElementT);
1137 return T(word & ELEMENT_MASK);
1138 }
1139 };
1140
1141 // Ranged integer argument for TemplateArgUnroller
1142 template <uint32_t TMin, uint32_t TMax>
1143 struct IntArg
1144 {
1145 uint32_t val;
1146 };
1147
1148 // Recursive template used to auto-nest conditionals. Converts dynamic boolean function
1149 // arguments to static template arguments.
1150 template <typename TermT, typename... ArgsB>
1151 struct TemplateArgUnroller
1152 {
1153 //-----------------------------------------
1154 // Boolean value
1155 //-----------------------------------------
1156
1157 // Last Arg Terminator
1158 static typename TermT::FuncType GetFunc(bool bArg)
1159 {
1160 if (bArg)
1161 {
1162 return TermT::template GetFunc<ArgsB..., std::true_type>();
1163 }
1164
1165 return TermT::template GetFunc<ArgsB..., std::false_type>();
1166 }
1167
1168 // Recursively parse args
1169 template <typename... TArgsT>
1170 static typename TermT::FuncType GetFunc(bool bArg, TArgsT... remainingArgs)
1171 {
1172 if (bArg)
1173 {
1174 return TemplateArgUnroller<TermT, ArgsB..., std::true_type>::GetFunc(remainingArgs...);
1175 }
1176
1177 return TemplateArgUnroller<TermT, ArgsB..., std::false_type>::GetFunc(remainingArgs...);
1178 }
1179
1180 //-----------------------------------------
1181 // Integer value (within specified range)
1182 //-----------------------------------------
1183
1184 // Last Arg Terminator
1185 template <uint32_t TMin, uint32_t TMax>
1186 static typename TermT::FuncType GetFunc(IntArg<TMin, TMax> iArg)
1187 {
1188 if (iArg.val == TMax)
1189 {
1190 return TermT::template GetFunc<ArgsB..., std::integral_constant<uint32_t, TMax>>();
1191 }
1192 if (TMax > TMin)
1193 {
1194 return TemplateArgUnroller<TermT, ArgsB...>::GetFunc(IntArg<TMin, TMax-1>{iArg.val});
1195 }
1196 SWR_ASSUME(false); return nullptr;
1197 }
1198 template <uint32_t TVal>
1199 static typename TermT::FuncType GetFunc(IntArg<TVal, TVal> iArg)
1200 {
1201 SWR_ASSERT(iArg.val == TVal);
1202 return TermT::template GetFunc<ArgsB..., std::integral_constant<uint32_t, TVal>>();
1203 }
1204
1205 // Recursively parse args
1206 template <uint32_t TMin, uint32_t TMax, typename... TArgsT>
1207 static typename TermT::FuncType GetFunc(IntArg<TMin, TMax> iArg, TArgsT... remainingArgs)
1208 {
1209 if (iArg.val == TMax)
1210 {
1211 return TemplateArgUnroller<TermT, ArgsB..., std::integral_constant<uint32_t, TMax>>::GetFunc(remainingArgs...);
1212 }
1213 if (TMax > TMin)
1214 {
1215 return TemplateArgUnroller<TermT, ArgsB...>::GetFunc(IntArg<TMin, TMax - 1>{iArg.val}, remainingArgs...);
1216 }
1217 SWR_ASSUME(false); return nullptr;
1218 }
1219 template <uint32_t TVal, typename... TArgsT>
1220 static typename TermT::FuncType GetFunc(IntArg<TVal, TVal> iArg, TArgsT... remainingArgs)
1221 {
1222 SWR_ASSERT(iArg.val == TVal);
1223 return TemplateArgUnroller<TermT, ArgsB..., std::integral_constant<uint32_t, TVal>>::GetFunc(remainingArgs...);
1224 }
1225 };
1226
1227 //////////////////////////////////////////////////////////////////////////
1228 /// Helpers used to get / set environment variable
1229 //////////////////////////////////////////////////////////////////////////
1230 static INLINE std::string GetEnv(const std::string& variableName)
1231 {
1232 std::string output;
1233 #if defined(_WIN32)
1234 DWORD valueSize = GetEnvironmentVariableA(variableName.c_str(), nullptr, 0);
1235 if (!valueSize) return output;
1236 output.resize(valueSize - 1); // valueSize includes null, output.resize() does not
1237 GetEnvironmentVariableA(variableName.c_str(), &output[0], valueSize);
1238 #else
1239 output = getenv(variableName.c_str());
1240 #endif
1241
1242 return output;
1243 }
1244
1245 static INLINE void SetEnv(const std::string& variableName, const std::string& value)
1246 {
1247 #if defined(_WIN32)
1248 SetEnvironmentVariableA(variableName.c_str(), value.c_str());
1249 #else
1250 setenv(variableName.c_str(), value.c_str(), true);
1251 #endif
1252 }
1253