swr: [rasterizer core] buckets fixes
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / utils.h
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file utils.h
24 *
25 * @brief Utilities used by SWR core.
26 *
27 ******************************************************************************/
28 #pragma once
29
30 #include <string.h>
31 #include <type_traits>
32 #include "common/os.h"
33 #include "common/simdintrin.h"
34 #include "common/swr_assert.h"
35
36 #if defined(_WIN32)
37 void SaveImageToPNGFile(
38 const WCHAR *pFilename,
39 void *pBuffer,
40 uint32_t width,
41 uint32_t height,
42 bool broadcastRed);
43
44 void OpenBitmapFromFile(
45 const WCHAR *pFilename,
46 void **pBuffer,
47 uint32_t *width,
48 uint32_t *height);
49 #endif
50
51 #if defined(_WIN64) || defined(__x86_64__)
52 #define _MM_INSERT_EPI64 _mm_insert_epi64
53 #define _MM_EXTRACT_EPI64 _mm_extract_epi64
54 #else
55 INLINE INT64 _MM_EXTRACT_EPI64(__m128i a, const int32_t ndx)
56 {
57 OSALIGNLINE(uint32_t) elems[4];
58 _mm_store_si128((__m128i*)elems, a);
59 if (ndx == 0)
60 {
61 uint64_t foo = elems[0];
62 foo |= (uint64_t)elems[1] << 32;
63 return foo;
64 }
65 else
66 {
67 uint64_t foo = elems[2];
68 foo |= (uint64_t)elems[3] << 32;
69 return foo;
70 }
71 }
72
73 INLINE __m128i _MM_INSERT_EPI64(__m128i a, INT64 b, const int32_t ndx)
74 {
75 OSALIGNLINE(int64_t) elems[2];
76 _mm_store_si128((__m128i*)elems, a);
77 if (ndx == 0)
78 {
79 elems[0] = b;
80 }
81 else
82 {
83 elems[1] = b;
84 }
85 __m128i out;
86 out = _mm_load_si128((const __m128i*)elems);
87 return out;
88 }
89 #endif
90
91 OSALIGNLINE(struct) BBOX
92 {
93 int top{ 0 };
94 int bottom{ 0 };
95 int left{ 0 };
96 int right{ 0 };
97
98 BBOX() {}
99 BBOX(int t, int b, int l, int r) : top(t), bottom(b), left(l), right(r) {}
100
101 bool operator==(const BBOX& rhs)
102 {
103 return (this->top == rhs.top &&
104 this->bottom == rhs.bottom &&
105 this->left == rhs.left &&
106 this->right == rhs.right);
107 }
108
109 bool operator!=(const BBOX& rhs)
110 {
111 return !(*this == rhs);
112 }
113 };
114
115 struct simdBBox
116 {
117 simdscalari top;
118 simdscalari bottom;
119 simdscalari left;
120 simdscalari right;
121 };
122
123 INLINE
124 void vTranspose(__m128 &row0, __m128 &row1, __m128 &row2, __m128 &row3)
125 {
126 __m128i row0i = _mm_castps_si128(row0);
127 __m128i row1i = _mm_castps_si128(row1);
128 __m128i row2i = _mm_castps_si128(row2);
129 __m128i row3i = _mm_castps_si128(row3);
130
131 __m128i vTemp = row2i;
132 row2i = _mm_unpacklo_epi32(row2i, row3i);
133 vTemp = _mm_unpackhi_epi32(vTemp, row3i);
134
135 row3i = row0i;
136 row0i = _mm_unpacklo_epi32(row0i, row1i);
137 row3i = _mm_unpackhi_epi32(row3i, row1i);
138
139 row1i = row0i;
140 row0i = _mm_unpacklo_epi64(row0i, row2i);
141 row1i = _mm_unpackhi_epi64(row1i, row2i);
142
143 row2i = row3i;
144 row2i = _mm_unpacklo_epi64(row2i, vTemp);
145 row3i = _mm_unpackhi_epi64(row3i, vTemp);
146
147 row0 = _mm_castsi128_ps(row0i);
148 row1 = _mm_castsi128_ps(row1i);
149 row2 = _mm_castsi128_ps(row2i);
150 row3 = _mm_castsi128_ps(row3i);
151 }
152
153 INLINE
154 void vTranspose(__m128i &row0, __m128i &row1, __m128i &row2, __m128i &row3)
155 {
156 __m128i vTemp = row2;
157 row2 = _mm_unpacklo_epi32(row2, row3);
158 vTemp = _mm_unpackhi_epi32(vTemp, row3);
159
160 row3 = row0;
161 row0 = _mm_unpacklo_epi32(row0, row1);
162 row3 = _mm_unpackhi_epi32(row3, row1);
163
164 row1 = row0;
165 row0 = _mm_unpacklo_epi64(row0, row2);
166 row1 = _mm_unpackhi_epi64(row1, row2);
167
168 row2 = row3;
169 row2 = _mm_unpacklo_epi64(row2, vTemp);
170 row3 = _mm_unpackhi_epi64(row3, vTemp);
171 }
172
173 #define GCC_VERSION (__GNUC__ * 10000 \
174 + __GNUC_MINOR__ * 100 \
175 + __GNUC_PATCHLEVEL__)
176
177 #if defined(__clang__) || (defined(__GNUC__) && (GCC_VERSION < 40900))
178 #define _mm_undefined_ps _mm_setzero_ps
179 #define _mm_undefined_si128 _mm_setzero_si128
180 #if KNOB_SIMD_WIDTH == 8
181 #define _mm256_undefined_ps _mm256_setzero_ps
182 #endif
183 #endif
184
185 #if KNOB_SIMD_WIDTH == 8
186 INLINE
187 void vTranspose3x8(__m128 (&vDst)[8], __m256 &vSrc0, __m256 &vSrc1, __m256 &vSrc2)
188 {
189 __m256 r0r2 = _mm256_unpacklo_ps(vSrc0, vSrc2); //x0z0x1z1 x4z4x5z5
190 __m256 r1rx = _mm256_unpacklo_ps(vSrc1, _mm256_undefined_ps()); //y0w0y1w1 y4w4y5w5
191 __m256 r02r1xlolo = _mm256_unpacklo_ps(r0r2, r1rx); //x0y0z0w0 x4y4z4w4
192 __m256 r02r1xlohi = _mm256_unpackhi_ps(r0r2, r1rx); //x1y1z1w1 x5y5z5w5
193
194 r0r2 = _mm256_unpackhi_ps(vSrc0, vSrc2); //x2z2x3z3 x6z6x7z7
195 r1rx = _mm256_unpackhi_ps(vSrc1, _mm256_undefined_ps()); //y2w2y3w3 y6w6yw77
196 __m256 r02r1xhilo = _mm256_unpacklo_ps(r0r2, r1rx); //x2y2z2w2 x6y6z6w6
197 __m256 r02r1xhihi = _mm256_unpackhi_ps(r0r2, r1rx); //x3y3z3w3 x7y7z7w7
198
199 vDst[0] = _mm256_castps256_ps128(r02r1xlolo);
200 vDst[1] = _mm256_castps256_ps128(r02r1xlohi);
201 vDst[2] = _mm256_castps256_ps128(r02r1xhilo);
202 vDst[3] = _mm256_castps256_ps128(r02r1xhihi);
203
204 vDst[4] = _mm256_extractf128_ps(r02r1xlolo, 1);
205 vDst[5] = _mm256_extractf128_ps(r02r1xlohi, 1);
206 vDst[6] = _mm256_extractf128_ps(r02r1xhilo, 1);
207 vDst[7] = _mm256_extractf128_ps(r02r1xhihi, 1);
208 }
209
210 INLINE
211 void vTranspose4x8(__m128 (&vDst)[8], __m256 &vSrc0, __m256 &vSrc1, __m256 &vSrc2, __m256 &vSrc3)
212 {
213 __m256 r0r2 = _mm256_unpacklo_ps(vSrc0, vSrc2); //x0z0x1z1 x4z4x5z5
214 __m256 r1rx = _mm256_unpacklo_ps(vSrc1, vSrc3); //y0w0y1w1 y4w4y5w5
215 __m256 r02r1xlolo = _mm256_unpacklo_ps(r0r2, r1rx); //x0y0z0w0 x4y4z4w4
216 __m256 r02r1xlohi = _mm256_unpackhi_ps(r0r2, r1rx); //x1y1z1w1 x5y5z5w5
217
218 r0r2 = _mm256_unpackhi_ps(vSrc0, vSrc2); //x2z2x3z3 x6z6x7z7
219 r1rx = _mm256_unpackhi_ps(vSrc1, vSrc3) ; //y2w2y3w3 y6w6yw77
220 __m256 r02r1xhilo = _mm256_unpacklo_ps(r0r2, r1rx); //x2y2z2w2 x6y6z6w6
221 __m256 r02r1xhihi = _mm256_unpackhi_ps(r0r2, r1rx); //x3y3z3w3 x7y7z7w7
222
223 vDst[0] = _mm256_castps256_ps128(r02r1xlolo);
224 vDst[1] = _mm256_castps256_ps128(r02r1xlohi);
225 vDst[2] = _mm256_castps256_ps128(r02r1xhilo);
226 vDst[3] = _mm256_castps256_ps128(r02r1xhihi);
227
228 vDst[4] = _mm256_extractf128_ps(r02r1xlolo, 1);
229 vDst[5] = _mm256_extractf128_ps(r02r1xlohi, 1);
230 vDst[6] = _mm256_extractf128_ps(r02r1xhilo, 1);
231 vDst[7] = _mm256_extractf128_ps(r02r1xhihi, 1);
232 }
233
234 INLINE
235 void vTranspose8x8(__m256 (&vDst)[8], const __m256 &vMask0, const __m256 &vMask1, const __m256 &vMask2, const __m256 &vMask3, const __m256 &vMask4, const __m256 &vMask5, const __m256 &vMask6, const __m256 &vMask7)
236 {
237 __m256 __t0 = _mm256_unpacklo_ps(vMask0, vMask1);
238 __m256 __t1 = _mm256_unpackhi_ps(vMask0, vMask1);
239 __m256 __t2 = _mm256_unpacklo_ps(vMask2, vMask3);
240 __m256 __t3 = _mm256_unpackhi_ps(vMask2, vMask3);
241 __m256 __t4 = _mm256_unpacklo_ps(vMask4, vMask5);
242 __m256 __t5 = _mm256_unpackhi_ps(vMask4, vMask5);
243 __m256 __t6 = _mm256_unpacklo_ps(vMask6, vMask7);
244 __m256 __t7 = _mm256_unpackhi_ps(vMask6, vMask7);
245 __m256 __tt0 = _mm256_shuffle_ps(__t0,__t2,_MM_SHUFFLE(1,0,1,0));
246 __m256 __tt1 = _mm256_shuffle_ps(__t0,__t2,_MM_SHUFFLE(3,2,3,2));
247 __m256 __tt2 = _mm256_shuffle_ps(__t1,__t3,_MM_SHUFFLE(1,0,1,0));
248 __m256 __tt3 = _mm256_shuffle_ps(__t1,__t3,_MM_SHUFFLE(3,2,3,2));
249 __m256 __tt4 = _mm256_shuffle_ps(__t4,__t6,_MM_SHUFFLE(1,0,1,0));
250 __m256 __tt5 = _mm256_shuffle_ps(__t4,__t6,_MM_SHUFFLE(3,2,3,2));
251 __m256 __tt6 = _mm256_shuffle_ps(__t5,__t7,_MM_SHUFFLE(1,0,1,0));
252 __m256 __tt7 = _mm256_shuffle_ps(__t5,__t7,_MM_SHUFFLE(3,2,3,2));
253 vDst[0] = _mm256_permute2f128_ps(__tt0, __tt4, 0x20);
254 vDst[1] = _mm256_permute2f128_ps(__tt1, __tt5, 0x20);
255 vDst[2] = _mm256_permute2f128_ps(__tt2, __tt6, 0x20);
256 vDst[3] = _mm256_permute2f128_ps(__tt3, __tt7, 0x20);
257 vDst[4] = _mm256_permute2f128_ps(__tt0, __tt4, 0x31);
258 vDst[5] = _mm256_permute2f128_ps(__tt1, __tt5, 0x31);
259 vDst[6] = _mm256_permute2f128_ps(__tt2, __tt6, 0x31);
260 vDst[7] = _mm256_permute2f128_ps(__tt3, __tt7, 0x31);
261 }
262
263 INLINE
264 void vTranspose8x8(__m256 (&vDst)[8], const __m256i &vMask0, const __m256i &vMask1, const __m256i &vMask2, const __m256i &vMask3, const __m256i &vMask4, const __m256i &vMask5, const __m256i &vMask6, const __m256i &vMask7)
265 {
266 vTranspose8x8(vDst, _mm256_castsi256_ps(vMask0), _mm256_castsi256_ps(vMask1), _mm256_castsi256_ps(vMask2), _mm256_castsi256_ps(vMask3),
267 _mm256_castsi256_ps(vMask4), _mm256_castsi256_ps(vMask5), _mm256_castsi256_ps(vMask6), _mm256_castsi256_ps(vMask7));
268 }
269 #endif
270
271 //////////////////////////////////////////////////////////////////////////
272 /// TranposeSingleComponent
273 //////////////////////////////////////////////////////////////////////////
274 template<uint32_t bpp>
275 struct TransposeSingleComponent
276 {
277 //////////////////////////////////////////////////////////////////////////
278 /// @brief Pass-thru for single component.
279 /// @param pSrc - source data in SOA form
280 /// @param pDst - output data in AOS form
281 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
282 {
283 memcpy(pDst, pSrc, (bpp * KNOB_SIMD_WIDTH) / 8);
284 }
285 };
286
287 //////////////////////////////////////////////////////////////////////////
288 /// Transpose8_8_8_8
289 //////////////////////////////////////////////////////////////////////////
290 struct Transpose8_8_8_8
291 {
292 //////////////////////////////////////////////////////////////////////////
293 /// @brief Performs an SOA to AOS conversion for packed 8_8_8_8 data.
294 /// @param pSrc - source data in SOA form
295 /// @param pDst - output data in AOS form
296 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
297 {
298 simdscalari src = _simd_load_si((const simdscalari*)pSrc);
299 #if KNOB_SIMD_WIDTH == 8
300 #if KNOB_ARCH == KNOB_ARCH_AVX
301 __m128i c0c1 = _mm256_castsi256_si128(src); // rrrrrrrrgggggggg
302 __m128i c2c3 = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(src), 1)); // bbbbbbbbaaaaaaaa
303 __m128i c0c2 = _mm_unpacklo_epi64(c0c1, c2c3); // rrrrrrrrbbbbbbbb
304 __m128i c1c3 = _mm_unpackhi_epi64(c0c1, c2c3); // ggggggggaaaaaaaa
305 __m128i c01 = _mm_unpacklo_epi8(c0c2, c1c3); // rgrgrgrgrgrgrgrg
306 __m128i c23 = _mm_unpackhi_epi8(c0c2, c1c3); // babababababababa
307 __m128i c0123lo = _mm_unpacklo_epi16(c01, c23); // rgbargbargbargba
308 __m128i c0123hi = _mm_unpackhi_epi16(c01, c23); // rgbargbargbargba
309 _mm_store_si128((__m128i*)pDst, c0123lo);
310 _mm_store_si128((__m128i*)(pDst + 16), c0123hi);
311 #elif KNOB_ARCH == KNOB_ARCH_AVX2
312 simdscalari dst01 = _mm256_shuffle_epi8(src,
313 _mm256_set_epi32(0x0f078080, 0x0e068080, 0x0d058080, 0x0c048080, 0x80800b03, 0x80800a02, 0x80800901, 0x80800800));
314 simdscalari dst23 = _mm256_permute2x128_si256(src, src, 0x01);
315 dst23 = _mm256_shuffle_epi8(dst23,
316 _mm256_set_epi32(0x80800f07, 0x80800e06, 0x80800d05, 0x80800c04, 0x0b038080, 0x0a028080, 0x09018080, 0x08008080));
317 simdscalari dst = _mm256_or_si256(dst01, dst23);
318 _simd_store_si((simdscalari*)pDst, dst);
319 #endif
320 #else
321 #error Unsupported vector width
322 #endif
323 }
324 };
325
326 //////////////////////////////////////////////////////////////////////////
327 /// Transpose8_8_8
328 //////////////////////////////////////////////////////////////////////////
329 struct Transpose8_8_8
330 {
331 //////////////////////////////////////////////////////////////////////////
332 /// @brief Performs an SOA to AOS conversion for packed 8_8_8 data.
333 /// @param pSrc - source data in SOA form
334 /// @param pDst - output data in AOS form
335 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
336 };
337
338 //////////////////////////////////////////////////////////////////////////
339 /// Transpose8_8
340 //////////////////////////////////////////////////////////////////////////
341 struct Transpose8_8
342 {
343 //////////////////////////////////////////////////////////////////////////
344 /// @brief Performs an SOA to AOS conversion for packed 8_8 data.
345 /// @param pSrc - source data in SOA form
346 /// @param pDst - output data in AOS form
347 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
348 {
349 simdscalari src = _simd_load_si((const simdscalari*)pSrc);
350
351 #if KNOB_SIMD_WIDTH == 8
352 __m128i rg = _mm256_castsi256_si128(src); // rrrrrrrr gggggggg
353 __m128i g = _mm_unpackhi_epi64(rg, rg); // gggggggg gggggggg
354 rg = _mm_unpacklo_epi8(rg, g);
355 _mm_store_si128((__m128i*)pDst, rg);
356 #else
357 #error Unsupported vector width
358 #endif
359 }
360 };
361
362 //////////////////////////////////////////////////////////////////////////
363 /// Transpose32_32_32_32
364 //////////////////////////////////////////////////////////////////////////
365 struct Transpose32_32_32_32
366 {
367 //////////////////////////////////////////////////////////////////////////
368 /// @brief Performs an SOA to AOS conversion for packed 32_32_32_32 data.
369 /// @param pSrc - source data in SOA form
370 /// @param pDst - output data in AOS form
371 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
372 {
373 #if KNOB_SIMD_WIDTH == 8
374 simdscalar src0 = _simd_load_ps((const float*)pSrc);
375 simdscalar src1 = _simd_load_ps((const float*)pSrc + 8);
376 simdscalar src2 = _simd_load_ps((const float*)pSrc + 16);
377 simdscalar src3 = _simd_load_ps((const float*)pSrc + 24);
378
379 __m128 vDst[8];
380 vTranspose4x8(vDst, src0, src1, src2, src3);
381 _mm_store_ps((float*)pDst, vDst[0]);
382 _mm_store_ps((float*)pDst+4, vDst[1]);
383 _mm_store_ps((float*)pDst+8, vDst[2]);
384 _mm_store_ps((float*)pDst+12, vDst[3]);
385 _mm_store_ps((float*)pDst+16, vDst[4]);
386 _mm_store_ps((float*)pDst+20, vDst[5]);
387 _mm_store_ps((float*)pDst+24, vDst[6]);
388 _mm_store_ps((float*)pDst+28, vDst[7]);
389 #else
390 #error Unsupported vector width
391 #endif
392 }
393 };
394
395 //////////////////////////////////////////////////////////////////////////
396 /// Transpose32_32_32
397 //////////////////////////////////////////////////////////////////////////
398 struct Transpose32_32_32
399 {
400 //////////////////////////////////////////////////////////////////////////
401 /// @brief Performs an SOA to AOS conversion for packed 32_32_32 data.
402 /// @param pSrc - source data in SOA form
403 /// @param pDst - output data in AOS form
404 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
405 {
406 #if KNOB_SIMD_WIDTH == 8
407 simdscalar src0 = _simd_load_ps((const float*)pSrc);
408 simdscalar src1 = _simd_load_ps((const float*)pSrc + 8);
409 simdscalar src2 = _simd_load_ps((const float*)pSrc + 16);
410
411 __m128 vDst[8];
412 vTranspose3x8(vDst, src0, src1, src2);
413 _mm_store_ps((float*)pDst, vDst[0]);
414 _mm_store_ps((float*)pDst + 4, vDst[1]);
415 _mm_store_ps((float*)pDst + 8, vDst[2]);
416 _mm_store_ps((float*)pDst + 12, vDst[3]);
417 _mm_store_ps((float*)pDst + 16, vDst[4]);
418 _mm_store_ps((float*)pDst + 20, vDst[5]);
419 _mm_store_ps((float*)pDst + 24, vDst[6]);
420 _mm_store_ps((float*)pDst + 28, vDst[7]);
421 #else
422 #error Unsupported vector width
423 #endif
424 }
425 };
426
427 //////////////////////////////////////////////////////////////////////////
428 /// Transpose32_32
429 //////////////////////////////////////////////////////////////////////////
430 struct Transpose32_32
431 {
432 //////////////////////////////////////////////////////////////////////////
433 /// @brief Performs an SOA to AOS conversion for packed 32_32 data.
434 /// @param pSrc - source data in SOA form
435 /// @param pDst - output data in AOS form
436 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
437 {
438 const float* pfSrc = (const float*)pSrc;
439 __m128 src_r0 = _mm_load_ps(pfSrc + 0);
440 __m128 src_r1 = _mm_load_ps(pfSrc + 4);
441 __m128 src_g0 = _mm_load_ps(pfSrc + 8);
442 __m128 src_g1 = _mm_load_ps(pfSrc + 12);
443
444 __m128 dst0 = _mm_unpacklo_ps(src_r0, src_g0);
445 __m128 dst1 = _mm_unpackhi_ps(src_r0, src_g0);
446 __m128 dst2 = _mm_unpacklo_ps(src_r1, src_g1);
447 __m128 dst3 = _mm_unpackhi_ps(src_r1, src_g1);
448
449 float* pfDst = (float*)pDst;
450 _mm_store_ps(pfDst + 0, dst0);
451 _mm_store_ps(pfDst + 4, dst1);
452 _mm_store_ps(pfDst + 8, dst2);
453 _mm_store_ps(pfDst + 12, dst3);
454 }
455 };
456
457 //////////////////////////////////////////////////////////////////////////
458 /// Transpose16_16_16_16
459 //////////////////////////////////////////////////////////////////////////
460 struct Transpose16_16_16_16
461 {
462 //////////////////////////////////////////////////////////////////////////
463 /// @brief Performs an SOA to AOS conversion for packed 16_16_16_16 data.
464 /// @param pSrc - source data in SOA form
465 /// @param pDst - output data in AOS form
466 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
467 {
468 #if KNOB_SIMD_WIDTH == 8
469 simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc);
470 simdscalari src_ba = _simd_load_si((const simdscalari*)(pSrc + sizeof(simdscalari)));
471
472 __m128i src_r = _mm256_extractf128_si256(src_rg, 0);
473 __m128i src_g = _mm256_extractf128_si256(src_rg, 1);
474 __m128i src_b = _mm256_extractf128_si256(src_ba, 0);
475 __m128i src_a = _mm256_extractf128_si256(src_ba, 1);
476
477 __m128i rg0 = _mm_unpacklo_epi16(src_r, src_g);
478 __m128i rg1 = _mm_unpackhi_epi16(src_r, src_g);
479 __m128i ba0 = _mm_unpacklo_epi16(src_b, src_a);
480 __m128i ba1 = _mm_unpackhi_epi16(src_b, src_a);
481
482 __m128i dst0 = _mm_unpacklo_epi32(rg0, ba0);
483 __m128i dst1 = _mm_unpackhi_epi32(rg0, ba0);
484 __m128i dst2 = _mm_unpacklo_epi32(rg1, ba1);
485 __m128i dst3 = _mm_unpackhi_epi32(rg1, ba1);
486
487 _mm_store_si128(((__m128i*)pDst) + 0, dst0);
488 _mm_store_si128(((__m128i*)pDst) + 1, dst1);
489 _mm_store_si128(((__m128i*)pDst) + 2, dst2);
490 _mm_store_si128(((__m128i*)pDst) + 3, dst3);
491 #else
492 #error Unsupported vector width
493 #endif
494 }
495 };
496
497 //////////////////////////////////////////////////////////////////////////
498 /// Transpose16_16_16
499 //////////////////////////////////////////////////////////////////////////
500 struct Transpose16_16_16
501 {
502 //////////////////////////////////////////////////////////////////////////
503 /// @brief Performs an SOA to AOS conversion for packed 16_16_16 data.
504 /// @param pSrc - source data in SOA form
505 /// @param pDst - output data in AOS form
506 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
507 {
508 #if KNOB_SIMD_WIDTH == 8
509 simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc);
510
511 __m128i src_r = _mm256_extractf128_si256(src_rg, 0);
512 __m128i src_g = _mm256_extractf128_si256(src_rg, 1);
513 __m128i src_b = _mm_load_si128((const __m128i*)(pSrc + sizeof(simdscalari)));
514 __m128i src_a = _mm_undefined_si128();
515
516 __m128i rg0 = _mm_unpacklo_epi16(src_r, src_g);
517 __m128i rg1 = _mm_unpackhi_epi16(src_r, src_g);
518 __m128i ba0 = _mm_unpacklo_epi16(src_b, src_a);
519 __m128i ba1 = _mm_unpackhi_epi16(src_b, src_a);
520
521 __m128i dst0 = _mm_unpacklo_epi32(rg0, ba0);
522 __m128i dst1 = _mm_unpackhi_epi32(rg0, ba0);
523 __m128i dst2 = _mm_unpacklo_epi32(rg1, ba1);
524 __m128i dst3 = _mm_unpackhi_epi32(rg1, ba1);
525
526 _mm_store_si128(((__m128i*)pDst) + 0, dst0);
527 _mm_store_si128(((__m128i*)pDst) + 1, dst1);
528 _mm_store_si128(((__m128i*)pDst) + 2, dst2);
529 _mm_store_si128(((__m128i*)pDst) + 3, dst3);
530 #else
531 #error Unsupported vector width
532 #endif
533 }
534 };
535
536 //////////////////////////////////////////////////////////////////////////
537 /// Transpose16_16
538 //////////////////////////////////////////////////////////////////////////
539 struct Transpose16_16
540 {
541 //////////////////////////////////////////////////////////////////////////
542 /// @brief Performs an SOA to AOS conversion for packed 16_16 data.
543 /// @param pSrc - source data in SOA form
544 /// @param pDst - output data in AOS form
545 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
546 {
547 simdscalar src = _simd_load_ps((const float*)pSrc);
548
549 #if KNOB_SIMD_WIDTH == 8
550 __m128 comp0 = _mm256_castps256_ps128(src);
551 __m128 comp1 = _mm256_extractf128_ps(src, 1);
552
553 __m128i comp0i = _mm_castps_si128(comp0);
554 __m128i comp1i = _mm_castps_si128(comp1);
555
556 __m128i resLo = _mm_unpacklo_epi16(comp0i, comp1i);
557 __m128i resHi = _mm_unpackhi_epi16(comp0i, comp1i);
558
559 _mm_store_si128((__m128i*)pDst, resLo);
560 _mm_store_si128((__m128i*)pDst + 1, resHi);
561 #else
562 #error Unsupported vector width
563 #endif
564 }
565 };
566
567 //////////////////////////////////////////////////////////////////////////
568 /// Transpose24_8
569 //////////////////////////////////////////////////////////////////////////
570 struct Transpose24_8
571 {
572 //////////////////////////////////////////////////////////////////////////
573 /// @brief Performs an SOA to AOS conversion for packed 24_8 data.
574 /// @param pSrc - source data in SOA form
575 /// @param pDst - output data in AOS form
576 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
577 };
578
579 //////////////////////////////////////////////////////////////////////////
580 /// Transpose32_8_24
581 //////////////////////////////////////////////////////////////////////////
582 struct Transpose32_8_24
583 {
584 //////////////////////////////////////////////////////////////////////////
585 /// @brief Performs an SOA to AOS conversion for packed 32_8_24 data.
586 /// @param pSrc - source data in SOA form
587 /// @param pDst - output data in AOS form
588 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
589 };
590
591
592
593 //////////////////////////////////////////////////////////////////////////
594 /// Transpose4_4_4_4
595 //////////////////////////////////////////////////////////////////////////
596 struct Transpose4_4_4_4
597 {
598 //////////////////////////////////////////////////////////////////////////
599 /// @brief Performs an SOA to AOS conversion for packed 4_4_4_4 data.
600 /// @param pSrc - source data in SOA form
601 /// @param pDst - output data in AOS form
602 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
603 };
604
605 //////////////////////////////////////////////////////////////////////////
606 /// Transpose5_6_5
607 //////////////////////////////////////////////////////////////////////////
608 struct Transpose5_6_5
609 {
610 //////////////////////////////////////////////////////////////////////////
611 /// @brief Performs an SOA to AOS conversion for packed 5_6_5 data.
612 /// @param pSrc - source data in SOA form
613 /// @param pDst - output data in AOS form
614 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
615 };
616
617 //////////////////////////////////////////////////////////////////////////
618 /// Transpose9_9_9_5
619 //////////////////////////////////////////////////////////////////////////
620 struct Transpose9_9_9_5
621 {
622 //////////////////////////////////////////////////////////////////////////
623 /// @brief Performs an SOA to AOS conversion for packed 9_9_9_5 data.
624 /// @param pSrc - source data in SOA form
625 /// @param pDst - output data in AOS form
626 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
627 };
628
629 //////////////////////////////////////////////////////////////////////////
630 /// Transpose5_5_5_1
631 //////////////////////////////////////////////////////////////////////////
632 struct Transpose5_5_5_1
633 {
634 //////////////////////////////////////////////////////////////////////////
635 /// @brief Performs an SOA to AOS conversion for packed 5_5_5_1 data.
636 /// @param pSrc - source data in SOA form
637 /// @param pDst - output data in AOS form
638 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
639 };
640
641 //////////////////////////////////////////////////////////////////////////
642 /// Transpose10_10_10_2
643 //////////////////////////////////////////////////////////////////////////
644 struct Transpose10_10_10_2
645 {
646 //////////////////////////////////////////////////////////////////////////
647 /// @brief Performs an SOA to AOS conversion for packed 10_10_10_2 data.
648 /// @param pSrc - source data in SOA form
649 /// @param pDst - output data in AOS form
650 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
651 };
652
653 //////////////////////////////////////////////////////////////////////////
654 /// Transpose11_11_10
655 //////////////////////////////////////////////////////////////////////////
656 struct Transpose11_11_10
657 {
658 //////////////////////////////////////////////////////////////////////////
659 /// @brief Performs an SOA to AOS conversion for packed 11_11_10 data.
660 /// @param pSrc - source data in SOA form
661 /// @param pDst - output data in AOS form
662 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
663 };
664
665 // helper function to unroll loops
666 template<int Begin, int End, int Step = 1>
667 struct UnrollerL {
668 template<typename Lambda>
669 INLINE static void step(Lambda& func) {
670 func(Begin);
671 UnrollerL<Begin + Step, End, Step>::step(func);
672 }
673 };
674
675 template<int End, int Step>
676 struct UnrollerL<End, End, Step> {
677 template<typename Lambda>
678 static void step(Lambda& func) {
679 }
680 };
681
682 // general CRC compute
683 INLINE
684 uint32_t ComputeCRC(uint32_t crc, const void *pData, uint32_t size)
685 {
686 #if defined(_WIN64) || defined(__x86_64__)
687 uint32_t sizeInQwords = size / sizeof(uint64_t);
688 uint32_t sizeRemainderBytes = size % sizeof(uint64_t);
689 uint64_t* pDataWords = (uint64_t*)pData;
690 for (uint32_t i = 0; i < sizeInQwords; ++i)
691 {
692 crc = (uint32_t)_mm_crc32_u64(crc, *pDataWords++);
693 }
694 #else
695 uint32_t sizeInDwords = size / sizeof(uint32_t);
696 uint32_t sizeRemainderBytes = size % sizeof(uint32_t);
697 uint32_t* pDataWords = (uint32_t*)pData;
698 for (uint32_t i = 0; i < sizeInDwords; ++i)
699 {
700 crc = _mm_crc32_u32(crc, *pDataWords++);
701 }
702 #endif
703
704 uint8_t* pRemainderBytes = (uint8_t*)pDataWords;
705 for (uint32_t i = 0; i < sizeRemainderBytes; ++i)
706 {
707 crc = _mm_crc32_u8(crc, *pRemainderBytes++);
708 }
709
710 return crc;
711 }
712
713 //////////////////////////////////////////////////////////////////////////
714 /// Add byte offset to any-type pointer
715 //////////////////////////////////////////////////////////////////////////
716 template <typename T>
717 INLINE
718 static T* PtrAdd(T* p, intptr_t offset)
719 {
720 intptr_t intp = reinterpret_cast<intptr_t>(p);
721 return reinterpret_cast<T*>(intp + offset);
722 }
723
724 //////////////////////////////////////////////////////////////////////////
725 /// Is a power-of-2?
726 //////////////////////////////////////////////////////////////////////////
727 template <typename T>
728 INLINE
729 static bool IsPow2(T value)
730 {
731 return value == (value & (0 - value));
732 }
733
734 //////////////////////////////////////////////////////////////////////////
735 /// Align down to specified alignment
736 /// Note: IsPow2(alignment) MUST be true
737 //////////////////////////////////////////////////////////////////////////
738 template <typename T1, typename T2>
739 INLINE
740 static T1 AlignDownPow2(T1 value, T2 alignment)
741 {
742 SWR_ASSERT(IsPow2(alignment));
743 return value & ~T1(alignment - 1);
744 }
745
746 //////////////////////////////////////////////////////////////////////////
747 /// Align up to specified alignment
748 /// Note: IsPow2(alignment) MUST be true
749 //////////////////////////////////////////////////////////////////////////
750 template <typename T1, typename T2>
751 INLINE
752 static T1 AlignUpPow2(T1 value, T2 alignment)
753 {
754 return AlignDownPow2(value + T1(alignment - 1), alignment);
755 }
756
757 //////////////////////////////////////////////////////////////////////////
758 /// Align up ptr to specified alignment
759 /// Note: IsPow2(alignment) MUST be true
760 //////////////////////////////////////////////////////////////////////////
761 template <typename T1, typename T2>
762 INLINE
763 static T1* AlignUpPow2(T1* value, T2 alignment)
764 {
765 return reinterpret_cast<T1*>(
766 AlignDownPow2(reinterpret_cast<uintptr_t>(value) + uintptr_t(alignment - 1), alignment));
767 }
768
769 //////////////////////////////////////////////////////////////////////////
770 /// Align down to specified alignment
771 //////////////////////////////////////////////////////////////////////////
772 template <typename T1, typename T2>
773 INLINE
774 static T1 AlignDown(T1 value, T2 alignment)
775 {
776 if (IsPow2(alignment)) { return AlignDownPow2(value, alignment); }
777 return value - T1(value % alignment);
778 }
779
780 //////////////////////////////////////////////////////////////////////////
781 /// Align down to specified alignment
782 //////////////////////////////////////////////////////////////////////////
783 template <typename T1, typename T2>
784 INLINE
785 static T1* AlignDown(T1* value, T2 alignment)
786 {
787 return (T1*)AlignDown(uintptr_t(value), alignment);
788 }
789
790 //////////////////////////////////////////////////////////////////////////
791 /// Align up to specified alignment
792 /// Note: IsPow2(alignment) MUST be true
793 //////////////////////////////////////////////////////////////////////////
794 template <typename T1, typename T2>
795 INLINE
796 static T1 AlignUp(T1 value, T2 alignment)
797 {
798 return AlignDown(value + T1(alignment - 1), alignment);
799 }
800
801 //////////////////////////////////////////////////////////////////////////
802 /// Align up to specified alignment
803 /// Note: IsPow2(alignment) MUST be true
804 //////////////////////////////////////////////////////////////////////////
805 template <typename T1, typename T2>
806 INLINE
807 static T1* AlignUp(T1* value, T2 alignment)
808 {
809 return AlignDown(PtrAdd(value, alignment - 1), alignment);
810 }
811
812 //////////////////////////////////////////////////////////////////////////
813 /// Helper structure used to access an array of elements that don't
814 /// correspond to a typical word size.
815 //////////////////////////////////////////////////////////////////////////
816 template<typename T, size_t BitsPerElementT, size_t ArrayLenT>
817 class BitsArray
818 {
819 private:
820 static const size_t BITS_PER_WORD = sizeof(size_t) * 8;
821 static const size_t ELEMENTS_PER_WORD = BITS_PER_WORD / BitsPerElementT;
822 static const size_t NUM_WORDS = (ArrayLenT + ELEMENTS_PER_WORD - 1) / ELEMENTS_PER_WORD;
823 static const size_t ELEMENT_MASK = (size_t(1) << BitsPerElementT) - 1;
824
825 static_assert(ELEMENTS_PER_WORD * BitsPerElementT == BITS_PER_WORD,
826 "Element size must an integral fraction of pointer size");
827
828 size_t m_words[NUM_WORDS] = {};
829
830 public:
831
832 T operator[] (size_t elementIndex) const
833 {
834 size_t word = m_words[elementIndex / ELEMENTS_PER_WORD];
835 word >>= ((elementIndex % ELEMENTS_PER_WORD) * BitsPerElementT);
836 return T(word & ELEMENT_MASK);
837 }
838 };
839
840 // Recursive template used to auto-nest conditionals. Converts dynamic boolean function
841 // arguments to static template arguments.
842 template <typename TermT, typename... ArgsB>
843 struct TemplateArgUnroller
844 {
845 // Last Arg Terminator
846 static typename TermT::FuncType GetFunc(bool bArg)
847 {
848 if (bArg)
849 {
850 return TermT::template GetFunc<ArgsB..., std::true_type>();
851 }
852
853 return TermT::template GetFunc<ArgsB..., std::false_type>();
854 }
855
856 // Recursively parse args
857 template <typename... TArgsT>
858 static typename TermT::FuncType GetFunc(bool bArg, TArgsT... remainingArgs)
859 {
860 if (bArg)
861 {
862 return TemplateArgUnroller<TermT, ArgsB..., std::true_type>::GetFunc(remainingArgs...);
863 }
864
865 return TemplateArgUnroller<TermT, ArgsB..., std::false_type>::GetFunc(remainingArgs...);
866 }
867 };
868
869 //////////////////////////////////////////////////////////////////////////
870 /// Helpers used to get / set environment variable
871 //////////////////////////////////////////////////////////////////////////
872 static INLINE std::string GetEnv(const std::string& variableName)
873 {
874 std::string output;
875 #if defined(_WIN32)
876 DWORD valueSize = GetEnvironmentVariableA(variableName.c_str(), nullptr, 0);
877 if (!valueSize) return output;
878 output.resize(valueSize - 1); // valueSize includes null, output.resize() does not
879 GetEnvironmentVariableA(variableName.c_str(), &output[0], valueSize);
880 #else
881 output = getenv(variableName.c_str());
882 #endif
883
884 return output;
885 }
886
887 static INLINE void SetEnv(const std::string& variableName, const std::string& value)
888 {
889 #if defined(_WIN32)
890 SetEnvironmentVariableA(variableName.c_str(), value.c_str());
891 #else
892 setenv(variableName.c_str(), value.c_str(), true);
893 #endif
894 }
895
896 //////////////////////////////////////////////////////////////////////////
897 /// Abstraction for dynamically loading modules and getting functions
898 //////////////////////////////////////////////////////////////////////////
899 #if defined(_WIN32)
900 typedef HMODULE SWR_MODULE_HANDLE;
901 #else
902 #include <dlfcn.h>
903 typedef void* SWR_MODULE_HANDLE;
904 #endif
905
906 static inline SWR_MODULE_HANDLE SWR_API LoadModule(const char* szModuleName)
907 {
908 #if defined(_WIN32)
909 return LoadLibraryA(szModuleName);
910 #else
911 return dlopen(szModuleName, RTLD_LAZY | RTLD_LOCAL);
912 #endif
913 }
914
915 static inline void SWR_API FreeModuleHandle(SWR_MODULE_HANDLE hModule)
916 {
917 if (hModule)
918 {
919 #if defined(_WIN32)
920 FreeLibrary((HMODULE)hModule);
921 #else
922 dlclose(hModule);
923 #endif
924 }
925 }
926
927 static inline void* SWR_API GetProcFromModule(SWR_MODULE_HANDLE hModule, const char* szProcName)
928 {
929 if (hModule && szProcName)
930 {
931 #if defined(_WIN32)
932 return GetProcAddress((HMODULE)hModule, szProcName);
933 #else
934 return dlsym(hModule, szProcName);
935 #endif
936 }
937
938 return nullptr;
939 }
940
941 template<typename T>
942 static inline void GetProcFromModule(SWR_MODULE_HANDLE hModule, const char* szProcName, T& outFunc)
943 {
944 outFunc = (T)GetProcFromModule(hModule, szProcName);
945 }
946