swr: [rasterizer core] don't assume linux is 64-bit
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / utils.h
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file utils.h
24 *
25 * @brief Utilities used by SWR core.
26 *
27 ******************************************************************************/
28 #pragma once
29
30 #include <string.h>
31 #include "common/os.h"
32 #include "common/simdintrin.h"
33 #include "common/swr_assert.h"
34
35 #if defined(_WIN32)
36 void SaveImageToPNGFile(
37 const WCHAR *pFilename,
38 void *pBuffer,
39 uint32_t width,
40 uint32_t height);
41
42 void OpenBitmapFromFile(
43 const WCHAR *pFilename,
44 void **pBuffer,
45 uint32_t *width,
46 uint32_t *height);
47 #endif
48
49 #if defined(_WIN64) || defined(__x86_64__)
50 #define _MM_INSERT_EPI64 _mm_insert_epi64
51 #define _MM_EXTRACT_EPI64 _mm_extract_epi64
52 #else
53 INLINE INT64 _MM_EXTRACT_EPI64(__m128i a, const int32_t ndx)
54 {
55 OSALIGNLINE(uint32_t) elems[4];
56 _mm_store_si128((__m128i*)elems, a);
57 if (ndx == 0)
58 {
59 uint64_t foo = elems[0];
60 foo |= (uint64_t)elems[1] << 32;
61 return foo;
62 }
63 else
64 {
65 uint64_t foo = elems[2];
66 foo |= (uint64_t)elems[3] << 32;
67 return foo;
68 }
69 }
70
71 INLINE __m128i _MM_INSERT_EPI64(__m128i a, INT64 b, const int32_t ndx)
72 {
73 OSALIGNLINE(int64_t) elems[2];
74 _mm_store_si128((__m128i*)elems, a);
75 if (ndx == 0)
76 {
77 elems[0] = b;
78 }
79 else
80 {
81 elems[1] = b;
82 }
83 __m128i out;
84 out = _mm_load_si128((const __m128i*)elems);
85 return out;
86 }
87 #endif
88
89 OSALIGNLINE(struct) BBOX
90 {
91 int top, bottom, left, right;
92
93 BBOX() {}
94 BBOX(int t, int b, int l, int r) : top(t), bottom(b), left(l), right(r) {}
95
96 bool operator==(const BBOX& rhs)
97 {
98 return (this->top == rhs.top &&
99 this->bottom == rhs.bottom &&
100 this->left == rhs.left &&
101 this->right == rhs.right);
102 }
103
104 bool operator!=(const BBOX& rhs)
105 {
106 return !(*this == rhs);
107 }
108 };
109
110 struct simdBBox
111 {
112 simdscalari top, bottom, left, right;
113 };
114
115 INLINE
116 void vTranspose(__m128 &row0, __m128 &row1, __m128 &row2, __m128 &row3)
117 {
118 __m128i row0i = _mm_castps_si128(row0);
119 __m128i row1i = _mm_castps_si128(row1);
120 __m128i row2i = _mm_castps_si128(row2);
121 __m128i row3i = _mm_castps_si128(row3);
122
123 __m128i vTemp = row2i;
124 row2i = _mm_unpacklo_epi32(row2i, row3i);
125 vTemp = _mm_unpackhi_epi32(vTemp, row3i);
126
127 row3i = row0i;
128 row0i = _mm_unpacklo_epi32(row0i, row1i);
129 row3i = _mm_unpackhi_epi32(row3i, row1i);
130
131 row1i = row0i;
132 row0i = _mm_unpacklo_epi64(row0i, row2i);
133 row1i = _mm_unpackhi_epi64(row1i, row2i);
134
135 row2i = row3i;
136 row2i = _mm_unpacklo_epi64(row2i, vTemp);
137 row3i = _mm_unpackhi_epi64(row3i, vTemp);
138
139 row0 = _mm_castsi128_ps(row0i);
140 row1 = _mm_castsi128_ps(row1i);
141 row2 = _mm_castsi128_ps(row2i);
142 row3 = _mm_castsi128_ps(row3i);
143 }
144
145 INLINE
146 void vTranspose(__m128i &row0, __m128i &row1, __m128i &row2, __m128i &row3)
147 {
148 __m128i vTemp = row2;
149 row2 = _mm_unpacklo_epi32(row2, row3);
150 vTemp = _mm_unpackhi_epi32(vTemp, row3);
151
152 row3 = row0;
153 row0 = _mm_unpacklo_epi32(row0, row1);
154 row3 = _mm_unpackhi_epi32(row3, row1);
155
156 row1 = row0;
157 row0 = _mm_unpacklo_epi64(row0, row2);
158 row1 = _mm_unpackhi_epi64(row1, row2);
159
160 row2 = row3;
161 row2 = _mm_unpacklo_epi64(row2, vTemp);
162 row3 = _mm_unpackhi_epi64(row3, vTemp);
163 }
164
165 #define GCC_VERSION (__GNUC__ * 10000 \
166 + __GNUC_MINOR__ * 100 \
167 + __GNUC_PATCHLEVEL__)
168
169 #if defined(__GNUC__) && (GCC_VERSION < 40900)
170 #define _mm_undefined_ps _mm_setzero_ps
171 #define _mm_undefined_si128 _mm_setzero_si128
172 #if KNOB_SIMD_WIDTH == 8
173 #define _mm256_undefined_ps _mm256_setzero_ps
174 #endif
175 #endif
176
177 #if KNOB_SIMD_WIDTH == 8
178 INLINE
179 void vTranspose3x8(__m128 (&vDst)[8], __m256 &vSrc0, __m256 &vSrc1, __m256 &vSrc2)
180 {
181 __m256 r0r2 = _mm256_unpacklo_ps(vSrc0, vSrc2); //x0z0x1z1 x4z4x5z5
182 __m256 r1rx = _mm256_unpacklo_ps(vSrc1, _mm256_undefined_ps()); //y0w0y1w1 y4w4y5w5
183 __m256 r02r1xlolo = _mm256_unpacklo_ps(r0r2, r1rx); //x0y0z0w0 x4y4z4w4
184 __m256 r02r1xlohi = _mm256_unpackhi_ps(r0r2, r1rx); //x1y1z1w1 x5y5z5w5
185
186 r0r2 = _mm256_unpackhi_ps(vSrc0, vSrc2); //x2z2x3z3 x6z6x7z7
187 r1rx = _mm256_unpackhi_ps(vSrc1, _mm256_undefined_ps()); //y2w2y3w3 y6w6yw77
188 __m256 r02r1xhilo = _mm256_unpacklo_ps(r0r2, r1rx); //x2y2z2w2 x6y6z6w6
189 __m256 r02r1xhihi = _mm256_unpackhi_ps(r0r2, r1rx); //x3y3z3w3 x7y7z7w7
190
191 vDst[0] = _mm256_castps256_ps128(r02r1xlolo);
192 vDst[1] = _mm256_castps256_ps128(r02r1xlohi);
193 vDst[2] = _mm256_castps256_ps128(r02r1xhilo);
194 vDst[3] = _mm256_castps256_ps128(r02r1xhihi);
195
196 vDst[4] = _mm256_extractf128_ps(r02r1xlolo, 1);
197 vDst[5] = _mm256_extractf128_ps(r02r1xlohi, 1);
198 vDst[6] = _mm256_extractf128_ps(r02r1xhilo, 1);
199 vDst[7] = _mm256_extractf128_ps(r02r1xhihi, 1);
200 }
201
202 INLINE
203 void vTranspose4x8(__m128 (&vDst)[8], __m256 &vSrc0, __m256 &vSrc1, __m256 &vSrc2, __m256 &vSrc3)
204 {
205 __m256 r0r2 = _mm256_unpacklo_ps(vSrc0, vSrc2); //x0z0x1z1 x4z4x5z5
206 __m256 r1rx = _mm256_unpacklo_ps(vSrc1, vSrc3); //y0w0y1w1 y4w4y5w5
207 __m256 r02r1xlolo = _mm256_unpacklo_ps(r0r2, r1rx); //x0y0z0w0 x4y4z4w4
208 __m256 r02r1xlohi = _mm256_unpackhi_ps(r0r2, r1rx); //x1y1z1w1 x5y5z5w5
209
210 r0r2 = _mm256_unpackhi_ps(vSrc0, vSrc2); //x2z2x3z3 x6z6x7z7
211 r1rx = _mm256_unpackhi_ps(vSrc1, vSrc3) ; //y2w2y3w3 y6w6yw77
212 __m256 r02r1xhilo = _mm256_unpacklo_ps(r0r2, r1rx); //x2y2z2w2 x6y6z6w6
213 __m256 r02r1xhihi = _mm256_unpackhi_ps(r0r2, r1rx); //x3y3z3w3 x7y7z7w7
214
215 vDst[0] = _mm256_castps256_ps128(r02r1xlolo);
216 vDst[1] = _mm256_castps256_ps128(r02r1xlohi);
217 vDst[2] = _mm256_castps256_ps128(r02r1xhilo);
218 vDst[3] = _mm256_castps256_ps128(r02r1xhihi);
219
220 vDst[4] = _mm256_extractf128_ps(r02r1xlolo, 1);
221 vDst[5] = _mm256_extractf128_ps(r02r1xlohi, 1);
222 vDst[6] = _mm256_extractf128_ps(r02r1xhilo, 1);
223 vDst[7] = _mm256_extractf128_ps(r02r1xhihi, 1);
224 }
225
226 INLINE
227 void vTranspose8x8(__m256 (&vDst)[8], const __m256 &vMask0, const __m256 &vMask1, const __m256 &vMask2, const __m256 &vMask3, const __m256 &vMask4, const __m256 &vMask5, const __m256 &vMask6, const __m256 &vMask7)
228 {
229 __m256 __t0 = _mm256_unpacklo_ps(vMask0, vMask1);
230 __m256 __t1 = _mm256_unpackhi_ps(vMask0, vMask1);
231 __m256 __t2 = _mm256_unpacklo_ps(vMask2, vMask3);
232 __m256 __t3 = _mm256_unpackhi_ps(vMask2, vMask3);
233 __m256 __t4 = _mm256_unpacklo_ps(vMask4, vMask5);
234 __m256 __t5 = _mm256_unpackhi_ps(vMask4, vMask5);
235 __m256 __t6 = _mm256_unpacklo_ps(vMask6, vMask7);
236 __m256 __t7 = _mm256_unpackhi_ps(vMask6, vMask7);
237 __m256 __tt0 = _mm256_shuffle_ps(__t0,__t2,_MM_SHUFFLE(1,0,1,0));
238 __m256 __tt1 = _mm256_shuffle_ps(__t0,__t2,_MM_SHUFFLE(3,2,3,2));
239 __m256 __tt2 = _mm256_shuffle_ps(__t1,__t3,_MM_SHUFFLE(1,0,1,0));
240 __m256 __tt3 = _mm256_shuffle_ps(__t1,__t3,_MM_SHUFFLE(3,2,3,2));
241 __m256 __tt4 = _mm256_shuffle_ps(__t4,__t6,_MM_SHUFFLE(1,0,1,0));
242 __m256 __tt5 = _mm256_shuffle_ps(__t4,__t6,_MM_SHUFFLE(3,2,3,2));
243 __m256 __tt6 = _mm256_shuffle_ps(__t5,__t7,_MM_SHUFFLE(1,0,1,0));
244 __m256 __tt7 = _mm256_shuffle_ps(__t5,__t7,_MM_SHUFFLE(3,2,3,2));
245 vDst[0] = _mm256_permute2f128_ps(__tt0, __tt4, 0x20);
246 vDst[1] = _mm256_permute2f128_ps(__tt1, __tt5, 0x20);
247 vDst[2] = _mm256_permute2f128_ps(__tt2, __tt6, 0x20);
248 vDst[3] = _mm256_permute2f128_ps(__tt3, __tt7, 0x20);
249 vDst[4] = _mm256_permute2f128_ps(__tt0, __tt4, 0x31);
250 vDst[5] = _mm256_permute2f128_ps(__tt1, __tt5, 0x31);
251 vDst[6] = _mm256_permute2f128_ps(__tt2, __tt6, 0x31);
252 vDst[7] = _mm256_permute2f128_ps(__tt3, __tt7, 0x31);
253 }
254
255 INLINE
256 void vTranspose8x8(__m256 (&vDst)[8], const __m256i &vMask0, const __m256i &vMask1, const __m256i &vMask2, const __m256i &vMask3, const __m256i &vMask4, const __m256i &vMask5, const __m256i &vMask6, const __m256i &vMask7)
257 {
258 vTranspose8x8(vDst, _mm256_castsi256_ps(vMask0), _mm256_castsi256_ps(vMask1), _mm256_castsi256_ps(vMask2), _mm256_castsi256_ps(vMask3),
259 _mm256_castsi256_ps(vMask4), _mm256_castsi256_ps(vMask5), _mm256_castsi256_ps(vMask6), _mm256_castsi256_ps(vMask7));
260 }
261 #endif
262
263 //////////////////////////////////////////////////////////////////////////
264 /// TranposeSingleComponent
265 //////////////////////////////////////////////////////////////////////////
266 template<uint32_t bpp>
267 struct TransposeSingleComponent
268 {
269 //////////////////////////////////////////////////////////////////////////
270 /// @brief Pass-thru for single component.
271 /// @param pSrc - source data in SOA form
272 /// @param pDst - output data in AOS form
273 INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
274 {
275 memcpy(pDst, pSrc, (bpp * KNOB_SIMD_WIDTH) / 8);
276 }
277 };
278
279 //////////////////////////////////////////////////////////////////////////
280 /// Transpose8_8_8_8
281 //////////////////////////////////////////////////////////////////////////
282 struct Transpose8_8_8_8
283 {
284 //////////////////////////////////////////////////////////////////////////
285 /// @brief Performs an SOA to AOS conversion for packed 8_8_8_8 data.
286 /// @param pSrc - source data in SOA form
287 /// @param pDst - output data in AOS form
288 INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
289 {
290 simdscalari src = _simd_load_si((const simdscalari*)pSrc);
291 #if KNOB_SIMD_WIDTH == 8
292 #if KNOB_ARCH == KNOB_ARCH_AVX
293 __m128i c0c1 = _mm256_castsi256_si128(src); // rrrrrrrrgggggggg
294 __m128i c2c3 = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(src), 1)); // bbbbbbbbaaaaaaaa
295 __m128i c0c2 = _mm_unpacklo_epi64(c0c1, c2c3); // rrrrrrrrbbbbbbbb
296 __m128i c1c3 = _mm_unpackhi_epi64(c0c1, c2c3); // ggggggggaaaaaaaa
297 __m128i c01 = _mm_unpacklo_epi8(c0c2, c1c3); // rgrgrgrgrgrgrgrg
298 __m128i c23 = _mm_unpackhi_epi8(c0c2, c1c3); // babababababababa
299 __m128i c0123lo = _mm_unpacklo_epi16(c01, c23); // rgbargbargbargba
300 __m128i c0123hi = _mm_unpackhi_epi16(c01, c23); // rgbargbargbargba
301 _mm_store_si128((__m128i*)pDst, c0123lo);
302 _mm_store_si128((__m128i*)(pDst + 16), c0123hi);
303 #elif KNOB_ARCH == KNOB_ARCH_AVX2
304 simdscalari dst01 = _mm256_shuffle_epi8(src,
305 _mm256_set_epi32(0x0f078080, 0x0e068080, 0x0d058080, 0x0c048080, 0x80800b03, 0x80800a02, 0x80800901, 0x80800800));
306 simdscalari dst23 = _mm256_permute2x128_si256(src, src, 0x01);
307 dst23 = _mm256_shuffle_epi8(dst23,
308 _mm256_set_epi32(0x80800f07, 0x80800e06, 0x80800d05, 0x80800c04, 0x0b038080, 0x0a028080, 0x09018080, 0x08008080));
309 simdscalari dst = _mm256_or_si256(dst01, dst23);
310 _simd_store_si((simdscalari*)pDst, dst);
311 #endif
312 #else
313 #error Unsupported vector width
314 #endif
315 }
316 };
317
318 //////////////////////////////////////////////////////////////////////////
319 /// Transpose8_8_8
320 //////////////////////////////////////////////////////////////////////////
321 struct Transpose8_8_8
322 {
323 //////////////////////////////////////////////////////////////////////////
324 /// @brief Performs an SOA to AOS conversion for packed 8_8_8 data.
325 /// @param pSrc - source data in SOA form
326 /// @param pDst - output data in AOS form
327 INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
328 };
329
330 //////////////////////////////////////////////////////////////////////////
331 /// Transpose8_8
332 //////////////////////////////////////////////////////////////////////////
333 struct Transpose8_8
334 {
335 //////////////////////////////////////////////////////////////////////////
336 /// @brief Performs an SOA to AOS conversion for packed 8_8 data.
337 /// @param pSrc - source data in SOA form
338 /// @param pDst - output data in AOS form
339 INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
340 {
341 simdscalari src = _simd_load_si((const simdscalari*)pSrc);
342
343 #if KNOB_SIMD_WIDTH == 8
344 __m128i rg = _mm256_castsi256_si128(src); // rrrrrrrr gggggggg
345 __m128i g = _mm_unpackhi_epi64(rg, rg); // gggggggg gggggggg
346 rg = _mm_unpacklo_epi8(rg, g);
347 _mm_store_si128((__m128i*)pDst, rg);
348 #else
349 #error Unsupported vector width
350 #endif
351 }
352 };
353
354 //////////////////////////////////////////////////////////////////////////
355 /// Transpose32_32_32_32
356 //////////////////////////////////////////////////////////////////////////
357 struct Transpose32_32_32_32
358 {
359 //////////////////////////////////////////////////////////////////////////
360 /// @brief Performs an SOA to AOS conversion for packed 32_32_32_32 data.
361 /// @param pSrc - source data in SOA form
362 /// @param pDst - output data in AOS form
363 INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
364 {
365 #if KNOB_SIMD_WIDTH == 8
366 simdscalar src0 = _simd_load_ps((const float*)pSrc);
367 simdscalar src1 = _simd_load_ps((const float*)pSrc + 8);
368 simdscalar src2 = _simd_load_ps((const float*)pSrc + 16);
369 simdscalar src3 = _simd_load_ps((const float*)pSrc + 24);
370
371 __m128 vDst[8];
372 vTranspose4x8(vDst, src0, src1, src2, src3);
373 _mm_store_ps((float*)pDst, vDst[0]);
374 _mm_store_ps((float*)pDst+4, vDst[1]);
375 _mm_store_ps((float*)pDst+8, vDst[2]);
376 _mm_store_ps((float*)pDst+12, vDst[3]);
377 _mm_store_ps((float*)pDst+16, vDst[4]);
378 _mm_store_ps((float*)pDst+20, vDst[5]);
379 _mm_store_ps((float*)pDst+24, vDst[6]);
380 _mm_store_ps((float*)pDst+28, vDst[7]);
381 #else
382 #error Unsupported vector width
383 #endif
384 }
385 };
386
387 //////////////////////////////////////////////////////////////////////////
388 /// Transpose32_32_32
389 //////////////////////////////////////////////////////////////////////////
390 struct Transpose32_32_32
391 {
392 //////////////////////////////////////////////////////////////////////////
393 /// @brief Performs an SOA to AOS conversion for packed 32_32_32 data.
394 /// @param pSrc - source data in SOA form
395 /// @param pDst - output data in AOS form
396 INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
397 {
398 #if KNOB_SIMD_WIDTH == 8
399 simdscalar src0 = _simd_load_ps((const float*)pSrc);
400 simdscalar src1 = _simd_load_ps((const float*)pSrc + 8);
401 simdscalar src2 = _simd_load_ps((const float*)pSrc + 16);
402
403 __m128 vDst[8];
404 vTranspose3x8(vDst, src0, src1, src2);
405 _mm_store_ps((float*)pDst, vDst[0]);
406 _mm_store_ps((float*)pDst + 4, vDst[1]);
407 _mm_store_ps((float*)pDst + 8, vDst[2]);
408 _mm_store_ps((float*)pDst + 12, vDst[3]);
409 _mm_store_ps((float*)pDst + 16, vDst[4]);
410 _mm_store_ps((float*)pDst + 20, vDst[5]);
411 _mm_store_ps((float*)pDst + 24, vDst[6]);
412 _mm_store_ps((float*)pDst + 28, vDst[7]);
413 #else
414 #error Unsupported vector width
415 #endif
416 }
417 };
418
419 //////////////////////////////////////////////////////////////////////////
420 /// Transpose32_32
421 //////////////////////////////////////////////////////////////////////////
422 struct Transpose32_32
423 {
424 //////////////////////////////////////////////////////////////////////////
425 /// @brief Performs an SOA to AOS conversion for packed 32_32 data.
426 /// @param pSrc - source data in SOA form
427 /// @param pDst - output data in AOS form
428 INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
429 {
430 const float* pfSrc = (const float*)pSrc;
431 __m128 src_r0 = _mm_load_ps(pfSrc + 0);
432 __m128 src_r1 = _mm_load_ps(pfSrc + 4);
433 __m128 src_g0 = _mm_load_ps(pfSrc + 8);
434 __m128 src_g1 = _mm_load_ps(pfSrc + 12);
435
436 __m128 dst0 = _mm_unpacklo_ps(src_r0, src_g0);
437 __m128 dst1 = _mm_unpackhi_ps(src_r0, src_g0);
438 __m128 dst2 = _mm_unpacklo_ps(src_r1, src_g1);
439 __m128 dst3 = _mm_unpackhi_ps(src_r1, src_g1);
440
441 float* pfDst = (float*)pDst;
442 _mm_store_ps(pfDst + 0, dst0);
443 _mm_store_ps(pfDst + 4, dst1);
444 _mm_store_ps(pfDst + 8, dst2);
445 _mm_store_ps(pfDst + 12, dst3);
446 }
447 };
448
449 //////////////////////////////////////////////////////////////////////////
450 /// Transpose16_16_16_16
451 //////////////////////////////////////////////////////////////////////////
452 struct Transpose16_16_16_16
453 {
454 //////////////////////////////////////////////////////////////////////////
455 /// @brief Performs an SOA to AOS conversion for packed 16_16_16_16 data.
456 /// @param pSrc - source data in SOA form
457 /// @param pDst - output data in AOS form
458 INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
459 {
460 #if KNOB_SIMD_WIDTH == 8
461 simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc);
462 simdscalari src_ba = _simd_load_si((const simdscalari*)(pSrc + sizeof(simdscalari)));
463
464 __m128i src_r = _mm256_extractf128_si256(src_rg, 0);
465 __m128i src_g = _mm256_extractf128_si256(src_rg, 1);
466 __m128i src_b = _mm256_extractf128_si256(src_ba, 0);
467 __m128i src_a = _mm256_extractf128_si256(src_ba, 1);
468
469 __m128i rg0 = _mm_unpacklo_epi16(src_r, src_g);
470 __m128i rg1 = _mm_unpackhi_epi16(src_r, src_g);
471 __m128i ba0 = _mm_unpacklo_epi16(src_b, src_a);
472 __m128i ba1 = _mm_unpackhi_epi16(src_b, src_a);
473
474 __m128i dst0 = _mm_unpacklo_epi32(rg0, ba0);
475 __m128i dst1 = _mm_unpackhi_epi32(rg0, ba0);
476 __m128i dst2 = _mm_unpacklo_epi32(rg1, ba1);
477 __m128i dst3 = _mm_unpackhi_epi32(rg1, ba1);
478
479 _mm_store_si128(((__m128i*)pDst) + 0, dst0);
480 _mm_store_si128(((__m128i*)pDst) + 1, dst1);
481 _mm_store_si128(((__m128i*)pDst) + 2, dst2);
482 _mm_store_si128(((__m128i*)pDst) + 3, dst3);
483 #else
484 #error Unsupported vector width
485 #endif
486 }
487 };
488
489 //////////////////////////////////////////////////////////////////////////
490 /// Transpose16_16_16
491 //////////////////////////////////////////////////////////////////////////
492 struct Transpose16_16_16
493 {
494 //////////////////////////////////////////////////////////////////////////
495 /// @brief Performs an SOA to AOS conversion for packed 16_16_16 data.
496 /// @param pSrc - source data in SOA form
497 /// @param pDst - output data in AOS form
498 INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
499 {
500 #if KNOB_SIMD_WIDTH == 8
501 simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc);
502
503 __m128i src_r = _mm256_extractf128_si256(src_rg, 0);
504 __m128i src_g = _mm256_extractf128_si256(src_rg, 1);
505 __m128i src_b = _mm_load_si128((const __m128i*)(pSrc + sizeof(simdscalari)));
506 __m128i src_a = _mm_undefined_si128();
507
508 __m128i rg0 = _mm_unpacklo_epi16(src_r, src_g);
509 __m128i rg1 = _mm_unpackhi_epi16(src_r, src_g);
510 __m128i ba0 = _mm_unpacklo_epi16(src_b, src_a);
511 __m128i ba1 = _mm_unpackhi_epi16(src_b, src_a);
512
513 __m128i dst0 = _mm_unpacklo_epi32(rg0, ba0);
514 __m128i dst1 = _mm_unpackhi_epi32(rg0, ba0);
515 __m128i dst2 = _mm_unpacklo_epi32(rg1, ba1);
516 __m128i dst3 = _mm_unpackhi_epi32(rg1, ba1);
517
518 _mm_store_si128(((__m128i*)pDst) + 0, dst0);
519 _mm_store_si128(((__m128i*)pDst) + 1, dst1);
520 _mm_store_si128(((__m128i*)pDst) + 2, dst2);
521 _mm_store_si128(((__m128i*)pDst) + 3, dst3);
522 #else
523 #error Unsupported vector width
524 #endif
525 }
526 };
527
528 //////////////////////////////////////////////////////////////////////////
529 /// Transpose16_16
530 //////////////////////////////////////////////////////////////////////////
531 struct Transpose16_16
532 {
533 //////////////////////////////////////////////////////////////////////////
534 /// @brief Performs an SOA to AOS conversion for packed 16_16 data.
535 /// @param pSrc - source data in SOA form
536 /// @param pDst - output data in AOS form
537 INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
538 {
539 simdscalar src = _simd_load_ps((const float*)pSrc);
540
541 #if KNOB_SIMD_WIDTH == 8
542 __m128 comp0 = _mm256_castps256_ps128(src);
543 __m128 comp1 = _mm256_extractf128_ps(src, 1);
544
545 __m128i comp0i = _mm_castps_si128(comp0);
546 __m128i comp1i = _mm_castps_si128(comp1);
547
548 __m128i resLo = _mm_unpacklo_epi16(comp0i, comp1i);
549 __m128i resHi = _mm_unpackhi_epi16(comp0i, comp1i);
550
551 _mm_store_si128((__m128i*)pDst, resLo);
552 _mm_store_si128((__m128i*)pDst + 1, resHi);
553 #else
554 #error Unsupported vector width
555 #endif
556 }
557 };
558
559 //////////////////////////////////////////////////////////////////////////
560 /// Transpose24_8
561 //////////////////////////////////////////////////////////////////////////
562 struct Transpose24_8
563 {
564 //////////////////////////////////////////////////////////////////////////
565 /// @brief Performs an SOA to AOS conversion for packed 24_8 data.
566 /// @param pSrc - source data in SOA form
567 /// @param pDst - output data in AOS form
568 static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
569 };
570
571 //////////////////////////////////////////////////////////////////////////
572 /// Transpose32_8_24
573 //////////////////////////////////////////////////////////////////////////
574 struct Transpose32_8_24
575 {
576 //////////////////////////////////////////////////////////////////////////
577 /// @brief Performs an SOA to AOS conversion for packed 32_8_24 data.
578 /// @param pSrc - source data in SOA form
579 /// @param pDst - output data in AOS form
580 static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
581 };
582
583
584
585 //////////////////////////////////////////////////////////////////////////
586 /// Transpose4_4_4_4
587 //////////////////////////////////////////////////////////////////////////
588 struct Transpose4_4_4_4
589 {
590 //////////////////////////////////////////////////////////////////////////
591 /// @brief Performs an SOA to AOS conversion for packed 4_4_4_4 data.
592 /// @param pSrc - source data in SOA form
593 /// @param pDst - output data in AOS form
594 static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
595 };
596
597 //////////////////////////////////////////////////////////////////////////
598 /// Transpose5_6_5
599 //////////////////////////////////////////////////////////////////////////
600 struct Transpose5_6_5
601 {
602 //////////////////////////////////////////////////////////////////////////
603 /// @brief Performs an SOA to AOS conversion for packed 5_6_5 data.
604 /// @param pSrc - source data in SOA form
605 /// @param pDst - output data in AOS form
606 static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
607 };
608
609 //////////////////////////////////////////////////////////////////////////
610 /// Transpose9_9_9_5
611 //////////////////////////////////////////////////////////////////////////
612 struct Transpose9_9_9_5
613 {
614 //////////////////////////////////////////////////////////////////////////
615 /// @brief Performs an SOA to AOS conversion for packed 9_9_9_5 data.
616 /// @param pSrc - source data in SOA form
617 /// @param pDst - output data in AOS form
618 static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
619 };
620
621 //////////////////////////////////////////////////////////////////////////
622 /// Transpose5_5_5_1
623 //////////////////////////////////////////////////////////////////////////
624 struct Transpose5_5_5_1
625 {
626 //////////////////////////////////////////////////////////////////////////
627 /// @brief Performs an SOA to AOS conversion for packed 5_5_5_1 data.
628 /// @param pSrc - source data in SOA form
629 /// @param pDst - output data in AOS form
630 static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
631 };
632
633 //////////////////////////////////////////////////////////////////////////
634 /// Transpose10_10_10_2
635 //////////////////////////////////////////////////////////////////////////
636 struct Transpose10_10_10_2
637 {
638 //////////////////////////////////////////////////////////////////////////
639 /// @brief Performs an SOA to AOS conversion for packed 10_10_10_2 data.
640 /// @param pSrc - source data in SOA form
641 /// @param pDst - output data in AOS form
642 static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
643 };
644
645 //////////////////////////////////////////////////////////////////////////
646 /// Transpose11_11_10
647 //////////////////////////////////////////////////////////////////////////
648 struct Transpose11_11_10
649 {
650 //////////////////////////////////////////////////////////////////////////
651 /// @brief Performs an SOA to AOS conversion for packed 11_11_10 data.
652 /// @param pSrc - source data in SOA form
653 /// @param pDst - output data in AOS form
654 static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
655 };
656
657 // helper function to unroll loops
658 template<int Begin, int End, int Step = 1>
659 struct UnrollerL {
660 template<typename Lambda>
661 INLINE static void step(Lambda& func) {
662 func(Begin);
663 UnrollerL<Begin + Step, End, Step>::step(func);
664 }
665 };
666
667 template<int End, int Step>
668 struct UnrollerL<End, End, Step> {
669 template<typename Lambda>
670 static void step(Lambda& func) {
671 }
672 };
673
674 // general CRC compute
675 INLINE
676 uint32_t ComputeCRC(uint32_t crc, const void *pData, uint32_t size)
677 {
678 #if defined(_WIN64) || defined(__x86_64__)
679 uint32_t sizeInQwords = size / sizeof(uint64_t);
680 uint32_t sizeRemainderBytes = size % sizeof(uint64_t);
681 uint64_t* pDataWords = (uint64_t*)pData;
682 for (uint32_t i = 0; i < sizeInQwords; ++i)
683 {
684 crc = (uint32_t)_mm_crc32_u64(crc, *pDataWords++);
685 }
686 #else
687 uint32_t sizeInDwords = size / sizeof(uint32_t);
688 uint32_t sizeRemainderBytes = size % sizeof(uint32_t);
689 uint32_t* pDataWords = (uint32_t*)pData;
690 for (uint32_t i = 0; i < sizeInDwords; ++i)
691 {
692 crc = _mm_crc32_u32(crc, *pDataWords++);
693 }
694 #endif
695
696 BYTE* pRemainderBytes = (BYTE*)pDataWords;
697 for (uint32_t i = 0; i < sizeRemainderBytes; ++i)
698 {
699 crc = _mm_crc32_u8(crc, *pRemainderBytes++);
700 }
701
702 return crc;
703 }
704
705 //////////////////////////////////////////////////////////////////////////
706 /// Add byte offset to any-type pointer
707 //////////////////////////////////////////////////////////////////////////
708 template <typename T>
709 INLINE
710 static T* PtrAdd(T* p, intptr_t offset)
711 {
712 intptr_t intp = reinterpret_cast<intptr_t>(p);
713 return reinterpret_cast<T*>(intp + offset);
714 }
715
716 //////////////////////////////////////////////////////////////////////////
717 /// Is a power-of-2?
718 //////////////////////////////////////////////////////////////////////////
719 template <typename T>
720 INLINE
721 static bool IsPow2(T value)
722 {
723 return value == (value & (0 - value));
724 }
725
726 //////////////////////////////////////////////////////////////////////////
727 /// Align down to specified alignment
728 /// Note: IsPow2(alignment) MUST be true
729 //////////////////////////////////////////////////////////////////////////
730 template <typename T1, typename T2>
731 INLINE
732 static T1 AlignDownPow2(T1 value, T2 alignment)
733 {
734 SWR_ASSERT(IsPow2(alignment));
735 return value & ~T1(alignment - 1);
736 }
737
738 //////////////////////////////////////////////////////////////////////////
739 /// Align up to specified alignment
740 /// Note: IsPow2(alignment) MUST be true
741 //////////////////////////////////////////////////////////////////////////
742 template <typename T1, typename T2>
743 INLINE
744 static T1 AlignUpPow2(T1 value, T2 alignment)
745 {
746 return AlignDownPow2(value + T1(alignment - 1), alignment);
747 }
748
749 //////////////////////////////////////////////////////////////////////////
750 /// Align up ptr to specified alignment
751 /// Note: IsPow2(alignment) MUST be true
752 //////////////////////////////////////////////////////////////////////////
753 template <typename T1, typename T2>
754 INLINE
755 static T1* AlignUpPow2(T1* value, T2 alignment)
756 {
757 return reinterpret_cast<T1*>(
758 AlignDownPow2(reinterpret_cast<uintptr_t>(value) + uintptr_t(alignment - 1), alignment));
759 }
760
761 //////////////////////////////////////////////////////////////////////////
762 /// Align down to specified alignment
763 //////////////////////////////////////////////////////////////////////////
764 template <typename T1, typename T2>
765 INLINE
766 static T1 AlignDown(T1 value, T2 alignment)
767 {
768 if (IsPow2(alignment)) { return AlignDownPow2(value, alignment); }
769 return value - T1(value % alignment);
770 }
771
772 //////////////////////////////////////////////////////////////////////////
773 /// Align down to specified alignment
774 //////////////////////////////////////////////////////////////////////////
775 template <typename T1, typename T2>
776 INLINE
777 static T1* AlignDown(T1* value, T2 alignment)
778 {
779 return (T1*)AlignDown(uintptr_t(value), alignment);
780 }
781
782 //////////////////////////////////////////////////////////////////////////
783 /// Align up to specified alignment
784 /// Note: IsPow2(alignment) MUST be true
785 //////////////////////////////////////////////////////////////////////////
786 template <typename T1, typename T2>
787 INLINE
788 static T1 AlignUp(T1 value, T2 alignment)
789 {
790 return AlignDown(value + T1(alignment - 1), alignment);
791 }
792
793 //////////////////////////////////////////////////////////////////////////
794 /// Align up to specified alignment
795 /// Note: IsPow2(alignment) MUST be true
796 //////////////////////////////////////////////////////////////////////////
797 template <typename T1, typename T2>
798 INLINE
799 static T1* AlignUp(T1* value, T2 alignment)
800 {
801 return AlignDown(PtrAdd(value, alignment - 1), alignment);
802 }
803
804 //////////////////////////////////////////////////////////////////////////
805 /// Helper structure used to access an array of elements that don't
806 /// correspond to a typical word size.
807 //////////////////////////////////////////////////////////////////////////
808 template<typename T, size_t BitsPerElementT, size_t ArrayLenT>
809 class BitsArray
810 {
811 private:
812 static const size_t BITS_PER_WORD = sizeof(size_t) * 8;
813 static const size_t ELEMENTS_PER_WORD = BITS_PER_WORD / BitsPerElementT;
814 static const size_t NUM_WORDS = (ArrayLenT + ELEMENTS_PER_WORD - 1) / ELEMENTS_PER_WORD;
815 static const size_t ELEMENT_MASK = (size_t(1) << BitsPerElementT) - 1;
816
817 static_assert(ELEMENTS_PER_WORD * BitsPerElementT == BITS_PER_WORD,
818 "Element size must an integral fraction of pointer size");
819
820 size_t m_words[NUM_WORDS] = {};
821
822 public:
823
824 T operator[] (size_t elementIndex) const
825 {
826 size_t word = m_words[elementIndex / ELEMENTS_PER_WORD];
827 word >>= ((elementIndex % ELEMENTS_PER_WORD) * BitsPerElementT);
828 return T(word & ELEMENT_MASK);
829 }
830 };