gallium/swr: add OpenSWR rasterizer
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / utils.h
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file utils.h
24 *
25 * @brief Utilities used by SWR core.
26 *
27 ******************************************************************************/
28 #pragma once
29
30 #include <string.h>
31 #include "common/os.h"
32 #include "common/simdintrin.h"
33 #include "common/swr_assert.h"
34
35 #if defined(_WIN32)
36 void SaveImageToPNGFile(
37 const WCHAR *pFilename,
38 void *pBuffer,
39 uint32_t width,
40 uint32_t height);
41
42 void OpenBitmapFromFile(
43 const WCHAR *pFilename,
44 void **pBuffer,
45 uint32_t *width,
46 uint32_t *height);
47 #endif
48
49 /// @todo assume linux is always 64 bit
50 #if defined(_WIN64) || defined(__linux__) || defined(__gnu_linux__)
51 #define _MM_INSERT_EPI64 _mm_insert_epi64
52 #define _MM_EXTRACT_EPI64 _mm_extract_epi64
53 #else
54 INLINE INT64 _MM_EXTRACT_EPI64(__m128i a, const int32_t ndx)
55 {
56 OSALIGNLINE(uint32_t) elems[4];
57 _mm_store_si128((__m128i*)elems, a);
58 if (ndx == 0)
59 {
60 uint64_t foo = elems[0];
61 foo |= (uint64_t)elems[1] << 32;
62 return foo;
63 }
64 else
65 {
66 uint64_t foo = elems[2];
67 foo |= (uint64_t)elems[3] << 32;
68 return foo;
69 }
70 }
71
72 INLINE __m128i _MM_INSERT_EPI64(__m128i a, INT64 b, const int32_t ndx)
73 {
74 OSALIGNLINE(int64_t) elems[2];
75 _mm_store_si128((__m128i*)elems, a);
76 if (ndx == 0)
77 {
78 elems[0] = b;
79 }
80 else
81 {
82 elems[1] = b;
83 }
84 __m128i out;
85 out = _mm_load_si128((const __m128i*)elems);
86 return out;
87 }
88 #endif
89
90 OSALIGNLINE(struct) BBOX
91 {
92 int top, bottom, left, right;
93
94 BBOX() {}
95 BBOX(int t, int b, int l, int r) : top(t), bottom(b), left(l), right(r) {}
96
97 bool operator==(const BBOX& rhs)
98 {
99 return (this->top == rhs.top &&
100 this->bottom == rhs.bottom &&
101 this->left == rhs.left &&
102 this->right == rhs.right);
103 }
104
105 bool operator!=(const BBOX& rhs)
106 {
107 return !(*this == rhs);
108 }
109 };
110
111 struct simdBBox
112 {
113 simdscalari top, bottom, left, right;
114 };
115
116 INLINE
117 void vTranspose(__m128 &row0, __m128 &row1, __m128 &row2, __m128 &row3)
118 {
119 __m128i row0i = _mm_castps_si128(row0);
120 __m128i row1i = _mm_castps_si128(row1);
121 __m128i row2i = _mm_castps_si128(row2);
122 __m128i row3i = _mm_castps_si128(row3);
123
124 __m128i vTemp = row2i;
125 row2i = _mm_unpacklo_epi32(row2i, row3i);
126 vTemp = _mm_unpackhi_epi32(vTemp, row3i);
127
128 row3i = row0i;
129 row0i = _mm_unpacklo_epi32(row0i, row1i);
130 row3i = _mm_unpackhi_epi32(row3i, row1i);
131
132 row1i = row0i;
133 row0i = _mm_unpacklo_epi64(row0i, row2i);
134 row1i = _mm_unpackhi_epi64(row1i, row2i);
135
136 row2i = row3i;
137 row2i = _mm_unpacklo_epi64(row2i, vTemp);
138 row3i = _mm_unpackhi_epi64(row3i, vTemp);
139
140 row0 = _mm_castsi128_ps(row0i);
141 row1 = _mm_castsi128_ps(row1i);
142 row2 = _mm_castsi128_ps(row2i);
143 row3 = _mm_castsi128_ps(row3i);
144 }
145
146 INLINE
147 void vTranspose(__m128i &row0, __m128i &row1, __m128i &row2, __m128i &row3)
148 {
149 __m128i vTemp = row2;
150 row2 = _mm_unpacklo_epi32(row2, row3);
151 vTemp = _mm_unpackhi_epi32(vTemp, row3);
152
153 row3 = row0;
154 row0 = _mm_unpacklo_epi32(row0, row1);
155 row3 = _mm_unpackhi_epi32(row3, row1);
156
157 row1 = row0;
158 row0 = _mm_unpacklo_epi64(row0, row2);
159 row1 = _mm_unpackhi_epi64(row1, row2);
160
161 row2 = row3;
162 row2 = _mm_unpacklo_epi64(row2, vTemp);
163 row3 = _mm_unpackhi_epi64(row3, vTemp);
164 }
165
166 #define GCC_VERSION (__GNUC__ * 10000 \
167 + __GNUC_MINOR__ * 100 \
168 + __GNUC_PATCHLEVEL__)
169
170 #if defined(__GNUC__) && (GCC_VERSION < 40900)
171 #define _mm_undefined_ps _mm_setzero_ps
172 #define _mm_undefined_si128 _mm_setzero_si128
173 #if KNOB_SIMD_WIDTH == 8
174 #define _mm256_undefined_ps _mm256_setzero_ps
175 #endif
176 #endif
177
178 #if KNOB_SIMD_WIDTH == 8
179 INLINE
180 void vTranspose3x8(__m128 (&vDst)[8], __m256 &vSrc0, __m256 &vSrc1, __m256 &vSrc2)
181 {
182 __m256 r0r2 = _mm256_unpacklo_ps(vSrc0, vSrc2); //x0z0x1z1 x4z4x5z5
183 __m256 r1rx = _mm256_unpacklo_ps(vSrc1, _mm256_undefined_ps()); //y0w0y1w1 y4w4y5w5
184 __m256 r02r1xlolo = _mm256_unpacklo_ps(r0r2, r1rx); //x0y0z0w0 x4y4z4w4
185 __m256 r02r1xlohi = _mm256_unpackhi_ps(r0r2, r1rx); //x1y1z1w1 x5y5z5w5
186
187 r0r2 = _mm256_unpackhi_ps(vSrc0, vSrc2); //x2z2x3z3 x6z6x7z7
188 r1rx = _mm256_unpackhi_ps(vSrc1, _mm256_undefined_ps()); //y2w2y3w3 y6w6yw77
189 __m256 r02r1xhilo = _mm256_unpacklo_ps(r0r2, r1rx); //x2y2z2w2 x6y6z6w6
190 __m256 r02r1xhihi = _mm256_unpackhi_ps(r0r2, r1rx); //x3y3z3w3 x7y7z7w7
191
192 vDst[0] = _mm256_castps256_ps128(r02r1xlolo);
193 vDst[1] = _mm256_castps256_ps128(r02r1xlohi);
194 vDst[2] = _mm256_castps256_ps128(r02r1xhilo);
195 vDst[3] = _mm256_castps256_ps128(r02r1xhihi);
196
197 vDst[4] = _mm256_extractf128_ps(r02r1xlolo, 1);
198 vDst[5] = _mm256_extractf128_ps(r02r1xlohi, 1);
199 vDst[6] = _mm256_extractf128_ps(r02r1xhilo, 1);
200 vDst[7] = _mm256_extractf128_ps(r02r1xhihi, 1);
201 }
202
203 INLINE
204 void vTranspose4x8(__m128 (&vDst)[8], __m256 &vSrc0, __m256 &vSrc1, __m256 &vSrc2, __m256 &vSrc3)
205 {
206 __m256 r0r2 = _mm256_unpacklo_ps(vSrc0, vSrc2); //x0z0x1z1 x4z4x5z5
207 __m256 r1rx = _mm256_unpacklo_ps(vSrc1, vSrc3); //y0w0y1w1 y4w4y5w5
208 __m256 r02r1xlolo = _mm256_unpacklo_ps(r0r2, r1rx); //x0y0z0w0 x4y4z4w4
209 __m256 r02r1xlohi = _mm256_unpackhi_ps(r0r2, r1rx); //x1y1z1w1 x5y5z5w5
210
211 r0r2 = _mm256_unpackhi_ps(vSrc0, vSrc2); //x2z2x3z3 x6z6x7z7
212 r1rx = _mm256_unpackhi_ps(vSrc1, vSrc3) ; //y2w2y3w3 y6w6yw77
213 __m256 r02r1xhilo = _mm256_unpacklo_ps(r0r2, r1rx); //x2y2z2w2 x6y6z6w6
214 __m256 r02r1xhihi = _mm256_unpackhi_ps(r0r2, r1rx); //x3y3z3w3 x7y7z7w7
215
216 vDst[0] = _mm256_castps256_ps128(r02r1xlolo);
217 vDst[1] = _mm256_castps256_ps128(r02r1xlohi);
218 vDst[2] = _mm256_castps256_ps128(r02r1xhilo);
219 vDst[3] = _mm256_castps256_ps128(r02r1xhihi);
220
221 vDst[4] = _mm256_extractf128_ps(r02r1xlolo, 1);
222 vDst[5] = _mm256_extractf128_ps(r02r1xlohi, 1);
223 vDst[6] = _mm256_extractf128_ps(r02r1xhilo, 1);
224 vDst[7] = _mm256_extractf128_ps(r02r1xhihi, 1);
225 }
226
227 INLINE
228 void vTranspose8x8(__m256 (&vDst)[8], const __m256 &vMask0, const __m256 &vMask1, const __m256 &vMask2, const __m256 &vMask3, const __m256 &vMask4, const __m256 &vMask5, const __m256 &vMask6, const __m256 &vMask7)
229 {
230 __m256 __t0 = _mm256_unpacklo_ps(vMask0, vMask1);
231 __m256 __t1 = _mm256_unpackhi_ps(vMask0, vMask1);
232 __m256 __t2 = _mm256_unpacklo_ps(vMask2, vMask3);
233 __m256 __t3 = _mm256_unpackhi_ps(vMask2, vMask3);
234 __m256 __t4 = _mm256_unpacklo_ps(vMask4, vMask5);
235 __m256 __t5 = _mm256_unpackhi_ps(vMask4, vMask5);
236 __m256 __t6 = _mm256_unpacklo_ps(vMask6, vMask7);
237 __m256 __t7 = _mm256_unpackhi_ps(vMask6, vMask7);
238 __m256 __tt0 = _mm256_shuffle_ps(__t0,__t2,_MM_SHUFFLE(1,0,1,0));
239 __m256 __tt1 = _mm256_shuffle_ps(__t0,__t2,_MM_SHUFFLE(3,2,3,2));
240 __m256 __tt2 = _mm256_shuffle_ps(__t1,__t3,_MM_SHUFFLE(1,0,1,0));
241 __m256 __tt3 = _mm256_shuffle_ps(__t1,__t3,_MM_SHUFFLE(3,2,3,2));
242 __m256 __tt4 = _mm256_shuffle_ps(__t4,__t6,_MM_SHUFFLE(1,0,1,0));
243 __m256 __tt5 = _mm256_shuffle_ps(__t4,__t6,_MM_SHUFFLE(3,2,3,2));
244 __m256 __tt6 = _mm256_shuffle_ps(__t5,__t7,_MM_SHUFFLE(1,0,1,0));
245 __m256 __tt7 = _mm256_shuffle_ps(__t5,__t7,_MM_SHUFFLE(3,2,3,2));
246 vDst[0] = _mm256_permute2f128_ps(__tt0, __tt4, 0x20);
247 vDst[1] = _mm256_permute2f128_ps(__tt1, __tt5, 0x20);
248 vDst[2] = _mm256_permute2f128_ps(__tt2, __tt6, 0x20);
249 vDst[3] = _mm256_permute2f128_ps(__tt3, __tt7, 0x20);
250 vDst[4] = _mm256_permute2f128_ps(__tt0, __tt4, 0x31);
251 vDst[5] = _mm256_permute2f128_ps(__tt1, __tt5, 0x31);
252 vDst[6] = _mm256_permute2f128_ps(__tt2, __tt6, 0x31);
253 vDst[7] = _mm256_permute2f128_ps(__tt3, __tt7, 0x31);
254 }
255
256 INLINE
257 void vTranspose8x8(__m256 (&vDst)[8], const __m256i &vMask0, const __m256i &vMask1, const __m256i &vMask2, const __m256i &vMask3, const __m256i &vMask4, const __m256i &vMask5, const __m256i &vMask6, const __m256i &vMask7)
258 {
259 vTranspose8x8(vDst, _mm256_castsi256_ps(vMask0), _mm256_castsi256_ps(vMask1), _mm256_castsi256_ps(vMask2), _mm256_castsi256_ps(vMask3),
260 _mm256_castsi256_ps(vMask4), _mm256_castsi256_ps(vMask5), _mm256_castsi256_ps(vMask6), _mm256_castsi256_ps(vMask7));
261 }
262 #endif
263
264 //////////////////////////////////////////////////////////////////////////
265 /// TranposeSingleComponent
266 //////////////////////////////////////////////////////////////////////////
267 template<uint32_t bpp>
268 struct TransposeSingleComponent
269 {
270 //////////////////////////////////////////////////////////////////////////
271 /// @brief Pass-thru for single component.
272 /// @param pSrc - source data in SOA form
273 /// @param pDst - output data in AOS form
274 INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
275 {
276 memcpy(pDst, pSrc, (bpp * KNOB_SIMD_WIDTH) / 8);
277 }
278 };
279
280 //////////////////////////////////////////////////////////////////////////
281 /// Transpose8_8_8_8
282 //////////////////////////////////////////////////////////////////////////
283 struct Transpose8_8_8_8
284 {
285 //////////////////////////////////////////////////////////////////////////
286 /// @brief Performs an SOA to AOS conversion for packed 8_8_8_8 data.
287 /// @param pSrc - source data in SOA form
288 /// @param pDst - output data in AOS form
289 INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
290 {
291 simdscalari src = _simd_load_si((const simdscalari*)pSrc);
292 #if KNOB_SIMD_WIDTH == 8
293 #if KNOB_ARCH == KNOB_ARCH_AVX
294 __m128i c0c1 = _mm256_castsi256_si128(src); // rrrrrrrrgggggggg
295 __m128i c2c3 = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(src), 1)); // bbbbbbbbaaaaaaaa
296 __m128i c0c2 = _mm_unpacklo_epi64(c0c1, c2c3); // rrrrrrrrbbbbbbbb
297 __m128i c1c3 = _mm_unpackhi_epi64(c0c1, c2c3); // ggggggggaaaaaaaa
298 __m128i c01 = _mm_unpacklo_epi8(c0c2, c1c3); // rgrgrgrgrgrgrgrg
299 __m128i c23 = _mm_unpackhi_epi8(c0c2, c1c3); // babababababababa
300 __m128i c0123lo = _mm_unpacklo_epi16(c01, c23); // rgbargbargbargba
301 __m128i c0123hi = _mm_unpackhi_epi16(c01, c23); // rgbargbargbargba
302 _mm_store_si128((__m128i*)pDst, c0123lo);
303 _mm_store_si128((__m128i*)(pDst + 16), c0123hi);
304 #elif KNOB_ARCH == KNOB_ARCH_AVX2
305 simdscalari dst01 = _mm256_shuffle_epi8(src,
306 _mm256_set_epi32(0x0f078080, 0x0e068080, 0x0d058080, 0x0c048080, 0x80800b03, 0x80800a02, 0x80800901, 0x80800800));
307 simdscalari dst23 = _mm256_permute2x128_si256(src, src, 0x01);
308 dst23 = _mm256_shuffle_epi8(dst23,
309 _mm256_set_epi32(0x80800f07, 0x80800e06, 0x80800d05, 0x80800c04, 0x0b038080, 0x0a028080, 0x09018080, 0x08008080));
310 simdscalari dst = _mm256_or_si256(dst01, dst23);
311 _simd_store_si((simdscalari*)pDst, dst);
312 #endif
313 #else
314 #error Unsupported vector width
315 #endif
316 }
317 };
318
319 //////////////////////////////////////////////////////////////////////////
320 /// Transpose8_8_8
321 //////////////////////////////////////////////////////////////////////////
322 struct Transpose8_8_8
323 {
324 //////////////////////////////////////////////////////////////////////////
325 /// @brief Performs an SOA to AOS conversion for packed 8_8_8 data.
326 /// @param pSrc - source data in SOA form
327 /// @param pDst - output data in AOS form
328 INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
329 };
330
331 //////////////////////////////////////////////////////////////////////////
332 /// Transpose8_8
333 //////////////////////////////////////////////////////////////////////////
334 struct Transpose8_8
335 {
336 //////////////////////////////////////////////////////////////////////////
337 /// @brief Performs an SOA to AOS conversion for packed 8_8 data.
338 /// @param pSrc - source data in SOA form
339 /// @param pDst - output data in AOS form
340 INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
341 {
342 simdscalari src = _simd_load_si((const simdscalari*)pSrc);
343
344 #if KNOB_SIMD_WIDTH == 8
345 __m128i rg = _mm256_castsi256_si128(src); // rrrrrrrr gggggggg
346 __m128i g = _mm_unpackhi_epi64(rg, rg); // gggggggg gggggggg
347 rg = _mm_unpacklo_epi8(rg, g);
348 _mm_store_si128((__m128i*)pDst, rg);
349 #else
350 #error Unsupported vector width
351 #endif
352 }
353 };
354
355 //////////////////////////////////////////////////////////////////////////
356 /// Transpose32_32_32_32
357 //////////////////////////////////////////////////////////////////////////
358 struct Transpose32_32_32_32
359 {
360 //////////////////////////////////////////////////////////////////////////
361 /// @brief Performs an SOA to AOS conversion for packed 32_32_32_32 data.
362 /// @param pSrc - source data in SOA form
363 /// @param pDst - output data in AOS form
364 INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
365 {
366 #if KNOB_SIMD_WIDTH == 8
367 simdscalar src0 = _simd_load_ps((const float*)pSrc);
368 simdscalar src1 = _simd_load_ps((const float*)pSrc + 8);
369 simdscalar src2 = _simd_load_ps((const float*)pSrc + 16);
370 simdscalar src3 = _simd_load_ps((const float*)pSrc + 24);
371
372 __m128 vDst[8];
373 vTranspose4x8(vDst, src0, src1, src2, src3);
374 _mm_store_ps((float*)pDst, vDst[0]);
375 _mm_store_ps((float*)pDst+4, vDst[1]);
376 _mm_store_ps((float*)pDst+8, vDst[2]);
377 _mm_store_ps((float*)pDst+12, vDst[3]);
378 _mm_store_ps((float*)pDst+16, vDst[4]);
379 _mm_store_ps((float*)pDst+20, vDst[5]);
380 _mm_store_ps((float*)pDst+24, vDst[6]);
381 _mm_store_ps((float*)pDst+28, vDst[7]);
382 #else
383 #error Unsupported vector width
384 #endif
385 }
386 };
387
388 //////////////////////////////////////////////////////////////////////////
389 /// Transpose32_32_32
390 //////////////////////////////////////////////////////////////////////////
391 struct Transpose32_32_32
392 {
393 //////////////////////////////////////////////////////////////////////////
394 /// @brief Performs an SOA to AOS conversion for packed 32_32_32 data.
395 /// @param pSrc - source data in SOA form
396 /// @param pDst - output data in AOS form
397 INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
398 {
399 #if KNOB_SIMD_WIDTH == 8
400 simdscalar src0 = _simd_load_ps((const float*)pSrc);
401 simdscalar src1 = _simd_load_ps((const float*)pSrc + 8);
402 simdscalar src2 = _simd_load_ps((const float*)pSrc + 16);
403
404 __m128 vDst[8];
405 vTranspose3x8(vDst, src0, src1, src2);
406 _mm_store_ps((float*)pDst, vDst[0]);
407 _mm_store_ps((float*)pDst + 4, vDst[1]);
408 _mm_store_ps((float*)pDst + 8, vDst[2]);
409 _mm_store_ps((float*)pDst + 12, vDst[3]);
410 _mm_store_ps((float*)pDst + 16, vDst[4]);
411 _mm_store_ps((float*)pDst + 20, vDst[5]);
412 _mm_store_ps((float*)pDst + 24, vDst[6]);
413 _mm_store_ps((float*)pDst + 28, vDst[7]);
414 #else
415 #error Unsupported vector width
416 #endif
417 }
418 };
419
420 //////////////////////////////////////////////////////////////////////////
421 /// Transpose32_32
422 //////////////////////////////////////////////////////////////////////////
423 struct Transpose32_32
424 {
425 //////////////////////////////////////////////////////////////////////////
426 /// @brief Performs an SOA to AOS conversion for packed 32_32 data.
427 /// @param pSrc - source data in SOA form
428 /// @param pDst - output data in AOS form
429 INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
430 {
431 const float* pfSrc = (const float*)pSrc;
432 __m128 src_r0 = _mm_load_ps(pfSrc + 0);
433 __m128 src_r1 = _mm_load_ps(pfSrc + 4);
434 __m128 src_g0 = _mm_load_ps(pfSrc + 8);
435 __m128 src_g1 = _mm_load_ps(pfSrc + 12);
436
437 __m128 dst0 = _mm_unpacklo_ps(src_r0, src_g0);
438 __m128 dst1 = _mm_unpackhi_ps(src_r0, src_g0);
439 __m128 dst2 = _mm_unpacklo_ps(src_r1, src_g1);
440 __m128 dst3 = _mm_unpackhi_ps(src_r1, src_g1);
441
442 float* pfDst = (float*)pDst;
443 _mm_store_ps(pfDst + 0, dst0);
444 _mm_store_ps(pfDst + 4, dst1);
445 _mm_store_ps(pfDst + 8, dst2);
446 _mm_store_ps(pfDst + 12, dst3);
447 }
448 };
449
450 //////////////////////////////////////////////////////////////////////////
451 /// Transpose16_16_16_16
452 //////////////////////////////////////////////////////////////////////////
453 struct Transpose16_16_16_16
454 {
455 //////////////////////////////////////////////////////////////////////////
456 /// @brief Performs an SOA to AOS conversion for packed 16_16_16_16 data.
457 /// @param pSrc - source data in SOA form
458 /// @param pDst - output data in AOS form
459 INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
460 {
461 #if KNOB_SIMD_WIDTH == 8
462 simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc);
463 simdscalari src_ba = _simd_load_si((const simdscalari*)(pSrc + sizeof(simdscalari)));
464
465 __m128i src_r = _mm256_extractf128_si256(src_rg, 0);
466 __m128i src_g = _mm256_extractf128_si256(src_rg, 1);
467 __m128i src_b = _mm256_extractf128_si256(src_ba, 0);
468 __m128i src_a = _mm256_extractf128_si256(src_ba, 1);
469
470 __m128i rg0 = _mm_unpacklo_epi16(src_r, src_g);
471 __m128i rg1 = _mm_unpackhi_epi16(src_r, src_g);
472 __m128i ba0 = _mm_unpacklo_epi16(src_b, src_a);
473 __m128i ba1 = _mm_unpackhi_epi16(src_b, src_a);
474
475 __m128i dst0 = _mm_unpacklo_epi32(rg0, ba0);
476 __m128i dst1 = _mm_unpackhi_epi32(rg0, ba0);
477 __m128i dst2 = _mm_unpacklo_epi32(rg1, ba1);
478 __m128i dst3 = _mm_unpackhi_epi32(rg1, ba1);
479
480 _mm_store_si128(((__m128i*)pDst) + 0, dst0);
481 _mm_store_si128(((__m128i*)pDst) + 1, dst1);
482 _mm_store_si128(((__m128i*)pDst) + 2, dst2);
483 _mm_store_si128(((__m128i*)pDst) + 3, dst3);
484 #else
485 #error Unsupported vector width
486 #endif
487 }
488 };
489
490 //////////////////////////////////////////////////////////////////////////
491 /// Transpose16_16_16
492 //////////////////////////////////////////////////////////////////////////
493 struct Transpose16_16_16
494 {
495 //////////////////////////////////////////////////////////////////////////
496 /// @brief Performs an SOA to AOS conversion for packed 16_16_16 data.
497 /// @param pSrc - source data in SOA form
498 /// @param pDst - output data in AOS form
499 INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
500 {
501 #if KNOB_SIMD_WIDTH == 8
502 simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc);
503
504 __m128i src_r = _mm256_extractf128_si256(src_rg, 0);
505 __m128i src_g = _mm256_extractf128_si256(src_rg, 1);
506 __m128i src_b = _mm_load_si128((const __m128i*)(pSrc + sizeof(simdscalari)));
507 __m128i src_a = _mm_undefined_si128();
508
509 __m128i rg0 = _mm_unpacklo_epi16(src_r, src_g);
510 __m128i rg1 = _mm_unpackhi_epi16(src_r, src_g);
511 __m128i ba0 = _mm_unpacklo_epi16(src_b, src_a);
512 __m128i ba1 = _mm_unpackhi_epi16(src_b, src_a);
513
514 __m128i dst0 = _mm_unpacklo_epi32(rg0, ba0);
515 __m128i dst1 = _mm_unpackhi_epi32(rg0, ba0);
516 __m128i dst2 = _mm_unpacklo_epi32(rg1, ba1);
517 __m128i dst3 = _mm_unpackhi_epi32(rg1, ba1);
518
519 _mm_store_si128(((__m128i*)pDst) + 0, dst0);
520 _mm_store_si128(((__m128i*)pDst) + 1, dst1);
521 _mm_store_si128(((__m128i*)pDst) + 2, dst2);
522 _mm_store_si128(((__m128i*)pDst) + 3, dst3);
523 #else
524 #error Unsupported vector width
525 #endif
526 }
527 };
528
529 //////////////////////////////////////////////////////////////////////////
530 /// Transpose16_16
531 //////////////////////////////////////////////////////////////////////////
532 struct Transpose16_16
533 {
534 //////////////////////////////////////////////////////////////////////////
535 /// @brief Performs an SOA to AOS conversion for packed 16_16 data.
536 /// @param pSrc - source data in SOA form
537 /// @param pDst - output data in AOS form
538 INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
539 {
540 simdscalar src = _simd_load_ps((const float*)pSrc);
541
542 #if KNOB_SIMD_WIDTH == 8
543 __m128 comp0 = _mm256_castps256_ps128(src);
544 __m128 comp1 = _mm256_extractf128_ps(src, 1);
545
546 __m128i comp0i = _mm_castps_si128(comp0);
547 __m128i comp1i = _mm_castps_si128(comp1);
548
549 __m128i resLo = _mm_unpacklo_epi16(comp0i, comp1i);
550 __m128i resHi = _mm_unpackhi_epi16(comp0i, comp1i);
551
552 _mm_store_si128((__m128i*)pDst, resLo);
553 _mm_store_si128((__m128i*)pDst + 1, resHi);
554 #else
555 #error Unsupported vector width
556 #endif
557 }
558 };
559
560 //////////////////////////////////////////////////////////////////////////
561 /// Transpose24_8
562 //////////////////////////////////////////////////////////////////////////
563 struct Transpose24_8
564 {
565 //////////////////////////////////////////////////////////////////////////
566 /// @brief Performs an SOA to AOS conversion for packed 24_8 data.
567 /// @param pSrc - source data in SOA form
568 /// @param pDst - output data in AOS form
569 static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
570 };
571
572 //////////////////////////////////////////////////////////////////////////
573 /// Transpose32_8_24
574 //////////////////////////////////////////////////////////////////////////
575 struct Transpose32_8_24
576 {
577 //////////////////////////////////////////////////////////////////////////
578 /// @brief Performs an SOA to AOS conversion for packed 32_8_24 data.
579 /// @param pSrc - source data in SOA form
580 /// @param pDst - output data in AOS form
581 static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
582 };
583
584
585
586 //////////////////////////////////////////////////////////////////////////
587 /// Transpose4_4_4_4
588 //////////////////////////////////////////////////////////////////////////
589 struct Transpose4_4_4_4
590 {
591 //////////////////////////////////////////////////////////////////////////
592 /// @brief Performs an SOA to AOS conversion for packed 4_4_4_4 data.
593 /// @param pSrc - source data in SOA form
594 /// @param pDst - output data in AOS form
595 static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
596 };
597
598 //////////////////////////////////////////////////////////////////////////
599 /// Transpose5_6_5
600 //////////////////////////////////////////////////////////////////////////
601 struct Transpose5_6_5
602 {
603 //////////////////////////////////////////////////////////////////////////
604 /// @brief Performs an SOA to AOS conversion for packed 5_6_5 data.
605 /// @param pSrc - source data in SOA form
606 /// @param pDst - output data in AOS form
607 static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
608 };
609
610 //////////////////////////////////////////////////////////////////////////
611 /// Transpose9_9_9_5
612 //////////////////////////////////////////////////////////////////////////
613 struct Transpose9_9_9_5
614 {
615 //////////////////////////////////////////////////////////////////////////
616 /// @brief Performs an SOA to AOS conversion for packed 9_9_9_5 data.
617 /// @param pSrc - source data in SOA form
618 /// @param pDst - output data in AOS form
619 static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
620 };
621
622 //////////////////////////////////////////////////////////////////////////
623 /// Transpose5_5_5_1
624 //////////////////////////////////////////////////////////////////////////
625 struct Transpose5_5_5_1
626 {
627 //////////////////////////////////////////////////////////////////////////
628 /// @brief Performs an SOA to AOS conversion for packed 5_5_5_1 data.
629 /// @param pSrc - source data in SOA form
630 /// @param pDst - output data in AOS form
631 static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
632 };
633
634 //////////////////////////////////////////////////////////////////////////
635 /// Transpose10_10_10_2
636 //////////////////////////////////////////////////////////////////////////
637 struct Transpose10_10_10_2
638 {
639 //////////////////////////////////////////////////////////////////////////
640 /// @brief Performs an SOA to AOS conversion for packed 10_10_10_2 data.
641 /// @param pSrc - source data in SOA form
642 /// @param pDst - output data in AOS form
643 static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
644 };
645
646 //////////////////////////////////////////////////////////////////////////
647 /// Transpose11_11_10
648 //////////////////////////////////////////////////////////////////////////
649 struct Transpose11_11_10
650 {
651 //////////////////////////////////////////////////////////////////////////
652 /// @brief Performs an SOA to AOS conversion for packed 11_11_10 data.
653 /// @param pSrc - source data in SOA form
654 /// @param pDst - output data in AOS form
655 static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
656 };
657
658 // helper function to unroll loops
659 template<int Begin, int End, int Step = 1>
660 struct UnrollerL {
661 template<typename Lambda>
662 INLINE static void step(Lambda& func) {
663 func(Begin);
664 UnrollerL<Begin + Step, End, Step>::step(func);
665 }
666 };
667
668 template<int End, int Step>
669 struct UnrollerL<End, End, Step> {
670 template<typename Lambda>
671 static void step(Lambda& func) {
672 }
673 };
674
675 // general CRC compute
676 INLINE
677 uint32_t ComputeCRC(uint32_t crc, const void *pData, uint32_t size)
678 {
679 #if defined(_WIN64) || defined(__linux__) || defined(__gnu_linux__)
680 uint32_t sizeInQwords = size / sizeof(uint64_t);
681 uint32_t sizeRemainderBytes = size % sizeof(uint64_t);
682 uint64_t* pDataWords = (uint64_t*)pData;
683 for (uint32_t i = 0; i < sizeInQwords; ++i)
684 {
685 crc = (uint32_t)_mm_crc32_u64(crc, *pDataWords++);
686 }
687 #else
688 uint32_t sizeInDwords = size / sizeof(uint32_t);
689 uint32_t sizeRemainderBytes = size % sizeof(uint32_t);
690 uint32_t* pDataWords = (uint32_t*)pData;
691 for (uint32_t i = 0; i < sizeInDwords; ++i)
692 {
693 crc = _mm_crc32_u32(crc, *pDataWords++);
694 }
695 #endif
696
697 BYTE* pRemainderBytes = (BYTE*)pDataWords;
698 for (uint32_t i = 0; i < sizeRemainderBytes; ++i)
699 {
700 crc = _mm_crc32_u8(crc, *pRemainderBytes++);
701 }
702
703 return crc;
704 }
705
706 //////////////////////////////////////////////////////////////////////////
707 /// Add byte offset to any-type pointer
708 //////////////////////////////////////////////////////////////////////////
709 template <typename T>
710 INLINE
711 static T* PtrAdd(T* p, intptr_t offset)
712 {
713 intptr_t intp = reinterpret_cast<intptr_t>(p);
714 return reinterpret_cast<T*>(intp + offset);
715 }
716
717 //////////////////////////////////////////////////////////////////////////
718 /// Is a power-of-2?
719 //////////////////////////////////////////////////////////////////////////
720 template <typename T>
721 INLINE
722 static bool IsPow2(T value)
723 {
724 return value == (value & (0 - value));
725 }
726
727 //////////////////////////////////////////////////////////////////////////
728 /// Align down to specified alignment
729 /// Note: IsPow2(alignment) MUST be true
730 //////////////////////////////////////////////////////////////////////////
731 template <typename T1, typename T2>
732 INLINE
733 static T1 AlignDownPow2(T1 value, T2 alignment)
734 {
735 SWR_ASSERT(IsPow2(alignment));
736 return value & ~T1(alignment - 1);
737 }
738
739 //////////////////////////////////////////////////////////////////////////
740 /// Align up to specified alignment
741 /// Note: IsPow2(alignment) MUST be true
742 //////////////////////////////////////////////////////////////////////////
743 template <typename T1, typename T2>
744 INLINE
745 static T1 AlignUpPow2(T1 value, T2 alignment)
746 {
747 return AlignDownPow2(value + T1(alignment - 1), alignment);
748 }
749
750 //////////////////////////////////////////////////////////////////////////
751 /// Align up ptr to specified alignment
752 /// Note: IsPow2(alignment) MUST be true
753 //////////////////////////////////////////////////////////////////////////
754 template <typename T1, typename T2>
755 INLINE
756 static T1* AlignUpPow2(T1* value, T2 alignment)
757 {
758 return reinterpret_cast<T1*>(
759 AlignDownPow2(reinterpret_cast<uintptr_t>(value) + uintptr_t(alignment - 1), alignment));
760 }
761
762 //////////////////////////////////////////////////////////////////////////
763 /// Align down to specified alignment
764 //////////////////////////////////////////////////////////////////////////
765 template <typename T1, typename T2>
766 INLINE
767 static T1 AlignDown(T1 value, T2 alignment)
768 {
769 if (IsPow2(alignment)) { return AlignDownPow2(value, alignment); }
770 return value - T1(value % alignment);
771 }
772
773 //////////////////////////////////////////////////////////////////////////
774 /// Align down to specified alignment
775 //////////////////////////////////////////////////////////////////////////
776 template <typename T1, typename T2>
777 INLINE
778 static T1* AlignDown(T1* value, T2 alignment)
779 {
780 return (T1*)AlignDown(uintptr_t(value), alignment);
781 }
782
783 //////////////////////////////////////////////////////////////////////////
784 /// Align up to specified alignment
785 /// Note: IsPow2(alignment) MUST be true
786 //////////////////////////////////////////////////////////////////////////
787 template <typename T1, typename T2>
788 INLINE
789 static T1 AlignUp(T1 value, T2 alignment)
790 {
791 return AlignDown(value + T1(alignment - 1), alignment);
792 }
793
794 //////////////////////////////////////////////////////////////////////////
795 /// Align up to specified alignment
796 /// Note: IsPow2(alignment) MUST be true
797 //////////////////////////////////////////////////////////////////////////
798 template <typename T1, typename T2>
799 INLINE
800 static T1* AlignUp(T1* value, T2 alignment)
801 {
802 return AlignDown(PtrAdd(value, alignment - 1), alignment);
803 }
804
805 //////////////////////////////////////////////////////////////////////////
806 /// Helper structure used to access an array of elements that don't
807 /// correspond to a typical word size.
808 //////////////////////////////////////////////////////////////////////////
809 template<typename T, size_t BitsPerElementT, size_t ArrayLenT>
810 class BitsArray
811 {
812 private:
813 static const size_t BITS_PER_WORD = sizeof(size_t) * 8;
814 static const size_t ELEMENTS_PER_WORD = BITS_PER_WORD / BitsPerElementT;
815 static const size_t NUM_WORDS = (ArrayLenT + ELEMENTS_PER_WORD - 1) / ELEMENTS_PER_WORD;
816 static const size_t ELEMENT_MASK = (size_t(1) << BitsPerElementT) - 1;
817
818 static_assert(ELEMENTS_PER_WORD * BitsPerElementT == BITS_PER_WORD,
819 "Element size must an integral fraction of pointer size");
820
821 size_t m_words[NUM_WORDS] = {};
822
823 public:
824
825 T operator[] (size_t elementIndex) const
826 {
827 size_t word = m_words[elementIndex / ELEMENTS_PER_WORD];
828 word >>= ((elementIndex % ELEMENTS_PER_WORD) * BitsPerElementT);
829 return T(word & ELEMENT_MASK);
830 }
831 };