swr: [rasterizer] Fix Coverity issues reported by Mesa developers.
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / utils.h
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file utils.h
24 *
25 * @brief Utilities used by SWR core.
26 *
27 ******************************************************************************/
28 #pragma once
29
30 #include <string.h>
31 #include "common/os.h"
32 #include "common/simdintrin.h"
33 #include "common/swr_assert.h"
34
35 #if defined(_WIN32)
36 void SaveImageToPNGFile(
37 const WCHAR *pFilename,
38 void *pBuffer,
39 uint32_t width,
40 uint32_t height);
41
42 void OpenBitmapFromFile(
43 const WCHAR *pFilename,
44 void **pBuffer,
45 uint32_t *width,
46 uint32_t *height);
47 #endif
48
49 #if defined(_WIN64) || defined(__x86_64__)
50 #define _MM_INSERT_EPI64 _mm_insert_epi64
51 #define _MM_EXTRACT_EPI64 _mm_extract_epi64
52 #else
53 INLINE INT64 _MM_EXTRACT_EPI64(__m128i a, const int32_t ndx)
54 {
55 OSALIGNLINE(uint32_t) elems[4];
56 _mm_store_si128((__m128i*)elems, a);
57 if (ndx == 0)
58 {
59 uint64_t foo = elems[0];
60 foo |= (uint64_t)elems[1] << 32;
61 return foo;
62 }
63 else
64 {
65 uint64_t foo = elems[2];
66 foo |= (uint64_t)elems[3] << 32;
67 return foo;
68 }
69 }
70
71 INLINE __m128i _MM_INSERT_EPI64(__m128i a, INT64 b, const int32_t ndx)
72 {
73 OSALIGNLINE(int64_t) elems[2];
74 _mm_store_si128((__m128i*)elems, a);
75 if (ndx == 0)
76 {
77 elems[0] = b;
78 }
79 else
80 {
81 elems[1] = b;
82 }
83 __m128i out;
84 out = _mm_load_si128((const __m128i*)elems);
85 return out;
86 }
87 #endif
88
89 OSALIGNLINE(struct) BBOX
90 {
91 int top{ 0 };
92 int bottom{ 0 };
93 int left{ 0 };
94 int right{ 0 };
95
96 BBOX() {}
97 BBOX(int t, int b, int l, int r) : top(t), bottom(b), left(l), right(r) {}
98
99 bool operator==(const BBOX& rhs)
100 {
101 return (this->top == rhs.top &&
102 this->bottom == rhs.bottom &&
103 this->left == rhs.left &&
104 this->right == rhs.right);
105 }
106
107 bool operator!=(const BBOX& rhs)
108 {
109 return !(*this == rhs);
110 }
111 };
112
113 struct simdBBox
114 {
115 simdscalari top;
116 simdscalari bottom;
117 simdscalari left;
118 simdscalari right;
119 };
120
121 INLINE
122 void vTranspose(__m128 &row0, __m128 &row1, __m128 &row2, __m128 &row3)
123 {
124 __m128i row0i = _mm_castps_si128(row0);
125 __m128i row1i = _mm_castps_si128(row1);
126 __m128i row2i = _mm_castps_si128(row2);
127 __m128i row3i = _mm_castps_si128(row3);
128
129 __m128i vTemp = row2i;
130 row2i = _mm_unpacklo_epi32(row2i, row3i);
131 vTemp = _mm_unpackhi_epi32(vTemp, row3i);
132
133 row3i = row0i;
134 row0i = _mm_unpacklo_epi32(row0i, row1i);
135 row3i = _mm_unpackhi_epi32(row3i, row1i);
136
137 row1i = row0i;
138 row0i = _mm_unpacklo_epi64(row0i, row2i);
139 row1i = _mm_unpackhi_epi64(row1i, row2i);
140
141 row2i = row3i;
142 row2i = _mm_unpacklo_epi64(row2i, vTemp);
143 row3i = _mm_unpackhi_epi64(row3i, vTemp);
144
145 row0 = _mm_castsi128_ps(row0i);
146 row1 = _mm_castsi128_ps(row1i);
147 row2 = _mm_castsi128_ps(row2i);
148 row3 = _mm_castsi128_ps(row3i);
149 }
150
151 INLINE
152 void vTranspose(__m128i &row0, __m128i &row1, __m128i &row2, __m128i &row3)
153 {
154 __m128i vTemp = row2;
155 row2 = _mm_unpacklo_epi32(row2, row3);
156 vTemp = _mm_unpackhi_epi32(vTemp, row3);
157
158 row3 = row0;
159 row0 = _mm_unpacklo_epi32(row0, row1);
160 row3 = _mm_unpackhi_epi32(row3, row1);
161
162 row1 = row0;
163 row0 = _mm_unpacklo_epi64(row0, row2);
164 row1 = _mm_unpackhi_epi64(row1, row2);
165
166 row2 = row3;
167 row2 = _mm_unpacklo_epi64(row2, vTemp);
168 row3 = _mm_unpackhi_epi64(row3, vTemp);
169 }
170
171 #define GCC_VERSION (__GNUC__ * 10000 \
172 + __GNUC_MINOR__ * 100 \
173 + __GNUC_PATCHLEVEL__)
174
175 #if defined(__GNUC__) && (GCC_VERSION < 40900)
176 #define _mm_undefined_ps _mm_setzero_ps
177 #define _mm_undefined_si128 _mm_setzero_si128
178 #if KNOB_SIMD_WIDTH == 8
179 #define _mm256_undefined_ps _mm256_setzero_ps
180 #endif
181 #endif
182
183 #if KNOB_SIMD_WIDTH == 8
184 INLINE
185 void vTranspose3x8(__m128 (&vDst)[8], __m256 &vSrc0, __m256 &vSrc1, __m256 &vSrc2)
186 {
187 __m256 r0r2 = _mm256_unpacklo_ps(vSrc0, vSrc2); //x0z0x1z1 x4z4x5z5
188 __m256 r1rx = _mm256_unpacklo_ps(vSrc1, _mm256_undefined_ps()); //y0w0y1w1 y4w4y5w5
189 __m256 r02r1xlolo = _mm256_unpacklo_ps(r0r2, r1rx); //x0y0z0w0 x4y4z4w4
190 __m256 r02r1xlohi = _mm256_unpackhi_ps(r0r2, r1rx); //x1y1z1w1 x5y5z5w5
191
192 r0r2 = _mm256_unpackhi_ps(vSrc0, vSrc2); //x2z2x3z3 x6z6x7z7
193 r1rx = _mm256_unpackhi_ps(vSrc1, _mm256_undefined_ps()); //y2w2y3w3 y6w6yw77
194 __m256 r02r1xhilo = _mm256_unpacklo_ps(r0r2, r1rx); //x2y2z2w2 x6y6z6w6
195 __m256 r02r1xhihi = _mm256_unpackhi_ps(r0r2, r1rx); //x3y3z3w3 x7y7z7w7
196
197 vDst[0] = _mm256_castps256_ps128(r02r1xlolo);
198 vDst[1] = _mm256_castps256_ps128(r02r1xlohi);
199 vDst[2] = _mm256_castps256_ps128(r02r1xhilo);
200 vDst[3] = _mm256_castps256_ps128(r02r1xhihi);
201
202 vDst[4] = _mm256_extractf128_ps(r02r1xlolo, 1);
203 vDst[5] = _mm256_extractf128_ps(r02r1xlohi, 1);
204 vDst[6] = _mm256_extractf128_ps(r02r1xhilo, 1);
205 vDst[7] = _mm256_extractf128_ps(r02r1xhihi, 1);
206 }
207
208 INLINE
209 void vTranspose4x8(__m128 (&vDst)[8], __m256 &vSrc0, __m256 &vSrc1, __m256 &vSrc2, __m256 &vSrc3)
210 {
211 __m256 r0r2 = _mm256_unpacklo_ps(vSrc0, vSrc2); //x0z0x1z1 x4z4x5z5
212 __m256 r1rx = _mm256_unpacklo_ps(vSrc1, vSrc3); //y0w0y1w1 y4w4y5w5
213 __m256 r02r1xlolo = _mm256_unpacklo_ps(r0r2, r1rx); //x0y0z0w0 x4y4z4w4
214 __m256 r02r1xlohi = _mm256_unpackhi_ps(r0r2, r1rx); //x1y1z1w1 x5y5z5w5
215
216 r0r2 = _mm256_unpackhi_ps(vSrc0, vSrc2); //x2z2x3z3 x6z6x7z7
217 r1rx = _mm256_unpackhi_ps(vSrc1, vSrc3) ; //y2w2y3w3 y6w6yw77
218 __m256 r02r1xhilo = _mm256_unpacklo_ps(r0r2, r1rx); //x2y2z2w2 x6y6z6w6
219 __m256 r02r1xhihi = _mm256_unpackhi_ps(r0r2, r1rx); //x3y3z3w3 x7y7z7w7
220
221 vDst[0] = _mm256_castps256_ps128(r02r1xlolo);
222 vDst[1] = _mm256_castps256_ps128(r02r1xlohi);
223 vDst[2] = _mm256_castps256_ps128(r02r1xhilo);
224 vDst[3] = _mm256_castps256_ps128(r02r1xhihi);
225
226 vDst[4] = _mm256_extractf128_ps(r02r1xlolo, 1);
227 vDst[5] = _mm256_extractf128_ps(r02r1xlohi, 1);
228 vDst[6] = _mm256_extractf128_ps(r02r1xhilo, 1);
229 vDst[7] = _mm256_extractf128_ps(r02r1xhihi, 1);
230 }
231
232 INLINE
233 void vTranspose8x8(__m256 (&vDst)[8], const __m256 &vMask0, const __m256 &vMask1, const __m256 &vMask2, const __m256 &vMask3, const __m256 &vMask4, const __m256 &vMask5, const __m256 &vMask6, const __m256 &vMask7)
234 {
235 __m256 __t0 = _mm256_unpacklo_ps(vMask0, vMask1);
236 __m256 __t1 = _mm256_unpackhi_ps(vMask0, vMask1);
237 __m256 __t2 = _mm256_unpacklo_ps(vMask2, vMask3);
238 __m256 __t3 = _mm256_unpackhi_ps(vMask2, vMask3);
239 __m256 __t4 = _mm256_unpacklo_ps(vMask4, vMask5);
240 __m256 __t5 = _mm256_unpackhi_ps(vMask4, vMask5);
241 __m256 __t6 = _mm256_unpacklo_ps(vMask6, vMask7);
242 __m256 __t7 = _mm256_unpackhi_ps(vMask6, vMask7);
243 __m256 __tt0 = _mm256_shuffle_ps(__t0,__t2,_MM_SHUFFLE(1,0,1,0));
244 __m256 __tt1 = _mm256_shuffle_ps(__t0,__t2,_MM_SHUFFLE(3,2,3,2));
245 __m256 __tt2 = _mm256_shuffle_ps(__t1,__t3,_MM_SHUFFLE(1,0,1,0));
246 __m256 __tt3 = _mm256_shuffle_ps(__t1,__t3,_MM_SHUFFLE(3,2,3,2));
247 __m256 __tt4 = _mm256_shuffle_ps(__t4,__t6,_MM_SHUFFLE(1,0,1,0));
248 __m256 __tt5 = _mm256_shuffle_ps(__t4,__t6,_MM_SHUFFLE(3,2,3,2));
249 __m256 __tt6 = _mm256_shuffle_ps(__t5,__t7,_MM_SHUFFLE(1,0,1,0));
250 __m256 __tt7 = _mm256_shuffle_ps(__t5,__t7,_MM_SHUFFLE(3,2,3,2));
251 vDst[0] = _mm256_permute2f128_ps(__tt0, __tt4, 0x20);
252 vDst[1] = _mm256_permute2f128_ps(__tt1, __tt5, 0x20);
253 vDst[2] = _mm256_permute2f128_ps(__tt2, __tt6, 0x20);
254 vDst[3] = _mm256_permute2f128_ps(__tt3, __tt7, 0x20);
255 vDst[4] = _mm256_permute2f128_ps(__tt0, __tt4, 0x31);
256 vDst[5] = _mm256_permute2f128_ps(__tt1, __tt5, 0x31);
257 vDst[6] = _mm256_permute2f128_ps(__tt2, __tt6, 0x31);
258 vDst[7] = _mm256_permute2f128_ps(__tt3, __tt7, 0x31);
259 }
260
261 INLINE
262 void vTranspose8x8(__m256 (&vDst)[8], const __m256i &vMask0, const __m256i &vMask1, const __m256i &vMask2, const __m256i &vMask3, const __m256i &vMask4, const __m256i &vMask5, const __m256i &vMask6, const __m256i &vMask7)
263 {
264 vTranspose8x8(vDst, _mm256_castsi256_ps(vMask0), _mm256_castsi256_ps(vMask1), _mm256_castsi256_ps(vMask2), _mm256_castsi256_ps(vMask3),
265 _mm256_castsi256_ps(vMask4), _mm256_castsi256_ps(vMask5), _mm256_castsi256_ps(vMask6), _mm256_castsi256_ps(vMask7));
266 }
267 #endif
268
269 //////////////////////////////////////////////////////////////////////////
270 /// TranposeSingleComponent
271 //////////////////////////////////////////////////////////////////////////
272 template<uint32_t bpp>
273 struct TransposeSingleComponent
274 {
275 //////////////////////////////////////////////////////////////////////////
276 /// @brief Pass-thru for single component.
277 /// @param pSrc - source data in SOA form
278 /// @param pDst - output data in AOS form
279 INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
280 {
281 memcpy(pDst, pSrc, (bpp * KNOB_SIMD_WIDTH) / 8);
282 }
283 };
284
285 //////////////////////////////////////////////////////////////////////////
286 /// Transpose8_8_8_8
287 //////////////////////////////////////////////////////////////////////////
288 struct Transpose8_8_8_8
289 {
290 //////////////////////////////////////////////////////////////////////////
291 /// @brief Performs an SOA to AOS conversion for packed 8_8_8_8 data.
292 /// @param pSrc - source data in SOA form
293 /// @param pDst - output data in AOS form
294 INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
295 {
296 simdscalari src = _simd_load_si((const simdscalari*)pSrc);
297 #if KNOB_SIMD_WIDTH == 8
298 #if KNOB_ARCH == KNOB_ARCH_AVX
299 __m128i c0c1 = _mm256_castsi256_si128(src); // rrrrrrrrgggggggg
300 __m128i c2c3 = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(src), 1)); // bbbbbbbbaaaaaaaa
301 __m128i c0c2 = _mm_unpacklo_epi64(c0c1, c2c3); // rrrrrrrrbbbbbbbb
302 __m128i c1c3 = _mm_unpackhi_epi64(c0c1, c2c3); // ggggggggaaaaaaaa
303 __m128i c01 = _mm_unpacklo_epi8(c0c2, c1c3); // rgrgrgrgrgrgrgrg
304 __m128i c23 = _mm_unpackhi_epi8(c0c2, c1c3); // babababababababa
305 __m128i c0123lo = _mm_unpacklo_epi16(c01, c23); // rgbargbargbargba
306 __m128i c0123hi = _mm_unpackhi_epi16(c01, c23); // rgbargbargbargba
307 _mm_store_si128((__m128i*)pDst, c0123lo);
308 _mm_store_si128((__m128i*)(pDst + 16), c0123hi);
309 #elif KNOB_ARCH == KNOB_ARCH_AVX2
310 simdscalari dst01 = _mm256_shuffle_epi8(src,
311 _mm256_set_epi32(0x0f078080, 0x0e068080, 0x0d058080, 0x0c048080, 0x80800b03, 0x80800a02, 0x80800901, 0x80800800));
312 simdscalari dst23 = _mm256_permute2x128_si256(src, src, 0x01);
313 dst23 = _mm256_shuffle_epi8(dst23,
314 _mm256_set_epi32(0x80800f07, 0x80800e06, 0x80800d05, 0x80800c04, 0x0b038080, 0x0a028080, 0x09018080, 0x08008080));
315 simdscalari dst = _mm256_or_si256(dst01, dst23);
316 _simd_store_si((simdscalari*)pDst, dst);
317 #endif
318 #else
319 #error Unsupported vector width
320 #endif
321 }
322 };
323
324 //////////////////////////////////////////////////////////////////////////
325 /// Transpose8_8_8
326 //////////////////////////////////////////////////////////////////////////
327 struct Transpose8_8_8
328 {
329 //////////////////////////////////////////////////////////////////////////
330 /// @brief Performs an SOA to AOS conversion for packed 8_8_8 data.
331 /// @param pSrc - source data in SOA form
332 /// @param pDst - output data in AOS form
333 INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
334 };
335
336 //////////////////////////////////////////////////////////////////////////
337 /// Transpose8_8
338 //////////////////////////////////////////////////////////////////////////
339 struct Transpose8_8
340 {
341 //////////////////////////////////////////////////////////////////////////
342 /// @brief Performs an SOA to AOS conversion for packed 8_8 data.
343 /// @param pSrc - source data in SOA form
344 /// @param pDst - output data in AOS form
345 INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
346 {
347 simdscalari src = _simd_load_si((const simdscalari*)pSrc);
348
349 #if KNOB_SIMD_WIDTH == 8
350 __m128i rg = _mm256_castsi256_si128(src); // rrrrrrrr gggggggg
351 __m128i g = _mm_unpackhi_epi64(rg, rg); // gggggggg gggggggg
352 rg = _mm_unpacklo_epi8(rg, g);
353 _mm_store_si128((__m128i*)pDst, rg);
354 #else
355 #error Unsupported vector width
356 #endif
357 }
358 };
359
360 //////////////////////////////////////////////////////////////////////////
361 /// Transpose32_32_32_32
362 //////////////////////////////////////////////////////////////////////////
363 struct Transpose32_32_32_32
364 {
365 //////////////////////////////////////////////////////////////////////////
366 /// @brief Performs an SOA to AOS conversion for packed 32_32_32_32 data.
367 /// @param pSrc - source data in SOA form
368 /// @param pDst - output data in AOS form
369 INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
370 {
371 #if KNOB_SIMD_WIDTH == 8
372 simdscalar src0 = _simd_load_ps((const float*)pSrc);
373 simdscalar src1 = _simd_load_ps((const float*)pSrc + 8);
374 simdscalar src2 = _simd_load_ps((const float*)pSrc + 16);
375 simdscalar src3 = _simd_load_ps((const float*)pSrc + 24);
376
377 __m128 vDst[8];
378 vTranspose4x8(vDst, src0, src1, src2, src3);
379 _mm_store_ps((float*)pDst, vDst[0]);
380 _mm_store_ps((float*)pDst+4, vDst[1]);
381 _mm_store_ps((float*)pDst+8, vDst[2]);
382 _mm_store_ps((float*)pDst+12, vDst[3]);
383 _mm_store_ps((float*)pDst+16, vDst[4]);
384 _mm_store_ps((float*)pDst+20, vDst[5]);
385 _mm_store_ps((float*)pDst+24, vDst[6]);
386 _mm_store_ps((float*)pDst+28, vDst[7]);
387 #else
388 #error Unsupported vector width
389 #endif
390 }
391 };
392
393 //////////////////////////////////////////////////////////////////////////
394 /// Transpose32_32_32
395 //////////////////////////////////////////////////////////////////////////
396 struct Transpose32_32_32
397 {
398 //////////////////////////////////////////////////////////////////////////
399 /// @brief Performs an SOA to AOS conversion for packed 32_32_32 data.
400 /// @param pSrc - source data in SOA form
401 /// @param pDst - output data in AOS form
402 INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
403 {
404 #if KNOB_SIMD_WIDTH == 8
405 simdscalar src0 = _simd_load_ps((const float*)pSrc);
406 simdscalar src1 = _simd_load_ps((const float*)pSrc + 8);
407 simdscalar src2 = _simd_load_ps((const float*)pSrc + 16);
408
409 __m128 vDst[8];
410 vTranspose3x8(vDst, src0, src1, src2);
411 _mm_store_ps((float*)pDst, vDst[0]);
412 _mm_store_ps((float*)pDst + 4, vDst[1]);
413 _mm_store_ps((float*)pDst + 8, vDst[2]);
414 _mm_store_ps((float*)pDst + 12, vDst[3]);
415 _mm_store_ps((float*)pDst + 16, vDst[4]);
416 _mm_store_ps((float*)pDst + 20, vDst[5]);
417 _mm_store_ps((float*)pDst + 24, vDst[6]);
418 _mm_store_ps((float*)pDst + 28, vDst[7]);
419 #else
420 #error Unsupported vector width
421 #endif
422 }
423 };
424
425 //////////////////////////////////////////////////////////////////////////
426 /// Transpose32_32
427 //////////////////////////////////////////////////////////////////////////
428 struct Transpose32_32
429 {
430 //////////////////////////////////////////////////////////////////////////
431 /// @brief Performs an SOA to AOS conversion for packed 32_32 data.
432 /// @param pSrc - source data in SOA form
433 /// @param pDst - output data in AOS form
434 INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
435 {
436 const float* pfSrc = (const float*)pSrc;
437 __m128 src_r0 = _mm_load_ps(pfSrc + 0);
438 __m128 src_r1 = _mm_load_ps(pfSrc + 4);
439 __m128 src_g0 = _mm_load_ps(pfSrc + 8);
440 __m128 src_g1 = _mm_load_ps(pfSrc + 12);
441
442 __m128 dst0 = _mm_unpacklo_ps(src_r0, src_g0);
443 __m128 dst1 = _mm_unpackhi_ps(src_r0, src_g0);
444 __m128 dst2 = _mm_unpacklo_ps(src_r1, src_g1);
445 __m128 dst3 = _mm_unpackhi_ps(src_r1, src_g1);
446
447 float* pfDst = (float*)pDst;
448 _mm_store_ps(pfDst + 0, dst0);
449 _mm_store_ps(pfDst + 4, dst1);
450 _mm_store_ps(pfDst + 8, dst2);
451 _mm_store_ps(pfDst + 12, dst3);
452 }
453 };
454
455 //////////////////////////////////////////////////////////////////////////
456 /// Transpose16_16_16_16
457 //////////////////////////////////////////////////////////////////////////
458 struct Transpose16_16_16_16
459 {
460 //////////////////////////////////////////////////////////////////////////
461 /// @brief Performs an SOA to AOS conversion for packed 16_16_16_16 data.
462 /// @param pSrc - source data in SOA form
463 /// @param pDst - output data in AOS form
464 INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
465 {
466 #if KNOB_SIMD_WIDTH == 8
467 simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc);
468 simdscalari src_ba = _simd_load_si((const simdscalari*)(pSrc + sizeof(simdscalari)));
469
470 __m128i src_r = _mm256_extractf128_si256(src_rg, 0);
471 __m128i src_g = _mm256_extractf128_si256(src_rg, 1);
472 __m128i src_b = _mm256_extractf128_si256(src_ba, 0);
473 __m128i src_a = _mm256_extractf128_si256(src_ba, 1);
474
475 __m128i rg0 = _mm_unpacklo_epi16(src_r, src_g);
476 __m128i rg1 = _mm_unpackhi_epi16(src_r, src_g);
477 __m128i ba0 = _mm_unpacklo_epi16(src_b, src_a);
478 __m128i ba1 = _mm_unpackhi_epi16(src_b, src_a);
479
480 __m128i dst0 = _mm_unpacklo_epi32(rg0, ba0);
481 __m128i dst1 = _mm_unpackhi_epi32(rg0, ba0);
482 __m128i dst2 = _mm_unpacklo_epi32(rg1, ba1);
483 __m128i dst3 = _mm_unpackhi_epi32(rg1, ba1);
484
485 _mm_store_si128(((__m128i*)pDst) + 0, dst0);
486 _mm_store_si128(((__m128i*)pDst) + 1, dst1);
487 _mm_store_si128(((__m128i*)pDst) + 2, dst2);
488 _mm_store_si128(((__m128i*)pDst) + 3, dst3);
489 #else
490 #error Unsupported vector width
491 #endif
492 }
493 };
494
495 //////////////////////////////////////////////////////////////////////////
496 /// Transpose16_16_16
497 //////////////////////////////////////////////////////////////////////////
498 struct Transpose16_16_16
499 {
500 //////////////////////////////////////////////////////////////////////////
501 /// @brief Performs an SOA to AOS conversion for packed 16_16_16 data.
502 /// @param pSrc - source data in SOA form
503 /// @param pDst - output data in AOS form
504 INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
505 {
506 #if KNOB_SIMD_WIDTH == 8
507 simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc);
508
509 __m128i src_r = _mm256_extractf128_si256(src_rg, 0);
510 __m128i src_g = _mm256_extractf128_si256(src_rg, 1);
511 __m128i src_b = _mm_load_si128((const __m128i*)(pSrc + sizeof(simdscalari)));
512 __m128i src_a = _mm_undefined_si128();
513
514 __m128i rg0 = _mm_unpacklo_epi16(src_r, src_g);
515 __m128i rg1 = _mm_unpackhi_epi16(src_r, src_g);
516 __m128i ba0 = _mm_unpacklo_epi16(src_b, src_a);
517 __m128i ba1 = _mm_unpackhi_epi16(src_b, src_a);
518
519 __m128i dst0 = _mm_unpacklo_epi32(rg0, ba0);
520 __m128i dst1 = _mm_unpackhi_epi32(rg0, ba0);
521 __m128i dst2 = _mm_unpacklo_epi32(rg1, ba1);
522 __m128i dst3 = _mm_unpackhi_epi32(rg1, ba1);
523
524 _mm_store_si128(((__m128i*)pDst) + 0, dst0);
525 _mm_store_si128(((__m128i*)pDst) + 1, dst1);
526 _mm_store_si128(((__m128i*)pDst) + 2, dst2);
527 _mm_store_si128(((__m128i*)pDst) + 3, dst3);
528 #else
529 #error Unsupported vector width
530 #endif
531 }
532 };
533
534 //////////////////////////////////////////////////////////////////////////
535 /// Transpose16_16
536 //////////////////////////////////////////////////////////////////////////
537 struct Transpose16_16
538 {
539 //////////////////////////////////////////////////////////////////////////
540 /// @brief Performs an SOA to AOS conversion for packed 16_16 data.
541 /// @param pSrc - source data in SOA form
542 /// @param pDst - output data in AOS form
543 INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
544 {
545 simdscalar src = _simd_load_ps((const float*)pSrc);
546
547 #if KNOB_SIMD_WIDTH == 8
548 __m128 comp0 = _mm256_castps256_ps128(src);
549 __m128 comp1 = _mm256_extractf128_ps(src, 1);
550
551 __m128i comp0i = _mm_castps_si128(comp0);
552 __m128i comp1i = _mm_castps_si128(comp1);
553
554 __m128i resLo = _mm_unpacklo_epi16(comp0i, comp1i);
555 __m128i resHi = _mm_unpackhi_epi16(comp0i, comp1i);
556
557 _mm_store_si128((__m128i*)pDst, resLo);
558 _mm_store_si128((__m128i*)pDst + 1, resHi);
559 #else
560 #error Unsupported vector width
561 #endif
562 }
563 };
564
565 //////////////////////////////////////////////////////////////////////////
566 /// Transpose24_8
567 //////////////////////////////////////////////////////////////////////////
568 struct Transpose24_8
569 {
570 //////////////////////////////////////////////////////////////////////////
571 /// @brief Performs an SOA to AOS conversion for packed 24_8 data.
572 /// @param pSrc - source data in SOA form
573 /// @param pDst - output data in AOS form
574 static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
575 };
576
577 //////////////////////////////////////////////////////////////////////////
578 /// Transpose32_8_24
579 //////////////////////////////////////////////////////////////////////////
580 struct Transpose32_8_24
581 {
582 //////////////////////////////////////////////////////////////////////////
583 /// @brief Performs an SOA to AOS conversion for packed 32_8_24 data.
584 /// @param pSrc - source data in SOA form
585 /// @param pDst - output data in AOS form
586 static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
587 };
588
589
590
591 //////////////////////////////////////////////////////////////////////////
592 /// Transpose4_4_4_4
593 //////////////////////////////////////////////////////////////////////////
594 struct Transpose4_4_4_4
595 {
596 //////////////////////////////////////////////////////////////////////////
597 /// @brief Performs an SOA to AOS conversion for packed 4_4_4_4 data.
598 /// @param pSrc - source data in SOA form
599 /// @param pDst - output data in AOS form
600 static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
601 };
602
603 //////////////////////////////////////////////////////////////////////////
604 /// Transpose5_6_5
605 //////////////////////////////////////////////////////////////////////////
606 struct Transpose5_6_5
607 {
608 //////////////////////////////////////////////////////////////////////////
609 /// @brief Performs an SOA to AOS conversion for packed 5_6_5 data.
610 /// @param pSrc - source data in SOA form
611 /// @param pDst - output data in AOS form
612 static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
613 };
614
615 //////////////////////////////////////////////////////////////////////////
616 /// Transpose9_9_9_5
617 //////////////////////////////////////////////////////////////////////////
618 struct Transpose9_9_9_5
619 {
620 //////////////////////////////////////////////////////////////////////////
621 /// @brief Performs an SOA to AOS conversion for packed 9_9_9_5 data.
622 /// @param pSrc - source data in SOA form
623 /// @param pDst - output data in AOS form
624 static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
625 };
626
627 //////////////////////////////////////////////////////////////////////////
628 /// Transpose5_5_5_1
629 //////////////////////////////////////////////////////////////////////////
630 struct Transpose5_5_5_1
631 {
632 //////////////////////////////////////////////////////////////////////////
633 /// @brief Performs an SOA to AOS conversion for packed 5_5_5_1 data.
634 /// @param pSrc - source data in SOA form
635 /// @param pDst - output data in AOS form
636 static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
637 };
638
639 //////////////////////////////////////////////////////////////////////////
640 /// Transpose10_10_10_2
641 //////////////////////////////////////////////////////////////////////////
642 struct Transpose10_10_10_2
643 {
644 //////////////////////////////////////////////////////////////////////////
645 /// @brief Performs an SOA to AOS conversion for packed 10_10_10_2 data.
646 /// @param pSrc - source data in SOA form
647 /// @param pDst - output data in AOS form
648 static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
649 };
650
651 //////////////////////////////////////////////////////////////////////////
652 /// Transpose11_11_10
653 //////////////////////////////////////////////////////////////////////////
654 struct Transpose11_11_10
655 {
656 //////////////////////////////////////////////////////////////////////////
657 /// @brief Performs an SOA to AOS conversion for packed 11_11_10 data.
658 /// @param pSrc - source data in SOA form
659 /// @param pDst - output data in AOS form
660 static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
661 };
662
663 // helper function to unroll loops
664 template<int Begin, int End, int Step = 1>
665 struct UnrollerL {
666 template<typename Lambda>
667 INLINE static void step(Lambda& func) {
668 func(Begin);
669 UnrollerL<Begin + Step, End, Step>::step(func);
670 }
671 };
672
673 template<int End, int Step>
674 struct UnrollerL<End, End, Step> {
675 template<typename Lambda>
676 static void step(Lambda& func) {
677 }
678 };
679
680 // general CRC compute
681 INLINE
682 uint32_t ComputeCRC(uint32_t crc, const void *pData, uint32_t size)
683 {
684 #if defined(_WIN64) || defined(__x86_64__)
685 uint32_t sizeInQwords = size / sizeof(uint64_t);
686 uint32_t sizeRemainderBytes = size % sizeof(uint64_t);
687 uint64_t* pDataWords = (uint64_t*)pData;
688 for (uint32_t i = 0; i < sizeInQwords; ++i)
689 {
690 crc = (uint32_t)_mm_crc32_u64(crc, *pDataWords++);
691 }
692 #else
693 uint32_t sizeInDwords = size / sizeof(uint32_t);
694 uint32_t sizeRemainderBytes = size % sizeof(uint32_t);
695 uint32_t* pDataWords = (uint32_t*)pData;
696 for (uint32_t i = 0; i < sizeInDwords; ++i)
697 {
698 crc = _mm_crc32_u32(crc, *pDataWords++);
699 }
700 #endif
701
702 BYTE* pRemainderBytes = (BYTE*)pDataWords;
703 for (uint32_t i = 0; i < sizeRemainderBytes; ++i)
704 {
705 crc = _mm_crc32_u8(crc, *pRemainderBytes++);
706 }
707
708 return crc;
709 }
710
711 //////////////////////////////////////////////////////////////////////////
712 /// Add byte offset to any-type pointer
713 //////////////////////////////////////////////////////////////////////////
714 template <typename T>
715 INLINE
716 static T* PtrAdd(T* p, intptr_t offset)
717 {
718 intptr_t intp = reinterpret_cast<intptr_t>(p);
719 return reinterpret_cast<T*>(intp + offset);
720 }
721
722 //////////////////////////////////////////////////////////////////////////
723 /// Is a power-of-2?
724 //////////////////////////////////////////////////////////////////////////
725 template <typename T>
726 INLINE
727 static bool IsPow2(T value)
728 {
729 return value == (value & (0 - value));
730 }
731
732 //////////////////////////////////////////////////////////////////////////
733 /// Align down to specified alignment
734 /// Note: IsPow2(alignment) MUST be true
735 //////////////////////////////////////////////////////////////////////////
736 template <typename T1, typename T2>
737 INLINE
738 static T1 AlignDownPow2(T1 value, T2 alignment)
739 {
740 SWR_ASSERT(IsPow2(alignment));
741 return value & ~T1(alignment - 1);
742 }
743
744 //////////////////////////////////////////////////////////////////////////
745 /// Align up to specified alignment
746 /// Note: IsPow2(alignment) MUST be true
747 //////////////////////////////////////////////////////////////////////////
748 template <typename T1, typename T2>
749 INLINE
750 static T1 AlignUpPow2(T1 value, T2 alignment)
751 {
752 return AlignDownPow2(value + T1(alignment - 1), alignment);
753 }
754
755 //////////////////////////////////////////////////////////////////////////
756 /// Align up ptr to specified alignment
757 /// Note: IsPow2(alignment) MUST be true
758 //////////////////////////////////////////////////////////////////////////
759 template <typename T1, typename T2>
760 INLINE
761 static T1* AlignUpPow2(T1* value, T2 alignment)
762 {
763 return reinterpret_cast<T1*>(
764 AlignDownPow2(reinterpret_cast<uintptr_t>(value) + uintptr_t(alignment - 1), alignment));
765 }
766
767 //////////////////////////////////////////////////////////////////////////
768 /// Align down to specified alignment
769 //////////////////////////////////////////////////////////////////////////
770 template <typename T1, typename T2>
771 INLINE
772 static T1 AlignDown(T1 value, T2 alignment)
773 {
774 if (IsPow2(alignment)) { return AlignDownPow2(value, alignment); }
775 return value - T1(value % alignment);
776 }
777
778 //////////////////////////////////////////////////////////////////////////
779 /// Align down to specified alignment
780 //////////////////////////////////////////////////////////////////////////
781 template <typename T1, typename T2>
782 INLINE
783 static T1* AlignDown(T1* value, T2 alignment)
784 {
785 return (T1*)AlignDown(uintptr_t(value), alignment);
786 }
787
788 //////////////////////////////////////////////////////////////////////////
789 /// Align up to specified alignment
790 /// Note: IsPow2(alignment) MUST be true
791 //////////////////////////////////////////////////////////////////////////
792 template <typename T1, typename T2>
793 INLINE
794 static T1 AlignUp(T1 value, T2 alignment)
795 {
796 return AlignDown(value + T1(alignment - 1), alignment);
797 }
798
799 //////////////////////////////////////////////////////////////////////////
800 /// Align up to specified alignment
801 /// Note: IsPow2(alignment) MUST be true
802 //////////////////////////////////////////////////////////////////////////
803 template <typename T1, typename T2>
804 INLINE
805 static T1* AlignUp(T1* value, T2 alignment)
806 {
807 return AlignDown(PtrAdd(value, alignment - 1), alignment);
808 }
809
810 //////////////////////////////////////////////////////////////////////////
811 /// Helper structure used to access an array of elements that don't
812 /// correspond to a typical word size.
813 //////////////////////////////////////////////////////////////////////////
814 template<typename T, size_t BitsPerElementT, size_t ArrayLenT>
815 class BitsArray
816 {
817 private:
818 static const size_t BITS_PER_WORD = sizeof(size_t) * 8;
819 static const size_t ELEMENTS_PER_WORD = BITS_PER_WORD / BitsPerElementT;
820 static const size_t NUM_WORDS = (ArrayLenT + ELEMENTS_PER_WORD - 1) / ELEMENTS_PER_WORD;
821 static const size_t ELEMENT_MASK = (size_t(1) << BitsPerElementT) - 1;
822
823 static_assert(ELEMENTS_PER_WORD * BitsPerElementT == BITS_PER_WORD,
824 "Element size must an integral fraction of pointer size");
825
826 size_t m_words[NUM_WORDS] = {};
827
828 public:
829
830 T operator[] (size_t elementIndex) const
831 {
832 size_t word = m_words[elementIndex / ELEMENTS_PER_WORD];
833 word >>= ((elementIndex % ELEMENTS_PER_WORD) * BitsPerElementT);
834 return T(word & ELEMENT_MASK);
835 }
836 };