1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * @brief Utilities used by SWR core.
27 ******************************************************************************/
31 #include "common/os.h"
32 #include "common/simdintrin.h"
33 #include "common/swr_assert.h"
36 void SaveImageToPNGFile(
37 const WCHAR
*pFilename
,
42 void OpenBitmapFromFile(
43 const WCHAR
*pFilename
,
49 #if defined(_WIN64) || defined(__x86_64__)
50 #define _MM_INSERT_EPI64 _mm_insert_epi64
51 #define _MM_EXTRACT_EPI64 _mm_extract_epi64
53 INLINE INT64
_MM_EXTRACT_EPI64(__m128i a
, const int32_t ndx
)
55 OSALIGNLINE(uint32_t) elems
[4];
56 _mm_store_si128((__m128i
*)elems
, a
);
59 uint64_t foo
= elems
[0];
60 foo
|= (uint64_t)elems
[1] << 32;
65 uint64_t foo
= elems
[2];
66 foo
|= (uint64_t)elems
[3] << 32;
71 INLINE __m128i
_MM_INSERT_EPI64(__m128i a
, INT64 b
, const int32_t ndx
)
73 OSALIGNLINE(int64_t) elems
[2];
74 _mm_store_si128((__m128i
*)elems
, a
);
84 out
= _mm_load_si128((const __m128i
*)elems
);
89 OSALIGNLINE(struct) BBOX
97 BBOX(int t
, int b
, int l
, int r
) : top(t
), bottom(b
), left(l
), right(r
) {}
99 bool operator==(const BBOX
& rhs
)
101 return (this->top
== rhs
.top
&&
102 this->bottom
== rhs
.bottom
&&
103 this->left
== rhs
.left
&&
104 this->right
== rhs
.right
);
107 bool operator!=(const BBOX
& rhs
)
109 return !(*this == rhs
);
122 void vTranspose(__m128
&row0
, __m128
&row1
, __m128
&row2
, __m128
&row3
)
124 __m128i row0i
= _mm_castps_si128(row0
);
125 __m128i row1i
= _mm_castps_si128(row1
);
126 __m128i row2i
= _mm_castps_si128(row2
);
127 __m128i row3i
= _mm_castps_si128(row3
);
129 __m128i vTemp
= row2i
;
130 row2i
= _mm_unpacklo_epi32(row2i
, row3i
);
131 vTemp
= _mm_unpackhi_epi32(vTemp
, row3i
);
134 row0i
= _mm_unpacklo_epi32(row0i
, row1i
);
135 row3i
= _mm_unpackhi_epi32(row3i
, row1i
);
138 row0i
= _mm_unpacklo_epi64(row0i
, row2i
);
139 row1i
= _mm_unpackhi_epi64(row1i
, row2i
);
142 row2i
= _mm_unpacklo_epi64(row2i
, vTemp
);
143 row3i
= _mm_unpackhi_epi64(row3i
, vTemp
);
145 row0
= _mm_castsi128_ps(row0i
);
146 row1
= _mm_castsi128_ps(row1i
);
147 row2
= _mm_castsi128_ps(row2i
);
148 row3
= _mm_castsi128_ps(row3i
);
152 void vTranspose(__m128i
&row0
, __m128i
&row1
, __m128i
&row2
, __m128i
&row3
)
154 __m128i vTemp
= row2
;
155 row2
= _mm_unpacklo_epi32(row2
, row3
);
156 vTemp
= _mm_unpackhi_epi32(vTemp
, row3
);
159 row0
= _mm_unpacklo_epi32(row0
, row1
);
160 row3
= _mm_unpackhi_epi32(row3
, row1
);
163 row0
= _mm_unpacklo_epi64(row0
, row2
);
164 row1
= _mm_unpackhi_epi64(row1
, row2
);
167 row2
= _mm_unpacklo_epi64(row2
, vTemp
);
168 row3
= _mm_unpackhi_epi64(row3
, vTemp
);
171 #define GCC_VERSION (__GNUC__ * 10000 \
172 + __GNUC_MINOR__ * 100 \
173 + __GNUC_PATCHLEVEL__)
175 #if defined(__GNUC__) && (GCC_VERSION < 40900)
176 #define _mm_undefined_ps _mm_setzero_ps
177 #define _mm_undefined_si128 _mm_setzero_si128
178 #if KNOB_SIMD_WIDTH == 8
179 #define _mm256_undefined_ps _mm256_setzero_ps
183 #if KNOB_SIMD_WIDTH == 8
185 void vTranspose3x8(__m128 (&vDst
)[8], __m256
&vSrc0
, __m256
&vSrc1
, __m256
&vSrc2
)
187 __m256 r0r2
= _mm256_unpacklo_ps(vSrc0
, vSrc2
); //x0z0x1z1 x4z4x5z5
188 __m256 r1rx
= _mm256_unpacklo_ps(vSrc1
, _mm256_undefined_ps()); //y0w0y1w1 y4w4y5w5
189 __m256 r02r1xlolo
= _mm256_unpacklo_ps(r0r2
, r1rx
); //x0y0z0w0 x4y4z4w4
190 __m256 r02r1xlohi
= _mm256_unpackhi_ps(r0r2
, r1rx
); //x1y1z1w1 x5y5z5w5
192 r0r2
= _mm256_unpackhi_ps(vSrc0
, vSrc2
); //x2z2x3z3 x6z6x7z7
193 r1rx
= _mm256_unpackhi_ps(vSrc1
, _mm256_undefined_ps()); //y2w2y3w3 y6w6yw77
194 __m256 r02r1xhilo
= _mm256_unpacklo_ps(r0r2
, r1rx
); //x2y2z2w2 x6y6z6w6
195 __m256 r02r1xhihi
= _mm256_unpackhi_ps(r0r2
, r1rx
); //x3y3z3w3 x7y7z7w7
197 vDst
[0] = _mm256_castps256_ps128(r02r1xlolo
);
198 vDst
[1] = _mm256_castps256_ps128(r02r1xlohi
);
199 vDst
[2] = _mm256_castps256_ps128(r02r1xhilo
);
200 vDst
[3] = _mm256_castps256_ps128(r02r1xhihi
);
202 vDst
[4] = _mm256_extractf128_ps(r02r1xlolo
, 1);
203 vDst
[5] = _mm256_extractf128_ps(r02r1xlohi
, 1);
204 vDst
[6] = _mm256_extractf128_ps(r02r1xhilo
, 1);
205 vDst
[7] = _mm256_extractf128_ps(r02r1xhihi
, 1);
209 void vTranspose4x8(__m128 (&vDst
)[8], __m256
&vSrc0
, __m256
&vSrc1
, __m256
&vSrc2
, __m256
&vSrc3
)
211 __m256 r0r2
= _mm256_unpacklo_ps(vSrc0
, vSrc2
); //x0z0x1z1 x4z4x5z5
212 __m256 r1rx
= _mm256_unpacklo_ps(vSrc1
, vSrc3
); //y0w0y1w1 y4w4y5w5
213 __m256 r02r1xlolo
= _mm256_unpacklo_ps(r0r2
, r1rx
); //x0y0z0w0 x4y4z4w4
214 __m256 r02r1xlohi
= _mm256_unpackhi_ps(r0r2
, r1rx
); //x1y1z1w1 x5y5z5w5
216 r0r2
= _mm256_unpackhi_ps(vSrc0
, vSrc2
); //x2z2x3z3 x6z6x7z7
217 r1rx
= _mm256_unpackhi_ps(vSrc1
, vSrc3
) ; //y2w2y3w3 y6w6yw77
218 __m256 r02r1xhilo
= _mm256_unpacklo_ps(r0r2
, r1rx
); //x2y2z2w2 x6y6z6w6
219 __m256 r02r1xhihi
= _mm256_unpackhi_ps(r0r2
, r1rx
); //x3y3z3w3 x7y7z7w7
221 vDst
[0] = _mm256_castps256_ps128(r02r1xlolo
);
222 vDst
[1] = _mm256_castps256_ps128(r02r1xlohi
);
223 vDst
[2] = _mm256_castps256_ps128(r02r1xhilo
);
224 vDst
[3] = _mm256_castps256_ps128(r02r1xhihi
);
226 vDst
[4] = _mm256_extractf128_ps(r02r1xlolo
, 1);
227 vDst
[5] = _mm256_extractf128_ps(r02r1xlohi
, 1);
228 vDst
[6] = _mm256_extractf128_ps(r02r1xhilo
, 1);
229 vDst
[7] = _mm256_extractf128_ps(r02r1xhihi
, 1);
233 void vTranspose8x8(__m256 (&vDst
)[8], const __m256
&vMask0
, const __m256
&vMask1
, const __m256
&vMask2
, const __m256
&vMask3
, const __m256
&vMask4
, const __m256
&vMask5
, const __m256
&vMask6
, const __m256
&vMask7
)
235 __m256 __t0
= _mm256_unpacklo_ps(vMask0
, vMask1
);
236 __m256 __t1
= _mm256_unpackhi_ps(vMask0
, vMask1
);
237 __m256 __t2
= _mm256_unpacklo_ps(vMask2
, vMask3
);
238 __m256 __t3
= _mm256_unpackhi_ps(vMask2
, vMask3
);
239 __m256 __t4
= _mm256_unpacklo_ps(vMask4
, vMask5
);
240 __m256 __t5
= _mm256_unpackhi_ps(vMask4
, vMask5
);
241 __m256 __t6
= _mm256_unpacklo_ps(vMask6
, vMask7
);
242 __m256 __t7
= _mm256_unpackhi_ps(vMask6
, vMask7
);
243 __m256 __tt0
= _mm256_shuffle_ps(__t0
,__t2
,_MM_SHUFFLE(1,0,1,0));
244 __m256 __tt1
= _mm256_shuffle_ps(__t0
,__t2
,_MM_SHUFFLE(3,2,3,2));
245 __m256 __tt2
= _mm256_shuffle_ps(__t1
,__t3
,_MM_SHUFFLE(1,0,1,0));
246 __m256 __tt3
= _mm256_shuffle_ps(__t1
,__t3
,_MM_SHUFFLE(3,2,3,2));
247 __m256 __tt4
= _mm256_shuffle_ps(__t4
,__t6
,_MM_SHUFFLE(1,0,1,0));
248 __m256 __tt5
= _mm256_shuffle_ps(__t4
,__t6
,_MM_SHUFFLE(3,2,3,2));
249 __m256 __tt6
= _mm256_shuffle_ps(__t5
,__t7
,_MM_SHUFFLE(1,0,1,0));
250 __m256 __tt7
= _mm256_shuffle_ps(__t5
,__t7
,_MM_SHUFFLE(3,2,3,2));
251 vDst
[0] = _mm256_permute2f128_ps(__tt0
, __tt4
, 0x20);
252 vDst
[1] = _mm256_permute2f128_ps(__tt1
, __tt5
, 0x20);
253 vDst
[2] = _mm256_permute2f128_ps(__tt2
, __tt6
, 0x20);
254 vDst
[3] = _mm256_permute2f128_ps(__tt3
, __tt7
, 0x20);
255 vDst
[4] = _mm256_permute2f128_ps(__tt0
, __tt4
, 0x31);
256 vDst
[5] = _mm256_permute2f128_ps(__tt1
, __tt5
, 0x31);
257 vDst
[6] = _mm256_permute2f128_ps(__tt2
, __tt6
, 0x31);
258 vDst
[7] = _mm256_permute2f128_ps(__tt3
, __tt7
, 0x31);
262 void vTranspose8x8(__m256 (&vDst
)[8], const __m256i
&vMask0
, const __m256i
&vMask1
, const __m256i
&vMask2
, const __m256i
&vMask3
, const __m256i
&vMask4
, const __m256i
&vMask5
, const __m256i
&vMask6
, const __m256i
&vMask7
)
264 vTranspose8x8(vDst
, _mm256_castsi256_ps(vMask0
), _mm256_castsi256_ps(vMask1
), _mm256_castsi256_ps(vMask2
), _mm256_castsi256_ps(vMask3
),
265 _mm256_castsi256_ps(vMask4
), _mm256_castsi256_ps(vMask5
), _mm256_castsi256_ps(vMask6
), _mm256_castsi256_ps(vMask7
));
269 //////////////////////////////////////////////////////////////////////////
270 /// TranposeSingleComponent
271 //////////////////////////////////////////////////////////////////////////
272 template<uint32_t bpp
>
273 struct TransposeSingleComponent
275 //////////////////////////////////////////////////////////////////////////
276 /// @brief Pass-thru for single component.
277 /// @param pSrc - source data in SOA form
278 /// @param pDst - output data in AOS form
279 INLINE
static void Transpose(const BYTE
* pSrc
, BYTE
* pDst
)
281 memcpy(pDst
, pSrc
, (bpp
* KNOB_SIMD_WIDTH
) / 8);
285 //////////////////////////////////////////////////////////////////////////
287 //////////////////////////////////////////////////////////////////////////
288 struct Transpose8_8_8_8
290 //////////////////////////////////////////////////////////////////////////
291 /// @brief Performs an SOA to AOS conversion for packed 8_8_8_8 data.
292 /// @param pSrc - source data in SOA form
293 /// @param pDst - output data in AOS form
294 INLINE
static void Transpose(const BYTE
* pSrc
, BYTE
* pDst
)
296 simdscalari src
= _simd_load_si((const simdscalari
*)pSrc
);
297 #if KNOB_SIMD_WIDTH == 8
298 #if KNOB_ARCH == KNOB_ARCH_AVX
299 __m128i c0c1
= _mm256_castsi256_si128(src
); // rrrrrrrrgggggggg
300 __m128i c2c3
= _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(src
), 1)); // bbbbbbbbaaaaaaaa
301 __m128i c0c2
= _mm_unpacklo_epi64(c0c1
, c2c3
); // rrrrrrrrbbbbbbbb
302 __m128i c1c3
= _mm_unpackhi_epi64(c0c1
, c2c3
); // ggggggggaaaaaaaa
303 __m128i c01
= _mm_unpacklo_epi8(c0c2
, c1c3
); // rgrgrgrgrgrgrgrg
304 __m128i c23
= _mm_unpackhi_epi8(c0c2
, c1c3
); // babababababababa
305 __m128i c0123lo
= _mm_unpacklo_epi16(c01
, c23
); // rgbargbargbargba
306 __m128i c0123hi
= _mm_unpackhi_epi16(c01
, c23
); // rgbargbargbargba
307 _mm_store_si128((__m128i
*)pDst
, c0123lo
);
308 _mm_store_si128((__m128i
*)(pDst
+ 16), c0123hi
);
309 #elif KNOB_ARCH == KNOB_ARCH_AVX2
310 simdscalari dst01
= _mm256_shuffle_epi8(src
,
311 _mm256_set_epi32(0x0f078080, 0x0e068080, 0x0d058080, 0x0c048080, 0x80800b03, 0x80800a02, 0x80800901, 0x80800800));
312 simdscalari dst23
= _mm256_permute2x128_si256(src
, src
, 0x01);
313 dst23
= _mm256_shuffle_epi8(dst23
,
314 _mm256_set_epi32(0x80800f07, 0x80800e06, 0x80800d05, 0x80800c04, 0x0b038080, 0x0a028080, 0x09018080, 0x08008080));
315 simdscalari dst
= _mm256_or_si256(dst01
, dst23
);
316 _simd_store_si((simdscalari
*)pDst
, dst
);
319 #error Unsupported vector width
324 //////////////////////////////////////////////////////////////////////////
326 //////////////////////////////////////////////////////////////////////////
327 struct Transpose8_8_8
329 //////////////////////////////////////////////////////////////////////////
330 /// @brief Performs an SOA to AOS conversion for packed 8_8_8 data.
331 /// @param pSrc - source data in SOA form
332 /// @param pDst - output data in AOS form
333 INLINE
static void Transpose(const BYTE
* pSrc
, BYTE
* pDst
) = delete;
336 //////////////////////////////////////////////////////////////////////////
338 //////////////////////////////////////////////////////////////////////////
341 //////////////////////////////////////////////////////////////////////////
342 /// @brief Performs an SOA to AOS conversion for packed 8_8 data.
343 /// @param pSrc - source data in SOA form
344 /// @param pDst - output data in AOS form
345 INLINE
static void Transpose(const BYTE
* pSrc
, BYTE
* pDst
)
347 simdscalari src
= _simd_load_si((const simdscalari
*)pSrc
);
349 #if KNOB_SIMD_WIDTH == 8
350 __m128i rg
= _mm256_castsi256_si128(src
); // rrrrrrrr gggggggg
351 __m128i g
= _mm_unpackhi_epi64(rg
, rg
); // gggggggg gggggggg
352 rg
= _mm_unpacklo_epi8(rg
, g
);
353 _mm_store_si128((__m128i
*)pDst
, rg
);
355 #error Unsupported vector width
360 //////////////////////////////////////////////////////////////////////////
361 /// Transpose32_32_32_32
362 //////////////////////////////////////////////////////////////////////////
363 struct Transpose32_32_32_32
365 //////////////////////////////////////////////////////////////////////////
366 /// @brief Performs an SOA to AOS conversion for packed 32_32_32_32 data.
367 /// @param pSrc - source data in SOA form
368 /// @param pDst - output data in AOS form
369 INLINE
static void Transpose(const BYTE
* pSrc
, BYTE
* pDst
)
371 #if KNOB_SIMD_WIDTH == 8
372 simdscalar src0
= _simd_load_ps((const float*)pSrc
);
373 simdscalar src1
= _simd_load_ps((const float*)pSrc
+ 8);
374 simdscalar src2
= _simd_load_ps((const float*)pSrc
+ 16);
375 simdscalar src3
= _simd_load_ps((const float*)pSrc
+ 24);
378 vTranspose4x8(vDst
, src0
, src1
, src2
, src3
);
379 _mm_store_ps((float*)pDst
, vDst
[0]);
380 _mm_store_ps((float*)pDst
+4, vDst
[1]);
381 _mm_store_ps((float*)pDst
+8, vDst
[2]);
382 _mm_store_ps((float*)pDst
+12, vDst
[3]);
383 _mm_store_ps((float*)pDst
+16, vDst
[4]);
384 _mm_store_ps((float*)pDst
+20, vDst
[5]);
385 _mm_store_ps((float*)pDst
+24, vDst
[6]);
386 _mm_store_ps((float*)pDst
+28, vDst
[7]);
388 #error Unsupported vector width
393 //////////////////////////////////////////////////////////////////////////
394 /// Transpose32_32_32
395 //////////////////////////////////////////////////////////////////////////
396 struct Transpose32_32_32
398 //////////////////////////////////////////////////////////////////////////
399 /// @brief Performs an SOA to AOS conversion for packed 32_32_32 data.
400 /// @param pSrc - source data in SOA form
401 /// @param pDst - output data in AOS form
402 INLINE
static void Transpose(const BYTE
* pSrc
, BYTE
* pDst
)
404 #if KNOB_SIMD_WIDTH == 8
405 simdscalar src0
= _simd_load_ps((const float*)pSrc
);
406 simdscalar src1
= _simd_load_ps((const float*)pSrc
+ 8);
407 simdscalar src2
= _simd_load_ps((const float*)pSrc
+ 16);
410 vTranspose3x8(vDst
, src0
, src1
, src2
);
411 _mm_store_ps((float*)pDst
, vDst
[0]);
412 _mm_store_ps((float*)pDst
+ 4, vDst
[1]);
413 _mm_store_ps((float*)pDst
+ 8, vDst
[2]);
414 _mm_store_ps((float*)pDst
+ 12, vDst
[3]);
415 _mm_store_ps((float*)pDst
+ 16, vDst
[4]);
416 _mm_store_ps((float*)pDst
+ 20, vDst
[5]);
417 _mm_store_ps((float*)pDst
+ 24, vDst
[6]);
418 _mm_store_ps((float*)pDst
+ 28, vDst
[7]);
420 #error Unsupported vector width
425 //////////////////////////////////////////////////////////////////////////
427 //////////////////////////////////////////////////////////////////////////
428 struct Transpose32_32
430 //////////////////////////////////////////////////////////////////////////
431 /// @brief Performs an SOA to AOS conversion for packed 32_32 data.
432 /// @param pSrc - source data in SOA form
433 /// @param pDst - output data in AOS form
434 INLINE
static void Transpose(const BYTE
* pSrc
, BYTE
* pDst
)
436 const float* pfSrc
= (const float*)pSrc
;
437 __m128 src_r0
= _mm_load_ps(pfSrc
+ 0);
438 __m128 src_r1
= _mm_load_ps(pfSrc
+ 4);
439 __m128 src_g0
= _mm_load_ps(pfSrc
+ 8);
440 __m128 src_g1
= _mm_load_ps(pfSrc
+ 12);
442 __m128 dst0
= _mm_unpacklo_ps(src_r0
, src_g0
);
443 __m128 dst1
= _mm_unpackhi_ps(src_r0
, src_g0
);
444 __m128 dst2
= _mm_unpacklo_ps(src_r1
, src_g1
);
445 __m128 dst3
= _mm_unpackhi_ps(src_r1
, src_g1
);
447 float* pfDst
= (float*)pDst
;
448 _mm_store_ps(pfDst
+ 0, dst0
);
449 _mm_store_ps(pfDst
+ 4, dst1
);
450 _mm_store_ps(pfDst
+ 8, dst2
);
451 _mm_store_ps(pfDst
+ 12, dst3
);
455 //////////////////////////////////////////////////////////////////////////
456 /// Transpose16_16_16_16
457 //////////////////////////////////////////////////////////////////////////
458 struct Transpose16_16_16_16
460 //////////////////////////////////////////////////////////////////////////
461 /// @brief Performs an SOA to AOS conversion for packed 16_16_16_16 data.
462 /// @param pSrc - source data in SOA form
463 /// @param pDst - output data in AOS form
464 INLINE
static void Transpose(const BYTE
* pSrc
, BYTE
* pDst
)
466 #if KNOB_SIMD_WIDTH == 8
467 simdscalari src_rg
= _simd_load_si((const simdscalari
*)pSrc
);
468 simdscalari src_ba
= _simd_load_si((const simdscalari
*)(pSrc
+ sizeof(simdscalari
)));
470 __m128i src_r
= _mm256_extractf128_si256(src_rg
, 0);
471 __m128i src_g
= _mm256_extractf128_si256(src_rg
, 1);
472 __m128i src_b
= _mm256_extractf128_si256(src_ba
, 0);
473 __m128i src_a
= _mm256_extractf128_si256(src_ba
, 1);
475 __m128i rg0
= _mm_unpacklo_epi16(src_r
, src_g
);
476 __m128i rg1
= _mm_unpackhi_epi16(src_r
, src_g
);
477 __m128i ba0
= _mm_unpacklo_epi16(src_b
, src_a
);
478 __m128i ba1
= _mm_unpackhi_epi16(src_b
, src_a
);
480 __m128i dst0
= _mm_unpacklo_epi32(rg0
, ba0
);
481 __m128i dst1
= _mm_unpackhi_epi32(rg0
, ba0
);
482 __m128i dst2
= _mm_unpacklo_epi32(rg1
, ba1
);
483 __m128i dst3
= _mm_unpackhi_epi32(rg1
, ba1
);
485 _mm_store_si128(((__m128i
*)pDst
) + 0, dst0
);
486 _mm_store_si128(((__m128i
*)pDst
) + 1, dst1
);
487 _mm_store_si128(((__m128i
*)pDst
) + 2, dst2
);
488 _mm_store_si128(((__m128i
*)pDst
) + 3, dst3
);
490 #error Unsupported vector width
495 //////////////////////////////////////////////////////////////////////////
496 /// Transpose16_16_16
497 //////////////////////////////////////////////////////////////////////////
498 struct Transpose16_16_16
500 //////////////////////////////////////////////////////////////////////////
501 /// @brief Performs an SOA to AOS conversion for packed 16_16_16 data.
502 /// @param pSrc - source data in SOA form
503 /// @param pDst - output data in AOS form
504 INLINE
static void Transpose(const BYTE
* pSrc
, BYTE
* pDst
)
506 #if KNOB_SIMD_WIDTH == 8
507 simdscalari src_rg
= _simd_load_si((const simdscalari
*)pSrc
);
509 __m128i src_r
= _mm256_extractf128_si256(src_rg
, 0);
510 __m128i src_g
= _mm256_extractf128_si256(src_rg
, 1);
511 __m128i src_b
= _mm_load_si128((const __m128i
*)(pSrc
+ sizeof(simdscalari
)));
512 __m128i src_a
= _mm_undefined_si128();
514 __m128i rg0
= _mm_unpacklo_epi16(src_r
, src_g
);
515 __m128i rg1
= _mm_unpackhi_epi16(src_r
, src_g
);
516 __m128i ba0
= _mm_unpacklo_epi16(src_b
, src_a
);
517 __m128i ba1
= _mm_unpackhi_epi16(src_b
, src_a
);
519 __m128i dst0
= _mm_unpacklo_epi32(rg0
, ba0
);
520 __m128i dst1
= _mm_unpackhi_epi32(rg0
, ba0
);
521 __m128i dst2
= _mm_unpacklo_epi32(rg1
, ba1
);
522 __m128i dst3
= _mm_unpackhi_epi32(rg1
, ba1
);
524 _mm_store_si128(((__m128i
*)pDst
) + 0, dst0
);
525 _mm_store_si128(((__m128i
*)pDst
) + 1, dst1
);
526 _mm_store_si128(((__m128i
*)pDst
) + 2, dst2
);
527 _mm_store_si128(((__m128i
*)pDst
) + 3, dst3
);
529 #error Unsupported vector width
534 //////////////////////////////////////////////////////////////////////////
536 //////////////////////////////////////////////////////////////////////////
537 struct Transpose16_16
539 //////////////////////////////////////////////////////////////////////////
540 /// @brief Performs an SOA to AOS conversion for packed 16_16 data.
541 /// @param pSrc - source data in SOA form
542 /// @param pDst - output data in AOS form
543 INLINE
static void Transpose(const BYTE
* pSrc
, BYTE
* pDst
)
545 simdscalar src
= _simd_load_ps((const float*)pSrc
);
547 #if KNOB_SIMD_WIDTH == 8
548 __m128 comp0
= _mm256_castps256_ps128(src
);
549 __m128 comp1
= _mm256_extractf128_ps(src
, 1);
551 __m128i comp0i
= _mm_castps_si128(comp0
);
552 __m128i comp1i
= _mm_castps_si128(comp1
);
554 __m128i resLo
= _mm_unpacklo_epi16(comp0i
, comp1i
);
555 __m128i resHi
= _mm_unpackhi_epi16(comp0i
, comp1i
);
557 _mm_store_si128((__m128i
*)pDst
, resLo
);
558 _mm_store_si128((__m128i
*)pDst
+ 1, resHi
);
560 #error Unsupported vector width
565 //////////////////////////////////////////////////////////////////////////
567 //////////////////////////////////////////////////////////////////////////
570 //////////////////////////////////////////////////////////////////////////
571 /// @brief Performs an SOA to AOS conversion for packed 24_8 data.
572 /// @param pSrc - source data in SOA form
573 /// @param pDst - output data in AOS form
574 static void Transpose(const BYTE
* pSrc
, BYTE
* pDst
) = delete;
577 //////////////////////////////////////////////////////////////////////////
579 //////////////////////////////////////////////////////////////////////////
580 struct Transpose32_8_24
582 //////////////////////////////////////////////////////////////////////////
583 /// @brief Performs an SOA to AOS conversion for packed 32_8_24 data.
584 /// @param pSrc - source data in SOA form
585 /// @param pDst - output data in AOS form
586 static void Transpose(const BYTE
* pSrc
, BYTE
* pDst
) = delete;
591 //////////////////////////////////////////////////////////////////////////
593 //////////////////////////////////////////////////////////////////////////
594 struct Transpose4_4_4_4
596 //////////////////////////////////////////////////////////////////////////
597 /// @brief Performs an SOA to AOS conversion for packed 4_4_4_4 data.
598 /// @param pSrc - source data in SOA form
599 /// @param pDst - output data in AOS form
600 static void Transpose(const BYTE
* pSrc
, BYTE
* pDst
) = delete;
603 //////////////////////////////////////////////////////////////////////////
605 //////////////////////////////////////////////////////////////////////////
606 struct Transpose5_6_5
608 //////////////////////////////////////////////////////////////////////////
609 /// @brief Performs an SOA to AOS conversion for packed 5_6_5 data.
610 /// @param pSrc - source data in SOA form
611 /// @param pDst - output data in AOS form
612 static void Transpose(const BYTE
* pSrc
, BYTE
* pDst
) = delete;
615 //////////////////////////////////////////////////////////////////////////
617 //////////////////////////////////////////////////////////////////////////
618 struct Transpose9_9_9_5
620 //////////////////////////////////////////////////////////////////////////
621 /// @brief Performs an SOA to AOS conversion for packed 9_9_9_5 data.
622 /// @param pSrc - source data in SOA form
623 /// @param pDst - output data in AOS form
624 static void Transpose(const BYTE
* pSrc
, BYTE
* pDst
) = delete;
627 //////////////////////////////////////////////////////////////////////////
629 //////////////////////////////////////////////////////////////////////////
630 struct Transpose5_5_5_1
632 //////////////////////////////////////////////////////////////////////////
633 /// @brief Performs an SOA to AOS conversion for packed 5_5_5_1 data.
634 /// @param pSrc - source data in SOA form
635 /// @param pDst - output data in AOS form
636 static void Transpose(const BYTE
* pSrc
, BYTE
* pDst
) = delete;
639 //////////////////////////////////////////////////////////////////////////
640 /// Transpose10_10_10_2
641 //////////////////////////////////////////////////////////////////////////
642 struct Transpose10_10_10_2
644 //////////////////////////////////////////////////////////////////////////
645 /// @brief Performs an SOA to AOS conversion for packed 10_10_10_2 data.
646 /// @param pSrc - source data in SOA form
647 /// @param pDst - output data in AOS form
648 static void Transpose(const BYTE
* pSrc
, BYTE
* pDst
) = delete;
651 //////////////////////////////////////////////////////////////////////////
652 /// Transpose11_11_10
653 //////////////////////////////////////////////////////////////////////////
654 struct Transpose11_11_10
656 //////////////////////////////////////////////////////////////////////////
657 /// @brief Performs an SOA to AOS conversion for packed 11_11_10 data.
658 /// @param pSrc - source data in SOA form
659 /// @param pDst - output data in AOS form
660 static void Transpose(const BYTE
* pSrc
, BYTE
* pDst
) = delete;
663 // helper function to unroll loops
664 template<int Begin
, int End
, int Step
= 1>
666 template<typename Lambda
>
667 INLINE
static void step(Lambda
& func
) {
669 UnrollerL
<Begin
+ Step
, End
, Step
>::step(func
);
673 template<int End
, int Step
>
674 struct UnrollerL
<End
, End
, Step
> {
675 template<typename Lambda
>
676 static void step(Lambda
& func
) {
680 // general CRC compute
682 uint32_t ComputeCRC(uint32_t crc
, const void *pData
, uint32_t size
)
684 #if defined(_WIN64) || defined(__x86_64__)
685 uint32_t sizeInQwords
= size
/ sizeof(uint64_t);
686 uint32_t sizeRemainderBytes
= size
% sizeof(uint64_t);
687 uint64_t* pDataWords
= (uint64_t*)pData
;
688 for (uint32_t i
= 0; i
< sizeInQwords
; ++i
)
690 crc
= (uint32_t)_mm_crc32_u64(crc
, *pDataWords
++);
693 uint32_t sizeInDwords
= size
/ sizeof(uint32_t);
694 uint32_t sizeRemainderBytes
= size
% sizeof(uint32_t);
695 uint32_t* pDataWords
= (uint32_t*)pData
;
696 for (uint32_t i
= 0; i
< sizeInDwords
; ++i
)
698 crc
= _mm_crc32_u32(crc
, *pDataWords
++);
702 BYTE
* pRemainderBytes
= (BYTE
*)pDataWords
;
703 for (uint32_t i
= 0; i
< sizeRemainderBytes
; ++i
)
705 crc
= _mm_crc32_u8(crc
, *pRemainderBytes
++);
711 //////////////////////////////////////////////////////////////////////////
712 /// Add byte offset to any-type pointer
713 //////////////////////////////////////////////////////////////////////////
714 template <typename T
>
716 static T
* PtrAdd(T
* p
, intptr_t offset
)
718 intptr_t intp
= reinterpret_cast<intptr_t>(p
);
719 return reinterpret_cast<T
*>(intp
+ offset
);
722 //////////////////////////////////////////////////////////////////////////
724 //////////////////////////////////////////////////////////////////////////
725 template <typename T
>
727 static bool IsPow2(T value
)
729 return value
== (value
& (0 - value
));
732 //////////////////////////////////////////////////////////////////////////
733 /// Align down to specified alignment
734 /// Note: IsPow2(alignment) MUST be true
735 //////////////////////////////////////////////////////////////////////////
736 template <typename T1
, typename T2
>
738 static T1
AlignDownPow2(T1 value
, T2 alignment
)
740 SWR_ASSERT(IsPow2(alignment
));
741 return value
& ~T1(alignment
- 1);
744 //////////////////////////////////////////////////////////////////////////
745 /// Align up to specified alignment
746 /// Note: IsPow2(alignment) MUST be true
747 //////////////////////////////////////////////////////////////////////////
748 template <typename T1
, typename T2
>
750 static T1
AlignUpPow2(T1 value
, T2 alignment
)
752 return AlignDownPow2(value
+ T1(alignment
- 1), alignment
);
755 //////////////////////////////////////////////////////////////////////////
756 /// Align up ptr to specified alignment
757 /// Note: IsPow2(alignment) MUST be true
758 //////////////////////////////////////////////////////////////////////////
759 template <typename T1
, typename T2
>
761 static T1
* AlignUpPow2(T1
* value
, T2 alignment
)
763 return reinterpret_cast<T1
*>(
764 AlignDownPow2(reinterpret_cast<uintptr_t>(value
) + uintptr_t(alignment
- 1), alignment
));
767 //////////////////////////////////////////////////////////////////////////
768 /// Align down to specified alignment
769 //////////////////////////////////////////////////////////////////////////
770 template <typename T1
, typename T2
>
772 static T1
AlignDown(T1 value
, T2 alignment
)
774 if (IsPow2(alignment
)) { return AlignDownPow2(value
, alignment
); }
775 return value
- T1(value
% alignment
);
778 //////////////////////////////////////////////////////////////////////////
779 /// Align down to specified alignment
780 //////////////////////////////////////////////////////////////////////////
781 template <typename T1
, typename T2
>
783 static T1
* AlignDown(T1
* value
, T2 alignment
)
785 return (T1
*)AlignDown(uintptr_t(value
), alignment
);
788 //////////////////////////////////////////////////////////////////////////
789 /// Align up to specified alignment
790 /// Note: IsPow2(alignment) MUST be true
791 //////////////////////////////////////////////////////////////////////////
792 template <typename T1
, typename T2
>
794 static T1
AlignUp(T1 value
, T2 alignment
)
796 return AlignDown(value
+ T1(alignment
- 1), alignment
);
799 //////////////////////////////////////////////////////////////////////////
800 /// Align up to specified alignment
801 /// Note: IsPow2(alignment) MUST be true
802 //////////////////////////////////////////////////////////////////////////
803 template <typename T1
, typename T2
>
805 static T1
* AlignUp(T1
* value
, T2 alignment
)
807 return AlignDown(PtrAdd(value
, alignment
- 1), alignment
);
810 //////////////////////////////////////////////////////////////////////////
811 /// Helper structure used to access an array of elements that don't
812 /// correspond to a typical word size.
813 //////////////////////////////////////////////////////////////////////////
814 template<typename T
, size_t BitsPerElementT
, size_t ArrayLenT
>
818 static const size_t BITS_PER_WORD
= sizeof(size_t) * 8;
819 static const size_t ELEMENTS_PER_WORD
= BITS_PER_WORD
/ BitsPerElementT
;
820 static const size_t NUM_WORDS
= (ArrayLenT
+ ELEMENTS_PER_WORD
- 1) / ELEMENTS_PER_WORD
;
821 static const size_t ELEMENT_MASK
= (size_t(1) << BitsPerElementT
) - 1;
823 static_assert(ELEMENTS_PER_WORD
* BitsPerElementT
== BITS_PER_WORD
,
824 "Element size must an integral fraction of pointer size");
826 size_t m_words
[NUM_WORDS
] = {};
830 T
operator[] (size_t elementIndex
) const
832 size_t word
= m_words
[elementIndex
/ ELEMENTS_PER_WORD
];
833 word
>>= ((elementIndex
% ELEMENTS_PER_WORD
) * BitsPerElementT
);
834 return T(word
& ELEMENT_MASK
);