1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * @brief Utilities used by SWR core.
27 ******************************************************************************/
31 #include <type_traits>
32 #include "common/os.h"
33 #include "common/simdintrin.h"
34 #include "common/swr_assert.h"
37 void SaveImageToPNGFile(
38 const WCHAR
*pFilename
,
44 void OpenBitmapFromFile(
45 const WCHAR
*pFilename
,
51 #if defined(_WIN64) || defined(__x86_64__)
52 #define _MM_INSERT_EPI64 _mm_insert_epi64
53 #define _MM_EXTRACT_EPI64 _mm_extract_epi64
55 INLINE INT64
_MM_EXTRACT_EPI64(__m128i a
, const int32_t ndx
)
57 OSALIGNLINE(uint32_t) elems
[4];
58 _mm_store_si128((__m128i
*)elems
, a
);
61 uint64_t foo
= elems
[0];
62 foo
|= (uint64_t)elems
[1] << 32;
67 uint64_t foo
= elems
[2];
68 foo
|= (uint64_t)elems
[3] << 32;
73 INLINE __m128i
_MM_INSERT_EPI64(__m128i a
, INT64 b
, const int32_t ndx
)
75 OSALIGNLINE(int64_t) elems
[2];
76 _mm_store_si128((__m128i
*)elems
, a
);
86 out
= _mm_load_si128((const __m128i
*)elems
);
91 OSALIGNLINE(struct) BBOX
99 BBOX(int t
, int b
, int l
, int r
) : top(t
), bottom(b
), left(l
), right(r
) {}
101 bool operator==(const BBOX
& rhs
)
103 return (this->top
== rhs
.top
&&
104 this->bottom
== rhs
.bottom
&&
105 this->left
== rhs
.left
&&
106 this->right
== rhs
.right
);
109 bool operator!=(const BBOX
& rhs
)
111 return !(*this == rhs
);
124 void vTranspose(__m128
&row0
, __m128
&row1
, __m128
&row2
, __m128
&row3
)
126 __m128i row0i
= _mm_castps_si128(row0
);
127 __m128i row1i
= _mm_castps_si128(row1
);
128 __m128i row2i
= _mm_castps_si128(row2
);
129 __m128i row3i
= _mm_castps_si128(row3
);
131 __m128i vTemp
= row2i
;
132 row2i
= _mm_unpacklo_epi32(row2i
, row3i
);
133 vTemp
= _mm_unpackhi_epi32(vTemp
, row3i
);
136 row0i
= _mm_unpacklo_epi32(row0i
, row1i
);
137 row3i
= _mm_unpackhi_epi32(row3i
, row1i
);
140 row0i
= _mm_unpacklo_epi64(row0i
, row2i
);
141 row1i
= _mm_unpackhi_epi64(row1i
, row2i
);
144 row2i
= _mm_unpacklo_epi64(row2i
, vTemp
);
145 row3i
= _mm_unpackhi_epi64(row3i
, vTemp
);
147 row0
= _mm_castsi128_ps(row0i
);
148 row1
= _mm_castsi128_ps(row1i
);
149 row2
= _mm_castsi128_ps(row2i
);
150 row3
= _mm_castsi128_ps(row3i
);
154 void vTranspose(__m128i
&row0
, __m128i
&row1
, __m128i
&row2
, __m128i
&row3
)
156 __m128i vTemp
= row2
;
157 row2
= _mm_unpacklo_epi32(row2
, row3
);
158 vTemp
= _mm_unpackhi_epi32(vTemp
, row3
);
161 row0
= _mm_unpacklo_epi32(row0
, row1
);
162 row3
= _mm_unpackhi_epi32(row3
, row1
);
165 row0
= _mm_unpacklo_epi64(row0
, row2
);
166 row1
= _mm_unpackhi_epi64(row1
, row2
);
169 row2
= _mm_unpacklo_epi64(row2
, vTemp
);
170 row3
= _mm_unpackhi_epi64(row3
, vTemp
);
173 #define GCC_VERSION (__GNUC__ * 10000 \
174 + __GNUC_MINOR__ * 100 \
175 + __GNUC_PATCHLEVEL__)
177 #if defined(__GNUC__) && (GCC_VERSION < 40900)
178 #define _mm_undefined_ps _mm_setzero_ps
179 #define _mm_undefined_si128 _mm_setzero_si128
180 #if KNOB_SIMD_WIDTH == 8
181 #define _mm256_undefined_ps _mm256_setzero_ps
185 #if KNOB_SIMD_WIDTH == 8
187 void vTranspose3x8(__m128 (&vDst
)[8], __m256
&vSrc0
, __m256
&vSrc1
, __m256
&vSrc2
)
189 __m256 r0r2
= _mm256_unpacklo_ps(vSrc0
, vSrc2
); //x0z0x1z1 x4z4x5z5
190 __m256 r1rx
= _mm256_unpacklo_ps(vSrc1
, _mm256_undefined_ps()); //y0w0y1w1 y4w4y5w5
191 __m256 r02r1xlolo
= _mm256_unpacklo_ps(r0r2
, r1rx
); //x0y0z0w0 x4y4z4w4
192 __m256 r02r1xlohi
= _mm256_unpackhi_ps(r0r2
, r1rx
); //x1y1z1w1 x5y5z5w5
194 r0r2
= _mm256_unpackhi_ps(vSrc0
, vSrc2
); //x2z2x3z3 x6z6x7z7
195 r1rx
= _mm256_unpackhi_ps(vSrc1
, _mm256_undefined_ps()); //y2w2y3w3 y6w6yw77
196 __m256 r02r1xhilo
= _mm256_unpacklo_ps(r0r2
, r1rx
); //x2y2z2w2 x6y6z6w6
197 __m256 r02r1xhihi
= _mm256_unpackhi_ps(r0r2
, r1rx
); //x3y3z3w3 x7y7z7w7
199 vDst
[0] = _mm256_castps256_ps128(r02r1xlolo
);
200 vDst
[1] = _mm256_castps256_ps128(r02r1xlohi
);
201 vDst
[2] = _mm256_castps256_ps128(r02r1xhilo
);
202 vDst
[3] = _mm256_castps256_ps128(r02r1xhihi
);
204 vDst
[4] = _mm256_extractf128_ps(r02r1xlolo
, 1);
205 vDst
[5] = _mm256_extractf128_ps(r02r1xlohi
, 1);
206 vDst
[6] = _mm256_extractf128_ps(r02r1xhilo
, 1);
207 vDst
[7] = _mm256_extractf128_ps(r02r1xhihi
, 1);
211 void vTranspose4x8(__m128 (&vDst
)[8], __m256
&vSrc0
, __m256
&vSrc1
, __m256
&vSrc2
, __m256
&vSrc3
)
213 __m256 r0r2
= _mm256_unpacklo_ps(vSrc0
, vSrc2
); //x0z0x1z1 x4z4x5z5
214 __m256 r1rx
= _mm256_unpacklo_ps(vSrc1
, vSrc3
); //y0w0y1w1 y4w4y5w5
215 __m256 r02r1xlolo
= _mm256_unpacklo_ps(r0r2
, r1rx
); //x0y0z0w0 x4y4z4w4
216 __m256 r02r1xlohi
= _mm256_unpackhi_ps(r0r2
, r1rx
); //x1y1z1w1 x5y5z5w5
218 r0r2
= _mm256_unpackhi_ps(vSrc0
, vSrc2
); //x2z2x3z3 x6z6x7z7
219 r1rx
= _mm256_unpackhi_ps(vSrc1
, vSrc3
) ; //y2w2y3w3 y6w6yw77
220 __m256 r02r1xhilo
= _mm256_unpacklo_ps(r0r2
, r1rx
); //x2y2z2w2 x6y6z6w6
221 __m256 r02r1xhihi
= _mm256_unpackhi_ps(r0r2
, r1rx
); //x3y3z3w3 x7y7z7w7
223 vDst
[0] = _mm256_castps256_ps128(r02r1xlolo
);
224 vDst
[1] = _mm256_castps256_ps128(r02r1xlohi
);
225 vDst
[2] = _mm256_castps256_ps128(r02r1xhilo
);
226 vDst
[3] = _mm256_castps256_ps128(r02r1xhihi
);
228 vDst
[4] = _mm256_extractf128_ps(r02r1xlolo
, 1);
229 vDst
[5] = _mm256_extractf128_ps(r02r1xlohi
, 1);
230 vDst
[6] = _mm256_extractf128_ps(r02r1xhilo
, 1);
231 vDst
[7] = _mm256_extractf128_ps(r02r1xhihi
, 1);
235 void vTranspose8x8(__m256 (&vDst
)[8], const __m256
&vMask0
, const __m256
&vMask1
, const __m256
&vMask2
, const __m256
&vMask3
, const __m256
&vMask4
, const __m256
&vMask5
, const __m256
&vMask6
, const __m256
&vMask7
)
237 __m256 __t0
= _mm256_unpacklo_ps(vMask0
, vMask1
);
238 __m256 __t1
= _mm256_unpackhi_ps(vMask0
, vMask1
);
239 __m256 __t2
= _mm256_unpacklo_ps(vMask2
, vMask3
);
240 __m256 __t3
= _mm256_unpackhi_ps(vMask2
, vMask3
);
241 __m256 __t4
= _mm256_unpacklo_ps(vMask4
, vMask5
);
242 __m256 __t5
= _mm256_unpackhi_ps(vMask4
, vMask5
);
243 __m256 __t6
= _mm256_unpacklo_ps(vMask6
, vMask7
);
244 __m256 __t7
= _mm256_unpackhi_ps(vMask6
, vMask7
);
245 __m256 __tt0
= _mm256_shuffle_ps(__t0
,__t2
,_MM_SHUFFLE(1,0,1,0));
246 __m256 __tt1
= _mm256_shuffle_ps(__t0
,__t2
,_MM_SHUFFLE(3,2,3,2));
247 __m256 __tt2
= _mm256_shuffle_ps(__t1
,__t3
,_MM_SHUFFLE(1,0,1,0));
248 __m256 __tt3
= _mm256_shuffle_ps(__t1
,__t3
,_MM_SHUFFLE(3,2,3,2));
249 __m256 __tt4
= _mm256_shuffle_ps(__t4
,__t6
,_MM_SHUFFLE(1,0,1,0));
250 __m256 __tt5
= _mm256_shuffle_ps(__t4
,__t6
,_MM_SHUFFLE(3,2,3,2));
251 __m256 __tt6
= _mm256_shuffle_ps(__t5
,__t7
,_MM_SHUFFLE(1,0,1,0));
252 __m256 __tt7
= _mm256_shuffle_ps(__t5
,__t7
,_MM_SHUFFLE(3,2,3,2));
253 vDst
[0] = _mm256_permute2f128_ps(__tt0
, __tt4
, 0x20);
254 vDst
[1] = _mm256_permute2f128_ps(__tt1
, __tt5
, 0x20);
255 vDst
[2] = _mm256_permute2f128_ps(__tt2
, __tt6
, 0x20);
256 vDst
[3] = _mm256_permute2f128_ps(__tt3
, __tt7
, 0x20);
257 vDst
[4] = _mm256_permute2f128_ps(__tt0
, __tt4
, 0x31);
258 vDst
[5] = _mm256_permute2f128_ps(__tt1
, __tt5
, 0x31);
259 vDst
[6] = _mm256_permute2f128_ps(__tt2
, __tt6
, 0x31);
260 vDst
[7] = _mm256_permute2f128_ps(__tt3
, __tt7
, 0x31);
264 void vTranspose8x8(__m256 (&vDst
)[8], const __m256i
&vMask0
, const __m256i
&vMask1
, const __m256i
&vMask2
, const __m256i
&vMask3
, const __m256i
&vMask4
, const __m256i
&vMask5
, const __m256i
&vMask6
, const __m256i
&vMask7
)
266 vTranspose8x8(vDst
, _mm256_castsi256_ps(vMask0
), _mm256_castsi256_ps(vMask1
), _mm256_castsi256_ps(vMask2
), _mm256_castsi256_ps(vMask3
),
267 _mm256_castsi256_ps(vMask4
), _mm256_castsi256_ps(vMask5
), _mm256_castsi256_ps(vMask6
), _mm256_castsi256_ps(vMask7
));
271 //////////////////////////////////////////////////////////////////////////
272 /// TranposeSingleComponent
273 //////////////////////////////////////////////////////////////////////////
274 template<uint32_t bpp
>
275 struct TransposeSingleComponent
277 //////////////////////////////////////////////////////////////////////////
278 /// @brief Pass-thru for single component.
279 /// @param pSrc - source data in SOA form
280 /// @param pDst - output data in AOS form
281 INLINE
static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
)
283 memcpy(pDst
, pSrc
, (bpp
* KNOB_SIMD_WIDTH
) / 8);
287 //////////////////////////////////////////////////////////////////////////
289 //////////////////////////////////////////////////////////////////////////
290 struct Transpose8_8_8_8
292 //////////////////////////////////////////////////////////////////////////
293 /// @brief Performs an SOA to AOS conversion for packed 8_8_8_8 data.
294 /// @param pSrc - source data in SOA form
295 /// @param pDst - output data in AOS form
296 INLINE
static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
)
298 simdscalari src
= _simd_load_si((const simdscalari
*)pSrc
);
299 #if KNOB_SIMD_WIDTH == 8
300 #if KNOB_ARCH == KNOB_ARCH_AVX
301 __m128i c0c1
= _mm256_castsi256_si128(src
); // rrrrrrrrgggggggg
302 __m128i c2c3
= _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(src
), 1)); // bbbbbbbbaaaaaaaa
303 __m128i c0c2
= _mm_unpacklo_epi64(c0c1
, c2c3
); // rrrrrrrrbbbbbbbb
304 __m128i c1c3
= _mm_unpackhi_epi64(c0c1
, c2c3
); // ggggggggaaaaaaaa
305 __m128i c01
= _mm_unpacklo_epi8(c0c2
, c1c3
); // rgrgrgrgrgrgrgrg
306 __m128i c23
= _mm_unpackhi_epi8(c0c2
, c1c3
); // babababababababa
307 __m128i c0123lo
= _mm_unpacklo_epi16(c01
, c23
); // rgbargbargbargba
308 __m128i c0123hi
= _mm_unpackhi_epi16(c01
, c23
); // rgbargbargbargba
309 _mm_store_si128((__m128i
*)pDst
, c0123lo
);
310 _mm_store_si128((__m128i
*)(pDst
+ 16), c0123hi
);
311 #elif KNOB_ARCH == KNOB_ARCH_AVX2
312 simdscalari dst01
= _mm256_shuffle_epi8(src
,
313 _mm256_set_epi32(0x0f078080, 0x0e068080, 0x0d058080, 0x0c048080, 0x80800b03, 0x80800a02, 0x80800901, 0x80800800));
314 simdscalari dst23
= _mm256_permute2x128_si256(src
, src
, 0x01);
315 dst23
= _mm256_shuffle_epi8(dst23
,
316 _mm256_set_epi32(0x80800f07, 0x80800e06, 0x80800d05, 0x80800c04, 0x0b038080, 0x0a028080, 0x09018080, 0x08008080));
317 simdscalari dst
= _mm256_or_si256(dst01
, dst23
);
318 _simd_store_si((simdscalari
*)pDst
, dst
);
321 #error Unsupported vector width
326 //////////////////////////////////////////////////////////////////////////
328 //////////////////////////////////////////////////////////////////////////
329 struct Transpose8_8_8
331 //////////////////////////////////////////////////////////////////////////
332 /// @brief Performs an SOA to AOS conversion for packed 8_8_8 data.
333 /// @param pSrc - source data in SOA form
334 /// @param pDst - output data in AOS form
335 INLINE
static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
338 //////////////////////////////////////////////////////////////////////////
340 //////////////////////////////////////////////////////////////////////////
343 //////////////////////////////////////////////////////////////////////////
344 /// @brief Performs an SOA to AOS conversion for packed 8_8 data.
345 /// @param pSrc - source data in SOA form
346 /// @param pDst - output data in AOS form
347 INLINE
static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
)
349 simdscalari src
= _simd_load_si((const simdscalari
*)pSrc
);
351 #if KNOB_SIMD_WIDTH == 8
352 __m128i rg
= _mm256_castsi256_si128(src
); // rrrrrrrr gggggggg
353 __m128i g
= _mm_unpackhi_epi64(rg
, rg
); // gggggggg gggggggg
354 rg
= _mm_unpacklo_epi8(rg
, g
);
355 _mm_store_si128((__m128i
*)pDst
, rg
);
357 #error Unsupported vector width
362 //////////////////////////////////////////////////////////////////////////
363 /// Transpose32_32_32_32
364 //////////////////////////////////////////////////////////////////////////
365 struct Transpose32_32_32_32
367 //////////////////////////////////////////////////////////////////////////
368 /// @brief Performs an SOA to AOS conversion for packed 32_32_32_32 data.
369 /// @param pSrc - source data in SOA form
370 /// @param pDst - output data in AOS form
371 INLINE
static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
)
373 #if KNOB_SIMD_WIDTH == 8
374 simdscalar src0
= _simd_load_ps((const float*)pSrc
);
375 simdscalar src1
= _simd_load_ps((const float*)pSrc
+ 8);
376 simdscalar src2
= _simd_load_ps((const float*)pSrc
+ 16);
377 simdscalar src3
= _simd_load_ps((const float*)pSrc
+ 24);
380 vTranspose4x8(vDst
, src0
, src1
, src2
, src3
);
381 _mm_store_ps((float*)pDst
, vDst
[0]);
382 _mm_store_ps((float*)pDst
+4, vDst
[1]);
383 _mm_store_ps((float*)pDst
+8, vDst
[2]);
384 _mm_store_ps((float*)pDst
+12, vDst
[3]);
385 _mm_store_ps((float*)pDst
+16, vDst
[4]);
386 _mm_store_ps((float*)pDst
+20, vDst
[5]);
387 _mm_store_ps((float*)pDst
+24, vDst
[6]);
388 _mm_store_ps((float*)pDst
+28, vDst
[7]);
390 #error Unsupported vector width
395 //////////////////////////////////////////////////////////////////////////
396 /// Transpose32_32_32
397 //////////////////////////////////////////////////////////////////////////
398 struct Transpose32_32_32
400 //////////////////////////////////////////////////////////////////////////
401 /// @brief Performs an SOA to AOS conversion for packed 32_32_32 data.
402 /// @param pSrc - source data in SOA form
403 /// @param pDst - output data in AOS form
404 INLINE
static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
)
406 #if KNOB_SIMD_WIDTH == 8
407 simdscalar src0
= _simd_load_ps((const float*)pSrc
);
408 simdscalar src1
= _simd_load_ps((const float*)pSrc
+ 8);
409 simdscalar src2
= _simd_load_ps((const float*)pSrc
+ 16);
412 vTranspose3x8(vDst
, src0
, src1
, src2
);
413 _mm_store_ps((float*)pDst
, vDst
[0]);
414 _mm_store_ps((float*)pDst
+ 4, vDst
[1]);
415 _mm_store_ps((float*)pDst
+ 8, vDst
[2]);
416 _mm_store_ps((float*)pDst
+ 12, vDst
[3]);
417 _mm_store_ps((float*)pDst
+ 16, vDst
[4]);
418 _mm_store_ps((float*)pDst
+ 20, vDst
[5]);
419 _mm_store_ps((float*)pDst
+ 24, vDst
[6]);
420 _mm_store_ps((float*)pDst
+ 28, vDst
[7]);
422 #error Unsupported vector width
427 //////////////////////////////////////////////////////////////////////////
429 //////////////////////////////////////////////////////////////////////////
430 struct Transpose32_32
432 //////////////////////////////////////////////////////////////////////////
433 /// @brief Performs an SOA to AOS conversion for packed 32_32 data.
434 /// @param pSrc - source data in SOA form
435 /// @param pDst - output data in AOS form
436 INLINE
static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
)
438 const float* pfSrc
= (const float*)pSrc
;
439 __m128 src_r0
= _mm_load_ps(pfSrc
+ 0);
440 __m128 src_r1
= _mm_load_ps(pfSrc
+ 4);
441 __m128 src_g0
= _mm_load_ps(pfSrc
+ 8);
442 __m128 src_g1
= _mm_load_ps(pfSrc
+ 12);
444 __m128 dst0
= _mm_unpacklo_ps(src_r0
, src_g0
);
445 __m128 dst1
= _mm_unpackhi_ps(src_r0
, src_g0
);
446 __m128 dst2
= _mm_unpacklo_ps(src_r1
, src_g1
);
447 __m128 dst3
= _mm_unpackhi_ps(src_r1
, src_g1
);
449 float* pfDst
= (float*)pDst
;
450 _mm_store_ps(pfDst
+ 0, dst0
);
451 _mm_store_ps(pfDst
+ 4, dst1
);
452 _mm_store_ps(pfDst
+ 8, dst2
);
453 _mm_store_ps(pfDst
+ 12, dst3
);
457 //////////////////////////////////////////////////////////////////////////
458 /// Transpose16_16_16_16
459 //////////////////////////////////////////////////////////////////////////
460 struct Transpose16_16_16_16
462 //////////////////////////////////////////////////////////////////////////
463 /// @brief Performs an SOA to AOS conversion for packed 16_16_16_16 data.
464 /// @param pSrc - source data in SOA form
465 /// @param pDst - output data in AOS form
466 INLINE
static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
)
468 #if KNOB_SIMD_WIDTH == 8
469 simdscalari src_rg
= _simd_load_si((const simdscalari
*)pSrc
);
470 simdscalari src_ba
= _simd_load_si((const simdscalari
*)(pSrc
+ sizeof(simdscalari
)));
472 __m128i src_r
= _mm256_extractf128_si256(src_rg
, 0);
473 __m128i src_g
= _mm256_extractf128_si256(src_rg
, 1);
474 __m128i src_b
= _mm256_extractf128_si256(src_ba
, 0);
475 __m128i src_a
= _mm256_extractf128_si256(src_ba
, 1);
477 __m128i rg0
= _mm_unpacklo_epi16(src_r
, src_g
);
478 __m128i rg1
= _mm_unpackhi_epi16(src_r
, src_g
);
479 __m128i ba0
= _mm_unpacklo_epi16(src_b
, src_a
);
480 __m128i ba1
= _mm_unpackhi_epi16(src_b
, src_a
);
482 __m128i dst0
= _mm_unpacklo_epi32(rg0
, ba0
);
483 __m128i dst1
= _mm_unpackhi_epi32(rg0
, ba0
);
484 __m128i dst2
= _mm_unpacklo_epi32(rg1
, ba1
);
485 __m128i dst3
= _mm_unpackhi_epi32(rg1
, ba1
);
487 _mm_store_si128(((__m128i
*)pDst
) + 0, dst0
);
488 _mm_store_si128(((__m128i
*)pDst
) + 1, dst1
);
489 _mm_store_si128(((__m128i
*)pDst
) + 2, dst2
);
490 _mm_store_si128(((__m128i
*)pDst
) + 3, dst3
);
492 #error Unsupported vector width
497 //////////////////////////////////////////////////////////////////////////
498 /// Transpose16_16_16
499 //////////////////////////////////////////////////////////////////////////
500 struct Transpose16_16_16
502 //////////////////////////////////////////////////////////////////////////
503 /// @brief Performs an SOA to AOS conversion for packed 16_16_16 data.
504 /// @param pSrc - source data in SOA form
505 /// @param pDst - output data in AOS form
506 INLINE
static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
)
508 #if KNOB_SIMD_WIDTH == 8
509 simdscalari src_rg
= _simd_load_si((const simdscalari
*)pSrc
);
511 __m128i src_r
= _mm256_extractf128_si256(src_rg
, 0);
512 __m128i src_g
= _mm256_extractf128_si256(src_rg
, 1);
513 __m128i src_b
= _mm_load_si128((const __m128i
*)(pSrc
+ sizeof(simdscalari
)));
514 __m128i src_a
= _mm_undefined_si128();
516 __m128i rg0
= _mm_unpacklo_epi16(src_r
, src_g
);
517 __m128i rg1
= _mm_unpackhi_epi16(src_r
, src_g
);
518 __m128i ba0
= _mm_unpacklo_epi16(src_b
, src_a
);
519 __m128i ba1
= _mm_unpackhi_epi16(src_b
, src_a
);
521 __m128i dst0
= _mm_unpacklo_epi32(rg0
, ba0
);
522 __m128i dst1
= _mm_unpackhi_epi32(rg0
, ba0
);
523 __m128i dst2
= _mm_unpacklo_epi32(rg1
, ba1
);
524 __m128i dst3
= _mm_unpackhi_epi32(rg1
, ba1
);
526 _mm_store_si128(((__m128i
*)pDst
) + 0, dst0
);
527 _mm_store_si128(((__m128i
*)pDst
) + 1, dst1
);
528 _mm_store_si128(((__m128i
*)pDst
) + 2, dst2
);
529 _mm_store_si128(((__m128i
*)pDst
) + 3, dst3
);
531 #error Unsupported vector width
536 //////////////////////////////////////////////////////////////////////////
538 //////////////////////////////////////////////////////////////////////////
539 struct Transpose16_16
541 //////////////////////////////////////////////////////////////////////////
542 /// @brief Performs an SOA to AOS conversion for packed 16_16 data.
543 /// @param pSrc - source data in SOA form
544 /// @param pDst - output data in AOS form
545 INLINE
static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
)
547 simdscalar src
= _simd_load_ps((const float*)pSrc
);
549 #if KNOB_SIMD_WIDTH == 8
550 __m128 comp0
= _mm256_castps256_ps128(src
);
551 __m128 comp1
= _mm256_extractf128_ps(src
, 1);
553 __m128i comp0i
= _mm_castps_si128(comp0
);
554 __m128i comp1i
= _mm_castps_si128(comp1
);
556 __m128i resLo
= _mm_unpacklo_epi16(comp0i
, comp1i
);
557 __m128i resHi
= _mm_unpackhi_epi16(comp0i
, comp1i
);
559 _mm_store_si128((__m128i
*)pDst
, resLo
);
560 _mm_store_si128((__m128i
*)pDst
+ 1, resHi
);
562 #error Unsupported vector width
567 //////////////////////////////////////////////////////////////////////////
569 //////////////////////////////////////////////////////////////////////////
572 //////////////////////////////////////////////////////////////////////////
573 /// @brief Performs an SOA to AOS conversion for packed 24_8 data.
574 /// @param pSrc - source data in SOA form
575 /// @param pDst - output data in AOS form
576 static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
579 //////////////////////////////////////////////////////////////////////////
581 //////////////////////////////////////////////////////////////////////////
582 struct Transpose32_8_24
584 //////////////////////////////////////////////////////////////////////////
585 /// @brief Performs an SOA to AOS conversion for packed 32_8_24 data.
586 /// @param pSrc - source data in SOA form
587 /// @param pDst - output data in AOS form
588 static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
593 //////////////////////////////////////////////////////////////////////////
595 //////////////////////////////////////////////////////////////////////////
596 struct Transpose4_4_4_4
598 //////////////////////////////////////////////////////////////////////////
599 /// @brief Performs an SOA to AOS conversion for packed 4_4_4_4 data.
600 /// @param pSrc - source data in SOA form
601 /// @param pDst - output data in AOS form
602 static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
605 //////////////////////////////////////////////////////////////////////////
607 //////////////////////////////////////////////////////////////////////////
608 struct Transpose5_6_5
610 //////////////////////////////////////////////////////////////////////////
611 /// @brief Performs an SOA to AOS conversion for packed 5_6_5 data.
612 /// @param pSrc - source data in SOA form
613 /// @param pDst - output data in AOS form
614 static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
617 //////////////////////////////////////////////////////////////////////////
619 //////////////////////////////////////////////////////////////////////////
620 struct Transpose9_9_9_5
622 //////////////////////////////////////////////////////////////////////////
623 /// @brief Performs an SOA to AOS conversion for packed 9_9_9_5 data.
624 /// @param pSrc - source data in SOA form
625 /// @param pDst - output data in AOS form
626 static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
629 //////////////////////////////////////////////////////////////////////////
631 //////////////////////////////////////////////////////////////////////////
632 struct Transpose5_5_5_1
634 //////////////////////////////////////////////////////////////////////////
635 /// @brief Performs an SOA to AOS conversion for packed 5_5_5_1 data.
636 /// @param pSrc - source data in SOA form
637 /// @param pDst - output data in AOS form
638 static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
641 //////////////////////////////////////////////////////////////////////////
642 /// Transpose10_10_10_2
643 //////////////////////////////////////////////////////////////////////////
644 struct Transpose10_10_10_2
646 //////////////////////////////////////////////////////////////////////////
647 /// @brief Performs an SOA to AOS conversion for packed 10_10_10_2 data.
648 /// @param pSrc - source data in SOA form
649 /// @param pDst - output data in AOS form
650 static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
653 //////////////////////////////////////////////////////////////////////////
654 /// Transpose11_11_10
655 //////////////////////////////////////////////////////////////////////////
656 struct Transpose11_11_10
658 //////////////////////////////////////////////////////////////////////////
659 /// @brief Performs an SOA to AOS conversion for packed 11_11_10 data.
660 /// @param pSrc - source data in SOA form
661 /// @param pDst - output data in AOS form
662 static void Transpose(const uint8_t* pSrc
, uint8_t* pDst
) = delete;
665 // helper function to unroll loops
666 template<int Begin
, int End
, int Step
= 1>
668 template<typename Lambda
>
669 INLINE
static void step(Lambda
& func
) {
671 UnrollerL
<Begin
+ Step
, End
, Step
>::step(func
);
675 template<int End
, int Step
>
676 struct UnrollerL
<End
, End
, Step
> {
677 template<typename Lambda
>
678 static void step(Lambda
& func
) {
682 // general CRC compute
684 uint32_t ComputeCRC(uint32_t crc
, const void *pData
, uint32_t size
)
686 #if defined(_WIN64) || defined(__x86_64__)
687 uint32_t sizeInQwords
= size
/ sizeof(uint64_t);
688 uint32_t sizeRemainderBytes
= size
% sizeof(uint64_t);
689 uint64_t* pDataWords
= (uint64_t*)pData
;
690 for (uint32_t i
= 0; i
< sizeInQwords
; ++i
)
692 crc
= (uint32_t)_mm_crc32_u64(crc
, *pDataWords
++);
695 uint32_t sizeInDwords
= size
/ sizeof(uint32_t);
696 uint32_t sizeRemainderBytes
= size
% sizeof(uint32_t);
697 uint32_t* pDataWords
= (uint32_t*)pData
;
698 for (uint32_t i
= 0; i
< sizeInDwords
; ++i
)
700 crc
= _mm_crc32_u32(crc
, *pDataWords
++);
704 uint8_t* pRemainderBytes
= (uint8_t*)pDataWords
;
705 for (uint32_t i
= 0; i
< sizeRemainderBytes
; ++i
)
707 crc
= _mm_crc32_u8(crc
, *pRemainderBytes
++);
713 //////////////////////////////////////////////////////////////////////////
714 /// Add byte offset to any-type pointer
715 //////////////////////////////////////////////////////////////////////////
716 template <typename T
>
718 static T
* PtrAdd(T
* p
, intptr_t offset
)
720 intptr_t intp
= reinterpret_cast<intptr_t>(p
);
721 return reinterpret_cast<T
*>(intp
+ offset
);
724 //////////////////////////////////////////////////////////////////////////
726 //////////////////////////////////////////////////////////////////////////
727 template <typename T
>
729 static bool IsPow2(T value
)
731 return value
== (value
& (0 - value
));
734 //////////////////////////////////////////////////////////////////////////
735 /// Align down to specified alignment
736 /// Note: IsPow2(alignment) MUST be true
737 //////////////////////////////////////////////////////////////////////////
738 template <typename T1
, typename T2
>
740 static T1
AlignDownPow2(T1 value
, T2 alignment
)
742 SWR_ASSERT(IsPow2(alignment
));
743 return value
& ~T1(alignment
- 1);
746 //////////////////////////////////////////////////////////////////////////
747 /// Align up to specified alignment
748 /// Note: IsPow2(alignment) MUST be true
749 //////////////////////////////////////////////////////////////////////////
750 template <typename T1
, typename T2
>
752 static T1
AlignUpPow2(T1 value
, T2 alignment
)
754 return AlignDownPow2(value
+ T1(alignment
- 1), alignment
);
757 //////////////////////////////////////////////////////////////////////////
758 /// Align up ptr to specified alignment
759 /// Note: IsPow2(alignment) MUST be true
760 //////////////////////////////////////////////////////////////////////////
761 template <typename T1
, typename T2
>
763 static T1
* AlignUpPow2(T1
* value
, T2 alignment
)
765 return reinterpret_cast<T1
*>(
766 AlignDownPow2(reinterpret_cast<uintptr_t>(value
) + uintptr_t(alignment
- 1), alignment
));
769 //////////////////////////////////////////////////////////////////////////
770 /// Align down to specified alignment
771 //////////////////////////////////////////////////////////////////////////
772 template <typename T1
, typename T2
>
774 static T1
AlignDown(T1 value
, T2 alignment
)
776 if (IsPow2(alignment
)) { return AlignDownPow2(value
, alignment
); }
777 return value
- T1(value
% alignment
);
780 //////////////////////////////////////////////////////////////////////////
781 /// Align down to specified alignment
782 //////////////////////////////////////////////////////////////////////////
783 template <typename T1
, typename T2
>
785 static T1
* AlignDown(T1
* value
, T2 alignment
)
787 return (T1
*)AlignDown(uintptr_t(value
), alignment
);
790 //////////////////////////////////////////////////////////////////////////
791 /// Align up to specified alignment
792 /// Note: IsPow2(alignment) MUST be true
793 //////////////////////////////////////////////////////////////////////////
794 template <typename T1
, typename T2
>
796 static T1
AlignUp(T1 value
, T2 alignment
)
798 return AlignDown(value
+ T1(alignment
- 1), alignment
);
801 //////////////////////////////////////////////////////////////////////////
802 /// Align up to specified alignment
803 /// Note: IsPow2(alignment) MUST be true
804 //////////////////////////////////////////////////////////////////////////
805 template <typename T1
, typename T2
>
807 static T1
* AlignUp(T1
* value
, T2 alignment
)
809 return AlignDown(PtrAdd(value
, alignment
- 1), alignment
);
812 //////////////////////////////////////////////////////////////////////////
813 /// Helper structure used to access an array of elements that don't
814 /// correspond to a typical word size.
815 //////////////////////////////////////////////////////////////////////////
816 template<typename T
, size_t BitsPerElementT
, size_t ArrayLenT
>
820 static const size_t BITS_PER_WORD
= sizeof(size_t) * 8;
821 static const size_t ELEMENTS_PER_WORD
= BITS_PER_WORD
/ BitsPerElementT
;
822 static const size_t NUM_WORDS
= (ArrayLenT
+ ELEMENTS_PER_WORD
- 1) / ELEMENTS_PER_WORD
;
823 static const size_t ELEMENT_MASK
= (size_t(1) << BitsPerElementT
) - 1;
825 static_assert(ELEMENTS_PER_WORD
* BitsPerElementT
== BITS_PER_WORD
,
826 "Element size must an integral fraction of pointer size");
828 size_t m_words
[NUM_WORDS
] = {};
832 T
operator[] (size_t elementIndex
) const
834 size_t word
= m_words
[elementIndex
/ ELEMENTS_PER_WORD
];
835 word
>>= ((elementIndex
% ELEMENTS_PER_WORD
) * BitsPerElementT
);
836 return T(word
& ELEMENT_MASK
);
840 // Recursive template used to auto-nest conditionals. Converts dynamic boolean function
841 // arguments to static template arguments.
842 template <typename TermT
, typename
... ArgsB
>
843 struct TemplateArgUnroller
845 // Last Arg Terminator
846 static typename
TermT::FuncType
GetFunc(bool bArg
)
850 return TermT::template GetFunc
<ArgsB
..., std::true_type
>();
853 return TermT::template GetFunc
<ArgsB
..., std::false_type
>();
856 // Recursively parse args
857 template <typename
... TArgsT
>
858 static typename
TermT::FuncType
GetFunc(bool bArg
, TArgsT
... remainingArgs
)
862 return TemplateArgUnroller
<TermT
, ArgsB
..., std::true_type
>::GetFunc(remainingArgs
...);
865 return TemplateArgUnroller
<TermT
, ArgsB
..., std::false_type
>::GetFunc(remainingArgs
...);
869 //////////////////////////////////////////////////////////////////////////
870 /// Helper used to get an environment variable
871 //////////////////////////////////////////////////////////////////////////
872 static INLINE
std::string
GetEnv(const std::string
& variableName
)
876 DWORD valueSize
= GetEnvironmentVariableA(variableName
.c_str(), nullptr, 0);
877 if (!valueSize
) return output
;
878 output
.resize(valueSize
- 1); // valueSize includes null, output.resize() does not
879 GetEnvironmentVariableA(variableName
.c_str(), &output
[0], valueSize
);
881 output
= getenv(variableName
.c_str());