1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * @brief Utilities used by SWR core.
27 ******************************************************************************/
31 #include "common/os.h"
32 #include "common/simdintrin.h"
33 #include "common/swr_assert.h"
36 void SaveImageToPNGFile(
37 const WCHAR
*pFilename
,
42 void OpenBitmapFromFile(
43 const WCHAR
*pFilename
,
49 /// @todo assume linux is always 64 bit
50 #if defined(_WIN64) || defined(__linux__) || defined(__gnu_linux__)
51 #define _MM_INSERT_EPI64 _mm_insert_epi64
52 #define _MM_EXTRACT_EPI64 _mm_extract_epi64
54 INLINE INT64
_MM_EXTRACT_EPI64(__m128i a
, const int32_t ndx
)
56 OSALIGNLINE(uint32_t) elems
[4];
57 _mm_store_si128((__m128i
*)elems
, a
);
60 uint64_t foo
= elems
[0];
61 foo
|= (uint64_t)elems
[1] << 32;
66 uint64_t foo
= elems
[2];
67 foo
|= (uint64_t)elems
[3] << 32;
72 INLINE __m128i
_MM_INSERT_EPI64(__m128i a
, INT64 b
, const int32_t ndx
)
74 OSALIGNLINE(int64_t) elems
[2];
75 _mm_store_si128((__m128i
*)elems
, a
);
85 out
= _mm_load_si128((const __m128i
*)elems
);
90 OSALIGNLINE(struct) BBOX
92 int top
, bottom
, left
, right
;
95 BBOX(int t
, int b
, int l
, int r
) : top(t
), bottom(b
), left(l
), right(r
) {}
97 bool operator==(const BBOX
& rhs
)
99 return (this->top
== rhs
.top
&&
100 this->bottom
== rhs
.bottom
&&
101 this->left
== rhs
.left
&&
102 this->right
== rhs
.right
);
105 bool operator!=(const BBOX
& rhs
)
107 return !(*this == rhs
);
113 simdscalari top
, bottom
, left
, right
;
117 void vTranspose(__m128
&row0
, __m128
&row1
, __m128
&row2
, __m128
&row3
)
119 __m128i row0i
= _mm_castps_si128(row0
);
120 __m128i row1i
= _mm_castps_si128(row1
);
121 __m128i row2i
= _mm_castps_si128(row2
);
122 __m128i row3i
= _mm_castps_si128(row3
);
124 __m128i vTemp
= row2i
;
125 row2i
= _mm_unpacklo_epi32(row2i
, row3i
);
126 vTemp
= _mm_unpackhi_epi32(vTemp
, row3i
);
129 row0i
= _mm_unpacklo_epi32(row0i
, row1i
);
130 row3i
= _mm_unpackhi_epi32(row3i
, row1i
);
133 row0i
= _mm_unpacklo_epi64(row0i
, row2i
);
134 row1i
= _mm_unpackhi_epi64(row1i
, row2i
);
137 row2i
= _mm_unpacklo_epi64(row2i
, vTemp
);
138 row3i
= _mm_unpackhi_epi64(row3i
, vTemp
);
140 row0
= _mm_castsi128_ps(row0i
);
141 row1
= _mm_castsi128_ps(row1i
);
142 row2
= _mm_castsi128_ps(row2i
);
143 row3
= _mm_castsi128_ps(row3i
);
147 void vTranspose(__m128i
&row0
, __m128i
&row1
, __m128i
&row2
, __m128i
&row3
)
149 __m128i vTemp
= row2
;
150 row2
= _mm_unpacklo_epi32(row2
, row3
);
151 vTemp
= _mm_unpackhi_epi32(vTemp
, row3
);
154 row0
= _mm_unpacklo_epi32(row0
, row1
);
155 row3
= _mm_unpackhi_epi32(row3
, row1
);
158 row0
= _mm_unpacklo_epi64(row0
, row2
);
159 row1
= _mm_unpackhi_epi64(row1
, row2
);
162 row2
= _mm_unpacklo_epi64(row2
, vTemp
);
163 row3
= _mm_unpackhi_epi64(row3
, vTemp
);
166 #define GCC_VERSION (__GNUC__ * 10000 \
167 + __GNUC_MINOR__ * 100 \
168 + __GNUC_PATCHLEVEL__)
170 #if defined(__GNUC__) && (GCC_VERSION < 40900)
171 #define _mm_undefined_ps _mm_setzero_ps
172 #define _mm_undefined_si128 _mm_setzero_si128
173 #if KNOB_SIMD_WIDTH == 8
174 #define _mm256_undefined_ps _mm256_setzero_ps
178 #if KNOB_SIMD_WIDTH == 8
180 void vTranspose3x8(__m128 (&vDst
)[8], __m256
&vSrc0
, __m256
&vSrc1
, __m256
&vSrc2
)
182 __m256 r0r2
= _mm256_unpacklo_ps(vSrc0
, vSrc2
); //x0z0x1z1 x4z4x5z5
183 __m256 r1rx
= _mm256_unpacklo_ps(vSrc1
, _mm256_undefined_ps()); //y0w0y1w1 y4w4y5w5
184 __m256 r02r1xlolo
= _mm256_unpacklo_ps(r0r2
, r1rx
); //x0y0z0w0 x4y4z4w4
185 __m256 r02r1xlohi
= _mm256_unpackhi_ps(r0r2
, r1rx
); //x1y1z1w1 x5y5z5w5
187 r0r2
= _mm256_unpackhi_ps(vSrc0
, vSrc2
); //x2z2x3z3 x6z6x7z7
188 r1rx
= _mm256_unpackhi_ps(vSrc1
, _mm256_undefined_ps()); //y2w2y3w3 y6w6yw77
189 __m256 r02r1xhilo
= _mm256_unpacklo_ps(r0r2
, r1rx
); //x2y2z2w2 x6y6z6w6
190 __m256 r02r1xhihi
= _mm256_unpackhi_ps(r0r2
, r1rx
); //x3y3z3w3 x7y7z7w7
192 vDst
[0] = _mm256_castps256_ps128(r02r1xlolo
);
193 vDst
[1] = _mm256_castps256_ps128(r02r1xlohi
);
194 vDst
[2] = _mm256_castps256_ps128(r02r1xhilo
);
195 vDst
[3] = _mm256_castps256_ps128(r02r1xhihi
);
197 vDst
[4] = _mm256_extractf128_ps(r02r1xlolo
, 1);
198 vDst
[5] = _mm256_extractf128_ps(r02r1xlohi
, 1);
199 vDst
[6] = _mm256_extractf128_ps(r02r1xhilo
, 1);
200 vDst
[7] = _mm256_extractf128_ps(r02r1xhihi
, 1);
204 void vTranspose4x8(__m128 (&vDst
)[8], __m256
&vSrc0
, __m256
&vSrc1
, __m256
&vSrc2
, __m256
&vSrc3
)
206 __m256 r0r2
= _mm256_unpacklo_ps(vSrc0
, vSrc2
); //x0z0x1z1 x4z4x5z5
207 __m256 r1rx
= _mm256_unpacklo_ps(vSrc1
, vSrc3
); //y0w0y1w1 y4w4y5w5
208 __m256 r02r1xlolo
= _mm256_unpacklo_ps(r0r2
, r1rx
); //x0y0z0w0 x4y4z4w4
209 __m256 r02r1xlohi
= _mm256_unpackhi_ps(r0r2
, r1rx
); //x1y1z1w1 x5y5z5w5
211 r0r2
= _mm256_unpackhi_ps(vSrc0
, vSrc2
); //x2z2x3z3 x6z6x7z7
212 r1rx
= _mm256_unpackhi_ps(vSrc1
, vSrc3
) ; //y2w2y3w3 y6w6yw77
213 __m256 r02r1xhilo
= _mm256_unpacklo_ps(r0r2
, r1rx
); //x2y2z2w2 x6y6z6w6
214 __m256 r02r1xhihi
= _mm256_unpackhi_ps(r0r2
, r1rx
); //x3y3z3w3 x7y7z7w7
216 vDst
[0] = _mm256_castps256_ps128(r02r1xlolo
);
217 vDst
[1] = _mm256_castps256_ps128(r02r1xlohi
);
218 vDst
[2] = _mm256_castps256_ps128(r02r1xhilo
);
219 vDst
[3] = _mm256_castps256_ps128(r02r1xhihi
);
221 vDst
[4] = _mm256_extractf128_ps(r02r1xlolo
, 1);
222 vDst
[5] = _mm256_extractf128_ps(r02r1xlohi
, 1);
223 vDst
[6] = _mm256_extractf128_ps(r02r1xhilo
, 1);
224 vDst
[7] = _mm256_extractf128_ps(r02r1xhihi
, 1);
228 void vTranspose8x8(__m256 (&vDst
)[8], const __m256
&vMask0
, const __m256
&vMask1
, const __m256
&vMask2
, const __m256
&vMask3
, const __m256
&vMask4
, const __m256
&vMask5
, const __m256
&vMask6
, const __m256
&vMask7
)
230 __m256 __t0
= _mm256_unpacklo_ps(vMask0
, vMask1
);
231 __m256 __t1
= _mm256_unpackhi_ps(vMask0
, vMask1
);
232 __m256 __t2
= _mm256_unpacklo_ps(vMask2
, vMask3
);
233 __m256 __t3
= _mm256_unpackhi_ps(vMask2
, vMask3
);
234 __m256 __t4
= _mm256_unpacklo_ps(vMask4
, vMask5
);
235 __m256 __t5
= _mm256_unpackhi_ps(vMask4
, vMask5
);
236 __m256 __t6
= _mm256_unpacklo_ps(vMask6
, vMask7
);
237 __m256 __t7
= _mm256_unpackhi_ps(vMask6
, vMask7
);
238 __m256 __tt0
= _mm256_shuffle_ps(__t0
,__t2
,_MM_SHUFFLE(1,0,1,0));
239 __m256 __tt1
= _mm256_shuffle_ps(__t0
,__t2
,_MM_SHUFFLE(3,2,3,2));
240 __m256 __tt2
= _mm256_shuffle_ps(__t1
,__t3
,_MM_SHUFFLE(1,0,1,0));
241 __m256 __tt3
= _mm256_shuffle_ps(__t1
,__t3
,_MM_SHUFFLE(3,2,3,2));
242 __m256 __tt4
= _mm256_shuffle_ps(__t4
,__t6
,_MM_SHUFFLE(1,0,1,0));
243 __m256 __tt5
= _mm256_shuffle_ps(__t4
,__t6
,_MM_SHUFFLE(3,2,3,2));
244 __m256 __tt6
= _mm256_shuffle_ps(__t5
,__t7
,_MM_SHUFFLE(1,0,1,0));
245 __m256 __tt7
= _mm256_shuffle_ps(__t5
,__t7
,_MM_SHUFFLE(3,2,3,2));
246 vDst
[0] = _mm256_permute2f128_ps(__tt0
, __tt4
, 0x20);
247 vDst
[1] = _mm256_permute2f128_ps(__tt1
, __tt5
, 0x20);
248 vDst
[2] = _mm256_permute2f128_ps(__tt2
, __tt6
, 0x20);
249 vDst
[3] = _mm256_permute2f128_ps(__tt3
, __tt7
, 0x20);
250 vDst
[4] = _mm256_permute2f128_ps(__tt0
, __tt4
, 0x31);
251 vDst
[5] = _mm256_permute2f128_ps(__tt1
, __tt5
, 0x31);
252 vDst
[6] = _mm256_permute2f128_ps(__tt2
, __tt6
, 0x31);
253 vDst
[7] = _mm256_permute2f128_ps(__tt3
, __tt7
, 0x31);
257 void vTranspose8x8(__m256 (&vDst
)[8], const __m256i
&vMask0
, const __m256i
&vMask1
, const __m256i
&vMask2
, const __m256i
&vMask3
, const __m256i
&vMask4
, const __m256i
&vMask5
, const __m256i
&vMask6
, const __m256i
&vMask7
)
259 vTranspose8x8(vDst
, _mm256_castsi256_ps(vMask0
), _mm256_castsi256_ps(vMask1
), _mm256_castsi256_ps(vMask2
), _mm256_castsi256_ps(vMask3
),
260 _mm256_castsi256_ps(vMask4
), _mm256_castsi256_ps(vMask5
), _mm256_castsi256_ps(vMask6
), _mm256_castsi256_ps(vMask7
));
264 //////////////////////////////////////////////////////////////////////////
265 /// TranposeSingleComponent
266 //////////////////////////////////////////////////////////////////////////
267 template<uint32_t bpp
>
268 struct TransposeSingleComponent
270 //////////////////////////////////////////////////////////////////////////
271 /// @brief Pass-thru for single component.
272 /// @param pSrc - source data in SOA form
273 /// @param pDst - output data in AOS form
274 INLINE
static void Transpose(const BYTE
* pSrc
, BYTE
* pDst
)
276 memcpy(pDst
, pSrc
, (bpp
* KNOB_SIMD_WIDTH
) / 8);
280 //////////////////////////////////////////////////////////////////////////
282 //////////////////////////////////////////////////////////////////////////
283 struct Transpose8_8_8_8
285 //////////////////////////////////////////////////////////////////////////
286 /// @brief Performs an SOA to AOS conversion for packed 8_8_8_8 data.
287 /// @param pSrc - source data in SOA form
288 /// @param pDst - output data in AOS form
289 INLINE
static void Transpose(const BYTE
* pSrc
, BYTE
* pDst
)
291 simdscalari src
= _simd_load_si((const simdscalari
*)pSrc
);
292 #if KNOB_SIMD_WIDTH == 8
293 #if KNOB_ARCH == KNOB_ARCH_AVX
294 __m128i c0c1
= _mm256_castsi256_si128(src
); // rrrrrrrrgggggggg
295 __m128i c2c3
= _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(src
), 1)); // bbbbbbbbaaaaaaaa
296 __m128i c0c2
= _mm_unpacklo_epi64(c0c1
, c2c3
); // rrrrrrrrbbbbbbbb
297 __m128i c1c3
= _mm_unpackhi_epi64(c0c1
, c2c3
); // ggggggggaaaaaaaa
298 __m128i c01
= _mm_unpacklo_epi8(c0c2
, c1c3
); // rgrgrgrgrgrgrgrg
299 __m128i c23
= _mm_unpackhi_epi8(c0c2
, c1c3
); // babababababababa
300 __m128i c0123lo
= _mm_unpacklo_epi16(c01
, c23
); // rgbargbargbargba
301 __m128i c0123hi
= _mm_unpackhi_epi16(c01
, c23
); // rgbargbargbargba
302 _mm_store_si128((__m128i
*)pDst
, c0123lo
);
303 _mm_store_si128((__m128i
*)(pDst
+ 16), c0123hi
);
304 #elif KNOB_ARCH == KNOB_ARCH_AVX2
305 simdscalari dst01
= _mm256_shuffle_epi8(src
,
306 _mm256_set_epi32(0x0f078080, 0x0e068080, 0x0d058080, 0x0c048080, 0x80800b03, 0x80800a02, 0x80800901, 0x80800800));
307 simdscalari dst23
= _mm256_permute2x128_si256(src
, src
, 0x01);
308 dst23
= _mm256_shuffle_epi8(dst23
,
309 _mm256_set_epi32(0x80800f07, 0x80800e06, 0x80800d05, 0x80800c04, 0x0b038080, 0x0a028080, 0x09018080, 0x08008080));
310 simdscalari dst
= _mm256_or_si256(dst01
, dst23
);
311 _simd_store_si((simdscalari
*)pDst
, dst
);
314 #error Unsupported vector width
319 //////////////////////////////////////////////////////////////////////////
321 //////////////////////////////////////////////////////////////////////////
322 struct Transpose8_8_8
324 //////////////////////////////////////////////////////////////////////////
325 /// @brief Performs an SOA to AOS conversion for packed 8_8_8 data.
326 /// @param pSrc - source data in SOA form
327 /// @param pDst - output data in AOS form
328 INLINE
static void Transpose(const BYTE
* pSrc
, BYTE
* pDst
) = delete;
331 //////////////////////////////////////////////////////////////////////////
333 //////////////////////////////////////////////////////////////////////////
336 //////////////////////////////////////////////////////////////////////////
337 /// @brief Performs an SOA to AOS conversion for packed 8_8 data.
338 /// @param pSrc - source data in SOA form
339 /// @param pDst - output data in AOS form
340 INLINE
static void Transpose(const BYTE
* pSrc
, BYTE
* pDst
)
342 simdscalari src
= _simd_load_si((const simdscalari
*)pSrc
);
344 #if KNOB_SIMD_WIDTH == 8
345 __m128i rg
= _mm256_castsi256_si128(src
); // rrrrrrrr gggggggg
346 __m128i g
= _mm_unpackhi_epi64(rg
, rg
); // gggggggg gggggggg
347 rg
= _mm_unpacklo_epi8(rg
, g
);
348 _mm_store_si128((__m128i
*)pDst
, rg
);
350 #error Unsupported vector width
355 //////////////////////////////////////////////////////////////////////////
356 /// Transpose32_32_32_32
357 //////////////////////////////////////////////////////////////////////////
358 struct Transpose32_32_32_32
360 //////////////////////////////////////////////////////////////////////////
361 /// @brief Performs an SOA to AOS conversion for packed 32_32_32_32 data.
362 /// @param pSrc - source data in SOA form
363 /// @param pDst - output data in AOS form
364 INLINE
static void Transpose(const BYTE
* pSrc
, BYTE
* pDst
)
366 #if KNOB_SIMD_WIDTH == 8
367 simdscalar src0
= _simd_load_ps((const float*)pSrc
);
368 simdscalar src1
= _simd_load_ps((const float*)pSrc
+ 8);
369 simdscalar src2
= _simd_load_ps((const float*)pSrc
+ 16);
370 simdscalar src3
= _simd_load_ps((const float*)pSrc
+ 24);
373 vTranspose4x8(vDst
, src0
, src1
, src2
, src3
);
374 _mm_store_ps((float*)pDst
, vDst
[0]);
375 _mm_store_ps((float*)pDst
+4, vDst
[1]);
376 _mm_store_ps((float*)pDst
+8, vDst
[2]);
377 _mm_store_ps((float*)pDst
+12, vDst
[3]);
378 _mm_store_ps((float*)pDst
+16, vDst
[4]);
379 _mm_store_ps((float*)pDst
+20, vDst
[5]);
380 _mm_store_ps((float*)pDst
+24, vDst
[6]);
381 _mm_store_ps((float*)pDst
+28, vDst
[7]);
383 #error Unsupported vector width
388 //////////////////////////////////////////////////////////////////////////
389 /// Transpose32_32_32
390 //////////////////////////////////////////////////////////////////////////
391 struct Transpose32_32_32
393 //////////////////////////////////////////////////////////////////////////
394 /// @brief Performs an SOA to AOS conversion for packed 32_32_32 data.
395 /// @param pSrc - source data in SOA form
396 /// @param pDst - output data in AOS form
397 INLINE
static void Transpose(const BYTE
* pSrc
, BYTE
* pDst
)
399 #if KNOB_SIMD_WIDTH == 8
400 simdscalar src0
= _simd_load_ps((const float*)pSrc
);
401 simdscalar src1
= _simd_load_ps((const float*)pSrc
+ 8);
402 simdscalar src2
= _simd_load_ps((const float*)pSrc
+ 16);
405 vTranspose3x8(vDst
, src0
, src1
, src2
);
406 _mm_store_ps((float*)pDst
, vDst
[0]);
407 _mm_store_ps((float*)pDst
+ 4, vDst
[1]);
408 _mm_store_ps((float*)pDst
+ 8, vDst
[2]);
409 _mm_store_ps((float*)pDst
+ 12, vDst
[3]);
410 _mm_store_ps((float*)pDst
+ 16, vDst
[4]);
411 _mm_store_ps((float*)pDst
+ 20, vDst
[5]);
412 _mm_store_ps((float*)pDst
+ 24, vDst
[6]);
413 _mm_store_ps((float*)pDst
+ 28, vDst
[7]);
415 #error Unsupported vector width
420 //////////////////////////////////////////////////////////////////////////
422 //////////////////////////////////////////////////////////////////////////
423 struct Transpose32_32
425 //////////////////////////////////////////////////////////////////////////
426 /// @brief Performs an SOA to AOS conversion for packed 32_32 data.
427 /// @param pSrc - source data in SOA form
428 /// @param pDst - output data in AOS form
429 INLINE
static void Transpose(const BYTE
* pSrc
, BYTE
* pDst
)
431 const float* pfSrc
= (const float*)pSrc
;
432 __m128 src_r0
= _mm_load_ps(pfSrc
+ 0);
433 __m128 src_r1
= _mm_load_ps(pfSrc
+ 4);
434 __m128 src_g0
= _mm_load_ps(pfSrc
+ 8);
435 __m128 src_g1
= _mm_load_ps(pfSrc
+ 12);
437 __m128 dst0
= _mm_unpacklo_ps(src_r0
, src_g0
);
438 __m128 dst1
= _mm_unpackhi_ps(src_r0
, src_g0
);
439 __m128 dst2
= _mm_unpacklo_ps(src_r1
, src_g1
);
440 __m128 dst3
= _mm_unpackhi_ps(src_r1
, src_g1
);
442 float* pfDst
= (float*)pDst
;
443 _mm_store_ps(pfDst
+ 0, dst0
);
444 _mm_store_ps(pfDst
+ 4, dst1
);
445 _mm_store_ps(pfDst
+ 8, dst2
);
446 _mm_store_ps(pfDst
+ 12, dst3
);
450 //////////////////////////////////////////////////////////////////////////
451 /// Transpose16_16_16_16
452 //////////////////////////////////////////////////////////////////////////
453 struct Transpose16_16_16_16
455 //////////////////////////////////////////////////////////////////////////
456 /// @brief Performs an SOA to AOS conversion for packed 16_16_16_16 data.
457 /// @param pSrc - source data in SOA form
458 /// @param pDst - output data in AOS form
459 INLINE
static void Transpose(const BYTE
* pSrc
, BYTE
* pDst
)
461 #if KNOB_SIMD_WIDTH == 8
462 simdscalari src_rg
= _simd_load_si((const simdscalari
*)pSrc
);
463 simdscalari src_ba
= _simd_load_si((const simdscalari
*)(pSrc
+ sizeof(simdscalari
)));
465 __m128i src_r
= _mm256_extractf128_si256(src_rg
, 0);
466 __m128i src_g
= _mm256_extractf128_si256(src_rg
, 1);
467 __m128i src_b
= _mm256_extractf128_si256(src_ba
, 0);
468 __m128i src_a
= _mm256_extractf128_si256(src_ba
, 1);
470 __m128i rg0
= _mm_unpacklo_epi16(src_r
, src_g
);
471 __m128i rg1
= _mm_unpackhi_epi16(src_r
, src_g
);
472 __m128i ba0
= _mm_unpacklo_epi16(src_b
, src_a
);
473 __m128i ba1
= _mm_unpackhi_epi16(src_b
, src_a
);
475 __m128i dst0
= _mm_unpacklo_epi32(rg0
, ba0
);
476 __m128i dst1
= _mm_unpackhi_epi32(rg0
, ba0
);
477 __m128i dst2
= _mm_unpacklo_epi32(rg1
, ba1
);
478 __m128i dst3
= _mm_unpackhi_epi32(rg1
, ba1
);
480 _mm_store_si128(((__m128i
*)pDst
) + 0, dst0
);
481 _mm_store_si128(((__m128i
*)pDst
) + 1, dst1
);
482 _mm_store_si128(((__m128i
*)pDst
) + 2, dst2
);
483 _mm_store_si128(((__m128i
*)pDst
) + 3, dst3
);
485 #error Unsupported vector width
490 //////////////////////////////////////////////////////////////////////////
491 /// Transpose16_16_16
492 //////////////////////////////////////////////////////////////////////////
493 struct Transpose16_16_16
495 //////////////////////////////////////////////////////////////////////////
496 /// @brief Performs an SOA to AOS conversion for packed 16_16_16 data.
497 /// @param pSrc - source data in SOA form
498 /// @param pDst - output data in AOS form
499 INLINE
static void Transpose(const BYTE
* pSrc
, BYTE
* pDst
)
501 #if KNOB_SIMD_WIDTH == 8
502 simdscalari src_rg
= _simd_load_si((const simdscalari
*)pSrc
);
504 __m128i src_r
= _mm256_extractf128_si256(src_rg
, 0);
505 __m128i src_g
= _mm256_extractf128_si256(src_rg
, 1);
506 __m128i src_b
= _mm_load_si128((const __m128i
*)(pSrc
+ sizeof(simdscalari
)));
507 __m128i src_a
= _mm_undefined_si128();
509 __m128i rg0
= _mm_unpacklo_epi16(src_r
, src_g
);
510 __m128i rg1
= _mm_unpackhi_epi16(src_r
, src_g
);
511 __m128i ba0
= _mm_unpacklo_epi16(src_b
, src_a
);
512 __m128i ba1
= _mm_unpackhi_epi16(src_b
, src_a
);
514 __m128i dst0
= _mm_unpacklo_epi32(rg0
, ba0
);
515 __m128i dst1
= _mm_unpackhi_epi32(rg0
, ba0
);
516 __m128i dst2
= _mm_unpacklo_epi32(rg1
, ba1
);
517 __m128i dst3
= _mm_unpackhi_epi32(rg1
, ba1
);
519 _mm_store_si128(((__m128i
*)pDst
) + 0, dst0
);
520 _mm_store_si128(((__m128i
*)pDst
) + 1, dst1
);
521 _mm_store_si128(((__m128i
*)pDst
) + 2, dst2
);
522 _mm_store_si128(((__m128i
*)pDst
) + 3, dst3
);
524 #error Unsupported vector width
529 //////////////////////////////////////////////////////////////////////////
531 //////////////////////////////////////////////////////////////////////////
532 struct Transpose16_16
534 //////////////////////////////////////////////////////////////////////////
535 /// @brief Performs an SOA to AOS conversion for packed 16_16 data.
536 /// @param pSrc - source data in SOA form
537 /// @param pDst - output data in AOS form
538 INLINE
static void Transpose(const BYTE
* pSrc
, BYTE
* pDst
)
540 simdscalar src
= _simd_load_ps((const float*)pSrc
);
542 #if KNOB_SIMD_WIDTH == 8
543 __m128 comp0
= _mm256_castps256_ps128(src
);
544 __m128 comp1
= _mm256_extractf128_ps(src
, 1);
546 __m128i comp0i
= _mm_castps_si128(comp0
);
547 __m128i comp1i
= _mm_castps_si128(comp1
);
549 __m128i resLo
= _mm_unpacklo_epi16(comp0i
, comp1i
);
550 __m128i resHi
= _mm_unpackhi_epi16(comp0i
, comp1i
);
552 _mm_store_si128((__m128i
*)pDst
, resLo
);
553 _mm_store_si128((__m128i
*)pDst
+ 1, resHi
);
555 #error Unsupported vector width
560 //////////////////////////////////////////////////////////////////////////
562 //////////////////////////////////////////////////////////////////////////
565 //////////////////////////////////////////////////////////////////////////
566 /// @brief Performs an SOA to AOS conversion for packed 24_8 data.
567 /// @param pSrc - source data in SOA form
568 /// @param pDst - output data in AOS form
569 static void Transpose(const BYTE
* pSrc
, BYTE
* pDst
) = delete;
572 //////////////////////////////////////////////////////////////////////////
574 //////////////////////////////////////////////////////////////////////////
575 struct Transpose32_8_24
577 //////////////////////////////////////////////////////////////////////////
578 /// @brief Performs an SOA to AOS conversion for packed 32_8_24 data.
579 /// @param pSrc - source data in SOA form
580 /// @param pDst - output data in AOS form
581 static void Transpose(const BYTE
* pSrc
, BYTE
* pDst
) = delete;
586 //////////////////////////////////////////////////////////////////////////
588 //////////////////////////////////////////////////////////////////////////
589 struct Transpose4_4_4_4
591 //////////////////////////////////////////////////////////////////////////
592 /// @brief Performs an SOA to AOS conversion for packed 4_4_4_4 data.
593 /// @param pSrc - source data in SOA form
594 /// @param pDst - output data in AOS form
595 static void Transpose(const BYTE
* pSrc
, BYTE
* pDst
) = delete;
598 //////////////////////////////////////////////////////////////////////////
600 //////////////////////////////////////////////////////////////////////////
601 struct Transpose5_6_5
603 //////////////////////////////////////////////////////////////////////////
604 /// @brief Performs an SOA to AOS conversion for packed 5_6_5 data.
605 /// @param pSrc - source data in SOA form
606 /// @param pDst - output data in AOS form
607 static void Transpose(const BYTE
* pSrc
, BYTE
* pDst
) = delete;
610 //////////////////////////////////////////////////////////////////////////
612 //////////////////////////////////////////////////////////////////////////
613 struct Transpose9_9_9_5
615 //////////////////////////////////////////////////////////////////////////
616 /// @brief Performs an SOA to AOS conversion for packed 9_9_9_5 data.
617 /// @param pSrc - source data in SOA form
618 /// @param pDst - output data in AOS form
619 static void Transpose(const BYTE
* pSrc
, BYTE
* pDst
) = delete;
622 //////////////////////////////////////////////////////////////////////////
624 //////////////////////////////////////////////////////////////////////////
625 struct Transpose5_5_5_1
627 //////////////////////////////////////////////////////////////////////////
628 /// @brief Performs an SOA to AOS conversion for packed 5_5_5_1 data.
629 /// @param pSrc - source data in SOA form
630 /// @param pDst - output data in AOS form
631 static void Transpose(const BYTE
* pSrc
, BYTE
* pDst
) = delete;
634 //////////////////////////////////////////////////////////////////////////
635 /// Transpose10_10_10_2
636 //////////////////////////////////////////////////////////////////////////
637 struct Transpose10_10_10_2
639 //////////////////////////////////////////////////////////////////////////
640 /// @brief Performs an SOA to AOS conversion for packed 10_10_10_2 data.
641 /// @param pSrc - source data in SOA form
642 /// @param pDst - output data in AOS form
643 static void Transpose(const BYTE
* pSrc
, BYTE
* pDst
) = delete;
646 //////////////////////////////////////////////////////////////////////////
647 /// Transpose11_11_10
648 //////////////////////////////////////////////////////////////////////////
649 struct Transpose11_11_10
651 //////////////////////////////////////////////////////////////////////////
652 /// @brief Performs an SOA to AOS conversion for packed 11_11_10 data.
653 /// @param pSrc - source data in SOA form
654 /// @param pDst - output data in AOS form
655 static void Transpose(const BYTE
* pSrc
, BYTE
* pDst
) = delete;
658 // helper function to unroll loops
659 template<int Begin
, int End
, int Step
= 1>
661 template<typename Lambda
>
662 INLINE
static void step(Lambda
& func
) {
664 UnrollerL
<Begin
+ Step
, End
, Step
>::step(func
);
668 template<int End
, int Step
>
669 struct UnrollerL
<End
, End
, Step
> {
670 template<typename Lambda
>
671 static void step(Lambda
& func
) {
675 // general CRC compute
677 uint32_t ComputeCRC(uint32_t crc
, const void *pData
, uint32_t size
)
679 #if defined(_WIN64) || defined(__linux__) || defined(__gnu_linux__)
680 uint32_t sizeInQwords
= size
/ sizeof(uint64_t);
681 uint32_t sizeRemainderBytes
= size
% sizeof(uint64_t);
682 uint64_t* pDataWords
= (uint64_t*)pData
;
683 for (uint32_t i
= 0; i
< sizeInQwords
; ++i
)
685 crc
= (uint32_t)_mm_crc32_u64(crc
, *pDataWords
++);
688 uint32_t sizeInDwords
= size
/ sizeof(uint32_t);
689 uint32_t sizeRemainderBytes
= size
% sizeof(uint32_t);
690 uint32_t* pDataWords
= (uint32_t*)pData
;
691 for (uint32_t i
= 0; i
< sizeInDwords
; ++i
)
693 crc
= _mm_crc32_u32(crc
, *pDataWords
++);
697 BYTE
* pRemainderBytes
= (BYTE
*)pDataWords
;
698 for (uint32_t i
= 0; i
< sizeRemainderBytes
; ++i
)
700 crc
= _mm_crc32_u8(crc
, *pRemainderBytes
++);
706 //////////////////////////////////////////////////////////////////////////
707 /// Add byte offset to any-type pointer
708 //////////////////////////////////////////////////////////////////////////
709 template <typename T
>
711 static T
* PtrAdd(T
* p
, intptr_t offset
)
713 intptr_t intp
= reinterpret_cast<intptr_t>(p
);
714 return reinterpret_cast<T
*>(intp
+ offset
);
717 //////////////////////////////////////////////////////////////////////////
719 //////////////////////////////////////////////////////////////////////////
720 template <typename T
>
722 static bool IsPow2(T value
)
724 return value
== (value
& (0 - value
));
727 //////////////////////////////////////////////////////////////////////////
728 /// Align down to specified alignment
729 /// Note: IsPow2(alignment) MUST be true
730 //////////////////////////////////////////////////////////////////////////
731 template <typename T1
, typename T2
>
733 static T1
AlignDownPow2(T1 value
, T2 alignment
)
735 SWR_ASSERT(IsPow2(alignment
));
736 return value
& ~T1(alignment
- 1);
739 //////////////////////////////////////////////////////////////////////////
740 /// Align up to specified alignment
741 /// Note: IsPow2(alignment) MUST be true
742 //////////////////////////////////////////////////////////////////////////
743 template <typename T1
, typename T2
>
745 static T1
AlignUpPow2(T1 value
, T2 alignment
)
747 return AlignDownPow2(value
+ T1(alignment
- 1), alignment
);
750 //////////////////////////////////////////////////////////////////////////
751 /// Align up ptr to specified alignment
752 /// Note: IsPow2(alignment) MUST be true
753 //////////////////////////////////////////////////////////////////////////
754 template <typename T1
, typename T2
>
756 static T1
* AlignUpPow2(T1
* value
, T2 alignment
)
758 return reinterpret_cast<T1
*>(
759 AlignDownPow2(reinterpret_cast<uintptr_t>(value
) + uintptr_t(alignment
- 1), alignment
));
762 //////////////////////////////////////////////////////////////////////////
763 /// Align down to specified alignment
764 //////////////////////////////////////////////////////////////////////////
765 template <typename T1
, typename T2
>
767 static T1
AlignDown(T1 value
, T2 alignment
)
769 if (IsPow2(alignment
)) { return AlignDownPow2(value
, alignment
); }
770 return value
- T1(value
% alignment
);
773 //////////////////////////////////////////////////////////////////////////
774 /// Align down to specified alignment
775 //////////////////////////////////////////////////////////////////////////
776 template <typename T1
, typename T2
>
778 static T1
* AlignDown(T1
* value
, T2 alignment
)
780 return (T1
*)AlignDown(uintptr_t(value
), alignment
);
783 //////////////////////////////////////////////////////////////////////////
784 /// Align up to specified alignment
785 /// Note: IsPow2(alignment) MUST be true
786 //////////////////////////////////////////////////////////////////////////
787 template <typename T1
, typename T2
>
789 static T1
AlignUp(T1 value
, T2 alignment
)
791 return AlignDown(value
+ T1(alignment
- 1), alignment
);
794 //////////////////////////////////////////////////////////////////////////
795 /// Align up to specified alignment
796 /// Note: IsPow2(alignment) MUST be true
797 //////////////////////////////////////////////////////////////////////////
798 template <typename T1
, typename T2
>
800 static T1
* AlignUp(T1
* value
, T2 alignment
)
802 return AlignDown(PtrAdd(value
, alignment
- 1), alignment
);
805 //////////////////////////////////////////////////////////////////////////
806 /// Helper structure used to access an array of elements that don't
807 /// correspond to a typical word size.
808 //////////////////////////////////////////////////////////////////////////
809 template<typename T
, size_t BitsPerElementT
, size_t ArrayLenT
>
813 static const size_t BITS_PER_WORD
= sizeof(size_t) * 8;
814 static const size_t ELEMENTS_PER_WORD
= BITS_PER_WORD
/ BitsPerElementT
;
815 static const size_t NUM_WORDS
= (ArrayLenT
+ ELEMENTS_PER_WORD
- 1) / ELEMENTS_PER_WORD
;
816 static const size_t ELEMENT_MASK
= (size_t(1) << BitsPerElementT
) - 1;
818 static_assert(ELEMENTS_PER_WORD
* BitsPerElementT
== BITS_PER_WORD
,
819 "Element size must an integral fraction of pointer size");
821 size_t m_words
[NUM_WORDS
] = {};
825 T
operator[] (size_t elementIndex
) const
827 size_t word
= m_words
[elementIndex
/ ELEMENTS_PER_WORD
];
828 word
>>= ((elementIndex
% ELEMENTS_PER_WORD
) * BitsPerElementT
);
829 return T(word
& ELEMENT_MASK
);