1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * @brief Definitions for SWR_FORMAT functions.
27 ******************************************************************************/
32 //////////////////////////////////////////////////////////////////////////
33 /// PackTraits - Helpers for packing / unpacking same pixel sizes
34 //////////////////////////////////////////////////////////////////////////
35 template <uint32_t NumBits
, bool Signed
= false>
38 static const uint32_t MyNumBits
= NumBits
;
39 static simdscalar
loadSOA(const uint8_t *pSrc
) = delete;
40 static void storeSOA(uint8_t *pDst
, simdscalar src
) = delete;
41 static simdscalar
unpack(simdscalar
&in
) = delete;
42 static simdscalar
pack(simdscalar
&in
) = delete;
43 #if ENABLE_AVX512_SIMD16
44 static simd16scalar
loadSOA_16(const uint8_t *pSrc
) = delete;
45 static void storeSOA(uint8_t *pDst
, simd16scalar src
) = delete;
46 static simd16scalar
unpack(simd16scalar
&in
) = delete;
47 static simd16scalar
pack(simd16scalar
&in
) = delete;
51 //////////////////////////////////////////////////////////////////////////
52 /// PackTraits - Helpers for packing / unpacking unused channels
53 //////////////////////////////////////////////////////////////////////////
55 struct PackTraits
<0, false>
57 static const uint32_t MyNumBits
= 0;
59 static simdscalar
loadSOA(const uint8_t *pSrc
) { return _simd_setzero_ps(); }
60 static void storeSOA(uint8_t *pDst
, simdscalar src
) { return; }
61 static simdscalar
unpack(simdscalar
&in
) { return _simd_setzero_ps(); }
62 static simdscalar
pack(simdscalar
&in
) { return _simd_setzero_ps(); }
63 #if ENABLE_AVX512_SIMD16
64 static simd16scalar
loadSOA_16(const uint8_t *pSrc
) { return _simd16_setzero_ps(); }
65 static void storeSOA(uint8_t *pDst
, simd16scalar src
) { return; }
66 static simd16scalar
unpack(simd16scalar
&in
) { return _simd16_setzero_ps(); }
67 static simd16scalar
pack(simd16scalar
&in
) { return _simd16_setzero_ps(); }
71 //////////////////////////////////////////////////////////////////////////
72 /// PackTraits - Helpers for packing / unpacking 8 bit unsigned channels
73 //////////////////////////////////////////////////////////////////////////
75 struct PackTraits
<8, false>
77 static const uint32_t MyNumBits
= 8;
79 static simdscalar
loadSOA(const uint8_t *pSrc
)
81 #if KNOB_SIMD_WIDTH == 8
82 __m256 result
= _mm256_setzero_ps();
83 __m128 vLo
= _mm_castpd_ps(_mm_load_sd((double*)pSrc
));
84 return _mm256_insertf128_ps(result
, vLo
, 0);
85 #elif KNOB_SIMD_WIDTH == 16
86 #if ENABLE_AVX512_EMULATION
87 simdscalar result
= _simd_setzero_ps();
89 __m128 src
= _mm_load_ps(reinterpret_cast<const float*>(pSrc
));
91 result
.lo
= _mm256_insertf128_ps(result
.lo
, src
, 0);
96 #error Unsupported vector width
100 static void storeSOA(uint8_t *pDst
, simdscalar src
)
103 #if KNOB_SIMD_WIDTH == 8
104 _mm_storel_pd((double*)pDst
, _mm_castps_pd(_mm256_castps256_ps128(src
)));
105 #elif KNOB_SIMD_WIDTH == 16
106 #if ENABLE_AVX512_EMULATION
107 _mm_store_ps(reinterpret_cast<float*>(pDst
), _mm256_castps256_ps128(src
.lo
));
110 #error Unsupported vector width
114 static simdscalar
unpack(simdscalar
&in
)
116 #if KNOB_SIMD_WIDTH == 8
117 #if KNOB_ARCH==KNOB_ARCH_AVX
118 __m128i src
= _mm_castps_si128(_mm256_castps256_ps128(in
));
119 __m128i resLo
= _mm_cvtepu8_epi32(src
);
120 __m128i resHi
= _mm_shuffle_epi8(src
,
121 _mm_set_epi32(0x80808007, 0x80808006, 0x80808005, 0x80808004));
123 __m256i result
= _mm256_castsi128_si256(resLo
);
124 result
= _mm256_insertf128_si256(result
, resHi
, 1);
125 return _mm256_castsi256_ps(result
);
126 #elif KNOB_ARCH>=KNOB_ARCH_AVX2
127 return _mm256_castsi256_ps(_mm256_cvtepu8_epi32(_mm_castps_si128(_mm256_castps256_ps128(in
))));
129 #elif KNOB_SIMD_WIDTH == 16
130 #if ENABLE_AVX512_EMULATION
133 __m128i src
= _mm_castps_si128(_mm256_castps256_ps128(in
.lo
));
135 result
.lo
= _mm256_cvtepu8_epi32(src
);
137 result
.hi
= _mm256_cvtepu8_epi32(_mm_srli_si128(src
, 8));
139 return _simd_castsi_ps(result
);
142 #error Unsupported vector width
146 static simdscalar
pack(simdscalar
&in
)
148 #if KNOB_SIMD_WIDTH == 8
149 simdscalari src
= _simd_castps_si(in
);
150 __m128i res16
= _mm_packus_epi32(_mm256_castsi256_si128(src
), _mm256_extractf128_si256(src
, 1));
151 __m128i res8
= _mm_packus_epi16(res16
, _mm_undefined_si128());
152 return _mm256_castsi256_ps(_mm256_castsi128_si256(res8
));
153 #elif KNOB_SIMD_WIDTH == 16
154 #if ENABLE_AVX512_EMULATION
155 simdscalari result
= _simd_setzero_si();
157 __m128i templo
= _mm_packus_epi32(_mm256_castsi256_si128(_mm256_castps_si256(in
.lo
)), _mm256_extractf128_si256(_mm256_castps_si256(in
.lo
), 1));
159 __m128i temphi
= _mm_packus_epi32(_mm256_castsi256_si128(_mm256_castps_si256(in
.hi
)), _mm256_extractf128_si256(_mm256_castps_si256(in
.hi
), 1));
161 __m128i temp
= _mm_packus_epi16(templo
, temphi
);
163 result
.lo
= _mm256_insertf128_si256(result
.lo
, temp
, 0);
165 return _simd_castsi_ps(result
);
168 #error Unsupported vector width
171 #if ENABLE_AVX512_SIMD16
173 static simd16scalar
loadSOA_16(const uint8_t *pSrc
)
175 simd16scalar result
= _simd16_setzero_ps();
176 simdscalar resultlo
= _simd_setzero_ps();
178 const __m128 src
= _mm_load_ps(reinterpret_cast<const float *>(pSrc
));
180 resultlo
= _mm256_insertf128_ps(resultlo
, src
, 0);
181 result
= _simd16_insert_ps(result
, resultlo
, 0);
186 static void storeSOA(uint8_t *pDst
, simd16scalar src
)
188 // store simd16 bytes
189 _mm_store_ps(reinterpret_cast<float *>(pDst
), _mm256_castps256_ps128(_simd16_extract_ps(src
, 0)));
192 static simd16scalar
unpack(simd16scalar
&in
)
194 simd16scalari result
= _simd16_setzero_si();
196 __m128i src
= _mm_castps_si128(_mm256_castps256_ps128(_simd16_extract_ps(in
, 0)));
198 result
= _simd16_insert_si(result
, _simd_cvtepu8_epi32(src
), 0);
199 result
= _simd16_insert_si(result
, _simd_cvtepu8_epi32(_mm_srli_si128(src
, 8)), 1);
201 return _simd16_castsi_ps(result
);
204 static simd16scalar
pack(simd16scalar
&in
)
206 simd16scalari result
= _simd16_setzero_si();
207 simdscalari resultlo
= _simd_setzero_si();
209 __m128i templo
= _mm_packus_epi32(_mm256_castsi256_si128(_mm256_castps_si256(_simd16_extract_ps(in
, 0))), _mm256_extractf128_si256(_mm256_castps_si256(_simd16_extract_ps(in
, 0)), 1));
210 __m128i temphi
= _mm_packus_epi32(_mm256_castsi256_si128(_mm256_castps_si256(_simd16_extract_ps(in
, 1))), _mm256_extractf128_si256(_mm256_castps_si256(_simd16_extract_ps(in
, 1)), 1));
212 __m128i temp
= _mm_packus_epi16(templo
, temphi
);
214 resultlo
= _mm256_inserti128_si256(resultlo
, temp
, 0);
215 result
= _simd16_insert_si(result
, resultlo
, 0);
217 return _simd16_castsi_ps(result
);
222 //////////////////////////////////////////////////////////////////////////
223 /// PackTraits - Helpers for packing / unpacking 8 bit signed channels
224 //////////////////////////////////////////////////////////////////////////
226 struct PackTraits
<8, true>
228 static const uint32_t MyNumBits
= 8;
230 static simdscalar
loadSOA(const uint8_t *pSrc
)
232 #if KNOB_SIMD_WIDTH == 8
233 __m256 result
= _mm256_setzero_ps();
234 __m128 vLo
= _mm_castpd_ps(_mm_load_sd((double*)pSrc
));
235 return _mm256_insertf128_ps(result
, vLo
, 0);
236 #elif KNOB_SIMD_WIDTH == 16
237 #if ENABLE_AVX512_EMULATION
238 simdscalar result
= _simd_setzero_ps();
240 __m128 src
= _mm_load_ps(reinterpret_cast<const float*>(pSrc
));
242 result
.lo
= _mm256_insertf128_ps(result
.lo
, src
, 0);
247 #error Unsupported vector width
251 static void storeSOA(uint8_t *pDst
, simdscalar src
)
254 #if KNOB_SIMD_WIDTH == 8
255 _mm_storel_pd((double*)pDst
, _mm_castps_pd(_mm256_castps256_ps128(src
)));
256 #elif KNOB_SIMD_WIDTH == 16
257 #if ENABLE_AVX512_EMULATION
258 _mm_store_ps(reinterpret_cast<float*>(pDst
), _mm256_castps256_ps128(src
.lo
));
261 #error Unsupported vector width
265 static simdscalar
unpack(simdscalar
&in
)
267 #if KNOB_SIMD_WIDTH == 8
268 #if KNOB_ARCH==KNOB_ARCH_AVX
269 SWR_ASSERT(0); // I think this may be incorrect.
270 __m128i src
= _mm_castps_si128(_mm256_castps256_ps128(in
));
271 __m128i resLo
= _mm_cvtepi8_epi32(src
);
272 __m128i resHi
= _mm_shuffle_epi8(src
,
273 _mm_set_epi32(0x80808007, 0x80808006, 0x80808005, 0x80808004));
275 __m256i result
= _mm256_castsi128_si256(resLo
);
276 result
= _mm256_insertf128_si256(result
, resHi
, 1);
277 return _mm256_castsi256_ps(result
);
278 #elif KNOB_ARCH>=KNOB_ARCH_AVX2
279 return _mm256_castsi256_ps(_mm256_cvtepi8_epi32(_mm_castps_si128(_mm256_castps256_ps128(in
))));
281 #elif KNOB_SIMD_WIDTH == 16
282 #if ENABLE_AVX512_EMULATION
285 __m128i src
= _mm_castps_si128(_mm256_castps256_ps128(in
.lo
));
287 result
.lo
= _mm256_cvtepu8_epi32(src
);
289 result
.hi
= _mm256_cvtepu8_epi32(_mm_srli_si128(src
, 8));
291 return _simd_castsi_ps(result
);
294 #error Unsupported vector width
298 static simdscalar
pack(simdscalar
&in
)
300 #if KNOB_SIMD_WIDTH == 8
301 simdscalari src
= _simd_castps_si(in
);
302 __m128i res16
= _mm_packs_epi32(_mm256_castsi256_si128(src
), _mm256_extractf128_si256(src
, 1));
303 __m128i res8
= _mm_packs_epi16(res16
, _mm_undefined_si128());
304 return _mm256_castsi256_ps(_mm256_castsi128_si256(res8
));
305 #elif KNOB_SIMD_WIDTH == 16
306 #if ENABLE_AVX512_EMULATION
307 simdscalari result
= _simd_setzero_si();
309 __m128i templo
= _mm_packs_epi32(_mm256_castsi256_si128(_mm256_castps_si256(in
.lo
)), _mm256_extractf128_si256(_mm256_castps_si256(in
.lo
), 1));
311 __m128i temphi
= _mm_packs_epi32(_mm256_castsi256_si128(_mm256_castps_si256(in
.hi
)), _mm256_extractf128_si256(_mm256_castps_si256(in
.hi
), 1));
313 __m128i temp
= _mm_packs_epi16(templo
, temphi
);
315 result
.lo
= _mm256_insertf128_si256(result
.lo
, temp
, 0);
317 return _simd_castsi_ps(result
);
320 #error Unsupported vector width
323 #if ENABLE_AVX512_SIMD16
325 static simd16scalar
loadSOA_16(const uint8_t *pSrc
)
327 simd16scalar result
= _simd16_setzero_ps();
328 simdscalar resultlo
= _simd_setzero_ps();
330 const __m128 src
= _mm_load_ps(reinterpret_cast<const float *>(pSrc
));
332 resultlo
= _mm256_insertf128_ps(resultlo
, src
, 0);
333 result
= _simd16_insert_ps(result
, resultlo
, 0);
338 static void storeSOA(uint8_t *pDst
, simd16scalar src
)
340 // store simd16 bytes
341 _mm_store_ps(reinterpret_cast<float *>(pDst
), _mm256_castps256_ps128(_simd16_extract_ps(src
, 0)));
344 static simd16scalar
unpack(simd16scalar
&in
)
346 simd16scalari result
= _simd16_setzero_si();
348 __m128i src
= _mm_castps_si128(_mm256_castps256_ps128(_simd16_extract_ps(in
, 0)));
350 result
= _simd16_insert_si(result
, _simd_cvtepu8_epi32(src
), 0);
351 result
= _simd16_insert_si(result
, _simd_cvtepu8_epi32(_mm_srli_si128(src
, 8)), 1);
353 return _simd16_castsi_ps(result
);
356 static simd16scalar
pack(simd16scalar
&in
)
358 simd16scalari result
= _simd16_setzero_si();
359 simdscalari resultlo
= _simd_setzero_si();
361 __m128i templo
= _mm_packs_epi32(_mm256_castsi256_si128(_mm256_castps_si256(_simd16_extract_ps(in
, 0))), _mm256_extractf128_si256(_mm256_castps_si256(_simd16_extract_ps(in
, 0)), 1));
362 __m128i temphi
= _mm_packs_epi32(_mm256_castsi256_si128(_mm256_castps_si256(_simd16_extract_ps(in
, 1))), _mm256_extractf128_si256(_mm256_castps_si256(_simd16_extract_ps(in
, 1)), 1));
364 __m128i temp
= _mm_packs_epi16(templo
, temphi
);
366 resultlo
= _mm256_inserti128_si256(resultlo
, temp
, 0);
367 result
= _simd16_insert_si(result
, resultlo
, 0);
369 return _simd16_castsi_ps(result
);
374 //////////////////////////////////////////////////////////////////////////
375 /// PackTraits - Helpers for packing / unpacking 16 bit unsigned channels
376 //////////////////////////////////////////////////////////////////////////
378 struct PackTraits
<16, false>
380 static const uint32_t MyNumBits
= 16;
382 static simdscalar
loadSOA(const uint8_t *pSrc
)
384 #if KNOB_SIMD_WIDTH == 8
385 __m256 result
= _mm256_setzero_ps();
386 __m128 vLo
= _mm_load_ps((const float*)pSrc
);
387 return _mm256_insertf128_ps(result
, vLo
, 0);
388 #elif KNOB_SIMD_WIDTH == 16
389 #if ENABLE_AVX512_EMULATION
392 result
.lo
= _mm256_load_ps(reinterpret_cast<const float*>(pSrc
));
394 result
.hi
= _mm256_undefined_ps();
399 #error Unsupported vector width
403 static void storeSOA(uint8_t *pDst
, simdscalar src
)
405 #if KNOB_SIMD_WIDTH == 8
406 // store 16B (2B * 8)
407 _mm_store_ps((float*)pDst
, _mm256_castps256_ps128(src
));
408 #elif KNOB_SIMD_WIDTH == 16
409 #if ENABLE_AVX512_EMULATION
410 _mm256_store_ps(reinterpret_cast<float*>(pDst
), src
.lo
);
413 #error Unsupported vector width
417 static simdscalar
unpack(simdscalar
&in
)
419 #if KNOB_SIMD_WIDTH == 8
420 #if KNOB_ARCH==KNOB_ARCH_AVX
421 __m128i src
= _mm_castps_si128(_mm256_castps256_ps128(in
));
422 __m128i resLo
= _mm_cvtepu16_epi32(src
);
423 __m128i resHi
= _mm_shuffle_epi8(src
,
424 _mm_set_epi32(0x80800F0E, 0x80800D0C, 0x80800B0A, 0x80800908));
426 __m256i result
= _mm256_castsi128_si256(resLo
);
427 result
= _mm256_insertf128_si256(result
, resHi
, 1);
428 return _mm256_castsi256_ps(result
);
429 #elif KNOB_ARCH>=KNOB_ARCH_AVX2
430 return _mm256_castsi256_ps(_mm256_cvtepu16_epi32(_mm_castps_si128(_mm256_castps256_ps128(in
))));
432 #elif KNOB_SIMD_WIDTH == 16
433 #if ENABLE_AVX512_EMULATION
436 result
.lo
= _mm256_cvtepu16_epi32(_mm256_extracti128_si256(_mm256_castps_si256(in
.lo
), 0));
438 result
.hi
= _mm256_cvtepu16_epi32(_mm256_extracti128_si256(_mm256_castps_si256(in
.lo
), 1));
440 return _simd_castsi_ps(result
);
443 #error Unsupported vector width
447 static simdscalar
pack(simdscalar
&in
)
449 #if KNOB_SIMD_WIDTH == 8
450 simdscalari src
= _simd_castps_si(in
);
451 __m256i res
= _mm256_castsi128_si256(_mm_packus_epi32(_mm256_castsi256_si128(src
), _mm256_extractf128_si256(src
, 1)));
452 return _mm256_castsi256_ps(res
);
453 #elif KNOB_SIMD_WIDTH == 16
454 #if ENABLE_AVX512_EMULATION
457 __m256i inlo
= _mm256_castps_si256(in
.lo
);
458 __m256i inhi
= _mm256_castps_si256(in
.hi
);
460 __m256i templo
= _mm256_permute2x128_si256(inlo
, inhi
, 0x20);
461 __m256i temphi
= _mm256_permute2x128_si256(inlo
, inhi
, 0x31);
463 result
.lo
= _mm256_packus_epi32(templo
, temphi
);
464 result
.hi
= _mm256_undefined_si256();
466 return _simd_castsi_ps(result
);
469 #error Unsupported vector width
472 #if ENABLE_AVX512_SIMD16
474 static simd16scalar
loadSOA_16(const uint8_t *pSrc
)
476 simd16scalar result
= _simd16_setzero_ps();
478 simdscalar resultlo
= _simd_load_ps(reinterpret_cast<const float *>(pSrc
));
480 result
= _simd16_insert_ps(result
, resultlo
, 0);
485 static void storeSOA(uint8_t *pDst
, simd16scalar src
)
487 _simd_store_ps(reinterpret_cast<float *>(pDst
), _simd16_extract_ps(src
, 0));
490 static simd16scalar
unpack(simd16scalar
&in
)
492 simd16scalari result
= _simd16_setzero_si();
494 result
= _simd16_insert_si(result
, _simd_cvtepu16_epi32(_mm256_extracti128_si256(_mm256_castps_si256(_simd16_extract_ps(in
, 0)), 0)), 0);
495 result
= _simd16_insert_si(result
, _simd_cvtepu16_epi32(_mm256_extracti128_si256(_mm256_castps_si256(_simd16_extract_ps(in
, 0)), 1)), 1);
497 return _simd16_castsi_ps(result
);
500 static simd16scalar
pack(simd16scalar
&in
)
502 simd16scalari result
= _simd16_setzero_si();
504 simdscalari inlo
= _simd_castps_si(_simd16_extract_ps(in
, 0));
505 simdscalari inhi
= _simd_castps_si(_simd16_extract_ps(in
, 1));
507 simdscalari templo
= _simd_permute2f128_si(inlo
, inhi
, 0x20);
508 simdscalari temphi
= _simd_permute2f128_si(inlo
, inhi
, 0x31);
510 result
= _simd16_insert_si(result
, _simd_packus_epi32(templo
, temphi
), 0);
512 return _simd16_castsi_ps(result
);
517 //////////////////////////////////////////////////////////////////////////
518 /// PackTraits - Helpers for packing / unpacking 16 bit signed channels
519 //////////////////////////////////////////////////////////////////////////
521 struct PackTraits
<16, true>
523 static const uint32_t MyNumBits
= 16;
525 static simdscalar
loadSOA(const uint8_t *pSrc
)
527 #if KNOB_SIMD_WIDTH == 8
528 __m256 result
= _mm256_setzero_ps();
529 __m128 vLo
= _mm_load_ps((const float*)pSrc
);
530 return _mm256_insertf128_ps(result
, vLo
, 0);
531 #elif KNOB_SIMD_WIDTH == 16
532 #if ENABLE_AVX512_EMULATION
535 result
.lo
= _mm256_load_ps(reinterpret_cast<const float*>(pSrc
));
537 result
.hi
= _mm256_undefined_ps();
542 #error Unsupported vector width
546 static void storeSOA(uint8_t *pDst
, simdscalar src
)
548 #if KNOB_SIMD_WIDTH == 8
549 // store 16B (2B * 8)
550 _mm_store_ps((float*)pDst
, _mm256_castps256_ps128(src
));
551 #elif KNOB_SIMD_WIDTH == 16
552 #if ENABLE_AVX512_EMULATION
553 _mm256_store_ps(reinterpret_cast<float*>(pDst
), src
.lo
);
556 #error Unsupported vector width
560 static simdscalar
unpack(simdscalar
&in
)
562 #if KNOB_SIMD_WIDTH == 8
563 #if KNOB_ARCH==KNOB_ARCH_AVX
564 SWR_ASSERT(0); // I think this is incorrectly implemented
565 __m128i src
= _mm_castps_si128(_mm256_castps256_ps128(in
));
566 __m128i resLo
= _mm_cvtepi16_epi32(src
);
567 __m128i resHi
= _mm_shuffle_epi8(src
,
568 _mm_set_epi32(0x80800F0E, 0x80800D0C, 0x80800B0A, 0x80800908));
570 __m256i result
= _mm256_castsi128_si256(resLo
);
571 result
= _mm256_insertf128_si256(result
, resHi
, 1);
572 return _mm256_castsi256_ps(result
);
573 #elif KNOB_ARCH>=KNOB_ARCH_AVX2
574 return _mm256_castsi256_ps(_mm256_cvtepi16_epi32(_mm_castps_si128(_mm256_castps256_ps128(in
))));
576 #elif KNOB_SIMD_WIDTH == 16
577 #if ENABLE_AVX512_EMULATION
580 result
.lo
= _mm256_cvtepu16_epi32(_mm256_extracti128_si256(_mm256_castps_si256(in
.lo
), 0));
582 result
.hi
= _mm256_cvtepu16_epi32(_mm256_extracti128_si256(_mm256_castps_si256(in
.lo
), 1));
584 return _simd_castsi_ps(result
);
587 #error Unsupported vector width
591 static simdscalar
pack(simdscalar
&in
)
593 #if KNOB_SIMD_WIDTH == 8
594 simdscalari src
= _simd_castps_si(in
);
595 __m256i res
= _mm256_castsi128_si256(_mm_packs_epi32(_mm256_castsi256_si128(src
), _mm256_extractf128_si256(src
, 1)));
596 return _mm256_castsi256_ps(res
);
597 #elif KNOB_SIMD_WIDTH == 16
598 #if ENABLE_AVX512_EMULATION
601 __m256i inlo
= _mm256_castps_si256(in
.lo
);
602 __m256i inhi
= _mm256_castps_si256(in
.hi
);
604 __m256i templo
= _mm256_permute2x128_si256(inlo
, inhi
, 0x20);
605 __m256i temphi
= _mm256_permute2x128_si256(inlo
, inhi
, 0x31);
607 result
.lo
= _mm256_packs_epi32(templo
, temphi
);
608 result
.hi
= _mm256_undefined_si256();
610 return _simd_castsi_ps(result
);
613 #error Unsupported vector width
616 #if ENABLE_AVX512_SIMD16
618 static simd16scalar
loadSOA_16(const uint8_t *pSrc
)
620 simd16scalar result
= _simd16_setzero_ps();
622 simdscalar resultlo
= _simd_load_ps(reinterpret_cast<const float *>(pSrc
));
624 result
= _simd16_insert_ps(result
, resultlo
, 0);
629 static void storeSOA(uint8_t *pDst
, simd16scalar src
)
631 _simd_store_ps(reinterpret_cast<float *>(pDst
), _simd16_extract_ps(src
, 0));
634 static simd16scalar
unpack(simd16scalar
&in
)
636 simd16scalari result
= _simd16_setzero_si();
638 result
= _simd16_insert_si(result
, _simd_cvtepu16_epi32(_mm256_extracti128_si256(_mm256_castps_si256(_simd16_extract_ps(in
, 0)), 0)), 0);
639 result
= _simd16_insert_si(result
, _simd_cvtepu16_epi32(_mm256_extracti128_si256(_mm256_castps_si256(_simd16_extract_ps(in
, 0)), 1)), 1);
641 return _simd16_castsi_ps(result
);
644 static simd16scalar
pack(simd16scalar
&in
)
646 simd16scalari result
= _simd16_setzero_si();
648 simdscalari inlo
= _simd_castps_si(_simd16_extract_ps(in
, 0));
649 simdscalari inhi
= _simd_castps_si(_simd16_extract_ps(in
, 1));
651 simdscalari templo
= _simd_permute2f128_si(inlo
, inhi
, 0x20);
652 simdscalari temphi
= _simd_permute2f128_si(inlo
, inhi
, 0x31);
654 result
= _simd16_insert_si(result
, _simd_packus_epi32(templo
, temphi
), 0);
656 return _simd16_castsi_ps(result
);
661 //////////////////////////////////////////////////////////////////////////
662 /// PackTraits - Helpers for packing / unpacking 32 bit channels
663 //////////////////////////////////////////////////////////////////////////
665 struct PackTraits
<32, false>
667 static const uint32_t MyNumBits
= 32;
669 static simdscalar
loadSOA(const uint8_t *pSrc
) { return _simd_load_ps((const float*)pSrc
); }
670 static void storeSOA(uint8_t *pDst
, simdscalar src
) { _simd_store_ps((float*)pDst
, src
); }
671 static simdscalar
unpack(simdscalar
&in
) { return in
; }
672 static simdscalar
pack(simdscalar
&in
) { return in
; }
673 #if ENABLE_AVX512_SIMD16
675 static simd16scalar
loadSOA_16(const uint8_t *pSrc
)
677 return _simd16_load_ps(reinterpret_cast<const float *>(pSrc
));
680 static void storeSOA(uint8_t *pDst
, simd16scalar src
)
682 _simd16_store_ps(reinterpret_cast<float *>(pDst
), src
);
685 static simd16scalar
unpack(simd16scalar
&in
)
690 static simd16scalar
pack(simd16scalar
&in
)
697 //////////////////////////////////////////////////////////////////////////
698 /// TypeTraits - Format type traits.
699 //////////////////////////////////////////////////////////////////////////
700 template<SWR_TYPE type
, uint32_t NumBits
>
701 struct TypeTraits
: PackTraits
<NumBits
>
703 static const SWR_TYPE MyType
= type
;
704 static float toFloat() { return 0.0; }
705 static float fromFloat() { SWR_ASSERT(0); return 0.0; }
706 static simdscalar
convertSrgb(simdscalar
&in
) { SWR_ASSERT(0); return _simd_setzero_ps(); }
709 //////////////////////////////////////////////////////////////////////////
710 /// TypeTraits - Format type traits specialization for UINT8
711 //////////////////////////////////////////////////////////////////////////
712 template<> struct TypeTraits
<SWR_TYPE_UINT
, 8> : PackTraits
<8>
714 static const SWR_TYPE MyType
= SWR_TYPE_UINT
;
715 static float toFloat() { return 0.0; }
716 static float fromFloat() { SWR_ASSERT(0); return 0.0; }
717 static simdscalar
convertSrgb(simdscalar
&in
) { SWR_ASSERT(0); return _simd_setzero_ps(); }
720 //////////////////////////////////////////////////////////////////////////
721 /// TypeTraits - Format type traits specialization for UINT8
722 //////////////////////////////////////////////////////////////////////////
723 template<> struct TypeTraits
<SWR_TYPE_SINT
, 8> : PackTraits
<8, true>
725 static const SWR_TYPE MyType
= SWR_TYPE_SINT
;
726 static float toFloat() { return 0.0; }
727 static float fromFloat() { SWR_ASSERT(0); return 0.0; }
728 static simdscalar
convertSrgb(simdscalar
&in
) { SWR_ASSERT(0); return _simd_setzero_ps(); }
731 //////////////////////////////////////////////////////////////////////////
732 /// TypeTraits - Format type traits specialization for UINT16
733 //////////////////////////////////////////////////////////////////////////
734 template<> struct TypeTraits
<SWR_TYPE_UINT
, 16> : PackTraits
<16>
736 static const SWR_TYPE MyType
= SWR_TYPE_UINT
;
737 static float toFloat() { return 0.0; }
738 static float fromFloat() { SWR_ASSERT(0); return 0.0; }
739 static simdscalar
convertSrgb(simdscalar
&in
) { SWR_ASSERT(0); return _simd_setzero_ps(); }
742 //////////////////////////////////////////////////////////////////////////
743 /// TypeTraits - Format type traits specialization for SINT16
744 //////////////////////////////////////////////////////////////////////////
745 template<> struct TypeTraits
<SWR_TYPE_SINT
, 16> : PackTraits
<16, true>
747 static const SWR_TYPE MyType
= SWR_TYPE_SINT
;
748 static float toFloat() { return 0.0; }
749 static float fromFloat() { SWR_ASSERT(0); return 0.0; }
750 static simdscalar
convertSrgb(simdscalar
&in
) { SWR_ASSERT(0); return _simd_setzero_ps(); }
753 //////////////////////////////////////////////////////////////////////////
754 /// TypeTraits - Format type traits specialization for UINT32
755 //////////////////////////////////////////////////////////////////////////
756 template<> struct TypeTraits
<SWR_TYPE_UINT
, 32> : PackTraits
<32>
758 static const SWR_TYPE MyType
= SWR_TYPE_UINT
;
759 static float toFloat() { return 0.0; }
760 static float fromFloat() { SWR_ASSERT(0); return 0.0; }
761 static simdscalar
convertSrgb(simdscalar
&in
) { SWR_ASSERT(0); return _simd_setzero_ps(); }
764 //////////////////////////////////////////////////////////////////////////
765 /// TypeTraits - Format type traits specialization for UINT32
766 //////////////////////////////////////////////////////////////////////////
767 template<> struct TypeTraits
<SWR_TYPE_SINT
, 32> : PackTraits
<32>
769 static const SWR_TYPE MyType
= SWR_TYPE_SINT
;
770 static float toFloat() { return 0.0; }
771 static float fromFloat() { SWR_ASSERT(0); return 0.0; }
772 static simdscalar
convertSrgb(simdscalar
&in
) { SWR_ASSERT(0); return _simd_setzero_ps(); }
775 //////////////////////////////////////////////////////////////////////////
776 /// TypeTraits - Format type traits specialization for UNORM5
777 //////////////////////////////////////////////////////////////////////////
778 template<> struct TypeTraits
<SWR_TYPE_UNORM
, 5> : PackTraits
<5>
780 static const SWR_TYPE MyType
= SWR_TYPE_UNORM
;
781 static float toFloat() { return 1.0f
/ 31.0f
; }
782 static float fromFloat() { return 31.0f
; }
783 static simdscalar
convertSrgb(simdscalar
&in
) { SWR_ASSERT(0); return _simd_setzero_ps(); }
786 //////////////////////////////////////////////////////////////////////////
787 /// TypeTraits - Format type traits specialization for UNORM6
788 //////////////////////////////////////////////////////////////////////////
789 template<> struct TypeTraits
<SWR_TYPE_UNORM
, 6> : PackTraits
<6>
791 static const SWR_TYPE MyType
= SWR_TYPE_UNORM
;
792 static float toFloat() { return 1.0f
/ 63.0f
; }
793 static float fromFloat() { return 63.0f
; }
794 static simdscalar
convertSrgb(simdscalar
&in
) { SWR_ASSERT(0); return _simd_setzero_ps(); }
797 //////////////////////////////////////////////////////////////////////////
798 /// TypeTraits - Format type traits specialization for UNORM8
799 //////////////////////////////////////////////////////////////////////////
800 template<> struct TypeTraits
<SWR_TYPE_UNORM
, 8> : PackTraits
<8>
802 static const SWR_TYPE MyType
= SWR_TYPE_UNORM
;
803 static float toFloat() { return 1.0f
/ 255.0f
; }
804 static float fromFloat() { return 255.0f
; }
805 static simdscalar
convertSrgb(simdscalar
&in
) { SWR_ASSERT(0); return _simd_setzero_ps(); }
808 //////////////////////////////////////////////////////////////////////////
809 /// TypeTraits - Format type traits specialization for UNORM8
810 //////////////////////////////////////////////////////////////////////////
811 template<> struct TypeTraits
<SWR_TYPE_SNORM
, 8> : PackTraits
<8, true>
813 static const SWR_TYPE MyType
= SWR_TYPE_SNORM
;
814 static float toFloat() { return 1.0f
/ 127.0f
; }
815 static float fromFloat() { return 127.0f
; }
816 static simdscalar
convertSrgb(simdscalar
&in
) { SWR_ASSERT(0); return _simd_setzero_ps(); }
819 //////////////////////////////////////////////////////////////////////////
820 /// TypeTraits - Format type traits specialization for UNORM16
821 //////////////////////////////////////////////////////////////////////////
822 template<> struct TypeTraits
<SWR_TYPE_UNORM
, 16> : PackTraits
<16>
824 static const SWR_TYPE MyType
= SWR_TYPE_UNORM
;
825 static float toFloat() { return 1.0f
/ 65535.0f
; }
826 static float fromFloat() { return 65535.0f
; }
827 static simdscalar
convertSrgb(simdscalar
&in
) { SWR_ASSERT(0); return _simd_setzero_ps(); }
830 //////////////////////////////////////////////////////////////////////////
831 /// TypeTraits - Format type traits specialization for SNORM16
832 //////////////////////////////////////////////////////////////////////////
833 template<> struct TypeTraits
<SWR_TYPE_SNORM
, 16> : PackTraits
<16, true>
835 static const SWR_TYPE MyType
= SWR_TYPE_UNORM
;
836 static float toFloat() { return 1.0f
/ 32767.0f
; }
837 static float fromFloat() { return 32767.0f
; }
838 static simdscalar
convertSrgb(simdscalar
&in
) { SWR_ASSERT(0); return _simd_setzero_ps(); }
841 //////////////////////////////////////////////////////////////////////////
842 /// TypeTraits - Format type traits specialization for UNORM24
843 //////////////////////////////////////////////////////////////////////////
845 struct TypeTraits
< SWR_TYPE_UNORM
, 24 > : PackTraits
<32>
847 static const SWR_TYPE MyType
= SWR_TYPE_UNORM
;
848 static float toFloat() { return 1.0f
/ 16777215.0f
; }
849 static float fromFloat() { return 16777215.0f
; }
850 static simdscalar
convertSrgb(simdscalar
&in
) { SWR_ASSERT(0); return _simd_setzero_ps(); }
853 //////////////////////////////////////////////////////////////////////////
854 // FLOAT Specializations from here on...
855 //////////////////////////////////////////////////////////////////////////
856 #define TO_M128i(a) _mm_castps_si128(a)
857 #define TO_M128(a) _mm_castsi128_ps(a)
861 template< unsigned expnum
, unsigned expden
, unsigned coeffnum
, unsigned coeffden
>
862 inline static __m128
fastpow(__m128 arg
) {
865 static const __m128 factor
= _mm_set1_ps(exp2(127.0f
* expden
/ expnum
- 127.0f
)
866 * powf(1.0f
* coeffnum
/ coeffden
, 1.0f
* expden
/ expnum
));
868 // Apply a constant pre-correction factor.
869 ret
= _mm_mul_ps(ret
, factor
);
871 // Reinterpret arg as integer to obtain logarithm.
872 //asm("cvtdq2ps %1, %0" : "=x" (ret) : "x" (ret));
873 ret
= _mm_cvtepi32_ps(_mm_castps_si128(ret
));
875 // Multiply logarithm by power.
876 ret
= _mm_mul_ps(ret
, _mm_set1_ps(1.0f
* expnum
/ expden
));
878 // Convert back to "integer" to exponentiate.
879 //asm("cvtps2dq %1, %0" : "=x" (ret) : "x" (ret));
880 ret
= _mm_castsi128_ps(_mm_cvtps_epi32(ret
));
885 inline static __m128
pow512_4(__m128 arg
) {
886 // 5/12 is too small, so compute the 4th root of 20/12 instead.
887 // 20/12 = 5/3 = 1 + 2/3 = 2 - 1/3. 2/3 is a suitable argument for fastpow.
888 // weighting coefficient: a^-1/2 = 2 a; a = 2^-2/3
889 __m128 xf
= fastpow
< 2, 3, int(0.629960524947437 * 1e9
), int(1e9
) >(arg
);
890 __m128 xover
= _mm_mul_ps(arg
, xf
);
892 __m128 xfm1
= _mm_rsqrt_ps(xf
);
893 __m128 x2
= _mm_mul_ps(arg
, arg
);
894 __m128 xunder
= _mm_mul_ps(x2
, xfm1
);
896 // sqrt2 * over + 2 * sqrt2 * under
897 __m128 xavg
= _mm_mul_ps(_mm_set1_ps(1.0f
/ (3.0f
* 0.629960524947437f
) * 0.999852f
),
898 _mm_add_ps(xover
, xunder
));
900 xavg
= _mm_mul_ps(xavg
, _mm_rsqrt_ps(xavg
));
901 xavg
= _mm_mul_ps(xavg
, _mm_rsqrt_ps(xavg
));
905 inline static __m128
powf_wrapper(__m128 Base
, float Exp
)
907 float *f
= (float *)(&Base
);
909 return _mm_set_ps(powf(f
[3], Exp
),
915 static inline __m128
ConvertFloatToSRGB2(__m128
& Src
)
917 // create a mask with 0xFFFFFFFF in the DWORDs where the source is <= the minimal SRGB float value
918 __m128i CmpToSRGBThresholdMask
= TO_M128i(_mm_cmpnlt_ps(_mm_set1_ps(0.0031308f
), Src
));
920 // squeeze the mask down to 16 bits (4 bits per DWORD)
921 int CompareResult
= _mm_movemask_epi8(CmpToSRGBThresholdMask
);
926 if (CompareResult
== 0xFFFF)
928 // all DWORDs are <= the threshold
929 Result
= _mm_mul_ps(Src
, _mm_set1_ps(12.92f
));
931 else if (CompareResult
== 0x0)
933 // all DWORDs are > the threshold
934 __m128 fSrc_0RGB
= Src
;
936 // --> 1.055f * c(1.0f/2.4f) - 0.055f
937 #if KNOB_USE_FAST_SRGB == TRUE
938 // 1.0f / 2.4f is 5.0f / 12.0f which is used for approximation.
939 __m128 f
= pow512_4(fSrc_0RGB
);
941 __m128 f
= powf_wrapper(fSrc_0RGB
, 1.0f
/ 2.4f
);
943 f
= _mm_mul_ps(f
, _mm_set1_ps(1.055f
));
944 Result
= _mm_sub_ps(f
, _mm_set1_ps(0.055f
));
948 // some DWORDs are <= the threshold and some are > threshold
949 __m128 Src_0RGB_mul_denorm
= _mm_mul_ps(Src
, _mm_set1_ps(12.92f
));
951 __m128 fSrc_0RGB
= Src
;
953 // --> 1.055f * c(1.0f/2.4f) - 0.055f
954 #if KNOB_USE_FAST_SRGB == TRUE
955 // 1.0f / 2.4f is 5.0f / 12.0f which is used for approximation.
956 __m128 f
= pow512_4(fSrc_0RGB
);
958 __m128 f
= powf_wrapper(fSrc_0RGB
, 1.0f
/ 2.4f
);
960 f
= _mm_mul_ps(f
, _mm_set1_ps(1.055f
));
961 f
= _mm_sub_ps(f
, _mm_set1_ps(0.055f
));
963 // Clear the alpha (is garbage after the sub)
964 __m128i i
= _mm_and_si128(TO_M128i(f
), _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF));
966 __m128i LessThanPart
= _mm_and_si128(CmpToSRGBThresholdMask
, TO_M128i(Src_0RGB_mul_denorm
));
967 __m128i GreaterEqualPart
= _mm_andnot_si128(CmpToSRGBThresholdMask
, i
);
968 __m128i CombinedParts
= _mm_or_si128(LessThanPart
, GreaterEqualPart
);
970 Result
= TO_M128(CombinedParts
);
976 #if ENABLE_AVX512_SIMD16
977 template< unsigned expnum
, unsigned expden
, unsigned coeffnum
, unsigned coeffden
>
978 inline static simd16scalar
fastpow(simd16scalar value
)
980 static const float factor1
= exp2(127.0f
* expden
/ expnum
- 127.0f
)
981 * powf(1.0f
* coeffnum
/ coeffden
, 1.0f
* expden
/ expnum
);
983 // Apply a constant pre-correction factor.
984 simd16scalar result
= _simd16_mul_ps(value
, _simd16_set1_ps(factor1
));
986 // Reinterpret arg as integer to obtain logarithm.
987 //asm("cvtdq2ps %1, %0" : "=x" (result) : "x" (result));
988 result
= _simd16_cvtepi32_ps(_simd16_castps_si(result
));
990 // Multiply logarithm by power.
991 result
= _simd16_mul_ps(result
, _simd16_set1_ps(1.0f
* expnum
/ expden
));
993 // Convert back to "integer" to exponentiate.
994 //asm("cvtps2dq %1, %0" : "=x" (result) : "x" (result));
995 result
= _simd16_castsi_ps(_simd16_cvtps_epi32(result
));
1000 inline static simd16scalar
pow512_4(simd16scalar arg
)
1002 // 5/12 is too small, so compute the 4th root of 20/12 instead.
1003 // 20/12 = 5/3 = 1 + 2/3 = 2 - 1/3. 2/3 is a suitable argument for fastpow.
1004 // weighting coefficient: a^-1/2 = 2 a; a = 2^-2/3
1005 simd16scalar xf
= fastpow
< 2, 3, int(0.629960524947437 * 1e9
), int(1e9
) >(arg
);
1006 simd16scalar xover
= _simd16_mul_ps(arg
, xf
);
1008 simd16scalar xfm1
= _simd16_rsqrt_ps(xf
);
1009 simd16scalar x2
= _simd16_mul_ps(arg
, arg
);
1010 simd16scalar xunder
= _simd16_mul_ps(x2
, xfm1
);
1012 // sqrt2 * over + 2 * sqrt2 * under
1013 simd16scalar xavg
= _simd16_mul_ps(_simd16_set1_ps(1.0f
/ (3.0f
* 0.629960524947437f
) * 0.999852f
), _simd16_add_ps(xover
, xunder
));
1015 xavg
= _simd16_mul_ps(xavg
, _simd16_rsqrt_ps(xavg
));
1016 xavg
= _simd16_mul_ps(xavg
, _simd16_rsqrt_ps(xavg
));
1021 inline static simd16scalar
powf_wrapper(const simd16scalar base
, float exp
)
1023 const float *f
= reinterpret_cast<const float *>(&base
);
1025 return _simd16_set_ps(
1045 // float to SRGB conversion formula
1047 // if (value < 0.0031308f)
1050 // value = 1.055f * pow(value, 1.0f / 2.4f) - 0.055f;
1052 static inline simd16scalar
ConvertFloatToSRGB2(const simd16scalar
&value
)
1054 // create a mask where the source is < the minimal SRGB float value
1055 const simd16mask mask
= _simd16_cmplt_ps_mask(value
, _simd16_set1_ps(0.0031308f
));
1057 // if all elements are < the threshold, result = value * 12.92
1058 simd16scalar result
= _simd16_mul_ps(value
, _simd16_set1_ps(12.92f
));
1060 if (_simd16_mask2int(mask
) != 0xFFFF)
1062 // some elements are >= threshold, result = 1.055 * power(value, 1.0 / 2.4) - 0.055
1063 #if KNOB_USE_FAST_SRGB == TRUE
1064 // 1.0f / 2.4f is 5.0f / 12.0f which is used for approximation.
1065 simd16scalar result2
= pow512_4(value
);
1067 simd16scalar result2
= powf_wrapper(value
, 1.0f
/ 2.4f
);
1070 result2
= _simd16_mul_ps(result2
, _simd16_set1_ps(1.055f
));
1071 result2
= _simd16_sub_ps(result2
, _simd16_set1_ps(0.055f
));
1073 #if (KNOB_ARCH == KNOB_ARCH_AVX512)
1074 // only native AVX512 can directly use the computed mask for the blend operation
1075 result
= _mm512_mask_blend_ps(mask
, result2
, result
);
1077 result
= _simd16_blendv_ps(result2
, result
, _simd16_cmplt_ps(value
, _simd16_set1_ps(0.0031308f
)));
1085 //////////////////////////////////////////////////////////////////////////
1086 /// TypeTraits - Format type traits specialization for FLOAT16
1087 //////////////////////////////////////////////////////////////////////////
1088 template<> struct TypeTraits
<SWR_TYPE_FLOAT
, 16> : PackTraits
<16>
1090 static const SWR_TYPE MyType
= SWR_TYPE_FLOAT
;
1091 static float toFloat() { return 1.0f
; }
1092 static float fromFloat() { return 1.0f
; }
1093 static simdscalar
convertSrgb(simdscalar
&in
) { SWR_ASSERT(0); return _simd_setzero_ps(); }
1095 static simdscalar
pack(const simdscalar
&in
)
1097 #if KNOB_SIMD_WIDTH == 8
1098 #if (KNOB_ARCH == KNOB_ARCH_AVX)
1099 // input is 8 packed float32, output is 8 packed float16
1100 simdscalari src
= _simd_castps_si(in
);
1102 static const uint32_t FLOAT_EXP_BITS
= 8;
1103 static const uint32_t FLOAT_MANTISSA_BITS
= 23;
1104 static const uint32_t FLOAT_MANTISSA_MASK
= (1U << FLOAT_MANTISSA_BITS
) - 1;
1105 static const uint32_t FLOAT_EXP_MASK
= ((1U << FLOAT_EXP_BITS
) - 1) << FLOAT_MANTISSA_BITS
;
1107 static const uint32_t HALF_EXP_BITS
= 5;
1108 static const uint32_t HALF_MANTISSA_BITS
= 10;
1109 static const uint32_t HALF_EXP_MASK
= ((1U << HALF_EXP_BITS
) - 1) << HALF_MANTISSA_BITS
;
1111 // minimum exponent required, exponents below this are flushed to 0.
1112 static const int32_t HALF_EXP_MIN
= -14;
1113 static const int32_t FLOAT_EXP_BIAS
= 127;
1114 static const int32_t FLOAT_EXP_MIN
= HALF_EXP_MIN
+ FLOAT_EXP_BIAS
;
1115 static const int32_t FLOAT_EXP_MIN_FTZ
= FLOAT_EXP_MIN
- (HALF_MANTISSA_BITS
+ 1); // +1 for the lack of implicit significand
1117 // maximum exponent required, exponents above this are set to infinity
1118 static const int32_t HALF_EXP_MAX
= 15;
1119 static const int32_t FLOAT_EXP_MAX
= HALF_EXP_MAX
+ FLOAT_EXP_BIAS
;
1121 const simdscalari vSignMask
= _simd_set1_epi32(0x80000000);
1122 const simdscalari vExpMask
= _simd_set1_epi32(FLOAT_EXP_MASK
);
1123 const simdscalari vManMask
= _simd_set1_epi32(FLOAT_MANTISSA_MASK
);
1124 const simdscalari vExpMin
= _simd_set1_epi32(FLOAT_EXP_MASK
& uint32_t(FLOAT_EXP_MIN
<< FLOAT_MANTISSA_BITS
));
1125 const simdscalari vExpMinFtz
= _simd_set1_epi32(FLOAT_EXP_MASK
& uint32_t(FLOAT_EXP_MIN_FTZ
<< FLOAT_MANTISSA_BITS
));
1126 const simdscalari vExpMax
= _simd_set1_epi32(FLOAT_EXP_MASK
& uint32_t(FLOAT_EXP_MAX
<< FLOAT_MANTISSA_BITS
));
1128 simdscalari vSign
= _simd_and_si(src
, vSignMask
);
1129 simdscalari vExp
= _simd_and_si(src
, vExpMask
);
1130 simdscalari vMan
= _simd_and_si(src
, vManMask
);
1132 simdscalari vFTZMask
= _simd_cmplt_epi32(vExp
, vExpMinFtz
);
1133 simdscalari vDenormMask
= _simd_andnot_si(vFTZMask
, _simd_cmplt_epi32(vExp
, vExpMin
));
1134 simdscalari vInfMask
= _simd_cmpeq_epi32(vExpMask
, vExp
);
1135 simdscalari vClampMask
= _simd_andnot_si(vInfMask
, _simd_cmplt_epi32(vExpMax
, vExp
));
1137 simdscalari vHalfExp
= _simd_add_epi32(_simd_sub_epi32(vExp
, vExpMin
), _simd_set1_epi32(1U << FLOAT_MANTISSA_BITS
));
1139 // pack output 16-bits into the lower 16-bits of each 32-bit channel
1140 simdscalari vDst
= _simd_and_si(_simd_srli_epi32(vHalfExp
, 13), _simd_set1_epi32(HALF_EXP_MASK
));
1141 vDst
= _simd_or_si(vDst
, _simd_srli_epi32(vMan
, FLOAT_MANTISSA_BITS
- HALF_MANTISSA_BITS
));
1144 vDst
= _simd_andnot_si(vFTZMask
, vDst
);
1145 // Apply Infinites / NaN
1146 vDst
= _simd_or_si(vDst
, _simd_and_si(vInfMask
, _simd_set1_epi32(HALF_EXP_MASK
)));
1149 vDst
= _simd_andnot_si(vClampMask
, vDst
);
1150 vDst
= _simd_or_si(vDst
,
1151 _simd_and_si(vClampMask
, _simd_set1_epi32(0x7BFF)));
1153 // Compute Denormals (subnormals)
1154 if (!_mm256_testz_si256(vDenormMask
, vDenormMask
))
1156 uint32_t *pDenormMask
= (uint32_t*)&vDenormMask
;
1157 uint32_t *pExp
= (uint32_t*)&vExp
;
1158 uint32_t *pMan
= (uint32_t*)&vMan
;
1159 uint32_t *pDst
= (uint32_t*)&vDst
;
1160 for (uint32_t i
= 0; i
< KNOB_SIMD_WIDTH
; ++i
)
1164 // Need to compute subnormal value
1165 uint32_t exponent
= pExp
[i
] >> FLOAT_MANTISSA_BITS
;
1166 uint32_t mantissa
= pMan
[i
] |
1167 (1U << FLOAT_MANTISSA_BITS
); // Denorms include no "implicit" 1s. Make it explicit
1169 pDst
[i
] = mantissa
>> ((FLOAT_EXP_MIN
- exponent
) + (FLOAT_MANTISSA_BITS
- HALF_MANTISSA_BITS
));
1175 vDst
= _simd_or_si(vDst
, _simd_srli_epi32(vSign
, 16));
1177 // Pack to lower 128-bits
1178 vDst
= _mm256_castsi128_si256(_mm_packus_epi32(_mm256_castsi256_si128(vDst
), _mm256_extractf128_si256(vDst
, 1)));
1181 #if !defined(NDEBUG)
1182 simdscalari vCheck
= _mm256_castsi128_si256(_mm256_cvtps_ph(in
, _MM_FROUND_TRUNC
));
1184 for (uint32_t i
= 0; i
< 4; ++i
)
1186 SWR_ASSERT(vCheck
.m256i_i32
[i
] == vDst
.m256i_i32
[i
]);
1191 return _simd_castsi_ps(vDst
);
1194 return _mm256_castsi256_ps(_mm256_castsi128_si256(_mm256_cvtps_ph(in
, _MM_FROUND_TRUNC
)));
1196 #elif KNOB_SIMD_WIDTH == 16
1197 #if ENABLE_AVX512_EMULATION
1200 __m128i templo
= _mm256_cvtps_ph(in
.lo
, _MM_FROUND_TRUNC
);
1201 __m128i temphi
= _mm256_cvtps_ph(in
.hi
, _MM_FROUND_TRUNC
);
1203 result
.lo
= _mm256_castsi128_si256(templo
);
1204 result
.lo
= _mm256_insertf128_si256(result
.lo
, temphi
, 1);
1206 result
.hi
= _mm256_undefined_si256();
1208 return _simd_castsi_ps(result
);
1211 #error Unsupported vector width
1215 static simdscalar
unpack(const simdscalar
&in
)
1217 // input is 8 packed float16, output is 8 packed float32
1218 SWR_ASSERT(0); // @todo
1219 return _simd_setzero_ps();
1221 #if ENABLE_AVX512_SIMD16
1223 static simd16scalar
pack(const simd16scalar
&in
)
1225 simd16scalari result
= _simd16_setzero_si();
1226 simdscalari resultlo
= _simd_setzero_si();
1228 #if (KNOB_ARCH == KNOB_ARCH_AVX)
1229 simdscalar simdlo
= pack(_simd16_extract_ps(in
, 0));
1230 simdscalar simdhi
= pack(_simd16_extract_ps(in
, 1));
1232 __m128i templo
= _mm256_extractf128_si256(_simd_castps_si(simdlo
), 0);
1233 __m128i temphi
= _mm256_extractf128_si256(_simd_castps_si(simdhi
), 0);
1236 __m128i templo
= _mm256_cvtps_ph(_simd16_extract_ps(in
, 0), _MM_FROUND_TRUNC
);
1237 __m128i temphi
= _mm256_cvtps_ph(_simd16_extract_ps(in
, 1), _MM_FROUND_TRUNC
);
1240 resultlo
= _mm256_insertf128_si256(resultlo
, templo
, 0);
1241 resultlo
= _mm256_insertf128_si256(resultlo
, temphi
, 1);
1243 result
= _simd16_insert_si(result
, resultlo
, 0);
1245 return _simd16_castsi_ps(result
);
1248 static simd16scalar
unpack(const simd16scalar
&in
)
1250 // input is 16 packed float16, output is 16 packed float32
1251 SWR_ASSERT(0); // @todo
1252 return _simd16_setzero_ps();
1257 //////////////////////////////////////////////////////////////////////////
1258 /// TypeTraits - Format type traits specialization for FLOAT32
1259 //////////////////////////////////////////////////////////////////////////
1260 template<> struct TypeTraits
<SWR_TYPE_FLOAT
, 32> : PackTraits
<32>
1262 static const SWR_TYPE MyType
= SWR_TYPE_FLOAT
;
1263 static float toFloat() { return 1.0f
; }
1264 static float fromFloat() { return 1.0f
; }
1265 static inline simdscalar
convertSrgb(simdscalar
&in
)
1267 #if KNOB_SIMD_WIDTH == 8
1268 #if (KNOB_ARCH == KNOB_ARCH_AVX || KNOB_ARCH == KNOB_ARCH_AVX2)
1269 __m128 srcLo
= _mm256_extractf128_ps(in
, 0);
1270 __m128 srcHi
= _mm256_extractf128_ps(in
, 1);
1272 srcLo
= ConvertFloatToSRGB2(srcLo
);
1273 srcHi
= ConvertFloatToSRGB2(srcHi
);
1275 in
= _mm256_insertf128_ps(in
, srcLo
, 0);
1276 in
= _mm256_insertf128_ps(in
, srcHi
, 1);
1278 #elif KNOB_SIMD_WIDTH == 16
1279 #if ENABLE_AVX512_EMULATION
1280 __m128 inlo0
= _mm256_extractf128_ps(in
.lo
, 0);
1281 __m128 inlo1
= _mm256_extractf128_ps(in
.lo
, 1);
1282 __m128 inhi0
= _mm256_extractf128_ps(in
.hi
, 0);
1283 __m128 inhi1
= _mm256_extractf128_ps(in
.hi
, 1);
1285 inlo0
= ConvertFloatToSRGB2(inlo0
);
1286 inlo1
= ConvertFloatToSRGB2(inlo1
);
1287 inhi0
= ConvertFloatToSRGB2(inhi0
);
1288 inhi1
= ConvertFloatToSRGB2(inhi1
);
1290 in
.lo
= _mm256_insertf128_ps(in
.lo
, inlo0
, 0);
1291 in
.lo
= _mm256_insertf128_ps(in
.lo
, inlo1
, 1);
1292 in
.hi
= _mm256_insertf128_ps(in
.hi
, inhi0
, 0);
1293 in
.hi
= _mm256_insertf128_ps(in
.hi
, inhi1
, 1);
1296 #error Unsupported vector width
1300 #if ENABLE_AVX512_SIMD16
1302 static inline simd16scalar
convertSrgb(simd16scalar
&in
)
1304 return ConvertFloatToSRGB2(in
);
1309 //////////////////////////////////////////////////////////////////////////
1310 /// Format1 - Bitfield for single component formats.
1311 //////////////////////////////////////////////////////////////////////////
1312 template<uint32_t x
>
1319 ///@ The following are here to provide full template needed in Formats.
1326 //////////////////////////////////////////////////////////////////////////
1327 /// Format1 - Bitfield for single component formats - 8 bit specialization
1328 //////////////////////////////////////////////////////////////////////////
1336 ///@ The following are here to provide full template needed in Formats.
1343 //////////////////////////////////////////////////////////////////////////
1344 /// Format1 - Bitfield for single component formats - 16 bit specialization
1345 //////////////////////////////////////////////////////////////////////////
1353 ///@ The following are here to provide full template needed in Formats.
1360 //////////////////////////////////////////////////////////////////////////
1361 /// Format2 - Bitfield for 2 component formats.
1362 //////////////////////////////////////////////////////////////////////////
1363 template<uint32_t x
, uint32_t y
>
1373 ///@ The following are here to provide full template needed in Formats.
1379 //////////////////////////////////////////////////////////////////////////
1380 /// Format2 - Bitfield for 2 component formats - 16 bit specialization
1381 //////////////////////////////////////////////////////////////////////////
1392 ///@ The following are here to provide full template needed in Formats.
1398 //////////////////////////////////////////////////////////////////////////
1399 /// Format3 - Bitfield for 3 component formats.
1400 //////////////////////////////////////////////////////////////////////////
1401 template<uint32_t x
, uint32_t y
, uint32_t z
>
1410 uint32_t a
; ///@note This is here to provide full template needed in Formats.
1413 //////////////////////////////////////////////////////////////////////////
1414 /// Format3 - Bitfield for 3 component formats - 16 bit specialization
1415 //////////////////////////////////////////////////////////////////////////
1417 union Format3
<5,6,5>
1425 uint16_t a
; ///@note This is here to provide full template needed in Formats.
1428 //////////////////////////////////////////////////////////////////////////
1429 /// Format4 - Bitfield for 4 component formats.
1430 //////////////////////////////////////////////////////////////////////////
1431 template<uint32_t x
, uint32_t y
, uint32_t z
, uint32_t w
>
1440 //////////////////////////////////////////////////////////////////////////
1441 /// Format4 - Bitfield for 4 component formats - 16 bit specialization
1442 //////////////////////////////////////////////////////////////////////////
1444 struct Format4
<5,5,5,1>
1452 //////////////////////////////////////////////////////////////////////////
1453 /// Format4 - Bitfield for 4 component formats - 16 bit specialization
1454 //////////////////////////////////////////////////////////////////////////
1456 struct Format4
<4,4,4,4>
1464 //////////////////////////////////////////////////////////////////////////
1465 /// ComponentTraits - Default components
1466 //////////////////////////////////////////////////////////////////////////
1467 template<uint32_t x
, uint32_t y
, uint32_t z
, uint32_t w
>
1470 INLINE
static uint32_t GetDefault(uint32_t comp
)
1472 static const uint32_t defaults
[4]{ x
, y
, z
, w
};
1473 return defaults
[comp
];
1477 //////////////////////////////////////////////////////////////////////////
1478 /// ComponentTraits - Component type traits.
1479 //////////////////////////////////////////////////////////////////////////
1480 template<SWR_TYPE X
, uint32_t NumBitsX
, SWR_TYPE Y
= SWR_TYPE_UNKNOWN
, uint32_t NumBitsY
= 0, SWR_TYPE Z
= SWR_TYPE_UNKNOWN
, uint32_t NumBitsZ
= 0, SWR_TYPE W
= SWR_TYPE_UNKNOWN
, uint32_t NumBitsW
= 0>
1481 struct ComponentTraits
1483 INLINE
static SWR_TYPE
GetType(uint32_t comp
)
1485 static const SWR_TYPE CompType
[4]{ X
, Y
, Z
, W
};
1486 return CompType
[comp
];
1489 INLINE
static uint32_t GetBPC(uint32_t comp
)
1491 static const uint32_t MyBpc
[4]{ NumBitsX
, NumBitsY
, NumBitsZ
, NumBitsW
};
1495 INLINE
static bool isNormalized(uint32_t comp
)
1500 return (X
== SWR_TYPE_UNORM
|| X
== SWR_TYPE_SNORM
) ? true : false;
1502 return (Y
== SWR_TYPE_UNORM
|| Y
== SWR_TYPE_SNORM
) ? true : false;
1504 return (Z
== SWR_TYPE_UNORM
|| Z
== SWR_TYPE_SNORM
) ? true : false;
1506 return (W
== SWR_TYPE_UNORM
|| W
== SWR_TYPE_SNORM
) ? true : false;
1512 INLINE
static float toFloat(uint32_t comp
)
1517 return TypeTraits
<X
, NumBitsX
>::toFloat();
1519 return TypeTraits
<Y
, NumBitsY
>::toFloat();
1521 return TypeTraits
<Z
, NumBitsZ
>::toFloat();
1523 return TypeTraits
<W
, NumBitsW
>::toFloat();
1526 return TypeTraits
<X
, NumBitsX
>::toFloat();
1530 INLINE
static float fromFloat(uint32_t comp
)
1535 return TypeTraits
<X
, NumBitsX
>::fromFloat();
1537 return TypeTraits
<Y
, NumBitsY
>::fromFloat();
1539 return TypeTraits
<Z
, NumBitsZ
>::fromFloat();
1541 return TypeTraits
<W
, NumBitsW
>::fromFloat();
1544 return TypeTraits
<X
, NumBitsX
>::fromFloat();
1547 INLINE
static simdscalar
loadSOA(uint32_t comp
, const uint8_t* pSrc
)
1552 return TypeTraits
<X
, NumBitsX
>::loadSOA(pSrc
);
1554 return TypeTraits
<Y
, NumBitsY
>::loadSOA(pSrc
);
1556 return TypeTraits
<Z
, NumBitsZ
>::loadSOA(pSrc
);
1558 return TypeTraits
<W
, NumBitsW
>::loadSOA(pSrc
);
1561 return TypeTraits
<X
, NumBitsX
>::loadSOA(pSrc
);
1564 INLINE
static void storeSOA(uint32_t comp
, uint8_t *pDst
, simdscalar src
)
1569 TypeTraits
<X
, NumBitsX
>::storeSOA(pDst
, src
);
1572 TypeTraits
<Y
, NumBitsY
>::storeSOA(pDst
, src
);
1575 TypeTraits
<Z
, NumBitsZ
>::storeSOA(pDst
, src
);
1578 TypeTraits
<W
, NumBitsW
>::storeSOA(pDst
, src
);
1582 TypeTraits
<X
, NumBitsX
>::storeSOA(pDst
, src
);
1585 INLINE
static simdscalar
unpack(uint32_t comp
, simdscalar
&in
)
1590 return TypeTraits
<X
, NumBitsX
>::unpack(in
);
1592 return TypeTraits
<Y
, NumBitsY
>::unpack(in
);
1594 return TypeTraits
<Z
, NumBitsZ
>::unpack(in
);
1596 return TypeTraits
<W
, NumBitsW
>::unpack(in
);
1599 return TypeTraits
<X
, NumBitsX
>::unpack(in
);
1602 INLINE
static simdscalar
pack(uint32_t comp
, simdscalar
&in
)
1607 return TypeTraits
<X
, NumBitsX
>::pack(in
);
1609 return TypeTraits
<Y
, NumBitsY
>::pack(in
);
1611 return TypeTraits
<Z
, NumBitsZ
>::pack(in
);
1613 return TypeTraits
<W
, NumBitsW
>::pack(in
);
1616 return TypeTraits
<X
, NumBitsX
>::pack(in
);
1619 INLINE
static simdscalar
convertSrgb(uint32_t comp
, simdscalar
&in
)
1624 return TypeTraits
<X
, NumBitsX
>::convertSrgb(in
);
1626 return TypeTraits
<Y
, NumBitsY
>::convertSrgb(in
);
1628 return TypeTraits
<Z
, NumBitsZ
>::convertSrgb(in
);
1630 return TypeTraits
<W
, NumBitsW
>::convertSrgb(in
);
1633 return TypeTraits
<X
, NumBitsX
>::convertSrgb(in
);
1635 #if ENABLE_AVX512_SIMD16
1637 INLINE
static simd16scalar
loadSOA_16(uint32_t comp
, const uint8_t* pSrc
)
1642 return TypeTraits
<X
, NumBitsX
>::loadSOA_16(pSrc
);
1644 return TypeTraits
<Y
, NumBitsY
>::loadSOA_16(pSrc
);
1646 return TypeTraits
<Z
, NumBitsZ
>::loadSOA_16(pSrc
);
1648 return TypeTraits
<W
, NumBitsW
>::loadSOA_16(pSrc
);
1651 return TypeTraits
<X
, NumBitsX
>::loadSOA_16(pSrc
);
1654 INLINE
static void storeSOA(uint32_t comp
, uint8_t *pDst
, simd16scalar src
)
1659 TypeTraits
<X
, NumBitsX
>::storeSOA(pDst
, src
);
1662 TypeTraits
<Y
, NumBitsY
>::storeSOA(pDst
, src
);
1665 TypeTraits
<Z
, NumBitsZ
>::storeSOA(pDst
, src
);
1668 TypeTraits
<W
, NumBitsW
>::storeSOA(pDst
, src
);
1672 TypeTraits
<X
, NumBitsX
>::storeSOA(pDst
, src
);
1675 INLINE
static simd16scalar
unpack(uint32_t comp
, simd16scalar
&in
)
1680 return TypeTraits
<X
, NumBitsX
>::unpack(in
);
1682 return TypeTraits
<Y
, NumBitsY
>::unpack(in
);
1684 return TypeTraits
<Z
, NumBitsZ
>::unpack(in
);
1686 return TypeTraits
<W
, NumBitsW
>::unpack(in
);
1689 return TypeTraits
<X
, NumBitsX
>::unpack(in
);
1692 INLINE
static simd16scalar
pack(uint32_t comp
, simd16scalar
&in
)
1697 return TypeTraits
<X
, NumBitsX
>::pack(in
);
1699 return TypeTraits
<Y
, NumBitsY
>::pack(in
);
1701 return TypeTraits
<Z
, NumBitsZ
>::pack(in
);
1703 return TypeTraits
<W
, NumBitsW
>::pack(in
);
1706 return TypeTraits
<X
, NumBitsX
>::pack(in
);
1709 INLINE
static simd16scalar
convertSrgb(uint32_t comp
, simd16scalar
&in
)
1714 return TypeTraits
<X
, NumBitsX
>::convertSrgb(in
);
1716 return TypeTraits
<Y
, NumBitsY
>::convertSrgb(in
);
1718 return TypeTraits
<Z
, NumBitsZ
>::convertSrgb(in
);
1720 return TypeTraits
<W
, NumBitsW
>::convertSrgb(in
);
1723 return TypeTraits
<X
, NumBitsX
>::convertSrgb(in
);