1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * @brief Definitions for SWR_FORMAT functions.
27 ******************************************************************************/
32 //////////////////////////////////////////////////////////////////////////
33 /// PackTraits - Helpers for packing / unpacking same pixel sizes
34 //////////////////////////////////////////////////////////////////////////
35 template <uint32_t NumBits
, bool Signed
= false>
38 static const uint32_t MyNumBits
= NumBits
;
39 static simdscalar
loadSOA(const uint8_t *pSrc
) = delete;
40 static void storeSOA(uint8_t *pDst
, simdscalar src
) = delete;
41 static simdscalar
unpack(simdscalar
&in
) = delete;
42 static simdscalar
pack(simdscalar
&in
) = delete;
43 #if ENABLE_AVX512_SIMD16
44 static simd16scalar
loadSOA_16(const uint8_t *pSrc
) = delete;
45 static void storeSOA(uint8_t *pDst
, simd16scalar src
) = delete;
46 static simd16scalar
unpack(simd16scalar
&in
) = delete;
47 static simd16scalar
pack(simd16scalar
&in
) = delete;
51 //////////////////////////////////////////////////////////////////////////
52 /// PackTraits - Helpers for packing / unpacking unused channels
53 //////////////////////////////////////////////////////////////////////////
55 struct PackTraits
<0, false>
57 static const uint32_t MyNumBits
= 0;
59 static simdscalar
loadSOA(const uint8_t *pSrc
) { return _simd_setzero_ps(); }
60 static void storeSOA(uint8_t *pDst
, simdscalar src
) { return; }
61 static simdscalar
unpack(simdscalar
&in
) { return _simd_setzero_ps(); }
62 static simdscalar
pack(simdscalar
&in
) { return _simd_setzero_ps(); }
63 #if ENABLE_AVX512_SIMD16
64 static simd16scalar
loadSOA_16(const uint8_t *pSrc
) { return _simd16_setzero_ps(); }
65 static void storeSOA(uint8_t *pDst
, simd16scalar src
) { return; }
66 static simd16scalar
unpack(simd16scalar
&in
) { return _simd16_setzero_ps(); }
67 static simd16scalar
pack(simd16scalar
&in
) { return _simd16_setzero_ps(); }
71 //////////////////////////////////////////////////////////////////////////
72 /// PackTraits - Helpers for packing / unpacking 8 bit unsigned channels
73 //////////////////////////////////////////////////////////////////////////
75 struct PackTraits
<8, false>
77 static const uint32_t MyNumBits
= 8;
79 static simdscalar
loadSOA(const uint8_t *pSrc
)
81 #if KNOB_SIMD_WIDTH == 8
82 __m256 result
= _mm256_setzero_ps();
83 __m128 vLo
= _mm_castpd_ps(_mm_load_sd((double*)pSrc
));
84 return _mm256_insertf128_ps(result
, vLo
, 0);
86 #error Unsupported vector width
90 static void storeSOA(uint8_t *pDst
, simdscalar src
)
93 #if KNOB_SIMD_WIDTH == 8
94 _mm_storel_pd((double*)pDst
, _mm_castps_pd(_mm256_castps256_ps128(src
)));
96 #error Unsupported vector width
100 static simdscalar
unpack(simdscalar
&in
)
102 #if KNOB_SIMD_WIDTH == 8
103 #if KNOB_ARCH==KNOB_ARCH_AVX
104 __m128i src
= _mm_castps_si128(_mm256_castps256_ps128(in
));
105 __m128i resLo
= _mm_cvtepu8_epi32(src
);
106 __m128i resHi
= _mm_shuffle_epi8(src
,
107 _mm_set_epi32(0x80808007, 0x80808006, 0x80808005, 0x80808004));
109 __m256i result
= _mm256_castsi128_si256(resLo
);
110 result
= _mm256_insertf128_si256(result
, resHi
, 1);
111 return _mm256_castsi256_ps(result
);
112 #elif KNOB_ARCH>=KNOB_ARCH_AVX2
113 return _mm256_castsi256_ps(_mm256_cvtepu8_epi32(_mm_castps_si128(_mm256_castps256_ps128(in
))));
116 #error Unsupported vector width
120 static simdscalar
pack(simdscalar
&in
)
122 #if KNOB_SIMD_WIDTH == 8
123 simdscalari src
= _simd_castps_si(in
);
124 __m128i res16
= _mm_packus_epi32(_mm256_castsi256_si128(src
), _mm256_extractf128_si256(src
, 1));
125 __m128i res8
= _mm_packus_epi16(res16
, _mm_undefined_si128());
126 return _mm256_castsi256_ps(_mm256_castsi128_si256(res8
));
128 #error Unsupported vector width
131 #if ENABLE_AVX512_SIMD16
133 static simd16scalar
loadSOA_16(const uint8_t *pSrc
)
135 simd16scalar result
= _simd16_setzero_ps();
136 simdscalar resultlo
= _simd_setzero_ps();
138 const __m128 src
= _mm_load_ps(reinterpret_cast<const float *>(pSrc
));
140 resultlo
= _mm256_insertf128_ps(resultlo
, src
, 0);
141 result
= _simd16_insert_ps(result
, resultlo
, 0);
146 static void storeSOA(uint8_t *pDst
, simd16scalar src
)
148 // store simd16 bytes
149 _mm_store_ps(reinterpret_cast<float *>(pDst
), _mm256_castps256_ps128(_simd16_extract_ps(src
, 0)));
152 static simd16scalar
unpack(simd16scalar
&in
)
154 simd16scalari result
= _simd16_setzero_si();
156 __m128i src
= _mm_castps_si128(_mm256_castps256_ps128(_simd16_extract_ps(in
, 0)));
158 result
= _simd16_insert_si(result
, _simd_cvtepu8_epi32(src
), 0);
159 result
= _simd16_insert_si(result
, _simd_cvtepu8_epi32(_mm_srli_si128(src
, 8)), 1);
161 return _simd16_castsi_ps(result
);
164 static simd16scalar
pack(simd16scalar
&in
)
166 simd16scalari result
= _simd16_setzero_si();
167 simdscalari resultlo
= _simd_setzero_si();
169 __m128i templo
= _mm_packus_epi32(_mm256_castsi256_si128(_mm256_castps_si256(_simd16_extract_ps(in
, 0))), _mm256_extractf128_si256(_mm256_castps_si256(_simd16_extract_ps(in
, 0)), 1));
170 __m128i temphi
= _mm_packus_epi32(_mm256_castsi256_si128(_mm256_castps_si256(_simd16_extract_ps(in
, 1))), _mm256_extractf128_si256(_mm256_castps_si256(_simd16_extract_ps(in
, 1)), 1));
172 __m128i temp
= _mm_packus_epi16(templo
, temphi
);
174 resultlo
= _mm256_inserti128_si256(resultlo
, temp
, 0);
175 result
= _simd16_insert_si(result
, resultlo
, 0);
177 return _simd16_castsi_ps(result
);
182 //////////////////////////////////////////////////////////////////////////
183 /// PackTraits - Helpers for packing / unpacking 8 bit signed channels
184 //////////////////////////////////////////////////////////////////////////
186 struct PackTraits
<8, true>
188 static const uint32_t MyNumBits
= 8;
190 static simdscalar
loadSOA(const uint8_t *pSrc
)
192 #if KNOB_SIMD_WIDTH == 8
193 __m256 result
= _mm256_setzero_ps();
194 __m128 vLo
= _mm_castpd_ps(_mm_load_sd((double*)pSrc
));
195 return _mm256_insertf128_ps(result
, vLo
, 0);
197 #error Unsupported vector width
201 static void storeSOA(uint8_t *pDst
, simdscalar src
)
204 #if KNOB_SIMD_WIDTH == 8
205 _mm_storel_pd((double*)pDst
, _mm_castps_pd(_mm256_castps256_ps128(src
)));
207 #error Unsupported vector width
211 static simdscalar
unpack(simdscalar
&in
)
213 #if KNOB_SIMD_WIDTH == 8
214 #if KNOB_ARCH==KNOB_ARCH_AVX
215 SWR_ASSERT(0); // I think this may be incorrect.
216 __m128i src
= _mm_castps_si128(_mm256_castps256_ps128(in
));
217 __m128i resLo
= _mm_cvtepi8_epi32(src
);
218 __m128i resHi
= _mm_shuffle_epi8(src
,
219 _mm_set_epi32(0x80808007, 0x80808006, 0x80808005, 0x80808004));
221 __m256i result
= _mm256_castsi128_si256(resLo
);
222 result
= _mm256_insertf128_si256(result
, resHi
, 1);
223 return _mm256_castsi256_ps(result
);
224 #elif KNOB_ARCH>=KNOB_ARCH_AVX2
225 return _mm256_castsi256_ps(_mm256_cvtepi8_epi32(_mm_castps_si128(_mm256_castps256_ps128(in
))));
228 #error Unsupported vector width
232 static simdscalar
pack(simdscalar
&in
)
234 #if KNOB_SIMD_WIDTH == 8
235 simdscalari src
= _simd_castps_si(in
);
236 __m128i res16
= _mm_packs_epi32(_mm256_castsi256_si128(src
), _mm256_extractf128_si256(src
, 1));
237 __m128i res8
= _mm_packs_epi16(res16
, _mm_undefined_si128());
238 return _mm256_castsi256_ps(_mm256_castsi128_si256(res8
));
240 #error Unsupported vector width
243 #if ENABLE_AVX512_SIMD16
245 static simd16scalar
loadSOA_16(const uint8_t *pSrc
)
247 simd16scalar result
= _simd16_setzero_ps();
248 simdscalar resultlo
= _simd_setzero_ps();
250 const __m128 src
= _mm_load_ps(reinterpret_cast<const float *>(pSrc
));
252 resultlo
= _mm256_insertf128_ps(resultlo
, src
, 0);
253 result
= _simd16_insert_ps(result
, resultlo
, 0);
258 static void storeSOA(uint8_t *pDst
, simd16scalar src
)
260 // store simd16 bytes
261 _mm_store_ps(reinterpret_cast<float *>(pDst
), _mm256_castps256_ps128(_simd16_extract_ps(src
, 0)));
264 static simd16scalar
unpack(simd16scalar
&in
)
266 simd16scalari result
= _simd16_setzero_si();
268 __m128i src
= _mm_castps_si128(_mm256_castps256_ps128(_simd16_extract_ps(in
, 0)));
270 result
= _simd16_insert_si(result
, _simd_cvtepu8_epi32(src
), 0);
271 result
= _simd16_insert_si(result
, _simd_cvtepu8_epi32(_mm_srli_si128(src
, 8)), 1);
273 return _simd16_castsi_ps(result
);
276 static simd16scalar
pack(simd16scalar
&in
)
278 simd16scalari result
= _simd16_setzero_si();
279 simdscalari resultlo
= _simd_setzero_si();
281 __m128i templo
= _mm_packs_epi32(_mm256_castsi256_si128(_mm256_castps_si256(_simd16_extract_ps(in
, 0))), _mm256_extractf128_si256(_mm256_castps_si256(_simd16_extract_ps(in
, 0)), 1));
282 __m128i temphi
= _mm_packs_epi32(_mm256_castsi256_si128(_mm256_castps_si256(_simd16_extract_ps(in
, 1))), _mm256_extractf128_si256(_mm256_castps_si256(_simd16_extract_ps(in
, 1)), 1));
284 __m128i temp
= _mm_packs_epi16(templo
, temphi
);
286 resultlo
= _mm256_inserti128_si256(resultlo
, temp
, 0);
287 result
= _simd16_insert_si(result
, resultlo
, 0);
289 return _simd16_castsi_ps(result
);
294 //////////////////////////////////////////////////////////////////////////
295 /// PackTraits - Helpers for packing / unpacking 16 bit unsigned channels
296 //////////////////////////////////////////////////////////////////////////
298 struct PackTraits
<16, false>
300 static const uint32_t MyNumBits
= 16;
302 static simdscalar
loadSOA(const uint8_t *pSrc
)
304 #if KNOB_SIMD_WIDTH == 8
305 __m256 result
= _mm256_setzero_ps();
306 __m128 vLo
= _mm_load_ps((const float*)pSrc
);
307 return _mm256_insertf128_ps(result
, vLo
, 0);
309 #error Unsupported vector width
313 static void storeSOA(uint8_t *pDst
, simdscalar src
)
315 #if KNOB_SIMD_WIDTH == 8
316 // store 16B (2B * 8)
317 _mm_store_ps((float*)pDst
, _mm256_castps256_ps128(src
));
319 #error Unsupported vector width
323 static simdscalar
unpack(simdscalar
&in
)
325 #if KNOB_SIMD_WIDTH == 8
326 #if KNOB_ARCH==KNOB_ARCH_AVX
327 __m128i src
= _mm_castps_si128(_mm256_castps256_ps128(in
));
328 __m128i resLo
= _mm_cvtepu16_epi32(src
);
329 __m128i resHi
= _mm_shuffle_epi8(src
,
330 _mm_set_epi32(0x80800F0E, 0x80800D0C, 0x80800B0A, 0x80800908));
332 __m256i result
= _mm256_castsi128_si256(resLo
);
333 result
= _mm256_insertf128_si256(result
, resHi
, 1);
334 return _mm256_castsi256_ps(result
);
335 #elif KNOB_ARCH>=KNOB_ARCH_AVX2
336 return _mm256_castsi256_ps(_mm256_cvtepu16_epi32(_mm_castps_si128(_mm256_castps256_ps128(in
))));
339 #error Unsupported vector width
343 static simdscalar
pack(simdscalar
&in
)
345 #if KNOB_SIMD_WIDTH == 8
346 simdscalari src
= _simd_castps_si(in
);
347 __m256i res
= _mm256_castsi128_si256(_mm_packus_epi32(_mm256_castsi256_si128(src
), _mm256_extractf128_si256(src
, 1)));
348 return _mm256_castsi256_ps(res
);
350 #error Unsupported vector width
353 #if ENABLE_AVX512_SIMD16
355 static simd16scalar
loadSOA_16(const uint8_t *pSrc
)
357 simd16scalar result
= _simd16_setzero_ps();
359 simdscalar resultlo
= _simd_load_ps(reinterpret_cast<const float *>(pSrc
));
361 result
= _simd16_insert_ps(result
, resultlo
, 0);
366 static void storeSOA(uint8_t *pDst
, simd16scalar src
)
368 _simd_store_ps(reinterpret_cast<float *>(pDst
), _simd16_extract_ps(src
, 0));
371 static simd16scalar
unpack(simd16scalar
&in
)
373 simd16scalari result
= _simd16_setzero_si();
375 result
= _simd16_insert_si(result
, _simd_cvtepu16_epi32(_mm256_extracti128_si256(_mm256_castps_si256(_simd16_extract_ps(in
, 0)), 0)), 0);
376 result
= _simd16_insert_si(result
, _simd_cvtepu16_epi32(_mm256_extracti128_si256(_mm256_castps_si256(_simd16_extract_ps(in
, 0)), 1)), 1);
378 return _simd16_castsi_ps(result
);
381 static simd16scalar
pack(simd16scalar
&in
)
383 simd16scalari result
= _simd16_setzero_si();
385 simdscalari inlo
= _simd_castps_si(_simd16_extract_ps(in
, 0));
386 simdscalari inhi
= _simd_castps_si(_simd16_extract_ps(in
, 1));
388 simdscalari templo
= _simd_permute2f128_si(inlo
, inhi
, 0x20);
389 simdscalari temphi
= _simd_permute2f128_si(inlo
, inhi
, 0x31);
391 result
= _simd16_insert_si(result
, _simd_packus_epi32(templo
, temphi
), 0);
393 return _simd16_castsi_ps(result
);
398 //////////////////////////////////////////////////////////////////////////
399 /// PackTraits - Helpers for packing / unpacking 16 bit signed channels
400 //////////////////////////////////////////////////////////////////////////
402 struct PackTraits
<16, true>
404 static const uint32_t MyNumBits
= 16;
406 static simdscalar
loadSOA(const uint8_t *pSrc
)
408 #if KNOB_SIMD_WIDTH == 8
409 __m256 result
= _mm256_setzero_ps();
410 __m128 vLo
= _mm_load_ps((const float*)pSrc
);
411 return _mm256_insertf128_ps(result
, vLo
, 0);
413 #error Unsupported vector width
417 static void storeSOA(uint8_t *pDst
, simdscalar src
)
419 #if KNOB_SIMD_WIDTH == 8
420 // store 16B (2B * 8)
421 _mm_store_ps((float*)pDst
, _mm256_castps256_ps128(src
));
423 #error Unsupported vector width
427 static simdscalar
unpack(simdscalar
&in
)
429 #if KNOB_SIMD_WIDTH == 8
430 #if KNOB_ARCH==KNOB_ARCH_AVX
431 SWR_ASSERT(0); // I think this is incorrectly implemented
432 __m128i src
= _mm_castps_si128(_mm256_castps256_ps128(in
));
433 __m128i resLo
= _mm_cvtepi16_epi32(src
);
434 __m128i resHi
= _mm_shuffle_epi8(src
,
435 _mm_set_epi32(0x80800F0E, 0x80800D0C, 0x80800B0A, 0x80800908));
437 __m256i result
= _mm256_castsi128_si256(resLo
);
438 result
= _mm256_insertf128_si256(result
, resHi
, 1);
439 return _mm256_castsi256_ps(result
);
440 #elif KNOB_ARCH>=KNOB_ARCH_AVX2
441 return _mm256_castsi256_ps(_mm256_cvtepi16_epi32(_mm_castps_si128(_mm256_castps256_ps128(in
))));
444 #error Unsupported vector width
448 static simdscalar
pack(simdscalar
&in
)
450 #if KNOB_SIMD_WIDTH == 8
451 simdscalari src
= _simd_castps_si(in
);
452 __m256i res
= _mm256_castsi128_si256(_mm_packs_epi32(_mm256_castsi256_si128(src
), _mm256_extractf128_si256(src
, 1)));
453 return _mm256_castsi256_ps(res
);
455 #error Unsupported vector width
458 #if ENABLE_AVX512_SIMD16
460 static simd16scalar
loadSOA_16(const uint8_t *pSrc
)
462 simd16scalar result
= _simd16_setzero_ps();
464 simdscalar resultlo
= _simd_load_ps(reinterpret_cast<const float *>(pSrc
));
466 result
= _simd16_insert_ps(result
, resultlo
, 0);
471 static void storeSOA(uint8_t *pDst
, simd16scalar src
)
473 _simd_store_ps(reinterpret_cast<float *>(pDst
), _simd16_extract_ps(src
, 0));
476 static simd16scalar
unpack(simd16scalar
&in
)
478 simd16scalari result
= _simd16_setzero_si();
480 result
= _simd16_insert_si(result
, _simd_cvtepu16_epi32(_mm256_extracti128_si256(_mm256_castps_si256(_simd16_extract_ps(in
, 0)), 0)), 0);
481 result
= _simd16_insert_si(result
, _simd_cvtepu16_epi32(_mm256_extracti128_si256(_mm256_castps_si256(_simd16_extract_ps(in
, 0)), 1)), 1);
483 return _simd16_castsi_ps(result
);
486 static simd16scalar
pack(simd16scalar
&in
)
488 simd16scalari result
= _simd16_setzero_si();
490 simdscalari inlo
= _simd_castps_si(_simd16_extract_ps(in
, 0));
491 simdscalari inhi
= _simd_castps_si(_simd16_extract_ps(in
, 1));
493 simdscalari templo
= _simd_permute2f128_si(inlo
, inhi
, 0x20);
494 simdscalari temphi
= _simd_permute2f128_si(inlo
, inhi
, 0x31);
496 result
= _simd16_insert_si(result
, _simd_packus_epi32(templo
, temphi
), 0);
498 return _simd16_castsi_ps(result
);
503 //////////////////////////////////////////////////////////////////////////
504 /// PackTraits - Helpers for packing / unpacking 32 bit channels
505 //////////////////////////////////////////////////////////////////////////
507 struct PackTraits
<32, false>
509 static const uint32_t MyNumBits
= 32;
511 static simdscalar
loadSOA(const uint8_t *pSrc
) { return _simd_load_ps((const float*)pSrc
); }
512 static void storeSOA(uint8_t *pDst
, simdscalar src
) { _simd_store_ps((float*)pDst
, src
); }
513 static simdscalar
unpack(simdscalar
&in
) { return in
; }
514 static simdscalar
pack(simdscalar
&in
) { return in
; }
515 #if ENABLE_AVX512_SIMD16
517 static simd16scalar
loadSOA_16(const uint8_t *pSrc
)
519 return _simd16_load_ps(reinterpret_cast<const float *>(pSrc
));
522 static void storeSOA(uint8_t *pDst
, simd16scalar src
)
524 _simd16_store_ps(reinterpret_cast<float *>(pDst
), src
);
527 static simd16scalar
unpack(simd16scalar
&in
)
532 static simd16scalar
pack(simd16scalar
&in
)
539 //////////////////////////////////////////////////////////////////////////
540 /// TypeTraits - Format type traits.
541 //////////////////////////////////////////////////////////////////////////
542 template<SWR_TYPE type
, uint32_t NumBits
>
543 struct TypeTraits
: PackTraits
<NumBits
>
545 static const SWR_TYPE MyType
= type
;
546 static float toFloat() { return 0.0; }
547 static float fromFloat() { SWR_ASSERT(0); return 0.0; }
548 static simdscalar
convertSrgb(simdscalar
&in
) { SWR_ASSERT(0); return _simd_setzero_ps(); }
551 //////////////////////////////////////////////////////////////////////////
552 /// TypeTraits - Format type traits specialization for UINT8
553 //////////////////////////////////////////////////////////////////////////
554 template<> struct TypeTraits
<SWR_TYPE_UINT
, 8> : PackTraits
<8>
556 static const SWR_TYPE MyType
= SWR_TYPE_UINT
;
557 static float toFloat() { return 0.0; }
558 static float fromFloat() { SWR_ASSERT(0); return 0.0; }
559 static simdscalar
convertSrgb(simdscalar
&in
) { SWR_ASSERT(0); return _simd_setzero_ps(); }
562 //////////////////////////////////////////////////////////////////////////
563 /// TypeTraits - Format type traits specialization for UINT8
564 //////////////////////////////////////////////////////////////////////////
565 template<> struct TypeTraits
<SWR_TYPE_SINT
, 8> : PackTraits
<8, true>
567 static const SWR_TYPE MyType
= SWR_TYPE_SINT
;
568 static float toFloat() { return 0.0; }
569 static float fromFloat() { SWR_ASSERT(0); return 0.0; }
570 static simdscalar
convertSrgb(simdscalar
&in
) { SWR_ASSERT(0); return _simd_setzero_ps(); }
573 //////////////////////////////////////////////////////////////////////////
574 /// TypeTraits - Format type traits specialization for UINT16
575 //////////////////////////////////////////////////////////////////////////
576 template<> struct TypeTraits
<SWR_TYPE_UINT
, 16> : PackTraits
<16>
578 static const SWR_TYPE MyType
= SWR_TYPE_UINT
;
579 static float toFloat() { return 0.0; }
580 static float fromFloat() { SWR_ASSERT(0); return 0.0; }
581 static simdscalar
convertSrgb(simdscalar
&in
) { SWR_ASSERT(0); return _simd_setzero_ps(); }
584 //////////////////////////////////////////////////////////////////////////
585 /// TypeTraits - Format type traits specialization for SINT16
586 //////////////////////////////////////////////////////////////////////////
587 template<> struct TypeTraits
<SWR_TYPE_SINT
, 16> : PackTraits
<16, true>
589 static const SWR_TYPE MyType
= SWR_TYPE_SINT
;
590 static float toFloat() { return 0.0; }
591 static float fromFloat() { SWR_ASSERT(0); return 0.0; }
592 static simdscalar
convertSrgb(simdscalar
&in
) { SWR_ASSERT(0); return _simd_setzero_ps(); }
595 //////////////////////////////////////////////////////////////////////////
596 /// TypeTraits - Format type traits specialization for UINT32
597 //////////////////////////////////////////////////////////////////////////
598 template<> struct TypeTraits
<SWR_TYPE_UINT
, 32> : PackTraits
<32>
600 static const SWR_TYPE MyType
= SWR_TYPE_UINT
;
601 static float toFloat() { return 0.0; }
602 static float fromFloat() { SWR_ASSERT(0); return 0.0; }
603 static simdscalar
convertSrgb(simdscalar
&in
) { SWR_ASSERT(0); return _simd_setzero_ps(); }
606 //////////////////////////////////////////////////////////////////////////
607 /// TypeTraits - Format type traits specialization for UINT32
608 //////////////////////////////////////////////////////////////////////////
609 template<> struct TypeTraits
<SWR_TYPE_SINT
, 32> : PackTraits
<32>
611 static const SWR_TYPE MyType
= SWR_TYPE_SINT
;
612 static float toFloat() { return 0.0; }
613 static float fromFloat() { SWR_ASSERT(0); return 0.0; }
614 static simdscalar
convertSrgb(simdscalar
&in
) { SWR_ASSERT(0); return _simd_setzero_ps(); }
617 //////////////////////////////////////////////////////////////////////////
618 /// TypeTraits - Format type traits specialization for UNORM5
619 //////////////////////////////////////////////////////////////////////////
620 template<> struct TypeTraits
<SWR_TYPE_UNORM
, 5> : PackTraits
<5>
622 static const SWR_TYPE MyType
= SWR_TYPE_UNORM
;
623 static float toFloat() { return 1.0f
/ 31.0f
; }
624 static float fromFloat() { return 31.0f
; }
625 static simdscalar
convertSrgb(simdscalar
&in
) { SWR_ASSERT(0); return _simd_setzero_ps(); }
628 //////////////////////////////////////////////////////////////////////////
629 /// TypeTraits - Format type traits specialization for UNORM6
630 //////////////////////////////////////////////////////////////////////////
631 template<> struct TypeTraits
<SWR_TYPE_UNORM
, 6> : PackTraits
<6>
633 static const SWR_TYPE MyType
= SWR_TYPE_UNORM
;
634 static float toFloat() { return 1.0f
/ 63.0f
; }
635 static float fromFloat() { return 63.0f
; }
636 static simdscalar
convertSrgb(simdscalar
&in
) { SWR_ASSERT(0); return _simd_setzero_ps(); }
639 //////////////////////////////////////////////////////////////////////////
640 /// TypeTraits - Format type traits specialization for UNORM8
641 //////////////////////////////////////////////////////////////////////////
642 template<> struct TypeTraits
<SWR_TYPE_UNORM
, 8> : PackTraits
<8>
644 static const SWR_TYPE MyType
= SWR_TYPE_UNORM
;
645 static float toFloat() { return 1.0f
/ 255.0f
; }
646 static float fromFloat() { return 255.0f
; }
647 static simdscalar
convertSrgb(simdscalar
&in
) { SWR_ASSERT(0); return _simd_setzero_ps(); }
650 //////////////////////////////////////////////////////////////////////////
651 /// TypeTraits - Format type traits specialization for UNORM8
652 //////////////////////////////////////////////////////////////////////////
653 template<> struct TypeTraits
<SWR_TYPE_SNORM
, 8> : PackTraits
<8, true>
655 static const SWR_TYPE MyType
= SWR_TYPE_SNORM
;
656 static float toFloat() { return 1.0f
/ 127.0f
; }
657 static float fromFloat() { return 127.0f
; }
658 static simdscalar
convertSrgb(simdscalar
&in
) { SWR_ASSERT(0); return _simd_setzero_ps(); }
661 //////////////////////////////////////////////////////////////////////////
662 /// TypeTraits - Format type traits specialization for UNORM16
663 //////////////////////////////////////////////////////////////////////////
664 template<> struct TypeTraits
<SWR_TYPE_UNORM
, 16> : PackTraits
<16>
666 static const SWR_TYPE MyType
= SWR_TYPE_UNORM
;
667 static float toFloat() { return 1.0f
/ 65535.0f
; }
668 static float fromFloat() { return 65535.0f
; }
669 static simdscalar
convertSrgb(simdscalar
&in
) { SWR_ASSERT(0); return _simd_setzero_ps(); }
672 //////////////////////////////////////////////////////////////////////////
673 /// TypeTraits - Format type traits specialization for SNORM16
674 //////////////////////////////////////////////////////////////////////////
675 template<> struct TypeTraits
<SWR_TYPE_SNORM
, 16> : PackTraits
<16, true>
677 static const SWR_TYPE MyType
= SWR_TYPE_UNORM
;
678 static float toFloat() { return 1.0f
/ 32767.0f
; }
679 static float fromFloat() { return 32767.0f
; }
680 static simdscalar
convertSrgb(simdscalar
&in
) { SWR_ASSERT(0); return _simd_setzero_ps(); }
683 //////////////////////////////////////////////////////////////////////////
684 /// TypeTraits - Format type traits specialization for UNORM24
685 //////////////////////////////////////////////////////////////////////////
687 struct TypeTraits
< SWR_TYPE_UNORM
, 24 > : PackTraits
<32>
689 static const SWR_TYPE MyType
= SWR_TYPE_UNORM
;
690 static float toFloat() { return 1.0f
/ 16777215.0f
; }
691 static float fromFloat() { return 16777215.0f
; }
692 static simdscalar
convertSrgb(simdscalar
&in
) { SWR_ASSERT(0); return _simd_setzero_ps(); }
695 //////////////////////////////////////////////////////////////////////////
696 // FLOAT Specializations from here on...
697 //////////////////////////////////////////////////////////////////////////
698 #define TO_M128i(a) _mm_castps_si128(a)
699 #define TO_M128(a) _mm_castsi128_ps(a)
703 template< unsigned expnum
, unsigned expden
, unsigned coeffnum
, unsigned coeffden
>
704 inline static __m128
fastpow(__m128 arg
) {
707 static const __m128 factor
= _mm_set1_ps(exp2(127.0f
* expden
/ expnum
- 127.0f
)
708 * powf(1.0f
* coeffnum
/ coeffden
, 1.0f
* expden
/ expnum
));
710 // Apply a constant pre-correction factor.
711 ret
= _mm_mul_ps(ret
, factor
);
713 // Reinterpret arg as integer to obtain logarithm.
714 //asm("cvtdq2ps %1, %0" : "=x" (ret) : "x" (ret));
715 ret
= _mm_cvtepi32_ps(_mm_castps_si128(ret
));
717 // Multiply logarithm by power.
718 ret
= _mm_mul_ps(ret
, _mm_set1_ps(1.0f
* expnum
/ expden
));
720 // Convert back to "integer" to exponentiate.
721 //asm("cvtps2dq %1, %0" : "=x" (ret) : "x" (ret));
722 ret
= _mm_castsi128_ps(_mm_cvtps_epi32(ret
));
727 inline static __m128
pow512_4(__m128 arg
) {
728 // 5/12 is too small, so compute the 4th root of 20/12 instead.
729 // 20/12 = 5/3 = 1 + 2/3 = 2 - 1/3. 2/3 is a suitable argument for fastpow.
730 // weighting coefficient: a^-1/2 = 2 a; a = 2^-2/3
731 __m128 xf
= fastpow
< 2, 3, int(0.629960524947437 * 1e9
), int(1e9
) >(arg
);
732 __m128 xover
= _mm_mul_ps(arg
, xf
);
734 __m128 xfm1
= _mm_rsqrt_ps(xf
);
735 __m128 x2
= _mm_mul_ps(arg
, arg
);
736 __m128 xunder
= _mm_mul_ps(x2
, xfm1
);
738 // sqrt2 * over + 2 * sqrt2 * under
739 __m128 xavg
= _mm_mul_ps(_mm_set1_ps(1.0f
/ (3.0f
* 0.629960524947437f
) * 0.999852f
),
740 _mm_add_ps(xover
, xunder
));
742 xavg
= _mm_mul_ps(xavg
, _mm_rsqrt_ps(xavg
));
743 xavg
= _mm_mul_ps(xavg
, _mm_rsqrt_ps(xavg
));
747 inline static __m128
powf_wrapper(__m128 Base
, float Exp
)
749 float *f
= (float *)(&Base
);
751 return _mm_set_ps(powf(f
[3], Exp
),
757 static inline __m128
ConvertFloatToSRGB2(__m128
& Src
)
759 // create a mask with 0xFFFFFFFF in the DWORDs where the source is <= the minimal SRGB float value
760 __m128i CmpToSRGBThresholdMask
= TO_M128i(_mm_cmpnlt_ps(_mm_set1_ps(0.0031308f
), Src
));
762 // squeeze the mask down to 16 bits (4 bits per DWORD)
763 int CompareResult
= _mm_movemask_epi8(CmpToSRGBThresholdMask
);
768 if (CompareResult
== 0xFFFF)
770 // all DWORDs are <= the threshold
771 Result
= _mm_mul_ps(Src
, _mm_set1_ps(12.92f
));
773 else if (CompareResult
== 0x0)
775 // all DWORDs are > the threshold
776 __m128 fSrc_0RGB
= Src
;
778 // --> 1.055f * c(1.0f/2.4f) - 0.055f
779 #if KNOB_USE_FAST_SRGB == TRUE
780 // 1.0f / 2.4f is 5.0f / 12.0f which is used for approximation.
781 __m128 f
= pow512_4(fSrc_0RGB
);
783 __m128 f
= powf_wrapper(fSrc_0RGB
, 1.0f
/ 2.4f
);
785 f
= _mm_mul_ps(f
, _mm_set1_ps(1.055f
));
786 Result
= _mm_sub_ps(f
, _mm_set1_ps(0.055f
));
790 // some DWORDs are <= the threshold and some are > threshold
791 __m128 Src_0RGB_mul_denorm
= _mm_mul_ps(Src
, _mm_set1_ps(12.92f
));
793 __m128 fSrc_0RGB
= Src
;
795 // --> 1.055f * c(1.0f/2.4f) - 0.055f
796 #if KNOB_USE_FAST_SRGB == TRUE
797 // 1.0f / 2.4f is 5.0f / 12.0f which is used for approximation.
798 __m128 f
= pow512_4(fSrc_0RGB
);
800 __m128 f
= powf_wrapper(fSrc_0RGB
, 1.0f
/ 2.4f
);
802 f
= _mm_mul_ps(f
, _mm_set1_ps(1.055f
));
803 f
= _mm_sub_ps(f
, _mm_set1_ps(0.055f
));
805 // Clear the alpha (is garbage after the sub)
806 __m128i i
= _mm_and_si128(TO_M128i(f
), _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF));
808 __m128i LessThanPart
= _mm_and_si128(CmpToSRGBThresholdMask
, TO_M128i(Src_0RGB_mul_denorm
));
809 __m128i GreaterEqualPart
= _mm_andnot_si128(CmpToSRGBThresholdMask
, i
);
810 __m128i CombinedParts
= _mm_or_si128(LessThanPart
, GreaterEqualPart
);
812 Result
= TO_M128(CombinedParts
);
818 #if ENABLE_AVX512_SIMD16
819 template< unsigned expnum
, unsigned expden
, unsigned coeffnum
, unsigned coeffden
>
820 inline static simd16scalar
fastpow(simd16scalar value
)
822 static const float factor1
= exp2(127.0f
* expden
/ expnum
- 127.0f
)
823 * powf(1.0f
* coeffnum
/ coeffden
, 1.0f
* expden
/ expnum
);
825 // Apply a constant pre-correction factor.
826 simd16scalar result
= _simd16_mul_ps(value
, _simd16_set1_ps(factor1
));
828 // Reinterpret arg as integer to obtain logarithm.
829 //asm("cvtdq2ps %1, %0" : "=x" (result) : "x" (result));
830 result
= _simd16_cvtepi32_ps(_simd16_castps_si(result
));
832 // Multiply logarithm by power.
833 result
= _simd16_mul_ps(result
, _simd16_set1_ps(1.0f
* expnum
/ expden
));
835 // Convert back to "integer" to exponentiate.
836 //asm("cvtps2dq %1, %0" : "=x" (result) : "x" (result));
837 result
= _simd16_castsi_ps(_simd16_cvtps_epi32(result
));
842 inline static simd16scalar
pow512_4(simd16scalar arg
)
844 // 5/12 is too small, so compute the 4th root of 20/12 instead.
845 // 20/12 = 5/3 = 1 + 2/3 = 2 - 1/3. 2/3 is a suitable argument for fastpow.
846 // weighting coefficient: a^-1/2 = 2 a; a = 2^-2/3
847 simd16scalar xf
= fastpow
< 2, 3, int(0.629960524947437 * 1e9
), int(1e9
) >(arg
);
848 simd16scalar xover
= _simd16_mul_ps(arg
, xf
);
850 simd16scalar xfm1
= _simd16_rsqrt_ps(xf
);
851 simd16scalar x2
= _simd16_mul_ps(arg
, arg
);
852 simd16scalar xunder
= _simd16_mul_ps(x2
, xfm1
);
854 // sqrt2 * over + 2 * sqrt2 * under
855 simd16scalar xavg
= _simd16_mul_ps(_simd16_set1_ps(1.0f
/ (3.0f
* 0.629960524947437f
) * 0.999852f
), _simd16_add_ps(xover
, xunder
));
857 xavg
= _simd16_mul_ps(xavg
, _simd16_rsqrt_ps(xavg
));
858 xavg
= _simd16_mul_ps(xavg
, _simd16_rsqrt_ps(xavg
));
863 inline static simd16scalar
powf_wrapper(const simd16scalar base
, float exp
)
865 const float *f
= reinterpret_cast<const float *>(&base
);
867 return _simd16_set_ps(
887 // float to SRGB conversion formula
889 // if (value < 0.0031308f)
892 // value = 1.055f * pow(value, 1.0f / 2.4f) - 0.055f;
894 static inline simd16scalar
ConvertFloatToSRGB2(const simd16scalar
&value
)
896 // create a mask where the source is < the minimal SRGB float value
897 const simd16mask mask
= _simd16_cmplt_ps_mask(value
, _simd16_set1_ps(0.0031308f
));
899 // if all elements are < the threshold, result = value * 12.92
900 simd16scalar result
= _simd16_mul_ps(value
, _simd16_set1_ps(12.92f
));
902 if (_simd16_mask2int(mask
) != 0xFFFF)
904 // some elements are >= threshold, result = 1.055 * power(value, 1.0 / 2.4) - 0.055
905 #if KNOB_USE_FAST_SRGB == TRUE
906 // 1.0f / 2.4f is 5.0f / 12.0f which is used for approximation.
907 simd16scalar result2
= pow512_4(value
);
909 simd16scalar result2
= powf_wrapper(value
, 1.0f
/ 2.4f
);
912 result2
= _simd16_mul_ps(result2
, _simd16_set1_ps(1.055f
));
913 result2
= _simd16_sub_ps(result2
, _simd16_set1_ps(0.055f
));
915 #if (KNOB_ARCH == KNOB_ARCH_AVX512)
916 // only native AVX512 can directly use the computed mask for the blend operation
917 result
= _mm512_mask_blend_ps(mask
, result2
, result
);
919 result
= _simd16_blendv_ps(result2
, result
, _simd16_cmplt_ps(value
, _simd16_set1_ps(0.0031308f
)));
927 //////////////////////////////////////////////////////////////////////////
928 /// TypeTraits - Format type traits specialization for FLOAT16
929 //////////////////////////////////////////////////////////////////////////
930 template<> struct TypeTraits
<SWR_TYPE_FLOAT
, 16> : PackTraits
<16>
932 static const SWR_TYPE MyType
= SWR_TYPE_FLOAT
;
933 static float toFloat() { return 1.0f
; }
934 static float fromFloat() { return 1.0f
; }
935 static simdscalar
convertSrgb(simdscalar
&in
) { SWR_ASSERT(0); return _simd_setzero_ps(); }
937 static simdscalar
pack(const simdscalar
&in
)
939 #if KNOB_SIMD_WIDTH == 8
940 #if (KNOB_ARCH == KNOB_ARCH_AVX)
941 // input is 8 packed float32, output is 8 packed float16
942 simdscalari src
= _simd_castps_si(in
);
944 static const uint32_t FLOAT_EXP_BITS
= 8;
945 static const uint32_t FLOAT_MANTISSA_BITS
= 23;
946 static const uint32_t FLOAT_MANTISSA_MASK
= (1U << FLOAT_MANTISSA_BITS
) - 1;
947 static const uint32_t FLOAT_EXP_MASK
= ((1U << FLOAT_EXP_BITS
) - 1) << FLOAT_MANTISSA_BITS
;
949 static const uint32_t HALF_EXP_BITS
= 5;
950 static const uint32_t HALF_MANTISSA_BITS
= 10;
951 static const uint32_t HALF_EXP_MASK
= ((1U << HALF_EXP_BITS
) - 1) << HALF_MANTISSA_BITS
;
953 // minimum exponent required, exponents below this are flushed to 0.
954 static const int32_t HALF_EXP_MIN
= -14;
955 static const int32_t FLOAT_EXP_BIAS
= 127;
956 static const int32_t FLOAT_EXP_MIN
= HALF_EXP_MIN
+ FLOAT_EXP_BIAS
;
957 static const int32_t FLOAT_EXP_MIN_FTZ
= FLOAT_EXP_MIN
- (HALF_MANTISSA_BITS
+ 1); // +1 for the lack of implicit significand
959 // maximum exponent required, exponents above this are set to infinity
960 static const int32_t HALF_EXP_MAX
= 15;
961 static const int32_t FLOAT_EXP_MAX
= HALF_EXP_MAX
+ FLOAT_EXP_BIAS
;
963 const simdscalari vSignMask
= _simd_set1_epi32(0x80000000);
964 const simdscalari vExpMask
= _simd_set1_epi32(FLOAT_EXP_MASK
);
965 const simdscalari vManMask
= _simd_set1_epi32(FLOAT_MANTISSA_MASK
);
966 const simdscalari vExpMin
= _simd_set1_epi32(FLOAT_EXP_MASK
& uint32_t(FLOAT_EXP_MIN
<< FLOAT_MANTISSA_BITS
));
967 const simdscalari vExpMinFtz
= _simd_set1_epi32(FLOAT_EXP_MASK
& uint32_t(FLOAT_EXP_MIN_FTZ
<< FLOAT_MANTISSA_BITS
));
968 const simdscalari vExpMax
= _simd_set1_epi32(FLOAT_EXP_MASK
& uint32_t(FLOAT_EXP_MAX
<< FLOAT_MANTISSA_BITS
));
970 simdscalari vSign
= _simd_and_si(src
, vSignMask
);
971 simdscalari vExp
= _simd_and_si(src
, vExpMask
);
972 simdscalari vMan
= _simd_and_si(src
, vManMask
);
974 simdscalari vFTZMask
= _simd_cmplt_epi32(vExp
, vExpMinFtz
);
975 simdscalari vDenormMask
= _simd_andnot_si(vFTZMask
, _simd_cmplt_epi32(vExp
, vExpMin
));
976 simdscalari vInfMask
= _simd_cmpeq_epi32(vExpMask
, vExp
);
977 simdscalari vClampMask
= _simd_andnot_si(vInfMask
, _simd_cmplt_epi32(vExpMax
, vExp
));
979 simdscalari vHalfExp
= _simd_add_epi32(_simd_sub_epi32(vExp
, vExpMin
), _simd_set1_epi32(1U << FLOAT_MANTISSA_BITS
));
981 // pack output 16-bits into the lower 16-bits of each 32-bit channel
982 simdscalari vDst
= _simd_and_si(_simd_srli_epi32(vHalfExp
, 13), _simd_set1_epi32(HALF_EXP_MASK
));
983 vDst
= _simd_or_si(vDst
, _simd_srli_epi32(vMan
, FLOAT_MANTISSA_BITS
- HALF_MANTISSA_BITS
));
986 vDst
= _simd_andnot_si(vFTZMask
, vDst
);
987 // Apply Infinites / NaN
988 vDst
= _simd_or_si(vDst
, _simd_and_si(vInfMask
, _simd_set1_epi32(HALF_EXP_MASK
)));
991 vDst
= _simd_andnot_si(vClampMask
, vDst
);
992 vDst
= _simd_or_si(vDst
,
993 _simd_and_si(vClampMask
, _simd_set1_epi32(0x7BFF)));
995 // Compute Denormals (subnormals)
996 if (!_mm256_testz_si256(vDenormMask
, vDenormMask
))
998 uint32_t *pDenormMask
= (uint32_t*)&vDenormMask
;
999 uint32_t *pExp
= (uint32_t*)&vExp
;
1000 uint32_t *pMan
= (uint32_t*)&vMan
;
1001 uint32_t *pDst
= (uint32_t*)&vDst
;
1002 for (uint32_t i
= 0; i
< KNOB_SIMD_WIDTH
; ++i
)
1006 // Need to compute subnormal value
1007 uint32_t exponent
= pExp
[i
] >> FLOAT_MANTISSA_BITS
;
1008 uint32_t mantissa
= pMan
[i
] |
1009 (1U << FLOAT_MANTISSA_BITS
); // Denorms include no "implicit" 1s. Make it explicit
1011 pDst
[i
] = mantissa
>> ((FLOAT_EXP_MIN
- exponent
) + (FLOAT_MANTISSA_BITS
- HALF_MANTISSA_BITS
));
1017 vDst
= _simd_or_si(vDst
, _simd_srli_epi32(vSign
, 16));
1019 // Pack to lower 128-bits
1020 vDst
= _mm256_castsi128_si256(_mm_packus_epi32(_mm256_castsi256_si128(vDst
), _mm256_extractf128_si256(vDst
, 1)));
1023 #if !defined(NDEBUG)
1024 simdscalari vCheck
= _mm256_castsi128_si256(_mm256_cvtps_ph(in
, _MM_FROUND_TRUNC
));
1026 for (uint32_t i
= 0; i
< 4; ++i
)
1028 SWR_ASSERT(vCheck
.m256i_i32
[i
] == vDst
.m256i_i32
[i
]);
1033 return _simd_castsi_ps(vDst
);
1036 return _mm256_castsi256_ps(_mm256_castsi128_si256(_mm256_cvtps_ph(in
, _MM_FROUND_TRUNC
)));
1039 #error Unsupported vector width
1043 static simdscalar
unpack(const simdscalar
&in
)
1045 // input is 8 packed float16, output is 8 packed float32
1046 SWR_ASSERT(0); // @todo
1047 return _simd_setzero_ps();
1049 #if ENABLE_AVX512_SIMD16
1051 static simd16scalar
pack(const simd16scalar
&in
)
1053 simd16scalari result
= _simd16_setzero_si();
1054 simdscalari resultlo
= _simd_setzero_si();
1056 #if (KNOB_ARCH == KNOB_ARCH_AVX)
1057 simdscalar simdlo
= pack(_simd16_extract_ps(in
, 0));
1058 simdscalar simdhi
= pack(_simd16_extract_ps(in
, 1));
1060 __m128i templo
= _mm256_extractf128_si256(_simd_castps_si(simdlo
), 0);
1061 __m128i temphi
= _mm256_extractf128_si256(_simd_castps_si(simdhi
), 0);
1064 __m128i templo
= _mm256_cvtps_ph(_simd16_extract_ps(in
, 0), _MM_FROUND_TRUNC
);
1065 __m128i temphi
= _mm256_cvtps_ph(_simd16_extract_ps(in
, 1), _MM_FROUND_TRUNC
);
1068 resultlo
= _mm256_insertf128_si256(resultlo
, templo
, 0);
1069 resultlo
= _mm256_insertf128_si256(resultlo
, temphi
, 1);
1071 result
= _simd16_insert_si(result
, resultlo
, 0);
1073 return _simd16_castsi_ps(result
);
1076 static simd16scalar
unpack(const simd16scalar
&in
)
1078 // input is 16 packed float16, output is 16 packed float32
1079 SWR_ASSERT(0); // @todo
1080 return _simd16_setzero_ps();
1085 //////////////////////////////////////////////////////////////////////////
1086 /// TypeTraits - Format type traits specialization for FLOAT32
1087 //////////////////////////////////////////////////////////////////////////
1088 template<> struct TypeTraits
<SWR_TYPE_FLOAT
, 32> : PackTraits
<32>
1090 static const SWR_TYPE MyType
= SWR_TYPE_FLOAT
;
1091 static float toFloat() { return 1.0f
; }
1092 static float fromFloat() { return 1.0f
; }
1093 static inline simdscalar
convertSrgb(simdscalar
&in
)
1095 #if KNOB_SIMD_WIDTH == 8
1096 #if (KNOB_ARCH == KNOB_ARCH_AVX || KNOB_ARCH == KNOB_ARCH_AVX2)
1097 __m128 srcLo
= _mm256_extractf128_ps(in
, 0);
1098 __m128 srcHi
= _mm256_extractf128_ps(in
, 1);
1100 srcLo
= ConvertFloatToSRGB2(srcLo
);
1101 srcHi
= ConvertFloatToSRGB2(srcHi
);
1103 in
= _mm256_insertf128_ps(in
, srcLo
, 0);
1104 in
= _mm256_insertf128_ps(in
, srcHi
, 1);
1107 #error Unsupported vector width
1111 #if ENABLE_AVX512_SIMD16
1113 static inline simd16scalar
convertSrgb(simd16scalar
&in
)
1115 return ConvertFloatToSRGB2(in
);
1120 //////////////////////////////////////////////////////////////////////////
1121 /// Format1 - Bitfield for single component formats.
1122 //////////////////////////////////////////////////////////////////////////
1123 template<uint32_t x
>
1130 ///@ The following are here to provide full template needed in Formats.
1137 //////////////////////////////////////////////////////////////////////////
1138 /// Format1 - Bitfield for single component formats - 8 bit specialization
1139 //////////////////////////////////////////////////////////////////////////
1147 ///@ The following are here to provide full template needed in Formats.
1154 //////////////////////////////////////////////////////////////////////////
1155 /// Format1 - Bitfield for single component formats - 16 bit specialization
1156 //////////////////////////////////////////////////////////////////////////
1164 ///@ The following are here to provide full template needed in Formats.
1171 //////////////////////////////////////////////////////////////////////////
1172 /// Format2 - Bitfield for 2 component formats.
1173 //////////////////////////////////////////////////////////////////////////
1174 template<uint32_t x
, uint32_t y
>
1184 ///@ The following are here to provide full template needed in Formats.
1190 //////////////////////////////////////////////////////////////////////////
1191 /// Format2 - Bitfield for 2 component formats - 16 bit specialization
1192 //////////////////////////////////////////////////////////////////////////
1203 ///@ The following are here to provide full template needed in Formats.
1209 //////////////////////////////////////////////////////////////////////////
1210 /// Format3 - Bitfield for 3 component formats.
1211 //////////////////////////////////////////////////////////////////////////
1212 template<uint32_t x
, uint32_t y
, uint32_t z
>
1221 uint32_t a
; ///@note This is here to provide full template needed in Formats.
1224 //////////////////////////////////////////////////////////////////////////
1225 /// Format3 - Bitfield for 3 component formats - 16 bit specialization
1226 //////////////////////////////////////////////////////////////////////////
1228 union Format3
<5,6,5>
1236 uint16_t a
; ///@note This is here to provide full template needed in Formats.
1239 //////////////////////////////////////////////////////////////////////////
1240 /// Format4 - Bitfield for 4 component formats.
1241 //////////////////////////////////////////////////////////////////////////
1242 template<uint32_t x
, uint32_t y
, uint32_t z
, uint32_t w
>
1251 //////////////////////////////////////////////////////////////////////////
1252 /// Format4 - Bitfield for 4 component formats - 16 bit specialization
1253 //////////////////////////////////////////////////////////////////////////
1255 struct Format4
<5,5,5,1>
1263 //////////////////////////////////////////////////////////////////////////
1264 /// Format4 - Bitfield for 4 component formats - 16 bit specialization
1265 //////////////////////////////////////////////////////////////////////////
1267 struct Format4
<4,4,4,4>
1275 //////////////////////////////////////////////////////////////////////////
1276 /// ComponentTraits - Default components
1277 //////////////////////////////////////////////////////////////////////////
1278 template<uint32_t x
, uint32_t y
, uint32_t z
, uint32_t w
>
1281 INLINE
static uint32_t GetDefault(uint32_t comp
)
1283 static const uint32_t defaults
[4]{ x
, y
, z
, w
};
1284 return defaults
[comp
];
1288 //////////////////////////////////////////////////////////////////////////
1289 /// ComponentTraits - Component type traits.
1290 //////////////////////////////////////////////////////////////////////////
1291 template<SWR_TYPE X
, uint32_t NumBitsX
, SWR_TYPE Y
= SWR_TYPE_UNKNOWN
, uint32_t NumBitsY
= 0, SWR_TYPE Z
= SWR_TYPE_UNKNOWN
, uint32_t NumBitsZ
= 0, SWR_TYPE W
= SWR_TYPE_UNKNOWN
, uint32_t NumBitsW
= 0>
1292 struct ComponentTraits
1294 INLINE
static SWR_TYPE
GetType(uint32_t comp
)
1296 static const SWR_TYPE CompType
[4]{ X
, Y
, Z
, W
};
1297 return CompType
[comp
];
1300 INLINE
static uint32_t GetBPC(uint32_t comp
)
1302 static const uint32_t MyBpc
[4]{ NumBitsX
, NumBitsY
, NumBitsZ
, NumBitsW
};
1306 INLINE
static bool isNormalized(uint32_t comp
)
1311 return (X
== SWR_TYPE_UNORM
|| X
== SWR_TYPE_SNORM
) ? true : false;
1313 return (Y
== SWR_TYPE_UNORM
|| Y
== SWR_TYPE_SNORM
) ? true : false;
1315 return (Z
== SWR_TYPE_UNORM
|| Z
== SWR_TYPE_SNORM
) ? true : false;
1317 return (W
== SWR_TYPE_UNORM
|| W
== SWR_TYPE_SNORM
) ? true : false;
1323 INLINE
static float toFloat(uint32_t comp
)
1328 return TypeTraits
<X
, NumBitsX
>::toFloat();
1330 return TypeTraits
<Y
, NumBitsY
>::toFloat();
1332 return TypeTraits
<Z
, NumBitsZ
>::toFloat();
1334 return TypeTraits
<W
, NumBitsW
>::toFloat();
1337 return TypeTraits
<X
, NumBitsX
>::toFloat();
1341 INLINE
static float fromFloat(uint32_t comp
)
1346 return TypeTraits
<X
, NumBitsX
>::fromFloat();
1348 return TypeTraits
<Y
, NumBitsY
>::fromFloat();
1350 return TypeTraits
<Z
, NumBitsZ
>::fromFloat();
1352 return TypeTraits
<W
, NumBitsW
>::fromFloat();
1355 return TypeTraits
<X
, NumBitsX
>::fromFloat();
1358 INLINE
static simdscalar
loadSOA(uint32_t comp
, const uint8_t* pSrc
)
1363 return TypeTraits
<X
, NumBitsX
>::loadSOA(pSrc
);
1365 return TypeTraits
<Y
, NumBitsY
>::loadSOA(pSrc
);
1367 return TypeTraits
<Z
, NumBitsZ
>::loadSOA(pSrc
);
1369 return TypeTraits
<W
, NumBitsW
>::loadSOA(pSrc
);
1372 return TypeTraits
<X
, NumBitsX
>::loadSOA(pSrc
);
1375 INLINE
static void storeSOA(uint32_t comp
, uint8_t *pDst
, simdscalar src
)
1380 TypeTraits
<X
, NumBitsX
>::storeSOA(pDst
, src
);
1383 TypeTraits
<Y
, NumBitsY
>::storeSOA(pDst
, src
);
1386 TypeTraits
<Z
, NumBitsZ
>::storeSOA(pDst
, src
);
1389 TypeTraits
<W
, NumBitsW
>::storeSOA(pDst
, src
);
1393 TypeTraits
<X
, NumBitsX
>::storeSOA(pDst
, src
);
1396 INLINE
static simdscalar
unpack(uint32_t comp
, simdscalar
&in
)
1401 return TypeTraits
<X
, NumBitsX
>::unpack(in
);
1403 return TypeTraits
<Y
, NumBitsY
>::unpack(in
);
1405 return TypeTraits
<Z
, NumBitsZ
>::unpack(in
);
1407 return TypeTraits
<W
, NumBitsW
>::unpack(in
);
1410 return TypeTraits
<X
, NumBitsX
>::unpack(in
);
1413 INLINE
static simdscalar
pack(uint32_t comp
, simdscalar
&in
)
1418 return TypeTraits
<X
, NumBitsX
>::pack(in
);
1420 return TypeTraits
<Y
, NumBitsY
>::pack(in
);
1422 return TypeTraits
<Z
, NumBitsZ
>::pack(in
);
1424 return TypeTraits
<W
, NumBitsW
>::pack(in
);
1427 return TypeTraits
<X
, NumBitsX
>::pack(in
);
1430 INLINE
static simdscalar
convertSrgb(uint32_t comp
, simdscalar
&in
)
1435 return TypeTraits
<X
, NumBitsX
>::convertSrgb(in
);
1437 return TypeTraits
<Y
, NumBitsY
>::convertSrgb(in
);
1439 return TypeTraits
<Z
, NumBitsZ
>::convertSrgb(in
);
1441 return TypeTraits
<W
, NumBitsW
>::convertSrgb(in
);
1444 return TypeTraits
<X
, NumBitsX
>::convertSrgb(in
);
1446 #if ENABLE_AVX512_SIMD16
1448 INLINE
static simd16scalar
loadSOA_16(uint32_t comp
, const uint8_t* pSrc
)
1453 return TypeTraits
<X
, NumBitsX
>::loadSOA_16(pSrc
);
1455 return TypeTraits
<Y
, NumBitsY
>::loadSOA_16(pSrc
);
1457 return TypeTraits
<Z
, NumBitsZ
>::loadSOA_16(pSrc
);
1459 return TypeTraits
<W
, NumBitsW
>::loadSOA_16(pSrc
);
1462 return TypeTraits
<X
, NumBitsX
>::loadSOA_16(pSrc
);
1465 INLINE
static void storeSOA(uint32_t comp
, uint8_t *pDst
, simd16scalar src
)
1470 TypeTraits
<X
, NumBitsX
>::storeSOA(pDst
, src
);
1473 TypeTraits
<Y
, NumBitsY
>::storeSOA(pDst
, src
);
1476 TypeTraits
<Z
, NumBitsZ
>::storeSOA(pDst
, src
);
1479 TypeTraits
<W
, NumBitsW
>::storeSOA(pDst
, src
);
1483 TypeTraits
<X
, NumBitsX
>::storeSOA(pDst
, src
);
1486 INLINE
static simd16scalar
unpack(uint32_t comp
, simd16scalar
&in
)
1491 return TypeTraits
<X
, NumBitsX
>::unpack(in
);
1493 return TypeTraits
<Y
, NumBitsY
>::unpack(in
);
1495 return TypeTraits
<Z
, NumBitsZ
>::unpack(in
);
1497 return TypeTraits
<W
, NumBitsW
>::unpack(in
);
1500 return TypeTraits
<X
, NumBitsX
>::unpack(in
);
1503 INLINE
static simd16scalar
pack(uint32_t comp
, simd16scalar
&in
)
1508 return TypeTraits
<X
, NumBitsX
>::pack(in
);
1510 return TypeTraits
<Y
, NumBitsY
>::pack(in
);
1512 return TypeTraits
<Z
, NumBitsZ
>::pack(in
);
1514 return TypeTraits
<W
, NumBitsW
>::pack(in
);
1517 return TypeTraits
<X
, NumBitsX
>::pack(in
);
1520 INLINE
static simd16scalar
convertSrgb(uint32_t comp
, simd16scalar
&in
)
1525 return TypeTraits
<X
, NumBitsX
>::convertSrgb(in
);
1527 return TypeTraits
<Y
, NumBitsY
>::convertSrgb(in
);
1529 return TypeTraits
<Z
, NumBitsZ
>::convertSrgb(in
);
1531 return TypeTraits
<W
, NumBitsW
>::convertSrgb(in
);
1534 return TypeTraits
<X
, NumBitsX
>::convertSrgb(in
);