1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * @brief Definitions for SWR_FORMAT functions.
27 ******************************************************************************/
32 //////////////////////////////////////////////////////////////////////////
33 /// PackTraits - Helpers for packing / unpacking same pixel sizes
34 //////////////////////////////////////////////////////////////////////////
35 template <uint32_t NumBits
, bool Signed
= false>
38 static const uint32_t MyNumBits
= NumBits
;
39 static simdscalar
loadSOA(const uint8_t *pSrc
) = delete;
40 static void storeSOA(uint8_t *pDst
, simdscalar src
) = delete;
41 static simdscalar
unpack(simdscalar
&in
) = delete;
42 static simdscalar
pack(simdscalar
&in
) = delete;
43 #if ENABLE_AVX512_SIMD16
44 static simd16scalar
loadSOA_16(const uint8_t *pSrc
) = delete;
45 static void storeSOA(uint8_t *pDst
, simd16scalar src
) = delete;
46 static simd16scalar
unpack(simd16scalar
&in
) = delete;
47 static simd16scalar
pack(simd16scalar
&in
) = delete;
51 //////////////////////////////////////////////////////////////////////////
52 /// PackTraits - Helpers for packing / unpacking unused channels
53 //////////////////////////////////////////////////////////////////////////
55 struct PackTraits
<0, false>
57 static const uint32_t MyNumBits
= 0;
59 static simdscalar
loadSOA(const uint8_t *pSrc
) { return _simd_setzero_ps(); }
60 static void storeSOA(uint8_t *pDst
, simdscalar src
) { return; }
61 static simdscalar
unpack(simdscalar
&in
) { return _simd_setzero_ps(); }
62 static simdscalar
pack(simdscalar
&in
) { return _simd_setzero_ps(); }
63 #if ENABLE_AVX512_SIMD16
64 static simd16scalar
loadSOA_16(const uint8_t *pSrc
) { return _simd16_setzero_ps(); }
65 static void storeSOA(uint8_t *pDst
, simd16scalar src
) { return; }
66 static simd16scalar
unpack(simd16scalar
&in
) { return _simd16_setzero_ps(); }
67 static simd16scalar
pack(simd16scalar
&in
) { return _simd16_setzero_ps(); }
71 //////////////////////////////////////////////////////////////////////////
72 /// PackTraits - Helpers for packing / unpacking 8 bit unsigned channels
73 //////////////////////////////////////////////////////////////////////////
75 struct PackTraits
<8, false>
77 static const uint32_t MyNumBits
= 8;
79 static simdscalar
loadSOA(const uint8_t *pSrc
)
81 #if KNOB_SIMD_WIDTH == 8
82 __m256 result
= _mm256_setzero_ps();
83 __m128 vLo
= _mm_castpd_ps(_mm_load_sd((double*)pSrc
));
84 return _mm256_insertf128_ps(result
, vLo
, 0);
86 #error Unsupported vector width
90 static void storeSOA(uint8_t *pDst
, simdscalar src
)
93 #if KNOB_SIMD_WIDTH == 8
94 _mm_storel_pd((double*)pDst
, _mm_castps_pd(_mm256_castps256_ps128(src
)));
96 #error Unsupported vector width
100 static simdscalar
unpack(simdscalar
&in
)
102 #if KNOB_SIMD_WIDTH == 8
103 #if KNOB_ARCH==KNOB_ARCH_AVX
104 __m128i src
= _mm_castps_si128(_mm256_castps256_ps128(in
));
105 __m128i resLo
= _mm_cvtepu8_epi32(src
);
106 __m128i resHi
= _mm_shuffle_epi8(src
,
107 _mm_set_epi32(0x80808007, 0x80808006, 0x80808005, 0x80808004));
109 __m256i result
= _mm256_castsi128_si256(resLo
);
110 result
= _mm256_insertf128_si256(result
, resHi
, 1);
111 return _mm256_castsi256_ps(result
);
112 #elif KNOB_ARCH>=KNOB_ARCH_AVX2
113 return _mm256_castsi256_ps(_mm256_cvtepu8_epi32(_mm_castps_si128(_mm256_castps256_ps128(in
))));
116 #error Unsupported vector width
120 static simdscalar
pack(simdscalar
&in
)
122 #if KNOB_SIMD_WIDTH == 8
123 simdscalari src
= _simd_castps_si(in
);
124 __m128i res16
= _mm_packus_epi32(_mm256_castsi256_si128(src
), _mm256_extractf128_si256(src
, 1));
125 __m128i res8
= _mm_packus_epi16(res16
, _mm_undefined_si128());
126 return _mm256_castsi256_ps(_mm256_castsi128_si256(res8
));
128 #error Unsupported vector width
131 #if ENABLE_AVX512_SIMD16
133 static simd16scalar
loadSOA_16(const uint8_t *pSrc
)
135 simd16scalar result
= _simd16_setzero_ps();
136 simdscalar resultlo
= _simd_setzero_ps();
138 const __m128 src
= _mm_load_ps(reinterpret_cast<const float *>(pSrc
));
140 resultlo
= _mm256_insertf128_ps(resultlo
, src
, 0);
141 result
= _simd16_insert_ps(result
, resultlo
, 0);
146 static void storeSOA(uint8_t *pDst
, simd16scalar src
)
148 // store simd16 bytes
149 _mm_store_ps(reinterpret_cast<float *>(pDst
), _mm256_castps256_ps128(_simd16_extract_ps(src
, 0)));
152 static simd16scalar
unpack(simd16scalar
&in
)
154 simd16scalari result
= _simd16_cvtepu8_epi32(_mm_castps_si128(_mm256_castps256_ps128(_simd16_extract_ps(in
, 0))));
156 return _simd16_castsi_ps(result
);
159 static simd16scalar
pack(simd16scalar
&in
)
161 simd16scalari result
= _simd16_setzero_si();
163 simdscalari inlo
= _simd_castps_si(_simd16_extract_ps(in
, 0)); // r0 r1 r2 r3 r4 r5 r6 r7 (32b)
164 simdscalari inhi
= _simd_castps_si(_simd16_extract_ps(in
, 1)); // r8 r9 rA rB rC rD rE rF
166 simdscalari permlo
= _simd_permute2f128_si(inlo
, inhi
, 0x20); // r0 r1 r2 r3 r8 r9 rA rB (32b)
167 simdscalari permhi
= _simd_permute2f128_si(inlo
, inhi
, 0x31); // r4 r5 r6 r7 rC rD rE rF (32b)
169 simdscalari pack
= _simd_packus_epi32(permlo
, permhi
); // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF (16b)
171 const simdscalari zero
= _simd_setzero_si();
173 permlo
= _simd_permute2f128_si(pack
, zero
, 0x20); // (2, 0) // r0 r1 r2 r3 r4 r5 r6 r7 00 00 00 00 00 00 00 00 (16b)
174 permhi
= _simd_permute2f128_si(pack
, zero
, 0x31); // (3, 1) // r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 (16b)
176 pack
= _simd_packus_epi16(permlo
, permhi
); // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 (8b)
178 result
= _simd16_insert_si(result
, pack
, 0);
180 return _simd16_castsi_ps(result
);
185 //////////////////////////////////////////////////////////////////////////
186 /// PackTraits - Helpers for packing / unpacking 8 bit signed channels
187 //////////////////////////////////////////////////////////////////////////
189 struct PackTraits
<8, true>
191 static const uint32_t MyNumBits
= 8;
193 static simdscalar
loadSOA(const uint8_t *pSrc
)
195 #if KNOB_SIMD_WIDTH == 8
196 __m256 result
= _mm256_setzero_ps();
197 __m128 vLo
= _mm_castpd_ps(_mm_load_sd((double*)pSrc
));
198 return _mm256_insertf128_ps(result
, vLo
, 0);
200 #error Unsupported vector width
204 static void storeSOA(uint8_t *pDst
, simdscalar src
)
207 #if KNOB_SIMD_WIDTH == 8
208 _mm_storel_pd((double*)pDst
, _mm_castps_pd(_mm256_castps256_ps128(src
)));
210 #error Unsupported vector width
214 static simdscalar
unpack(simdscalar
&in
)
216 #if KNOB_SIMD_WIDTH == 8
217 #if KNOB_ARCH==KNOB_ARCH_AVX
218 SWR_INVALID("I think this may be incorrect.");
219 __m128i src
= _mm_castps_si128(_mm256_castps256_ps128(in
));
220 __m128i resLo
= _mm_cvtepi8_epi32(src
);
221 __m128i resHi
= _mm_shuffle_epi8(src
,
222 _mm_set_epi32(0x80808007, 0x80808006, 0x80808005, 0x80808004));
224 __m256i result
= _mm256_castsi128_si256(resLo
);
225 result
= _mm256_insertf128_si256(result
, resHi
, 1);
226 return _mm256_castsi256_ps(result
);
227 #elif KNOB_ARCH>=KNOB_ARCH_AVX2
228 return _mm256_castsi256_ps(_mm256_cvtepi8_epi32(_mm_castps_si128(_mm256_castps256_ps128(in
))));
231 #error Unsupported vector width
235 static simdscalar
pack(simdscalar
&in
)
237 #if KNOB_SIMD_WIDTH == 8
238 simdscalari src
= _simd_castps_si(in
);
239 __m128i res16
= _mm_packs_epi32(_mm256_castsi256_si128(src
), _mm256_extractf128_si256(src
, 1));
240 __m128i res8
= _mm_packs_epi16(res16
, _mm_undefined_si128());
241 return _mm256_castsi256_ps(_mm256_castsi128_si256(res8
));
243 #error Unsupported vector width
246 #if ENABLE_AVX512_SIMD16
248 static simd16scalar
loadSOA_16(const uint8_t *pSrc
)
250 simd16scalar result
= _simd16_setzero_ps();
251 simdscalar resultlo
= _simd_setzero_ps();
253 const __m128 src
= _mm_load_ps(reinterpret_cast<const float *>(pSrc
));
255 resultlo
= _mm256_insertf128_ps(resultlo
, src
, 0);
256 result
= _simd16_insert_ps(result
, resultlo
, 0);
261 static void storeSOA(uint8_t *pDst
, simd16scalar src
)
263 // store simd16 bytes
264 _mm_store_ps(reinterpret_cast<float *>(pDst
), _mm256_castps256_ps128(_simd16_extract_ps(src
, 0)));
267 static simd16scalar
unpack(simd16scalar
&in
)
269 simd16scalari result
= _simd16_cvtepu8_epi32(_mm_castps_si128(_mm256_castps256_ps128(_simd16_extract_ps(in
, 0))));
271 return _simd16_castsi_ps(result
);
274 static simd16scalar
pack(simd16scalar
&in
)
276 simd16scalari result
= _simd16_setzero_si();
278 simdscalari inlo
= _simd_castps_si(_simd16_extract_ps(in
, 0)); // r0 r1 r2 r3 r4 r5 r6 r7 (32b)
279 simdscalari inhi
= _simd_castps_si(_simd16_extract_ps(in
, 1)); // r8 r9 rA rB rC rD rE rF
281 simdscalari permlo
= _simd_permute2f128_si(inlo
, inhi
, 0x20); // r0 r1 r2 r3 r8 r9 rA rB (32b)
282 simdscalari permhi
= _simd_permute2f128_si(inlo
, inhi
, 0x31); // r4 r5 r6 r7 rC rD rE rF (32b)
284 simdscalari pack
= _simd_packs_epi32(permlo
, permhi
); // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF (16b)
286 const simdscalari zero
= _simd_setzero_si();
288 permlo
= _simd_permute2f128_si(pack
, zero
, 0x20); // (2, 0) // r0 r1 r2 r3 r4 r5 r6 r7 00 00 00 00 00 00 00 00 (16b)
289 permhi
= _simd_permute2f128_si(pack
, zero
, 0x31); // (3, 1) // r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 (16b)
291 pack
= _simd_packs_epi16(permlo
, permhi
); // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 (8b)
293 result
= _simd16_insert_si(result
, pack
, 0);
295 return _simd16_castsi_ps(result
);
300 //////////////////////////////////////////////////////////////////////////
301 /// PackTraits - Helpers for packing / unpacking 16 bit unsigned channels
302 //////////////////////////////////////////////////////////////////////////
304 struct PackTraits
<16, false>
306 static const uint32_t MyNumBits
= 16;
308 static simdscalar
loadSOA(const uint8_t *pSrc
)
310 #if KNOB_SIMD_WIDTH == 8
311 __m256 result
= _mm256_setzero_ps();
312 __m128 vLo
= _mm_load_ps((const float*)pSrc
);
313 return _mm256_insertf128_ps(result
, vLo
, 0);
315 #error Unsupported vector width
319 static void storeSOA(uint8_t *pDst
, simdscalar src
)
321 #if KNOB_SIMD_WIDTH == 8
322 // store 16B (2B * 8)
323 _mm_store_ps((float*)pDst
, _mm256_castps256_ps128(src
));
325 #error Unsupported vector width
329 static simdscalar
unpack(simdscalar
&in
)
331 #if KNOB_SIMD_WIDTH == 8
332 #if KNOB_ARCH==KNOB_ARCH_AVX
333 __m128i src
= _mm_castps_si128(_mm256_castps256_ps128(in
));
334 __m128i resLo
= _mm_cvtepu16_epi32(src
);
335 __m128i resHi
= _mm_shuffle_epi8(src
,
336 _mm_set_epi32(0x80800F0E, 0x80800D0C, 0x80800B0A, 0x80800908));
338 __m256i result
= _mm256_castsi128_si256(resLo
);
339 result
= _mm256_insertf128_si256(result
, resHi
, 1);
340 return _mm256_castsi256_ps(result
);
341 #elif KNOB_ARCH>=KNOB_ARCH_AVX2
342 return _mm256_castsi256_ps(_mm256_cvtepu16_epi32(_mm_castps_si128(_mm256_castps256_ps128(in
))));
345 #error Unsupported vector width
349 static simdscalar
pack(simdscalar
&in
)
351 #if KNOB_SIMD_WIDTH == 8
352 simdscalari src
= _simd_castps_si(in
);
353 __m256i res
= _mm256_castsi128_si256(_mm_packus_epi32(_mm256_castsi256_si128(src
), _mm256_extractf128_si256(src
, 1)));
354 return _mm256_castsi256_ps(res
);
356 #error Unsupported vector width
359 #if ENABLE_AVX512_SIMD16
361 static simd16scalar
loadSOA_16(const uint8_t *pSrc
)
363 simd16scalar result
= _simd16_setzero_ps();
365 simdscalar resultlo
= _simd_load_ps(reinterpret_cast<const float *>(pSrc
));
367 result
= _simd16_insert_ps(result
, resultlo
, 0);
372 static void storeSOA(uint8_t *pDst
, simd16scalar src
)
374 _simd_store_ps(reinterpret_cast<float *>(pDst
), _simd16_extract_ps(src
, 0));
377 static simd16scalar
unpack(simd16scalar
&in
)
379 simd16scalari result
= _simd16_cvtepu16_epi32(_simd_castps_si(_simd16_extract_ps(in
, 0)));
381 return _simd16_castsi_ps(result
);
384 static simd16scalar
pack(simd16scalar
&in
)
386 const simd16scalari zero
= _simd16_setzero_si();
388 simd16scalari permlo
= _simd16_permute2f128_si(_simd16_castps_si(in
), zero
, 0x08); // (0, 0, 2, 0) // r0 r1 r2 r3 r8 r9 rA rB 00 00 00 00 00 00 00 00 (32b)
389 simd16scalari permhi
= _simd16_permute2f128_si(_simd16_castps_si(in
), zero
, 0x0D); // (0, 0, 3, 1) // r4 r5 r6 r7 rC rD rE rF 00 00 00 00 00 00 00 00
391 simd16scalari result
= _simd16_packus_epi32(permlo
, permhi
); // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 (16b)
393 return _simd16_castsi_ps(result
);
398 //////////////////////////////////////////////////////////////////////////
399 /// PackTraits - Helpers for packing / unpacking 16 bit signed channels
400 //////////////////////////////////////////////////////////////////////////
402 struct PackTraits
<16, true>
404 static const uint32_t MyNumBits
= 16;
406 static simdscalar
loadSOA(const uint8_t *pSrc
)
408 #if KNOB_SIMD_WIDTH == 8
409 __m256 result
= _mm256_setzero_ps();
410 __m128 vLo
= _mm_load_ps((const float*)pSrc
);
411 return _mm256_insertf128_ps(result
, vLo
, 0);
413 #error Unsupported vector width
417 static void storeSOA(uint8_t *pDst
, simdscalar src
)
419 #if KNOB_SIMD_WIDTH == 8
420 // store 16B (2B * 8)
421 _mm_store_ps((float*)pDst
, _mm256_castps256_ps128(src
));
423 #error Unsupported vector width
427 static simdscalar
unpack(simdscalar
&in
)
429 #if KNOB_SIMD_WIDTH == 8
430 #if KNOB_ARCH==KNOB_ARCH_AVX
431 SWR_INVALID("I think this may be incorrect.");
432 __m128i src
= _mm_castps_si128(_mm256_castps256_ps128(in
));
433 __m128i resLo
= _mm_cvtepi16_epi32(src
);
434 __m128i resHi
= _mm_shuffle_epi8(src
,
435 _mm_set_epi32(0x80800F0E, 0x80800D0C, 0x80800B0A, 0x80800908));
437 __m256i result
= _mm256_castsi128_si256(resLo
);
438 result
= _mm256_insertf128_si256(result
, resHi
, 1);
439 return _mm256_castsi256_ps(result
);
440 #elif KNOB_ARCH>=KNOB_ARCH_AVX2
441 return _mm256_castsi256_ps(_mm256_cvtepi16_epi32(_mm_castps_si128(_mm256_castps256_ps128(in
))));
444 #error Unsupported vector width
448 static simdscalar
pack(simdscalar
&in
)
450 #if KNOB_SIMD_WIDTH == 8
451 simdscalari src
= _simd_castps_si(in
);
452 __m256i res
= _mm256_castsi128_si256(_mm_packs_epi32(_mm256_castsi256_si128(src
), _mm256_extractf128_si256(src
, 1)));
453 return _mm256_castsi256_ps(res
);
455 #error Unsupported vector width
458 #if ENABLE_AVX512_SIMD16
460 static simd16scalar
loadSOA_16(const uint8_t *pSrc
)
462 simd16scalar result
= _simd16_setzero_ps();
464 simdscalar resultlo
= _simd_load_ps(reinterpret_cast<const float *>(pSrc
));
466 result
= _simd16_insert_ps(result
, resultlo
, 0);
471 static void storeSOA(uint8_t *pDst
, simd16scalar src
)
473 _simd_store_ps(reinterpret_cast<float *>(pDst
), _simd16_extract_ps(src
, 0));
476 static simd16scalar
unpack(simd16scalar
&in
)
478 simd16scalari result
= _simd16_cvtepu16_epi32(_simd_castps_si(_simd16_extract_ps(in
, 0)));
480 return _simd16_castsi_ps(result
);
483 static simd16scalar
pack(simd16scalar
&in
)
485 const simd16scalari zero
= _simd16_setzero_si();
487 simd16scalari permlo
= _simd16_permute2f128_si(_simd16_castps_si(in
), zero
, 0x08); // (0, 0, 2, 0) // r0 r1 r2 r3 r8 r9 rA rB 00 00 00 00 00 00 00 00 (32b)
488 simd16scalari permhi
= _simd16_permute2f128_si(_simd16_castps_si(in
), zero
, 0x0D); // (0, 0, 3, 1) // r4 r5 r6 r7 rC rD rE rF 00 00 00 00 00 00 00 00
490 simd16scalari result
= _simd16_packs_epi32(permlo
, permhi
); // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 (16b)
492 return _simd16_castsi_ps(result
);
497 //////////////////////////////////////////////////////////////////////////
498 /// PackTraits - Helpers for packing / unpacking 32 bit channels
499 //////////////////////////////////////////////////////////////////////////
501 struct PackTraits
<32, false>
503 static const uint32_t MyNumBits
= 32;
505 static simdscalar
loadSOA(const uint8_t *pSrc
) { return _simd_load_ps((const float*)pSrc
); }
506 static void storeSOA(uint8_t *pDst
, simdscalar src
) { _simd_store_ps((float*)pDst
, src
); }
507 static simdscalar
unpack(simdscalar
&in
) { return in
; }
508 static simdscalar
pack(simdscalar
&in
) { return in
; }
509 #if ENABLE_AVX512_SIMD16
511 static simd16scalar
loadSOA_16(const uint8_t *pSrc
)
513 return _simd16_load_ps(reinterpret_cast<const float *>(pSrc
));
516 static void storeSOA(uint8_t *pDst
, simd16scalar src
)
518 _simd16_store_ps(reinterpret_cast<float *>(pDst
), src
);
521 static simd16scalar
unpack(simd16scalar
&in
)
526 static simd16scalar
pack(simd16scalar
&in
)
533 //////////////////////////////////////////////////////////////////////////
534 /// TypeTraits - Format type traits.
535 //////////////////////////////////////////////////////////////////////////
536 template<SWR_TYPE type
, uint32_t NumBits
>
537 struct TypeTraits
: PackTraits
<NumBits
>
539 static const SWR_TYPE MyType
= type
;
540 static float toFloat() { return 0.0; }
541 static float fromFloat() { SWR_NOT_IMPL
; return 0.0; }
542 static simdscalar
convertSrgb(simdscalar
&in
) { SWR_NOT_IMPL
; return _simd_setzero_ps(); }
545 //////////////////////////////////////////////////////////////////////////
546 /// TypeTraits - Format type traits specialization for UINT8
547 //////////////////////////////////////////////////////////////////////////
548 template<> struct TypeTraits
<SWR_TYPE_UINT
, 8> : PackTraits
<8>
550 static const SWR_TYPE MyType
= SWR_TYPE_UINT
;
551 static float toFloat() { return 0.0; }
552 static float fromFloat() { SWR_NOT_IMPL
; return 0.0; }
553 static simdscalar
convertSrgb(simdscalar
&in
) { SWR_NOT_IMPL
; return _simd_setzero_ps(); }
556 //////////////////////////////////////////////////////////////////////////
557 /// TypeTraits - Format type traits specialization for UINT8
558 //////////////////////////////////////////////////////////////////////////
559 template<> struct TypeTraits
<SWR_TYPE_SINT
, 8> : PackTraits
<8, true>
561 static const SWR_TYPE MyType
= SWR_TYPE_SINT
;
562 static float toFloat() { return 0.0; }
563 static float fromFloat() { SWR_NOT_IMPL
; return 0.0; }
564 static simdscalar
convertSrgb(simdscalar
&in
) { SWR_NOT_IMPL
; return _simd_setzero_ps(); }
567 //////////////////////////////////////////////////////////////////////////
568 /// TypeTraits - Format type traits specialization for UINT16
569 //////////////////////////////////////////////////////////////////////////
570 template<> struct TypeTraits
<SWR_TYPE_UINT
, 16> : PackTraits
<16>
572 static const SWR_TYPE MyType
= SWR_TYPE_UINT
;
573 static float toFloat() { return 0.0; }
574 static float fromFloat() { SWR_NOT_IMPL
; return 0.0; }
575 static simdscalar
convertSrgb(simdscalar
&in
) { SWR_NOT_IMPL
; return _simd_setzero_ps(); }
578 //////////////////////////////////////////////////////////////////////////
579 /// TypeTraits - Format type traits specialization for SINT16
580 //////////////////////////////////////////////////////////////////////////
581 template<> struct TypeTraits
<SWR_TYPE_SINT
, 16> : PackTraits
<16, true>
583 static const SWR_TYPE MyType
= SWR_TYPE_SINT
;
584 static float toFloat() { return 0.0; }
585 static float fromFloat() { SWR_NOT_IMPL
; return 0.0; }
586 static simdscalar
convertSrgb(simdscalar
&in
) { SWR_NOT_IMPL
; return _simd_setzero_ps(); }
589 //////////////////////////////////////////////////////////////////////////
590 /// TypeTraits - Format type traits specialization for UINT32
591 //////////////////////////////////////////////////////////////////////////
592 template<> struct TypeTraits
<SWR_TYPE_UINT
, 32> : PackTraits
<32>
594 static const SWR_TYPE MyType
= SWR_TYPE_UINT
;
595 static float toFloat() { return 0.0; }
596 static float fromFloat() { SWR_NOT_IMPL
; return 0.0; }
597 static simdscalar
convertSrgb(simdscalar
&in
) { SWR_NOT_IMPL
; return _simd_setzero_ps(); }
600 //////////////////////////////////////////////////////////////////////////
601 /// TypeTraits - Format type traits specialization for UINT32
602 //////////////////////////////////////////////////////////////////////////
603 template<> struct TypeTraits
<SWR_TYPE_SINT
, 32> : PackTraits
<32>
605 static const SWR_TYPE MyType
= SWR_TYPE_SINT
;
606 static float toFloat() { return 0.0; }
607 static float fromFloat() { SWR_NOT_IMPL
; return 0.0; }
608 static simdscalar
convertSrgb(simdscalar
&in
) { SWR_NOT_IMPL
; return _simd_setzero_ps(); }
611 //////////////////////////////////////////////////////////////////////////
612 /// TypeTraits - Format type traits specialization for UNORM5
613 //////////////////////////////////////////////////////////////////////////
614 template<> struct TypeTraits
<SWR_TYPE_UNORM
, 5> : PackTraits
<5>
616 static const SWR_TYPE MyType
= SWR_TYPE_UNORM
;
617 static float toFloat() { return 1.0f
/ 31.0f
; }
618 static float fromFloat() { return 31.0f
; }
619 static simdscalar
convertSrgb(simdscalar
&in
) { SWR_NOT_IMPL
; return _simd_setzero_ps(); }
622 //////////////////////////////////////////////////////////////////////////
623 /// TypeTraits - Format type traits specialization for UNORM6
624 //////////////////////////////////////////////////////////////////////////
625 template<> struct TypeTraits
<SWR_TYPE_UNORM
, 6> : PackTraits
<6>
627 static const SWR_TYPE MyType
= SWR_TYPE_UNORM
;
628 static float toFloat() { return 1.0f
/ 63.0f
; }
629 static float fromFloat() { return 63.0f
; }
630 static simdscalar
convertSrgb(simdscalar
&in
) { SWR_NOT_IMPL
; return _simd_setzero_ps(); }
633 //////////////////////////////////////////////////////////////////////////
634 /// TypeTraits - Format type traits specialization for UNORM8
635 //////////////////////////////////////////////////////////////////////////
636 template<> struct TypeTraits
<SWR_TYPE_UNORM
, 8> : PackTraits
<8>
638 static const SWR_TYPE MyType
= SWR_TYPE_UNORM
;
639 static float toFloat() { return 1.0f
/ 255.0f
; }
640 static float fromFloat() { return 255.0f
; }
641 static simdscalar
convertSrgb(simdscalar
&in
) { SWR_NOT_IMPL
; return _simd_setzero_ps(); }
644 //////////////////////////////////////////////////////////////////////////
645 /// TypeTraits - Format type traits specialization for UNORM8
646 //////////////////////////////////////////////////////////////////////////
647 template<> struct TypeTraits
<SWR_TYPE_SNORM
, 8> : PackTraits
<8, true>
649 static const SWR_TYPE MyType
= SWR_TYPE_SNORM
;
650 static float toFloat() { return 1.0f
/ 127.0f
; }
651 static float fromFloat() { return 127.0f
; }
652 static simdscalar
convertSrgb(simdscalar
&in
) { SWR_NOT_IMPL
; return _simd_setzero_ps(); }
655 //////////////////////////////////////////////////////////////////////////
656 /// TypeTraits - Format type traits specialization for UNORM16
657 //////////////////////////////////////////////////////////////////////////
658 template<> struct TypeTraits
<SWR_TYPE_UNORM
, 16> : PackTraits
<16>
660 static const SWR_TYPE MyType
= SWR_TYPE_UNORM
;
661 static float toFloat() { return 1.0f
/ 65535.0f
; }
662 static float fromFloat() { return 65535.0f
; }
663 static simdscalar
convertSrgb(simdscalar
&in
) { SWR_NOT_IMPL
; return _simd_setzero_ps(); }
666 //////////////////////////////////////////////////////////////////////////
667 /// TypeTraits - Format type traits specialization for SNORM16
668 //////////////////////////////////////////////////////////////////////////
669 template<> struct TypeTraits
<SWR_TYPE_SNORM
, 16> : PackTraits
<16, true>
671 static const SWR_TYPE MyType
= SWR_TYPE_UNORM
;
672 static float toFloat() { return 1.0f
/ 32767.0f
; }
673 static float fromFloat() { return 32767.0f
; }
674 static simdscalar
convertSrgb(simdscalar
&in
) { SWR_NOT_IMPL
; return _simd_setzero_ps(); }
677 //////////////////////////////////////////////////////////////////////////
678 /// TypeTraits - Format type traits specialization for UNORM24
679 //////////////////////////////////////////////////////////////////////////
681 struct TypeTraits
< SWR_TYPE_UNORM
, 24 > : PackTraits
<32>
683 static const SWR_TYPE MyType
= SWR_TYPE_UNORM
;
684 static float toFloat() { return 1.0f
/ 16777215.0f
; }
685 static float fromFloat() { return 16777215.0f
; }
686 static simdscalar
convertSrgb(simdscalar
&in
) { SWR_NOT_IMPL
; return _simd_setzero_ps(); }
689 //////////////////////////////////////////////////////////////////////////
690 // FLOAT Specializations from here on...
691 //////////////////////////////////////////////////////////////////////////
692 #define TO_M128i(a) _mm_castps_si128(a)
693 #define TO_M128(a) _mm_castsi128_ps(a)
697 template< unsigned expnum
, unsigned expden
, unsigned coeffnum
, unsigned coeffden
>
698 inline static __m128
fastpow(__m128 arg
) {
701 static const __m128 factor
= _mm_set1_ps(exp2(127.0f
* expden
/ expnum
- 127.0f
)
702 * powf(1.0f
* coeffnum
/ coeffden
, 1.0f
* expden
/ expnum
));
704 // Apply a constant pre-correction factor.
705 ret
= _mm_mul_ps(ret
, factor
);
707 // Reinterpret arg as integer to obtain logarithm.
708 //asm("cvtdq2ps %1, %0" : "=x" (ret) : "x" (ret));
709 ret
= _mm_cvtepi32_ps(_mm_castps_si128(ret
));
711 // Multiply logarithm by power.
712 ret
= _mm_mul_ps(ret
, _mm_set1_ps(1.0f
* expnum
/ expden
));
714 // Convert back to "integer" to exponentiate.
715 //asm("cvtps2dq %1, %0" : "=x" (ret) : "x" (ret));
716 ret
= _mm_castsi128_ps(_mm_cvtps_epi32(ret
));
721 inline static __m128
pow512_4(__m128 arg
) {
722 // 5/12 is too small, so compute the 4th root of 20/12 instead.
723 // 20/12 = 5/3 = 1 + 2/3 = 2 - 1/3. 2/3 is a suitable argument for fastpow.
724 // weighting coefficient: a^-1/2 = 2 a; a = 2^-2/3
725 __m128 xf
= fastpow
< 2, 3, int(0.629960524947437 * 1e9
), int(1e9
) >(arg
);
726 __m128 xover
= _mm_mul_ps(arg
, xf
);
728 __m128 xfm1
= _mm_rsqrt_ps(xf
);
729 __m128 x2
= _mm_mul_ps(arg
, arg
);
730 __m128 xunder
= _mm_mul_ps(x2
, xfm1
);
732 // sqrt2 * over + 2 * sqrt2 * under
733 __m128 xavg
= _mm_mul_ps(_mm_set1_ps(1.0f
/ (3.0f
* 0.629960524947437f
) * 0.999852f
),
734 _mm_add_ps(xover
, xunder
));
736 xavg
= _mm_mul_ps(xavg
, _mm_rsqrt_ps(xavg
));
737 xavg
= _mm_mul_ps(xavg
, _mm_rsqrt_ps(xavg
));
741 inline static __m128
powf_wrapper(__m128 Base
, float Exp
)
743 float *f
= (float *)(&Base
);
745 return _mm_set_ps(powf(f
[3], Exp
),
751 static inline __m128
ConvertFloatToSRGB2(__m128
& Src
)
753 // create a mask with 0xFFFFFFFF in the DWORDs where the source is <= the minimal SRGB float value
754 __m128i CmpToSRGBThresholdMask
= TO_M128i(_mm_cmpnlt_ps(_mm_set1_ps(0.0031308f
), Src
));
756 // squeeze the mask down to 16 bits (4 bits per DWORD)
757 int CompareResult
= _mm_movemask_epi8(CmpToSRGBThresholdMask
);
762 if (CompareResult
== 0xFFFF)
764 // all DWORDs are <= the threshold
765 Result
= _mm_mul_ps(Src
, _mm_set1_ps(12.92f
));
767 else if (CompareResult
== 0x0)
769 // all DWORDs are > the threshold
770 __m128 fSrc_0RGB
= Src
;
772 // --> 1.055f * c(1.0f/2.4f) - 0.055f
773 #if KNOB_USE_FAST_SRGB == TRUE
774 // 1.0f / 2.4f is 5.0f / 12.0f which is used for approximation.
775 __m128 f
= pow512_4(fSrc_0RGB
);
777 __m128 f
= powf_wrapper(fSrc_0RGB
, 1.0f
/ 2.4f
);
779 f
= _mm_mul_ps(f
, _mm_set1_ps(1.055f
));
780 Result
= _mm_sub_ps(f
, _mm_set1_ps(0.055f
));
784 // some DWORDs are <= the threshold and some are > threshold
785 __m128 Src_0RGB_mul_denorm
= _mm_mul_ps(Src
, _mm_set1_ps(12.92f
));
787 __m128 fSrc_0RGB
= Src
;
789 // --> 1.055f * c(1.0f/2.4f) - 0.055f
790 #if KNOB_USE_FAST_SRGB == TRUE
791 // 1.0f / 2.4f is 5.0f / 12.0f which is used for approximation.
792 __m128 f
= pow512_4(fSrc_0RGB
);
794 __m128 f
= powf_wrapper(fSrc_0RGB
, 1.0f
/ 2.4f
);
796 f
= _mm_mul_ps(f
, _mm_set1_ps(1.055f
));
797 f
= _mm_sub_ps(f
, _mm_set1_ps(0.055f
));
799 // Clear the alpha (is garbage after the sub)
800 __m128i i
= _mm_and_si128(TO_M128i(f
), _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF));
802 __m128i LessThanPart
= _mm_and_si128(CmpToSRGBThresholdMask
, TO_M128i(Src_0RGB_mul_denorm
));
803 __m128i GreaterEqualPart
= _mm_andnot_si128(CmpToSRGBThresholdMask
, i
);
804 __m128i CombinedParts
= _mm_or_si128(LessThanPart
, GreaterEqualPart
);
806 Result
= TO_M128(CombinedParts
);
812 #if ENABLE_AVX512_SIMD16
813 template< unsigned expnum
, unsigned expden
, unsigned coeffnum
, unsigned coeffden
>
814 inline static simd16scalar
fastpow(simd16scalar value
)
816 static const float factor1
= exp2(127.0f
* expden
/ expnum
- 127.0f
)
817 * powf(1.0f
* coeffnum
/ coeffden
, 1.0f
* expden
/ expnum
);
819 // Apply a constant pre-correction factor.
820 simd16scalar result
= _simd16_mul_ps(value
, _simd16_set1_ps(factor1
));
822 // Reinterpret arg as integer to obtain logarithm.
823 //asm("cvtdq2ps %1, %0" : "=x" (result) : "x" (result));
824 result
= _simd16_cvtepi32_ps(_simd16_castps_si(result
));
826 // Multiply logarithm by power.
827 result
= _simd16_mul_ps(result
, _simd16_set1_ps(1.0f
* expnum
/ expden
));
829 // Convert back to "integer" to exponentiate.
830 //asm("cvtps2dq %1, %0" : "=x" (result) : "x" (result));
831 result
= _simd16_castsi_ps(_simd16_cvtps_epi32(result
));
836 inline static simd16scalar
pow512_4(simd16scalar arg
)
838 // 5/12 is too small, so compute the 4th root of 20/12 instead.
839 // 20/12 = 5/3 = 1 + 2/3 = 2 - 1/3. 2/3 is a suitable argument for fastpow.
840 // weighting coefficient: a^-1/2 = 2 a; a = 2^-2/3
841 simd16scalar xf
= fastpow
< 2, 3, int(0.629960524947437 * 1e9
), int(1e9
) >(arg
);
842 simd16scalar xover
= _simd16_mul_ps(arg
, xf
);
844 simd16scalar xfm1
= _simd16_rsqrt_ps(xf
);
845 simd16scalar x2
= _simd16_mul_ps(arg
, arg
);
846 simd16scalar xunder
= _simd16_mul_ps(x2
, xfm1
);
848 // sqrt2 * over + 2 * sqrt2 * under
849 simd16scalar xavg
= _simd16_mul_ps(_simd16_set1_ps(1.0f
/ (3.0f
* 0.629960524947437f
) * 0.999852f
), _simd16_add_ps(xover
, xunder
));
851 xavg
= _simd16_mul_ps(xavg
, _simd16_rsqrt_ps(xavg
));
852 xavg
= _simd16_mul_ps(xavg
, _simd16_rsqrt_ps(xavg
));
857 inline static simd16scalar
powf_wrapper(const simd16scalar base
, float exp
)
859 const float *f
= reinterpret_cast<const float *>(&base
);
861 return _simd16_set_ps(
881 // float to SRGB conversion formula
883 // if (value < 0.0031308f)
886 // value = 1.055f * pow(value, 1.0f / 2.4f) - 0.055f;
888 static inline simd16scalar
ConvertFloatToSRGB2(const simd16scalar
&value
)
890 // create a mask where the source is < the minimal SRGB float value
891 const simd16mask mask
= _simd16_cmplt_ps_mask(value
, _simd16_set1_ps(0.0031308f
));
893 // if all elements are < the threshold, result = value * 12.92
894 simd16scalar result
= _simd16_mul_ps(value
, _simd16_set1_ps(12.92f
));
896 if (_simd16_mask2int(mask
) != 0xFFFF)
898 // some elements are >= threshold, result = 1.055 * power(value, 1.0 / 2.4) - 0.055
899 #if KNOB_USE_FAST_SRGB == TRUE
900 // 1.0f / 2.4f is 5.0f / 12.0f which is used for approximation.
901 simd16scalar result2
= pow512_4(value
);
903 simd16scalar result2
= powf_wrapper(value
, 1.0f
/ 2.4f
);
906 result2
= _simd16_mul_ps(result2
, _simd16_set1_ps(1.055f
));
907 result2
= _simd16_sub_ps(result2
, _simd16_set1_ps(0.055f
));
909 #if (KNOB_ARCH == KNOB_ARCH_AVX512)
910 // only native AVX512 can directly use the computed mask for the blend operation
911 result
= _mm512_mask_blend_ps(mask
, result2
, result
);
913 result
= _simd16_blendv_ps(result2
, result
, _simd16_cmplt_ps(value
, _simd16_set1_ps(0.0031308f
)));
921 //////////////////////////////////////////////////////////////////////////
922 /// TypeTraits - Format type traits specialization for FLOAT16
923 //////////////////////////////////////////////////////////////////////////
924 template<> struct TypeTraits
<SWR_TYPE_FLOAT
, 16> : PackTraits
<16>
926 static const SWR_TYPE MyType
= SWR_TYPE_FLOAT
;
927 static float toFloat() { return 1.0f
; }
928 static float fromFloat() { return 1.0f
; }
929 static simdscalar
convertSrgb(simdscalar
&in
) { SWR_NOT_IMPL
; return _simd_setzero_ps(); }
931 static simdscalar
pack(const simdscalar
&in
)
933 #if KNOB_SIMD_WIDTH == 8
934 #if (KNOB_ARCH == KNOB_ARCH_AVX)
935 // input is 8 packed float32, output is 8 packed float16
936 simdscalari src
= _simd_castps_si(in
);
938 static const uint32_t FLOAT_EXP_BITS
= 8;
939 static const uint32_t FLOAT_MANTISSA_BITS
= 23;
940 static const uint32_t FLOAT_MANTISSA_MASK
= (1U << FLOAT_MANTISSA_BITS
) - 1;
941 static const uint32_t FLOAT_EXP_MASK
= ((1U << FLOAT_EXP_BITS
) - 1) << FLOAT_MANTISSA_BITS
;
943 static const uint32_t HALF_EXP_BITS
= 5;
944 static const uint32_t HALF_MANTISSA_BITS
= 10;
945 static const uint32_t HALF_EXP_MASK
= ((1U << HALF_EXP_BITS
) - 1) << HALF_MANTISSA_BITS
;
947 // minimum exponent required, exponents below this are flushed to 0.
948 static const int32_t HALF_EXP_MIN
= -14;
949 static const int32_t FLOAT_EXP_BIAS
= 127;
950 static const int32_t FLOAT_EXP_MIN
= HALF_EXP_MIN
+ FLOAT_EXP_BIAS
;
951 static const int32_t FLOAT_EXP_MIN_FTZ
= FLOAT_EXP_MIN
- (HALF_MANTISSA_BITS
+ 1); // +1 for the lack of implicit significand
953 // maximum exponent required, exponents above this are set to infinity
954 static const int32_t HALF_EXP_MAX
= 15;
955 static const int32_t FLOAT_EXP_MAX
= HALF_EXP_MAX
+ FLOAT_EXP_BIAS
;
957 const simdscalari vSignMask
= _simd_set1_epi32(0x80000000);
958 const simdscalari vExpMask
= _simd_set1_epi32(FLOAT_EXP_MASK
);
959 const simdscalari vManMask
= _simd_set1_epi32(FLOAT_MANTISSA_MASK
);
960 const simdscalari vExpMin
= _simd_set1_epi32(FLOAT_EXP_MASK
& uint32_t(FLOAT_EXP_MIN
<< FLOAT_MANTISSA_BITS
));
961 const simdscalari vExpMinFtz
= _simd_set1_epi32(FLOAT_EXP_MASK
& uint32_t(FLOAT_EXP_MIN_FTZ
<< FLOAT_MANTISSA_BITS
));
962 const simdscalari vExpMax
= _simd_set1_epi32(FLOAT_EXP_MASK
& uint32_t(FLOAT_EXP_MAX
<< FLOAT_MANTISSA_BITS
));
964 simdscalari vSign
= _simd_and_si(src
, vSignMask
);
965 simdscalari vExp
= _simd_and_si(src
, vExpMask
);
966 simdscalari vMan
= _simd_and_si(src
, vManMask
);
968 simdscalari vFTZMask
= _simd_cmplt_epi32(vExp
, vExpMinFtz
);
969 simdscalari vDenormMask
= _simd_andnot_si(vFTZMask
, _simd_cmplt_epi32(vExp
, vExpMin
));
970 simdscalari vInfMask
= _simd_cmpeq_epi32(vExpMask
, vExp
);
971 simdscalari vClampMask
= _simd_andnot_si(vInfMask
, _simd_cmplt_epi32(vExpMax
, vExp
));
973 simdscalari vHalfExp
= _simd_add_epi32(_simd_sub_epi32(vExp
, vExpMin
), _simd_set1_epi32(1U << FLOAT_MANTISSA_BITS
));
975 // pack output 16-bits into the lower 16-bits of each 32-bit channel
976 simdscalari vDst
= _simd_and_si(_simd_srli_epi32(vHalfExp
, 13), _simd_set1_epi32(HALF_EXP_MASK
));
977 vDst
= _simd_or_si(vDst
, _simd_srli_epi32(vMan
, FLOAT_MANTISSA_BITS
- HALF_MANTISSA_BITS
));
980 vDst
= _simd_andnot_si(vFTZMask
, vDst
);
981 // Apply Infinites / NaN
982 vDst
= _simd_or_si(vDst
, _simd_and_si(vInfMask
, _simd_set1_epi32(HALF_EXP_MASK
)));
985 vDst
= _simd_andnot_si(vClampMask
, vDst
);
986 vDst
= _simd_or_si(vDst
,
987 _simd_and_si(vClampMask
, _simd_set1_epi32(0x7BFF)));
989 // Compute Denormals (subnormals)
990 if (!_mm256_testz_si256(vDenormMask
, vDenormMask
))
992 uint32_t *pDenormMask
= (uint32_t*)&vDenormMask
;
993 uint32_t *pExp
= (uint32_t*)&vExp
;
994 uint32_t *pMan
= (uint32_t*)&vMan
;
995 uint32_t *pDst
= (uint32_t*)&vDst
;
996 for (uint32_t i
= 0; i
< KNOB_SIMD_WIDTH
; ++i
)
1000 // Need to compute subnormal value
1001 uint32_t exponent
= pExp
[i
] >> FLOAT_MANTISSA_BITS
;
1002 uint32_t mantissa
= pMan
[i
] |
1003 (1U << FLOAT_MANTISSA_BITS
); // Denorms include no "implicit" 1s. Make it explicit
1005 pDst
[i
] = mantissa
>> ((FLOAT_EXP_MIN
- exponent
) + (FLOAT_MANTISSA_BITS
- HALF_MANTISSA_BITS
));
1011 vDst
= _simd_or_si(vDst
, _simd_srli_epi32(vSign
, 16));
1013 // Pack to lower 128-bits
1014 vDst
= _mm256_castsi128_si256(_mm_packus_epi32(_mm256_castsi256_si128(vDst
), _mm256_extractf128_si256(vDst
, 1)));
1017 #if !defined(NDEBUG)
1018 simdscalari vCheck
= _mm256_castsi128_si256(_mm256_cvtps_ph(in
, _MM_FROUND_TRUNC
));
1020 for (uint32_t i
= 0; i
< 4; ++i
)
1022 SWR_ASSERT(vCheck
.m256i_i32
[i
] == vDst
.m256i_i32
[i
]);
1027 return _simd_castsi_ps(vDst
);
1030 return _mm256_castsi256_ps(_mm256_castsi128_si256(_mm256_cvtps_ph(in
, _MM_FROUND_TRUNC
)));
1033 #error Unsupported vector width
1037 static simdscalar
unpack(const simdscalar
&in
)
1039 // input is 8 packed float16, output is 8 packed float32
1040 SWR_NOT_IMPL
; // @todo
1041 return _simd_setzero_ps();
1043 #if ENABLE_AVX512_SIMD16
1045 static simd16scalar
pack(const simd16scalar
&in
)
1047 simd16scalari result
= _simd16_setzero_si();
1048 simdscalari resultlo
= _simd_setzero_si();
1050 #if (KNOB_ARCH == KNOB_ARCH_AVX)
1051 simdscalar simdlo
= pack(_simd16_extract_ps(in
, 0));
1052 simdscalar simdhi
= pack(_simd16_extract_ps(in
, 1));
1054 __m128i templo
= _simd_extractf128_si(_simd_castps_si(simdlo
), 0);
1055 __m128i temphi
= _simd_extractf128_si(_simd_castps_si(simdhi
), 0);
1058 __m128i templo
= _mm256_cvtps_ph(_simd16_extract_ps(in
, 0), _MM_FROUND_TRUNC
);
1059 __m128i temphi
= _mm256_cvtps_ph(_simd16_extract_ps(in
, 1), _MM_FROUND_TRUNC
);
1062 resultlo
= _simd_insertf128_si(resultlo
, templo
, 0);
1063 resultlo
= _simd_insertf128_si(resultlo
, temphi
, 1);
1065 result
= _simd16_insert_si(result
, resultlo
, 0);
1067 return _simd16_castsi_ps(result
);
1070 static simd16scalar
unpack(const simd16scalar
&in
)
1072 // input is 16 packed float16, output is 16 packed float32
1073 SWR_NOT_IMPL
; // @todo
1074 return _simd16_setzero_ps();
1079 //////////////////////////////////////////////////////////////////////////
1080 /// TypeTraits - Format type traits specialization for FLOAT32
1081 //////////////////////////////////////////////////////////////////////////
1082 template<> struct TypeTraits
<SWR_TYPE_FLOAT
, 32> : PackTraits
<32>
1084 static const SWR_TYPE MyType
= SWR_TYPE_FLOAT
;
1085 static float toFloat() { return 1.0f
; }
1086 static float fromFloat() { return 1.0f
; }
1087 static inline simdscalar
convertSrgb(simdscalar
&in
)
1089 #if KNOB_SIMD_WIDTH == 8
1090 #if (KNOB_ARCH == KNOB_ARCH_AVX || KNOB_ARCH == KNOB_ARCH_AVX2)
1091 __m128 srcLo
= _mm256_extractf128_ps(in
, 0);
1092 __m128 srcHi
= _mm256_extractf128_ps(in
, 1);
1094 srcLo
= ConvertFloatToSRGB2(srcLo
);
1095 srcHi
= ConvertFloatToSRGB2(srcHi
);
1097 in
= _mm256_insertf128_ps(in
, srcLo
, 0);
1098 in
= _mm256_insertf128_ps(in
, srcHi
, 1);
1101 #error Unsupported vector width
1105 #if ENABLE_AVX512_SIMD16
1107 static inline simd16scalar
convertSrgb(simd16scalar
&in
)
1109 return ConvertFloatToSRGB2(in
);
1114 //////////////////////////////////////////////////////////////////////////
1115 /// Format1 - Bitfield for single component formats.
1116 //////////////////////////////////////////////////////////////////////////
1117 template<uint32_t x
>
1124 ///@ The following are here to provide full template needed in Formats.
1131 //////////////////////////////////////////////////////////////////////////
1132 /// Format1 - Bitfield for single component formats - 8 bit specialization
1133 //////////////////////////////////////////////////////////////////////////
1141 ///@ The following are here to provide full template needed in Formats.
1148 //////////////////////////////////////////////////////////////////////////
1149 /// Format1 - Bitfield for single component formats - 16 bit specialization
1150 //////////////////////////////////////////////////////////////////////////
1158 ///@ The following are here to provide full template needed in Formats.
1165 //////////////////////////////////////////////////////////////////////////
1166 /// Format2 - Bitfield for 2 component formats.
1167 //////////////////////////////////////////////////////////////////////////
1168 template<uint32_t x
, uint32_t y
>
1178 ///@ The following are here to provide full template needed in Formats.
1184 //////////////////////////////////////////////////////////////////////////
1185 /// Format2 - Bitfield for 2 component formats - 16 bit specialization
1186 //////////////////////////////////////////////////////////////////////////
1197 ///@ The following are here to provide full template needed in Formats.
1203 //////////////////////////////////////////////////////////////////////////
1204 /// Format3 - Bitfield for 3 component formats.
1205 //////////////////////////////////////////////////////////////////////////
1206 template<uint32_t x
, uint32_t y
, uint32_t z
>
1215 uint32_t a
; ///@note This is here to provide full template needed in Formats.
1218 //////////////////////////////////////////////////////////////////////////
1219 /// Format3 - Bitfield for 3 component formats - 16 bit specialization
1220 //////////////////////////////////////////////////////////////////////////
1222 union Format3
<5,6,5>
1230 uint16_t a
; ///@note This is here to provide full template needed in Formats.
1233 //////////////////////////////////////////////////////////////////////////
1234 /// Format4 - Bitfield for 4 component formats.
1235 //////////////////////////////////////////////////////////////////////////
1236 template<uint32_t x
, uint32_t y
, uint32_t z
, uint32_t w
>
1245 //////////////////////////////////////////////////////////////////////////
1246 /// Format4 - Bitfield for 4 component formats - 16 bit specialization
1247 //////////////////////////////////////////////////////////////////////////
1249 struct Format4
<5,5,5,1>
1257 //////////////////////////////////////////////////////////////////////////
1258 /// Format4 - Bitfield for 4 component formats - 16 bit specialization
1259 //////////////////////////////////////////////////////////////////////////
1261 struct Format4
<4,4,4,4>
1269 //////////////////////////////////////////////////////////////////////////
1270 /// ComponentTraits - Default components
1271 //////////////////////////////////////////////////////////////////////////
1272 template<uint32_t x
, uint32_t y
, uint32_t z
, uint32_t w
>
1275 INLINE
static uint32_t GetDefault(uint32_t comp
)
1277 static const uint32_t defaults
[4]{ x
, y
, z
, w
};
1278 return defaults
[comp
];
1282 //////////////////////////////////////////////////////////////////////////
1283 /// ComponentTraits - Component type traits.
1284 //////////////////////////////////////////////////////////////////////////
1285 template<SWR_TYPE X
, uint32_t NumBitsX
, SWR_TYPE Y
= SWR_TYPE_UNKNOWN
, uint32_t NumBitsY
= 0, SWR_TYPE Z
= SWR_TYPE_UNKNOWN
, uint32_t NumBitsZ
= 0, SWR_TYPE W
= SWR_TYPE_UNKNOWN
, uint32_t NumBitsW
= 0>
1286 struct ComponentTraits
1288 INLINE
static SWR_TYPE
GetType(uint32_t comp
)
1290 static const SWR_TYPE CompType
[4]{ X
, Y
, Z
, W
};
1291 return CompType
[comp
];
1294 INLINE
static uint32_t GetBPC(uint32_t comp
)
1296 static const uint32_t MyBpc
[4]{ NumBitsX
, NumBitsY
, NumBitsZ
, NumBitsW
};
1300 INLINE
static bool isNormalized(uint32_t comp
)
1305 return (X
== SWR_TYPE_UNORM
|| X
== SWR_TYPE_SNORM
) ? true : false;
1307 return (Y
== SWR_TYPE_UNORM
|| Y
== SWR_TYPE_SNORM
) ? true : false;
1309 return (Z
== SWR_TYPE_UNORM
|| Z
== SWR_TYPE_SNORM
) ? true : false;
1311 return (W
== SWR_TYPE_UNORM
|| W
== SWR_TYPE_SNORM
) ? true : false;
1313 SWR_INVALID("Invalid component: %d", comp
);
1317 INLINE
static float toFloat(uint32_t comp
)
1322 return TypeTraits
<X
, NumBitsX
>::toFloat();
1324 return TypeTraits
<Y
, NumBitsY
>::toFloat();
1326 return TypeTraits
<Z
, NumBitsZ
>::toFloat();
1328 return TypeTraits
<W
, NumBitsW
>::toFloat();
1330 SWR_INVALID("Invalid component: %d", comp
);
1331 return TypeTraits
<X
, NumBitsX
>::toFloat();
1335 INLINE
static float fromFloat(uint32_t comp
)
1340 return TypeTraits
<X
, NumBitsX
>::fromFloat();
1342 return TypeTraits
<Y
, NumBitsY
>::fromFloat();
1344 return TypeTraits
<Z
, NumBitsZ
>::fromFloat();
1346 return TypeTraits
<W
, NumBitsW
>::fromFloat();
1348 SWR_INVALID("Invalid component: %d", comp
);
1349 return TypeTraits
<X
, NumBitsX
>::fromFloat();
1352 INLINE
static simdscalar
loadSOA(uint32_t comp
, const uint8_t* pSrc
)
1357 return TypeTraits
<X
, NumBitsX
>::loadSOA(pSrc
);
1359 return TypeTraits
<Y
, NumBitsY
>::loadSOA(pSrc
);
1361 return TypeTraits
<Z
, NumBitsZ
>::loadSOA(pSrc
);
1363 return TypeTraits
<W
, NumBitsW
>::loadSOA(pSrc
);
1365 SWR_INVALID("Invalid component: %d", comp
);
1366 return TypeTraits
<X
, NumBitsX
>::loadSOA(pSrc
);
1369 INLINE
static void storeSOA(uint32_t comp
, uint8_t *pDst
, simdscalar src
)
1374 TypeTraits
<X
, NumBitsX
>::storeSOA(pDst
, src
);
1377 TypeTraits
<Y
, NumBitsY
>::storeSOA(pDst
, src
);
1380 TypeTraits
<Z
, NumBitsZ
>::storeSOA(pDst
, src
);
1383 TypeTraits
<W
, NumBitsW
>::storeSOA(pDst
, src
);
1386 SWR_INVALID("Invalid component: %d", comp
);
1387 TypeTraits
<X
, NumBitsX
>::storeSOA(pDst
, src
);
1390 INLINE
static simdscalar
unpack(uint32_t comp
, simdscalar
&in
)
1395 return TypeTraits
<X
, NumBitsX
>::unpack(in
);
1397 return TypeTraits
<Y
, NumBitsY
>::unpack(in
);
1399 return TypeTraits
<Z
, NumBitsZ
>::unpack(in
);
1401 return TypeTraits
<W
, NumBitsW
>::unpack(in
);
1403 SWR_INVALID("Invalid component: %d", comp
);
1404 return TypeTraits
<X
, NumBitsX
>::unpack(in
);
1407 INLINE
static simdscalar
pack(uint32_t comp
, simdscalar
&in
)
1412 return TypeTraits
<X
, NumBitsX
>::pack(in
);
1414 return TypeTraits
<Y
, NumBitsY
>::pack(in
);
1416 return TypeTraits
<Z
, NumBitsZ
>::pack(in
);
1418 return TypeTraits
<W
, NumBitsW
>::pack(in
);
1420 SWR_INVALID("Invalid component: %d", comp
);
1421 return TypeTraits
<X
, NumBitsX
>::pack(in
);
1424 INLINE
static simdscalar
convertSrgb(uint32_t comp
, simdscalar
&in
)
1429 return TypeTraits
<X
, NumBitsX
>::convertSrgb(in
);
1431 return TypeTraits
<Y
, NumBitsY
>::convertSrgb(in
);
1433 return TypeTraits
<Z
, NumBitsZ
>::convertSrgb(in
);
1435 return TypeTraits
<W
, NumBitsW
>::convertSrgb(in
);
1437 SWR_INVALID("Invalid component: %d", comp
);
1438 return TypeTraits
<X
, NumBitsX
>::convertSrgb(in
);
1440 #if ENABLE_AVX512_SIMD16
1442 INLINE
static simd16scalar
loadSOA_16(uint32_t comp
, const uint8_t* pSrc
)
1447 return TypeTraits
<X
, NumBitsX
>::loadSOA_16(pSrc
);
1449 return TypeTraits
<Y
, NumBitsY
>::loadSOA_16(pSrc
);
1451 return TypeTraits
<Z
, NumBitsZ
>::loadSOA_16(pSrc
);
1453 return TypeTraits
<W
, NumBitsW
>::loadSOA_16(pSrc
);
1455 SWR_INVALID("Invalid component: %d", comp
);
1456 return TypeTraits
<X
, NumBitsX
>::loadSOA_16(pSrc
);
1459 INLINE
static void storeSOA(uint32_t comp
, uint8_t *pDst
, simd16scalar src
)
1464 TypeTraits
<X
, NumBitsX
>::storeSOA(pDst
, src
);
1467 TypeTraits
<Y
, NumBitsY
>::storeSOA(pDst
, src
);
1470 TypeTraits
<Z
, NumBitsZ
>::storeSOA(pDst
, src
);
1473 TypeTraits
<W
, NumBitsW
>::storeSOA(pDst
, src
);
1476 SWR_INVALID("Invalid component: %d", comp
);
1477 TypeTraits
<X
, NumBitsX
>::storeSOA(pDst
, src
);
1480 INLINE
static simd16scalar
unpack(uint32_t comp
, simd16scalar
&in
)
1485 return TypeTraits
<X
, NumBitsX
>::unpack(in
);
1487 return TypeTraits
<Y
, NumBitsY
>::unpack(in
);
1489 return TypeTraits
<Z
, NumBitsZ
>::unpack(in
);
1491 return TypeTraits
<W
, NumBitsW
>::unpack(in
);
1493 SWR_INVALID("Invalid component: %d", comp
);
1494 return TypeTraits
<X
, NumBitsX
>::unpack(in
);
1497 INLINE
static simd16scalar
pack(uint32_t comp
, simd16scalar
&in
)
1502 return TypeTraits
<X
, NumBitsX
>::pack(in
);
1504 return TypeTraits
<Y
, NumBitsY
>::pack(in
);
1506 return TypeTraits
<Z
, NumBitsZ
>::pack(in
);
1508 return TypeTraits
<W
, NumBitsW
>::pack(in
);
1510 SWR_INVALID("Invalid component: %d", comp
);
1511 return TypeTraits
<X
, NumBitsX
>::pack(in
);
1514 INLINE
static simd16scalar
convertSrgb(uint32_t comp
, simd16scalar
&in
)
1519 return TypeTraits
<X
, NumBitsX
>::convertSrgb(in
);
1521 return TypeTraits
<Y
, NumBitsY
>::convertSrgb(in
);
1523 return TypeTraits
<Z
, NumBitsZ
>::convertSrgb(in
);
1525 return TypeTraits
<W
, NumBitsW
>::convertSrgb(in
);
1527 SWR_INVALID("Invalid component: %d", comp
);
1528 return TypeTraits
<X
, NumBitsX
>::convertSrgb(in
);