1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * @brief Definitions for SWR_FORMAT functions.
27 ******************************************************************************/
31 #include "common/simdintrin.h"
33 //////////////////////////////////////////////////////////////////////////
34 /// PackTraits - Helpers for packing / unpacking same pixel sizes
35 //////////////////////////////////////////////////////////////////////////
36 template <uint32_t NumBits
, bool Signed
= false>
39 static const uint32_t MyNumBits
= NumBits
;
40 static simdscalar
loadSOA(const uint8_t *pSrc
) = delete;
41 static void storeSOA(uint8_t *pDst
, simdscalar
const &src
) = delete;
42 static simdscalar
unpack(simdscalar
&in
) = delete;
43 static simdscalar
pack(simdscalar
&in
) = delete;
44 #if ENABLE_AVX512_SIMD16
45 static simd16scalar
loadSOA_16(const uint8_t *pSrc
) = delete;
46 static void SIMDCALL
storeSOA(uint8_t *pDst
, simd16scalar
const &src
) = delete;
47 static simd16scalar
unpack(simd16scalar
&in
) = delete;
48 static simd16scalar
pack(simd16scalar
&in
) = delete;
52 //////////////////////////////////////////////////////////////////////////
53 /// PackTraits - Helpers for packing / unpacking unused channels
54 //////////////////////////////////////////////////////////////////////////
56 struct PackTraits
<0, false>
58 static const uint32_t MyNumBits
= 0;
60 static simdscalar
loadSOA(const uint8_t *pSrc
) { return _simd_setzero_ps(); }
61 static void storeSOA(uint8_t *pDst
, simdscalar
const &src
) { return; }
62 static simdscalar
unpack(simdscalar
&in
) { return _simd_setzero_ps(); }
63 static simdscalar
pack(simdscalar
&in
) { return _simd_setzero_ps(); }
64 #if ENABLE_AVX512_SIMD16
65 static simd16scalar
loadSOA_16(const uint8_t *pSrc
) { return _simd16_setzero_ps(); }
66 static void SIMDCALL
storeSOA(uint8_t *pDst
, simd16scalar
const &src
) { return; }
67 static simd16scalar
unpack(simd16scalar
&in
) { return _simd16_setzero_ps(); }
68 static simd16scalar
pack(simd16scalar
&in
) { return _simd16_setzero_ps(); }
72 //////////////////////////////////////////////////////////////////////////
73 /// PackTraits - Helpers for packing / unpacking 8 bit unsigned channels
74 //////////////////////////////////////////////////////////////////////////
76 struct PackTraits
<8, false>
78 static const uint32_t MyNumBits
= 8;
80 static simdscalar
loadSOA(const uint8_t *pSrc
)
82 #if KNOB_SIMD_WIDTH == 8
83 __m256 result
= _mm256_setzero_ps();
84 __m128 vLo
= _mm_castpd_ps(_mm_load_sd((double*)pSrc
));
85 return _mm256_insertf128_ps(result
, vLo
, 0);
87 #error Unsupported vector width
91 static void storeSOA(uint8_t *pDst
, simdscalar
const &src
)
94 #if KNOB_SIMD_WIDTH == 8
95 _mm_storel_pd((double*)pDst
, _mm_castps_pd(_mm256_castps256_ps128(src
)));
97 #error Unsupported vector width
101 static simdscalar
unpack(simdscalar
&in
)
103 #if KNOB_SIMD_WIDTH == 8
104 #if KNOB_ARCH <= KNOB_ARCH_AVX
105 __m128i src
= _mm_castps_si128(_mm256_castps256_ps128(in
));
106 __m128i resLo
= _mm_cvtepu8_epi32(src
);
107 __m128i resHi
= _mm_shuffle_epi8(src
,
108 _mm_set_epi32(0x80808007, 0x80808006, 0x80808005, 0x80808004));
110 __m256i result
= _mm256_castsi128_si256(resLo
);
111 result
= _mm256_insertf128_si256(result
, resHi
, 1);
112 return simdscalar
{ _mm256_castsi256_ps(result
) };
114 return _mm256_castsi256_ps(_mm256_cvtepu8_epi32(_mm_castps_si128(_mm256_castps256_ps128(in
))));
117 #error Unsupported vector width
121 static simdscalar
pack(simdscalar
&in
)
123 #if KNOB_SIMD_WIDTH == 8
124 simdscalari src
= _simd_castps_si(in
);
125 __m128i res16
= _mm_packus_epi32(_mm256_castsi256_si128(src
), _mm256_extractf128_si256(src
, 1));
126 __m128i res8
= _mm_packus_epi16(res16
, _mm_undefined_si128());
127 return _mm256_castsi256_ps(_mm256_castsi128_si256(res8
));
129 #error Unsupported vector width
132 #if ENABLE_AVX512_SIMD16
134 static simd16scalar
loadSOA_16(const uint8_t *pSrc
)
136 simd16scalar result
= _simd16_setzero_ps();
137 simdscalar resultlo
= _simd_setzero_ps();
139 const __m128 src
= _mm_load_ps(reinterpret_cast<const float *>(pSrc
));
141 resultlo
= _mm256_insertf128_ps(resultlo
, src
, 0);
142 result
= _simd16_insert_ps(result
, resultlo
, 0);
147 static void SIMDCALL
storeSOA(uint8_t *pDst
, simd16scalar
const &src
)
149 // store simd16 bytes
150 _mm_store_ps(reinterpret_cast<float *>(pDst
), _mm256_castps256_ps128(_simd16_extract_ps(src
, 0)));
153 static simd16scalar
unpack(simd16scalar
&in
)
155 simd4scalari tmp
= _mm_castps_si128(_mm256_castps256_ps128(_simd16_extract_ps(in
, 0)));
156 simd16scalari result
= _simd16_cvtepu8_epi32(tmp
);
158 return _simd16_castsi_ps(result
);
161 static simd16scalar
pack(simd16scalar
&in
)
163 simd16scalari result
= _simd16_setzero_si();
165 simdscalari inlo
= _simd_castps_si(_simd16_extract_ps(in
, 0)); // r0 r1 r2 r3 r4 r5 r6 r7 (32b)
166 simdscalari inhi
= _simd_castps_si(_simd16_extract_ps(in
, 1)); // r8 r9 rA rB rC rD rE rF
168 simdscalari permlo
= _simd_permute2f128_si(inlo
, inhi
, 0x20); // r0 r1 r2 r3 r8 r9 rA rB (32b)
169 simdscalari permhi
= _simd_permute2f128_si(inlo
, inhi
, 0x31); // r4 r5 r6 r7 rC rD rE rF (32b)
171 simdscalari pack
= _simd_packus_epi32(permlo
, permhi
); // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF (16b)
173 const simdscalari zero
= _simd_setzero_si();
175 permlo
= _simd_permute2f128_si(pack
, zero
, 0x20); // (2, 0) // r0 r1 r2 r3 r4 r5 r6 r7 00 00 00 00 00 00 00 00 (16b)
176 permhi
= _simd_permute2f128_si(pack
, zero
, 0x31); // (3, 1) // r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 (16b)
178 pack
= _simd_packus_epi16(permlo
, permhi
); // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 (8b)
180 result
= _simd16_insert_si(result
, pack
, 0);
182 return _simd16_castsi_ps(result
);
187 //////////////////////////////////////////////////////////////////////////
188 /// PackTraits - Helpers for packing / unpacking 8 bit signed channels
189 //////////////////////////////////////////////////////////////////////////
191 struct PackTraits
<8, true>
193 static const uint32_t MyNumBits
= 8;
195 static simdscalar
loadSOA(const uint8_t *pSrc
)
197 #if KNOB_SIMD_WIDTH == 8
198 __m256 result
= _mm256_setzero_ps();
199 __m128 vLo
= _mm_castpd_ps(_mm_load_sd((double*)pSrc
));
200 return _mm256_insertf128_ps(result
, vLo
, 0);
202 #error Unsupported vector width
206 static void storeSOA(uint8_t *pDst
, simdscalar
const &src
)
209 #if KNOB_SIMD_WIDTH == 8
210 _mm_storel_pd((double*)pDst
, _mm_castps_pd(_mm256_castps256_ps128(src
)));
212 #error Unsupported vector width
216 static simdscalar
unpack(simdscalar
&in
)
218 #if KNOB_SIMD_WIDTH == 8
219 #if KNOB_ARCH <= KNOB_ARCH_AVX
220 SWR_INVALID("I think this may be incorrect.");
221 __m128i src
= _mm_castps_si128(_mm256_castps256_ps128(in
));
222 __m128i resLo
= _mm_cvtepi8_epi32(src
);
223 __m128i resHi
= _mm_shuffle_epi8(src
,
224 _mm_set_epi32(0x80808007, 0x80808006, 0x80808005, 0x80808004));
226 __m256i result
= _mm256_castsi128_si256(resLo
);
227 result
= _mm256_insertf128_si256(result
, resHi
, 1);
228 return _mm256_castsi256_ps(result
);
230 return _mm256_castsi256_ps(_mm256_cvtepi8_epi32(_mm_castps_si128(_mm256_castps256_ps128(in
))));
233 #error Unsupported vector width
237 static simdscalar
pack(simdscalar
&in
)
239 #if KNOB_SIMD_WIDTH == 8
240 simdscalari src
= _simd_castps_si(in
);
241 __m128i res16
= _mm_packs_epi32(_mm256_castsi256_si128(src
), _mm256_extractf128_si256(src
, 1));
242 __m128i res8
= _mm_packs_epi16(res16
, _mm_undefined_si128());
243 return _mm256_castsi256_ps(_mm256_castsi128_si256(res8
));
245 #error Unsupported vector width
248 #if ENABLE_AVX512_SIMD16
250 static simd16scalar
loadSOA_16(const uint8_t *pSrc
)
252 simd16scalar result
= _simd16_setzero_ps();
253 simdscalar resultlo
= _simd_setzero_ps();
255 const __m128 src
= _mm_load_ps(reinterpret_cast<const float *>(pSrc
));
257 resultlo
= _mm256_insertf128_ps(resultlo
, src
, 0);
258 result
= _simd16_insert_ps(result
, resultlo
, 0);
263 static void SIMDCALL
storeSOA(uint8_t *pDst
, simd16scalar
const &src
)
265 // store simd16 bytes
266 _mm_store_ps(reinterpret_cast<float *>(pDst
), _mm256_castps256_ps128(_simd16_extract_ps(src
, 0)));
269 static simd16scalar
unpack(simd16scalar
&in
)
271 simd4scalari tmp
= _mm_castps_si128(_mm256_castps256_ps128(_simd16_extract_ps(in
, 0)));
272 simd16scalari result
= _simd16_cvtepu8_epi32(tmp
);
274 return _simd16_castsi_ps(result
);
277 static simd16scalar
pack(simd16scalar
&in
)
279 simd16scalari result
= _simd16_setzero_si();
281 simdscalari inlo
= _simd_castps_si(_simd16_extract_ps(in
, 0)); // r0 r1 r2 r3 r4 r5 r6 r7 (32b)
282 simdscalari inhi
= _simd_castps_si(_simd16_extract_ps(in
, 1)); // r8 r9 rA rB rC rD rE rF
284 simdscalari permlo
= _simd_permute2f128_si(inlo
, inhi
, 0x20); // r0 r1 r2 r3 r8 r9 rA rB (32b)
285 simdscalari permhi
= _simd_permute2f128_si(inlo
, inhi
, 0x31); // r4 r5 r6 r7 rC rD rE rF (32b)
287 simdscalari pack
= _simd_packs_epi32(permlo
, permhi
); // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF (16b)
289 const simdscalari zero
= _simd_setzero_si();
291 permlo
= _simd_permute2f128_si(pack
, zero
, 0x20); // (2, 0) // r0 r1 r2 r3 r4 r5 r6 r7 00 00 00 00 00 00 00 00 (16b)
292 permhi
= _simd_permute2f128_si(pack
, zero
, 0x31); // (3, 1) // r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 (16b)
294 pack
= _simd_packs_epi16(permlo
, permhi
); // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 (8b)
296 result
= _simd16_insert_si(result
, pack
, 0);
298 return _simd16_castsi_ps(result
);
303 //////////////////////////////////////////////////////////////////////////
304 /// PackTraits - Helpers for packing / unpacking 16 bit unsigned channels
305 //////////////////////////////////////////////////////////////////////////
307 struct PackTraits
<16, false>
309 static const uint32_t MyNumBits
= 16;
311 static simdscalar
loadSOA(const uint8_t *pSrc
)
313 #if KNOB_SIMD_WIDTH == 8
314 __m256 result
= _mm256_setzero_ps();
315 __m128 vLo
= _mm_load_ps((const float*)pSrc
);
316 return _mm256_insertf128_ps(result
, vLo
, 0);
318 #error Unsupported vector width
322 static void storeSOA(uint8_t *pDst
, simdscalar
const &src
)
324 #if KNOB_SIMD_WIDTH == 8
325 // store 16B (2B * 8)
326 _mm_store_ps((float*)pDst
, _mm256_castps256_ps128(src
));
328 #error Unsupported vector width
332 static simdscalar
unpack(simdscalar
&in
)
334 #if KNOB_SIMD_WIDTH == 8
335 #if KNOB_ARCH <= KNOB_ARCH_AVX
336 __m128i src
= _mm_castps_si128(_mm256_castps256_ps128(in
));
337 __m128i resLo
= _mm_cvtepu16_epi32(src
);
338 __m128i resHi
= _mm_shuffle_epi8(src
,
339 _mm_set_epi32(0x80800F0E, 0x80800D0C, 0x80800B0A, 0x80800908));
341 __m256i result
= _mm256_castsi128_si256(resLo
);
342 result
= _mm256_insertf128_si256(result
, resHi
, 1);
343 return _mm256_castsi256_ps(result
);
345 return _mm256_castsi256_ps(_mm256_cvtepu16_epi32(_mm_castps_si128(_mm256_castps256_ps128(in
))));
348 #error Unsupported vector width
352 static simdscalar
pack(simdscalar
&in
)
354 #if KNOB_SIMD_WIDTH == 8
355 simdscalari src
= _simd_castps_si(in
);
356 __m256i res
= _mm256_castsi128_si256(_mm_packus_epi32(_mm256_castsi256_si128(src
), _mm256_extractf128_si256(src
, 1)));
357 return _mm256_castsi256_ps(res
);
359 #error Unsupported vector width
362 #if ENABLE_AVX512_SIMD16
364 static simd16scalar
loadSOA_16(const uint8_t *pSrc
)
366 simd16scalar result
= _simd16_setzero_ps();
368 simdscalar resultlo
= _simd_load_ps(reinterpret_cast<const float *>(pSrc
));
370 result
= _simd16_insert_ps(result
, resultlo
, 0);
375 static void SIMDCALL
storeSOA(uint8_t *pDst
, simd16scalar
const &src
)
377 _simd_store_ps(reinterpret_cast<float *>(pDst
), _simd16_extract_ps(src
, 0));
380 static simd16scalar
unpack(simd16scalar
&in
)
382 simd16scalari result
= _simd16_cvtepu16_epi32(_simd_castps_si(_simd16_extract_ps(in
, 0)));
384 return _simd16_castsi_ps(result
);
387 static simd16scalar
pack(simd16scalar
&in
)
389 const simd16scalari zero
= _simd16_setzero_si();
391 simd16scalari permlo
= _simd16_permute2f128_si(_simd16_castps_si(in
), zero
, 0x08); // (0, 0, 2, 0) // r0 r1 r2 r3 r8 r9 rA rB 00 00 00 00 00 00 00 00 (32b)
392 simd16scalari permhi
= _simd16_permute2f128_si(_simd16_castps_si(in
), zero
, 0x0D); // (0, 0, 3, 1) // r4 r5 r6 r7 rC rD rE rF 00 00 00 00 00 00 00 00
394 simd16scalari result
= _simd16_packus_epi32(permlo
, permhi
); // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 (16b)
396 return _simd16_castsi_ps(result
);
401 //////////////////////////////////////////////////////////////////////////
402 /// PackTraits - Helpers for packing / unpacking 16 bit signed channels
403 //////////////////////////////////////////////////////////////////////////
405 struct PackTraits
<16, true>
407 static const uint32_t MyNumBits
= 16;
409 static simdscalar
loadSOA(const uint8_t *pSrc
)
411 #if KNOB_SIMD_WIDTH == 8
412 __m256 result
= _mm256_setzero_ps();
413 __m128 vLo
= _mm_load_ps((const float*)pSrc
);
414 return _mm256_insertf128_ps(result
, vLo
, 0);
416 #error Unsupported vector width
420 static void storeSOA(uint8_t *pDst
, simdscalar
const &src
)
422 #if KNOB_SIMD_WIDTH == 8
423 // store 16B (2B * 8)
424 _mm_store_ps((float*)pDst
, _mm256_castps256_ps128(src
));
426 #error Unsupported vector width
430 static simdscalar
unpack(simdscalar
&in
)
432 #if KNOB_SIMD_WIDTH == 8
433 #if KNOB_ARCH <= KNOB_ARCH_AVX
434 SWR_INVALID("I think this may be incorrect.");
435 __m128i src
= _mm_castps_si128(_mm256_castps256_ps128(in
));
436 __m128i resLo
= _mm_cvtepi16_epi32(src
);
437 __m128i resHi
= _mm_shuffle_epi8(src
,
438 _mm_set_epi32(0x80800F0E, 0x80800D0C, 0x80800B0A, 0x80800908));
440 __m256i result
= _mm256_castsi128_si256(resLo
);
441 result
= _mm256_insertf128_si256(result
, resHi
, 1);
442 return _mm256_castsi256_ps(result
);
444 return _mm256_castsi256_ps(_mm256_cvtepi16_epi32(_mm_castps_si128(_mm256_castps256_ps128(in
))));
447 #error Unsupported vector width
451 static simdscalar
pack(simdscalar
&in
)
453 #if KNOB_SIMD_WIDTH == 8
454 simdscalari src
= _simd_castps_si(in
);
455 __m256i res
= _mm256_castsi128_si256(_mm_packs_epi32(_mm256_castsi256_si128(src
), _mm256_extractf128_si256(src
, 1)));
456 return _mm256_castsi256_ps(res
);
458 #error Unsupported vector width
461 #if ENABLE_AVX512_SIMD16
463 static simd16scalar
loadSOA_16(const uint8_t *pSrc
)
465 simd16scalar result
= _simd16_setzero_ps();
467 simdscalar resultlo
= _simd_load_ps(reinterpret_cast<const float *>(pSrc
));
469 result
= _simd16_insert_ps(result
, resultlo
, 0);
474 static void SIMDCALL
storeSOA(uint8_t *pDst
, simd16scalar
const &src
)
476 _simd_store_ps(reinterpret_cast<float *>(pDst
), _simd16_extract_ps(src
, 0));
479 static simd16scalar
unpack(simd16scalar
&in
)
481 simd16scalari result
= _simd16_cvtepu16_epi32(_simd_castps_si(_simd16_extract_ps(in
, 0)));
483 return _simd16_castsi_ps(result
);
486 static simd16scalar
pack(simd16scalar
&in
)
488 const simd16scalari zero
= _simd16_setzero_si();
490 simd16scalari permlo
= _simd16_permute2f128_si(_simd16_castps_si(in
), zero
, 0x08); // (0, 0, 2, 0) // r0 r1 r2 r3 r8 r9 rA rB 00 00 00 00 00 00 00 00 (32b)
491 simd16scalari permhi
= _simd16_permute2f128_si(_simd16_castps_si(in
), zero
, 0x0D); // (0, 0, 3, 1) // r4 r5 r6 r7 rC rD rE rF 00 00 00 00 00 00 00 00
493 simd16scalari result
= _simd16_packs_epi32(permlo
, permhi
); // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 (16b)
495 return _simd16_castsi_ps(result
);
500 //////////////////////////////////////////////////////////////////////////
501 /// PackTraits - Helpers for packing / unpacking 32 bit channels
502 //////////////////////////////////////////////////////////////////////////
504 struct PackTraits
<32, false>
506 static const uint32_t MyNumBits
= 32;
508 static simdscalar
loadSOA(const uint8_t *pSrc
) { return _simd_load_ps((const float*)pSrc
); }
509 static void storeSOA(uint8_t *pDst
, simdscalar
const &src
) { _simd_store_ps((float*)pDst
, src
); }
510 static simdscalar
unpack(simdscalar
&in
) { return in
; }
511 static simdscalar
pack(simdscalar
&in
) { return in
; }
512 #if ENABLE_AVX512_SIMD16
514 static simd16scalar
loadSOA_16(const uint8_t *pSrc
)
516 return _simd16_load_ps(reinterpret_cast<const float *>(pSrc
));
519 static void SIMDCALL
storeSOA(uint8_t *pDst
, simd16scalar
const &src
)
521 _simd16_store_ps(reinterpret_cast<float *>(pDst
), src
);
524 static simd16scalar
unpack(simd16scalar
&in
)
529 static simd16scalar
pack(simd16scalar
&in
)
536 //////////////////////////////////////////////////////////////////////////
537 /// TypeTraits - Format type traits.
538 //////////////////////////////////////////////////////////////////////////
539 template<SWR_TYPE type
, uint32_t NumBits
>
540 struct TypeTraits
: PackTraits
<NumBits
>
542 static const SWR_TYPE MyType
= type
;
543 static float toFloat() { return 0.0; }
544 static float fromFloat() { SWR_NOT_IMPL
; return 0.0; }
545 static simdscalar
convertSrgb(simdscalar
&in
) { SWR_NOT_IMPL
; return _simd_setzero_ps(); }
548 //////////////////////////////////////////////////////////////////////////
549 /// TypeTraits - Format type traits specialization for UINT8
550 //////////////////////////////////////////////////////////////////////////
551 template<> struct TypeTraits
<SWR_TYPE_UINT
, 8> : PackTraits
<8>
553 static const SWR_TYPE MyType
= SWR_TYPE_UINT
;
554 static float toFloat() { return 0.0; }
555 static float fromFloat() { SWR_NOT_IMPL
; return 0.0; }
556 static simdscalar
convertSrgb(simdscalar
&in
) { SWR_NOT_IMPL
; return _simd_setzero_ps(); }
559 //////////////////////////////////////////////////////////////////////////
560 /// TypeTraits - Format type traits specialization for UINT8
561 //////////////////////////////////////////////////////////////////////////
562 template<> struct TypeTraits
<SWR_TYPE_SINT
, 8> : PackTraits
<8, true>
564 static const SWR_TYPE MyType
= SWR_TYPE_SINT
;
565 static float toFloat() { return 0.0; }
566 static float fromFloat() { SWR_NOT_IMPL
; return 0.0; }
567 static simdscalar
convertSrgb(simdscalar
&in
) { SWR_NOT_IMPL
; return _simd_setzero_ps(); }
570 //////////////////////////////////////////////////////////////////////////
571 /// TypeTraits - Format type traits specialization for UINT16
572 //////////////////////////////////////////////////////////////////////////
573 template<> struct TypeTraits
<SWR_TYPE_UINT
, 16> : PackTraits
<16>
575 static const SWR_TYPE MyType
= SWR_TYPE_UINT
;
576 static float toFloat() { return 0.0; }
577 static float fromFloat() { SWR_NOT_IMPL
; return 0.0; }
578 static simdscalar
convertSrgb(simdscalar
&in
) { SWR_NOT_IMPL
; return _simd_setzero_ps(); }
581 //////////////////////////////////////////////////////////////////////////
582 /// TypeTraits - Format type traits specialization for SINT16
583 //////////////////////////////////////////////////////////////////////////
584 template<> struct TypeTraits
<SWR_TYPE_SINT
, 16> : PackTraits
<16, true>
586 static const SWR_TYPE MyType
= SWR_TYPE_SINT
;
587 static float toFloat() { return 0.0; }
588 static float fromFloat() { SWR_NOT_IMPL
; return 0.0; }
589 static simdscalar
convertSrgb(simdscalar
&in
) { SWR_NOT_IMPL
; return _simd_setzero_ps(); }
592 //////////////////////////////////////////////////////////////////////////
593 /// TypeTraits - Format type traits specialization for UINT32
594 //////////////////////////////////////////////////////////////////////////
595 template<> struct TypeTraits
<SWR_TYPE_UINT
, 32> : PackTraits
<32>
597 static const SWR_TYPE MyType
= SWR_TYPE_UINT
;
598 static float toFloat() { return 0.0; }
599 static float fromFloat() { SWR_NOT_IMPL
; return 0.0; }
600 static simdscalar
convertSrgb(simdscalar
&in
) { SWR_NOT_IMPL
; return _simd_setzero_ps(); }
603 //////////////////////////////////////////////////////////////////////////
604 /// TypeTraits - Format type traits specialization for UINT32
605 //////////////////////////////////////////////////////////////////////////
606 template<> struct TypeTraits
<SWR_TYPE_SINT
, 32> : PackTraits
<32>
608 static const SWR_TYPE MyType
= SWR_TYPE_SINT
;
609 static float toFloat() { return 0.0; }
610 static float fromFloat() { SWR_NOT_IMPL
; return 0.0; }
611 static simdscalar
convertSrgb(simdscalar
&in
) { SWR_NOT_IMPL
; return _simd_setzero_ps(); }
614 //////////////////////////////////////////////////////////////////////////
615 /// TypeTraits - Format type traits specialization for UNORM5
616 //////////////////////////////////////////////////////////////////////////
617 template<> struct TypeTraits
<SWR_TYPE_UNORM
, 5> : PackTraits
<5>
619 static const SWR_TYPE MyType
= SWR_TYPE_UNORM
;
620 static float toFloat() { return 1.0f
/ 31.0f
; }
621 static float fromFloat() { return 31.0f
; }
622 static simdscalar
convertSrgb(simdscalar
&in
) { SWR_NOT_IMPL
; return _simd_setzero_ps(); }
625 //////////////////////////////////////////////////////////////////////////
626 /// TypeTraits - Format type traits specialization for UNORM6
627 //////////////////////////////////////////////////////////////////////////
628 template<> struct TypeTraits
<SWR_TYPE_UNORM
, 6> : PackTraits
<6>
630 static const SWR_TYPE MyType
= SWR_TYPE_UNORM
;
631 static float toFloat() { return 1.0f
/ 63.0f
; }
632 static float fromFloat() { return 63.0f
; }
633 static simdscalar
convertSrgb(simdscalar
&in
) { SWR_NOT_IMPL
; return _simd_setzero_ps(); }
636 //////////////////////////////////////////////////////////////////////////
637 /// TypeTraits - Format type traits specialization for UNORM8
638 //////////////////////////////////////////////////////////////////////////
639 template<> struct TypeTraits
<SWR_TYPE_UNORM
, 8> : PackTraits
<8>
641 static const SWR_TYPE MyType
= SWR_TYPE_UNORM
;
642 static float toFloat() { return 1.0f
/ 255.0f
; }
643 static float fromFloat() { return 255.0f
; }
644 static simdscalar
convertSrgb(simdscalar
&in
) { SWR_NOT_IMPL
; return _simd_setzero_ps(); }
647 //////////////////////////////////////////////////////////////////////////
648 /// TypeTraits - Format type traits specialization for UNORM8
649 //////////////////////////////////////////////////////////////////////////
650 template<> struct TypeTraits
<SWR_TYPE_SNORM
, 8> : PackTraits
<8, true>
652 static const SWR_TYPE MyType
= SWR_TYPE_SNORM
;
653 static float toFloat() { return 1.0f
/ 127.0f
; }
654 static float fromFloat() { return 127.0f
; }
655 static simdscalar
convertSrgb(simdscalar
&in
) { SWR_NOT_IMPL
; return _simd_setzero_ps(); }
658 //////////////////////////////////////////////////////////////////////////
659 /// TypeTraits - Format type traits specialization for UNORM16
660 //////////////////////////////////////////////////////////////////////////
661 template<> struct TypeTraits
<SWR_TYPE_UNORM
, 16> : PackTraits
<16>
663 static const SWR_TYPE MyType
= SWR_TYPE_UNORM
;
664 static float toFloat() { return 1.0f
/ 65535.0f
; }
665 static float fromFloat() { return 65535.0f
; }
666 static simdscalar
convertSrgb(simdscalar
&in
) { SWR_NOT_IMPL
; return _simd_setzero_ps(); }
669 //////////////////////////////////////////////////////////////////////////
670 /// TypeTraits - Format type traits specialization for SNORM16
671 //////////////////////////////////////////////////////////////////////////
672 template<> struct TypeTraits
<SWR_TYPE_SNORM
, 16> : PackTraits
<16, true>
674 static const SWR_TYPE MyType
= SWR_TYPE_UNORM
;
675 static float toFloat() { return 1.0f
/ 32767.0f
; }
676 static float fromFloat() { return 32767.0f
; }
677 static simdscalar
convertSrgb(simdscalar
&in
) { SWR_NOT_IMPL
; return _simd_setzero_ps(); }
680 //////////////////////////////////////////////////////////////////////////
681 /// TypeTraits - Format type traits specialization for UNORM24
682 //////////////////////////////////////////////////////////////////////////
684 struct TypeTraits
< SWR_TYPE_UNORM
, 24 > : PackTraits
<32>
686 static const SWR_TYPE MyType
= SWR_TYPE_UNORM
;
687 static float toFloat() { return 1.0f
/ 16777215.0f
; }
688 static float fromFloat() { return 16777215.0f
; }
689 static simdscalar
convertSrgb(simdscalar
&in
) { SWR_NOT_IMPL
; return _simd_setzero_ps(); }
692 //////////////////////////////////////////////////////////////////////////
693 // FLOAT Specializations from here on...
694 //////////////////////////////////////////////////////////////////////////
695 #define TO_M128i(a) _mm_castps_si128(a)
696 #define TO_M128(a) _mm_castsi128_ps(a)
700 template< unsigned expnum
, unsigned expden
, unsigned coeffnum
, unsigned coeffden
>
701 inline static __m128
fastpow(__m128 arg
) {
704 static const __m128 factor
= _mm_set1_ps(exp2(127.0f
* expden
/ expnum
- 127.0f
)
705 * powf(1.0f
* coeffnum
/ coeffden
, 1.0f
* expden
/ expnum
));
707 // Apply a constant pre-correction factor.
708 ret
= _mm_mul_ps(ret
, factor
);
710 // Reinterpret arg as integer to obtain logarithm.
711 //asm("cvtdq2ps %1, %0" : "=x" (ret) : "x" (ret));
712 ret
= _mm_cvtepi32_ps(_mm_castps_si128(ret
));
714 // Multiply logarithm by power.
715 ret
= _mm_mul_ps(ret
, _mm_set1_ps(1.0f
* expnum
/ expden
));
717 // Convert back to "integer" to exponentiate.
718 //asm("cvtps2dq %1, %0" : "=x" (ret) : "x" (ret));
719 ret
= _mm_castsi128_ps(_mm_cvtps_epi32(ret
));
724 inline static __m128
pow512_4(__m128 arg
) {
725 // 5/12 is too small, so compute the 4th root of 20/12 instead.
726 // 20/12 = 5/3 = 1 + 2/3 = 2 - 1/3. 2/3 is a suitable argument for fastpow.
727 // weighting coefficient: a^-1/2 = 2 a; a = 2^-2/3
728 __m128 xf
= fastpow
< 2, 3, int(0.629960524947437 * 1e9
), int(1e9
) >(arg
);
729 __m128 xover
= _mm_mul_ps(arg
, xf
);
731 __m128 xfm1
= _mm_rsqrt_ps(xf
);
732 __m128 x2
= _mm_mul_ps(arg
, arg
);
733 __m128 xunder
= _mm_mul_ps(x2
, xfm1
);
735 // sqrt2 * over + 2 * sqrt2 * under
736 __m128 xavg
= _mm_mul_ps(_mm_set1_ps(1.0f
/ (3.0f
* 0.629960524947437f
) * 0.999852f
),
737 _mm_add_ps(xover
, xunder
));
739 xavg
= _mm_mul_ps(xavg
, _mm_rsqrt_ps(xavg
));
740 xavg
= _mm_mul_ps(xavg
, _mm_rsqrt_ps(xavg
));
744 inline static __m128
powf_wrapper(__m128 Base
, float Exp
)
746 float *f
= (float *)(&Base
);
748 return _mm_set_ps(powf(f
[3], Exp
),
754 static inline __m128
ConvertFloatToSRGB2(__m128
& Src
)
756 // create a mask with 0xFFFFFFFF in the DWORDs where the source is <= the minimal SRGB float value
757 __m128i CmpToSRGBThresholdMask
= TO_M128i(_mm_cmpnlt_ps(_mm_set1_ps(0.0031308f
), Src
));
759 // squeeze the mask down to 16 bits (4 bits per DWORD)
760 int CompareResult
= _mm_movemask_epi8(CmpToSRGBThresholdMask
);
765 if (CompareResult
== 0xFFFF)
767 // all DWORDs are <= the threshold
768 Result
= _mm_mul_ps(Src
, _mm_set1_ps(12.92f
));
770 else if (CompareResult
== 0x0)
772 // all DWORDs are > the threshold
773 __m128 fSrc_0RGB
= Src
;
775 // --> 1.055f * c(1.0f/2.4f) - 0.055f
776 #if KNOB_USE_FAST_SRGB == TRUE
777 // 1.0f / 2.4f is 5.0f / 12.0f which is used for approximation.
778 __m128 f
= pow512_4(fSrc_0RGB
);
780 __m128 f
= powf_wrapper(fSrc_0RGB
, 1.0f
/ 2.4f
);
782 f
= _mm_mul_ps(f
, _mm_set1_ps(1.055f
));
783 Result
= _mm_sub_ps(f
, _mm_set1_ps(0.055f
));
787 // some DWORDs are <= the threshold and some are > threshold
788 __m128 Src_0RGB_mul_denorm
= _mm_mul_ps(Src
, _mm_set1_ps(12.92f
));
790 __m128 fSrc_0RGB
= Src
;
792 // --> 1.055f * c(1.0f/2.4f) - 0.055f
793 #if KNOB_USE_FAST_SRGB == TRUE
794 // 1.0f / 2.4f is 5.0f / 12.0f which is used for approximation.
795 __m128 f
= pow512_4(fSrc_0RGB
);
797 __m128 f
= powf_wrapper(fSrc_0RGB
, 1.0f
/ 2.4f
);
799 f
= _mm_mul_ps(f
, _mm_set1_ps(1.055f
));
800 f
= _mm_sub_ps(f
, _mm_set1_ps(0.055f
));
802 // Clear the alpha (is garbage after the sub)
803 __m128i i
= _mm_and_si128(TO_M128i(f
), _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF));
805 __m128i LessThanPart
= _mm_and_si128(CmpToSRGBThresholdMask
, TO_M128i(Src_0RGB_mul_denorm
));
806 __m128i GreaterEqualPart
= _mm_andnot_si128(CmpToSRGBThresholdMask
, i
);
807 __m128i CombinedParts
= _mm_or_si128(LessThanPart
, GreaterEqualPart
);
809 Result
= TO_M128(CombinedParts
);
815 #if ENABLE_AVX512_SIMD16
816 template< unsigned expnum
, unsigned expden
, unsigned coeffnum
, unsigned coeffden
>
817 inline static simd16scalar SIMDCALL
fastpow(simd16scalar
const &value
)
819 static const float factor1
= exp2(127.0f
* expden
/ expnum
- 127.0f
)
820 * powf(1.0f
* coeffnum
/ coeffden
, 1.0f
* expden
/ expnum
);
822 // Apply a constant pre-correction factor.
823 simd16scalar result
= _simd16_mul_ps(value
, _simd16_set1_ps(factor1
));
825 // Reinterpret arg as integer to obtain logarithm.
826 //asm("cvtdq2ps %1, %0" : "=x" (result) : "x" (result));
827 result
= _simd16_cvtepi32_ps(_simd16_castps_si(result
));
829 // Multiply logarithm by power.
830 result
= _simd16_mul_ps(result
, _simd16_set1_ps(1.0f
* expnum
/ expden
));
832 // Convert back to "integer" to exponentiate.
833 //asm("cvtps2dq %1, %0" : "=x" (result) : "x" (result));
834 result
= _simd16_castsi_ps(_simd16_cvtps_epi32(result
));
839 inline static simd16scalar SIMDCALL
pow512_4(simd16scalar
const &arg
)
841 // 5/12 is too small, so compute the 4th root of 20/12 instead.
842 // 20/12 = 5/3 = 1 + 2/3 = 2 - 1/3. 2/3 is a suitable argument for fastpow.
843 // weighting coefficient: a^-1/2 = 2 a; a = 2^-2/3
844 simd16scalar xf
= fastpow
< 2, 3, int(0.629960524947437 * 1e9
), int(1e9
) >(arg
);
845 simd16scalar xover
= _simd16_mul_ps(arg
, xf
);
847 simd16scalar xfm1
= _simd16_rsqrt_ps(xf
);
848 simd16scalar x2
= _simd16_mul_ps(arg
, arg
);
849 simd16scalar xunder
= _simd16_mul_ps(x2
, xfm1
);
851 // sqrt2 * over + 2 * sqrt2 * under
852 simd16scalar xavg
= _simd16_mul_ps(_simd16_set1_ps(1.0f
/ (3.0f
* 0.629960524947437f
) * 0.999852f
), _simd16_add_ps(xover
, xunder
));
854 xavg
= _simd16_mul_ps(xavg
, _simd16_rsqrt_ps(xavg
));
855 xavg
= _simd16_mul_ps(xavg
, _simd16_rsqrt_ps(xavg
));
860 inline static simd16scalar SIMDCALL
powf_wrapper(const simd16scalar
&base
, float exp
)
862 const float *f
= reinterpret_cast<const float *>(&base
);
864 return _simd16_set_ps(
884 // float to SRGB conversion formula
886 // if (value < 0.0031308f)
889 // value = 1.055f * pow(value, 1.0f / 2.4f) - 0.055f;
891 static inline simd16scalar
ConvertFloatToSRGB2(const simd16scalar
&value
)
893 // create a mask where the source is < the minimal SRGB float value
894 const simd16mask mask
= _simd16_cmplt_ps_mask(value
, _simd16_set1_ps(0.0031308f
));
896 // if all elements are < the threshold, result = value * 12.92
897 simd16scalar result
= _simd16_mul_ps(value
, _simd16_set1_ps(12.92f
));
899 if (_simd16_mask2int(mask
) != 0xFFFF)
901 // some elements are >= threshold, result = 1.055 * power(value, 1.0 / 2.4) - 0.055
902 #if KNOB_USE_FAST_SRGB == TRUE
903 // 1.0f / 2.4f is 5.0f / 12.0f which is used for approximation.
904 simd16scalar result2
= pow512_4(value
);
906 simd16scalar result2
= powf_wrapper(value
, 1.0f
/ 2.4f
);
909 result2
= _simd16_mul_ps(result2
, _simd16_set1_ps(1.055f
));
910 result2
= _simd16_sub_ps(result2
, _simd16_set1_ps(0.055f
));
912 #if (KNOB_ARCH == KNOB_ARCH_AVX512)
913 // only native AVX512 can directly use the computed mask for the blend operation
914 result
= _mm512_mask_blend_ps(mask
, result2
, result
);
916 result
= _simd16_blendv_ps(result2
, result
, _simd16_cmplt_ps(value
, _simd16_set1_ps(0.0031308f
)));
924 //////////////////////////////////////////////////////////////////////////
925 /// TypeTraits - Format type traits specialization for FLOAT16
926 //////////////////////////////////////////////////////////////////////////
927 template<> struct TypeTraits
<SWR_TYPE_FLOAT
, 16> : PackTraits
<16>
929 static const SWR_TYPE MyType
= SWR_TYPE_FLOAT
;
930 static float toFloat() { return 1.0f
; }
931 static float fromFloat() { return 1.0f
; }
932 static simdscalar
convertSrgb(simdscalar
&in
) { SWR_NOT_IMPL
; return _simd_setzero_ps(); }
934 static simdscalar
pack(const simdscalar
&in
)
936 #if KNOB_SIMD_WIDTH == 8
937 #if (KNOB_ARCH == KNOB_ARCH_AVX)
938 // input is 8 packed float32, output is 8 packed float16
939 simdscalari src
= _simd_castps_si(in
);
941 static const uint32_t FLOAT_EXP_BITS
= 8;
942 static const uint32_t FLOAT_MANTISSA_BITS
= 23;
943 static const uint32_t FLOAT_MANTISSA_MASK
= (1U << FLOAT_MANTISSA_BITS
) - 1;
944 static const uint32_t FLOAT_EXP_MASK
= ((1U << FLOAT_EXP_BITS
) - 1) << FLOAT_MANTISSA_BITS
;
946 static const uint32_t HALF_EXP_BITS
= 5;
947 static const uint32_t HALF_MANTISSA_BITS
= 10;
948 static const uint32_t HALF_EXP_MASK
= ((1U << HALF_EXP_BITS
) - 1) << HALF_MANTISSA_BITS
;
950 // minimum exponent required, exponents below this are flushed to 0.
951 static const int32_t HALF_EXP_MIN
= -14;
952 static const int32_t FLOAT_EXP_BIAS
= 127;
953 static const int32_t FLOAT_EXP_MIN
= HALF_EXP_MIN
+ FLOAT_EXP_BIAS
;
954 static const int32_t FLOAT_EXP_MIN_FTZ
= FLOAT_EXP_MIN
- (HALF_MANTISSA_BITS
+ 1); // +1 for the lack of implicit significand
956 // maximum exponent required, exponents above this are set to infinity
957 static const int32_t HALF_EXP_MAX
= 15;
958 static const int32_t FLOAT_EXP_MAX
= HALF_EXP_MAX
+ FLOAT_EXP_BIAS
;
960 const simdscalari vSignMask
= _simd_set1_epi32(0x80000000);
961 const simdscalari vExpMask
= _simd_set1_epi32(FLOAT_EXP_MASK
);
962 const simdscalari vManMask
= _simd_set1_epi32(FLOAT_MANTISSA_MASK
);
963 const simdscalari vExpMin
= _simd_set1_epi32(FLOAT_EXP_MASK
& uint32_t(FLOAT_EXP_MIN
<< FLOAT_MANTISSA_BITS
));
964 const simdscalari vExpMinFtz
= _simd_set1_epi32(FLOAT_EXP_MASK
& uint32_t(FLOAT_EXP_MIN_FTZ
<< FLOAT_MANTISSA_BITS
));
965 const simdscalari vExpMax
= _simd_set1_epi32(FLOAT_EXP_MASK
& uint32_t(FLOAT_EXP_MAX
<< FLOAT_MANTISSA_BITS
));
967 simdscalari vSign
= _simd_and_si(src
, vSignMask
);
968 simdscalari vExp
= _simd_and_si(src
, vExpMask
);
969 simdscalari vMan
= _simd_and_si(src
, vManMask
);
971 simdscalari vFTZMask
= _simd_cmplt_epi32(vExp
, vExpMinFtz
);
972 simdscalari vDenormMask
= _simd_andnot_si(vFTZMask
, _simd_cmplt_epi32(vExp
, vExpMin
));
973 simdscalari vInfMask
= _simd_cmpeq_epi32(vExpMask
, vExp
);
974 simdscalari vClampMask
= _simd_andnot_si(vInfMask
, _simd_cmplt_epi32(vExpMax
, vExp
));
976 simdscalari vHalfExp
= _simd_add_epi32(_simd_sub_epi32(vExp
, vExpMin
), _simd_set1_epi32(1U << FLOAT_MANTISSA_BITS
));
978 // pack output 16-bits into the lower 16-bits of each 32-bit channel
979 simdscalari vDst
= _simd_and_si(_simd_srli_epi32(vHalfExp
, 13), _simd_set1_epi32(HALF_EXP_MASK
));
980 vDst
= _simd_or_si(vDst
, _simd_srli_epi32(vMan
, FLOAT_MANTISSA_BITS
- HALF_MANTISSA_BITS
));
983 vDst
= _simd_andnot_si(vFTZMask
, vDst
);
984 // Apply Infinites / NaN
985 vDst
= _simd_or_si(vDst
, _simd_and_si(vInfMask
, _simd_set1_epi32(HALF_EXP_MASK
)));
988 vDst
= _simd_andnot_si(vClampMask
, vDst
);
989 vDst
= _simd_or_si(vDst
,
990 _simd_and_si(vClampMask
, _simd_set1_epi32(0x7BFF)));
992 // Compute Denormals (subnormals)
993 if (!_mm256_testz_si256(vDenormMask
, vDenormMask
))
995 uint32_t *pDenormMask
= (uint32_t*)&vDenormMask
;
996 uint32_t *pExp
= (uint32_t*)&vExp
;
997 uint32_t *pMan
= (uint32_t*)&vMan
;
998 uint32_t *pDst
= (uint32_t*)&vDst
;
999 for (uint32_t i
= 0; i
< KNOB_SIMD_WIDTH
; ++i
)
1003 // Need to compute subnormal value
1004 uint32_t exponent
= pExp
[i
] >> FLOAT_MANTISSA_BITS
;
1005 uint32_t mantissa
= pMan
[i
] |
1006 (1U << FLOAT_MANTISSA_BITS
); // Denorms include no "implicit" 1s. Make it explicit
1008 pDst
[i
] = mantissa
>> ((FLOAT_EXP_MIN
- exponent
) + (FLOAT_MANTISSA_BITS
- HALF_MANTISSA_BITS
));
1014 vDst
= _simd_or_si(vDst
, _simd_srli_epi32(vSign
, 16));
1016 // Pack to lower 128-bits
1017 vDst
= _mm256_castsi128_si256(_mm_packus_epi32(_mm256_castsi256_si128(vDst
), _mm256_extractf128_si256(vDst
, 1)));
1020 #if !defined(NDEBUG)
1021 simdscalari vCheck
= _mm256_castsi128_si256(_mm256_cvtps_ph(in
, _MM_FROUND_TRUNC
));
1023 for (uint32_t i
= 0; i
< 4; ++i
)
1025 SWR_ASSERT(vCheck
.m256i_i32
[i
] == vDst
.m256i_i32
[i
]);
1030 return _simd_castsi_ps(vDst
);
1033 return _mm256_castsi256_ps(_mm256_castsi128_si256(_mm256_cvtps_ph(in
, _MM_FROUND_TRUNC
)));
1036 #error Unsupported vector width
1040 static simdscalar
unpack(const simdscalar
&in
)
1042 // input is 8 packed float16, output is 8 packed float32
1043 SWR_NOT_IMPL
; // @todo
1044 return _simd_setzero_ps();
1046 #if ENABLE_AVX512_SIMD16
1048 static simd16scalar
pack(const simd16scalar
&in
)
1050 simd16scalari result
= _simd16_setzero_si();
1051 simdscalari resultlo
= _simd_setzero_si();
1053 #if (KNOB_ARCH == KNOB_ARCH_AVX)
1054 simdscalar simdlo
= pack(_simd16_extract_ps(in
, 0));
1055 simdscalar simdhi
= pack(_simd16_extract_ps(in
, 1));
1057 __m128i templo
= _simd_extractf128_si(_simd_castps_si(simdlo
), 0);
1058 __m128i temphi
= _simd_extractf128_si(_simd_castps_si(simdhi
), 0);
1061 __m128i templo
= _mm256_cvtps_ph(_simd16_extract_ps(in
, 0), _MM_FROUND_TRUNC
);
1062 __m128i temphi
= _mm256_cvtps_ph(_simd16_extract_ps(in
, 1), _MM_FROUND_TRUNC
);
1065 resultlo
= _simd_insertf128_si(resultlo
, templo
, 0);
1066 resultlo
= _simd_insertf128_si(resultlo
, temphi
, 1);
1068 result
= _simd16_insert_si(result
, resultlo
, 0);
1070 return _simd16_castsi_ps(result
);
1073 static simd16scalar
unpack(const simd16scalar
&in
)
1075 // input is 16 packed float16, output is 16 packed float32
1076 SWR_NOT_IMPL
; // @todo
1077 return _simd16_setzero_ps();
1082 //////////////////////////////////////////////////////////////////////////
1083 /// TypeTraits - Format type traits specialization for FLOAT32
1084 //////////////////////////////////////////////////////////////////////////
1085 template<> struct TypeTraits
<SWR_TYPE_FLOAT
, 32> : PackTraits
<32>
1087 static const SWR_TYPE MyType
= SWR_TYPE_FLOAT
;
1088 static float toFloat() { return 1.0f
; }
1089 static float fromFloat() { return 1.0f
; }
1090 static inline simdscalar
convertSrgb(simdscalar
&in
)
1092 #if KNOB_SIMD_WIDTH == 8
1093 __m128 srcLo
= _mm256_extractf128_ps(in
, 0);
1094 __m128 srcHi
= _mm256_extractf128_ps(in
, 1);
1096 srcLo
= ConvertFloatToSRGB2(srcLo
);
1097 srcHi
= ConvertFloatToSRGB2(srcHi
);
1099 in
= _mm256_insertf128_ps(in
, srcLo
, 0);
1100 in
= _mm256_insertf128_ps(in
, srcHi
, 1);
1102 #error Unsupported vector width
1106 #if ENABLE_AVX512_SIMD16
1108 static inline simd16scalar
convertSrgb(simd16scalar
&in
)
1110 return ConvertFloatToSRGB2(in
);
1115 //////////////////////////////////////////////////////////////////////////
1116 /// FormatIntType - Calculate base integer type for pixel components based
1117 /// on total number of bits. Components can be smaller
1118 /// that this type, but the entire pixel must not be
1119 /// any smaller than this type.
1120 //////////////////////////////////////////////////////////////////////////
1121 template <uint32_t bits
, bool bits8
= bits
<= 8, bool bits16
= bits
<= 16>
1122 struct FormatIntType
1124 typedef uint32_t TYPE
;
1127 template <uint32_t bits
>
1128 struct FormatIntType
<bits
, true, true>
1130 typedef uint8_t TYPE
;
1133 template <uint32_t bits
>
1134 struct FormatIntType
<bits
, false, true>
1136 typedef uint16_t TYPE
;
1139 //////////////////////////////////////////////////////////////////////////
1140 /// Format1 - Bitfield for single component formats.
1141 //////////////////////////////////////////////////////////////////////////
1142 template<uint32_t x
>
1145 typedef typename FormatIntType
<x
>::TYPE TYPE
;
1151 ///@ The following are here to provide full template needed in Formats.
1166 //////////////////////////////////////////////////////////////////////////
1167 /// Format2 - Bitfield for 2 component formats.
1168 //////////////////////////////////////////////////////////////////////////
1169 template<uint32_t x
, uint32_t y
>
1172 typedef typename FormatIntType
<x
+ y
>::TYPE TYPE
;
1181 ///@ The following are here to provide full template needed in Formats.
1187 //////////////////////////////////////////////////////////////////////////
1188 /// Format3 - Bitfield for 3 component formats.
1189 //////////////////////////////////////////////////////////////////////////
1190 template<uint32_t x
, uint32_t y
, uint32_t z
>
1193 typedef typename FormatIntType
<x
+ y
+ z
>::TYPE TYPE
;
1201 TYPE a
; ///@note This is here to provide full template needed in Formats.
1204 //////////////////////////////////////////////////////////////////////////
1205 /// Format4 - Bitfield for 4 component formats.
1206 //////////////////////////////////////////////////////////////////////////
1207 template<uint32_t x
, uint32_t y
, uint32_t z
, uint32_t w
>
1210 typedef typename FormatIntType
<x
+ y
+ z
+ w
>::TYPE TYPE
;
1218 //////////////////////////////////////////////////////////////////////////
1219 /// ComponentTraits - Default components
1220 //////////////////////////////////////////////////////////////////////////
1221 template<uint32_t x
, uint32_t y
, uint32_t z
, uint32_t w
>
1224 INLINE
static uint32_t GetDefault(uint32_t comp
)
1226 static const uint32_t defaults
[4]{ x
, y
, z
, w
};
1227 return defaults
[comp
];
1231 //////////////////////////////////////////////////////////////////////////
1232 /// ComponentTraits - Component type traits.
1233 //////////////////////////////////////////////////////////////////////////
1234 template<SWR_TYPE X
, uint32_t NumBitsX
, SWR_TYPE Y
= SWR_TYPE_UNKNOWN
, uint32_t NumBitsY
= 0, SWR_TYPE Z
= SWR_TYPE_UNKNOWN
, uint32_t NumBitsZ
= 0, SWR_TYPE W
= SWR_TYPE_UNKNOWN
, uint32_t NumBitsW
= 0>
1235 struct ComponentTraits
1237 INLINE
static SWR_TYPE
GetType(uint32_t comp
)
1239 static const SWR_TYPE CompType
[4]{ X
, Y
, Z
, W
};
1240 return CompType
[comp
];
1243 INLINE
static constexpr uint32_t GetConstBPC(uint32_t comp
)
1245 return (comp
== 3) ? NumBitsW
:
1246 ((comp
== 2) ? NumBitsZ
:
1247 ((comp
== 1) ? NumBitsY
: NumBitsX
) );
1250 INLINE
static uint32_t GetBPC(uint32_t comp
)
1252 static const uint32_t MyBpc
[4]{ NumBitsX
, NumBitsY
, NumBitsZ
, NumBitsW
};
1256 INLINE
static bool isNormalized(uint32_t comp
)
1261 return (X
== SWR_TYPE_UNORM
|| X
== SWR_TYPE_SNORM
) ? true : false;
1263 return (Y
== SWR_TYPE_UNORM
|| Y
== SWR_TYPE_SNORM
) ? true : false;
1265 return (Z
== SWR_TYPE_UNORM
|| Z
== SWR_TYPE_SNORM
) ? true : false;
1267 return (W
== SWR_TYPE_UNORM
|| W
== SWR_TYPE_SNORM
) ? true : false;
1269 SWR_INVALID("Invalid component: %d", comp
);
1273 INLINE
static float toFloat(uint32_t comp
)
1278 return TypeTraits
<X
, NumBitsX
>::toFloat();
1280 return TypeTraits
<Y
, NumBitsY
>::toFloat();
1282 return TypeTraits
<Z
, NumBitsZ
>::toFloat();
1284 return TypeTraits
<W
, NumBitsW
>::toFloat();
1286 SWR_INVALID("Invalid component: %d", comp
);
1287 return TypeTraits
<X
, NumBitsX
>::toFloat();
1291 INLINE
static float fromFloat(uint32_t comp
)
1296 return TypeTraits
<X
, NumBitsX
>::fromFloat();
1298 return TypeTraits
<Y
, NumBitsY
>::fromFloat();
1300 return TypeTraits
<Z
, NumBitsZ
>::fromFloat();
1302 return TypeTraits
<W
, NumBitsW
>::fromFloat();
1304 SWR_INVALID("Invalid component: %d", comp
);
1305 return TypeTraits
<X
, NumBitsX
>::fromFloat();
1308 INLINE
static simdscalar
loadSOA(uint32_t comp
, const uint8_t* pSrc
)
1313 return TypeTraits
<X
, NumBitsX
>::loadSOA(pSrc
);
1315 return TypeTraits
<Y
, NumBitsY
>::loadSOA(pSrc
);
1317 return TypeTraits
<Z
, NumBitsZ
>::loadSOA(pSrc
);
1319 return TypeTraits
<W
, NumBitsW
>::loadSOA(pSrc
);
1321 SWR_INVALID("Invalid component: %d", comp
);
1322 return TypeTraits
<X
, NumBitsX
>::loadSOA(pSrc
);
1325 INLINE
static void storeSOA(uint32_t comp
, uint8_t *pDst
, simdscalar
const &src
)
1330 TypeTraits
<X
, NumBitsX
>::storeSOA(pDst
, src
);
1333 TypeTraits
<Y
, NumBitsY
>::storeSOA(pDst
, src
);
1336 TypeTraits
<Z
, NumBitsZ
>::storeSOA(pDst
, src
);
1339 TypeTraits
<W
, NumBitsW
>::storeSOA(pDst
, src
);
1342 SWR_INVALID("Invalid component: %d", comp
);
1345 INLINE
static simdscalar
unpack(uint32_t comp
, simdscalar
&in
)
1351 out
= TypeTraits
<X
, NumBitsX
>::unpack(in
); break;
1353 out
= TypeTraits
<Y
, NumBitsY
>::unpack(in
); break;
1355 out
= TypeTraits
<Z
, NumBitsZ
>::unpack(in
); break;
1357 out
= TypeTraits
<W
, NumBitsW
>::unpack(in
); break;
1359 SWR_INVALID("Invalid component: %d", comp
);
1366 INLINE
static simdscalar
pack(uint32_t comp
, simdscalar
&in
)
1372 out
= TypeTraits
<X
, NumBitsX
>::pack(in
); break;
1374 out
= TypeTraits
<Y
, NumBitsY
>::pack(in
); break;
1376 out
= TypeTraits
<Z
, NumBitsZ
>::pack(in
); break;
1378 out
= TypeTraits
<W
, NumBitsW
>::pack(in
); break;
1380 SWR_INVALID("Invalid component: %d", comp
);
1387 INLINE
static simdscalar
convertSrgb(uint32_t comp
, simdscalar
&in
)
1392 return TypeTraits
<X
, NumBitsX
>::convertSrgb(in
);
1394 return TypeTraits
<Y
, NumBitsY
>::convertSrgb(in
);
1396 return TypeTraits
<Z
, NumBitsZ
>::convertSrgb(in
);
1398 return TypeTraits
<W
, NumBitsW
>::convertSrgb(in
);
1400 SWR_INVALID("Invalid component: %d", comp
);
1401 return TypeTraits
<X
, NumBitsX
>::convertSrgb(in
);
1403 #if ENABLE_AVX512_SIMD16
1405 INLINE
static simd16scalar
loadSOA_16(uint32_t comp
, const uint8_t* pSrc
)
1410 return TypeTraits
<X
, NumBitsX
>::loadSOA_16(pSrc
);
1412 return TypeTraits
<Y
, NumBitsY
>::loadSOA_16(pSrc
);
1414 return TypeTraits
<Z
, NumBitsZ
>::loadSOA_16(pSrc
);
1416 return TypeTraits
<W
, NumBitsW
>::loadSOA_16(pSrc
);
1418 SWR_INVALID("Invalid component: %d", comp
);
1419 return TypeTraits
<X
, NumBitsX
>::loadSOA_16(pSrc
);
1422 INLINE
static void SIMDCALL
storeSOA(uint32_t comp
, uint8_t *pDst
, simd16scalar
const &src
)
1427 TypeTraits
<X
, NumBitsX
>::storeSOA(pDst
, src
);
1430 TypeTraits
<Y
, NumBitsY
>::storeSOA(pDst
, src
);
1433 TypeTraits
<Z
, NumBitsZ
>::storeSOA(pDst
, src
);
1436 TypeTraits
<W
, NumBitsW
>::storeSOA(pDst
, src
);
1439 SWR_INVALID("Invalid component: %d", comp
);
1440 TypeTraits
<X
, NumBitsX
>::storeSOA(pDst
, src
);
1443 INLINE
static simd16scalar
unpack(uint32_t comp
, simd16scalar
&in
)
1448 return TypeTraits
<X
, NumBitsX
>::unpack(in
);
1450 return TypeTraits
<Y
, NumBitsY
>::unpack(in
);
1452 return TypeTraits
<Z
, NumBitsZ
>::unpack(in
);
1454 return TypeTraits
<W
, NumBitsW
>::unpack(in
);
1456 SWR_INVALID("Invalid component: %d", comp
);
1457 return TypeTraits
<X
, NumBitsX
>::unpack(in
);
1460 INLINE
static simd16scalar
pack(uint32_t comp
, simd16scalar
&in
)
1465 return TypeTraits
<X
, NumBitsX
>::pack(in
);
1467 return TypeTraits
<Y
, NumBitsY
>::pack(in
);
1469 return TypeTraits
<Z
, NumBitsZ
>::pack(in
);
1471 return TypeTraits
<W
, NumBitsW
>::pack(in
);
1473 SWR_INVALID("Invalid component: %d", comp
);
1474 return TypeTraits
<X
, NumBitsX
>::pack(in
);
1477 INLINE
static simd16scalar
convertSrgb(uint32_t comp
, simd16scalar
&in
)
1482 return TypeTraits
<X
, NumBitsX
>::convertSrgb(in
);
1484 return TypeTraits
<Y
, NumBitsY
>::convertSrgb(in
);
1486 return TypeTraits
<Z
, NumBitsZ
>::convertSrgb(in
);
1488 return TypeTraits
<W
, NumBitsW
>::convertSrgb(in
);
1490 SWR_INVALID("Invalid component: %d", comp
);
1491 return TypeTraits
<X
, NumBitsX
>::convertSrgb(in
);