1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * @brief Definitions for SWR_FORMAT functions.
27 ******************************************************************************/
32 //////////////////////////////////////////////////////////////////////////
33 /// PackTraits - Helpers for packing / unpacking same pixel sizes
34 //////////////////////////////////////////////////////////////////////////
35 template <uint32_t NumBits
, bool Signed
= false>
38 static const uint32_t MyNumBits
= NumBits
;
39 static simdscalar
loadSOA(const uint8_t *pSrc
) = delete;
40 static void storeSOA(uint8_t *pDst
, simdscalar src
) = delete;
41 static simdscalar
unpack(simdscalar
&in
) = delete;
42 static simdscalar
pack(simdscalar
&in
) = delete;
45 //////////////////////////////////////////////////////////////////////////
46 /// PackTraits - Helpers for packing / unpacking unused channels
47 //////////////////////////////////////////////////////////////////////////
49 struct PackTraits
<0, false>
51 static const uint32_t MyNumBits
= 0;
53 static simdscalar
loadSOA(const uint8_t *pSrc
) { return _simd_setzero_ps(); }
54 static void storeSOA(uint8_t *pDst
, simdscalar src
) { return; }
55 static simdscalar
unpack(simdscalar
&in
) { return _simd_setzero_ps(); }
56 static simdscalar
pack(simdscalar
&in
) { return _simd_setzero_ps(); }
60 //////////////////////////////////////////////////////////////////////////
61 /// PackTraits - Helpers for packing / unpacking 8 bit unsigned channels
62 //////////////////////////////////////////////////////////////////////////
64 struct PackTraits
<8, false>
66 static const uint32_t MyNumBits
= 8;
68 static simdscalar
loadSOA(const uint8_t *pSrc
)
70 #if KNOB_SIMD_WIDTH == 8
71 __m256 result
= _mm256_setzero_ps();
72 __m128 vLo
= _mm_castpd_ps(_mm_load_sd((double*)pSrc
));
73 return _mm256_insertf128_ps(result
, vLo
, 0);
75 #error Unsupported vector width
79 static void storeSOA(uint8_t *pDst
, simdscalar src
)
82 #if KNOB_SIMD_WIDTH == 8
83 _mm_storel_pd((double*)pDst
, _mm_castps_pd(_mm256_castps256_ps128(src
)));
85 #error Unsupported vector width
89 static simdscalar
unpack(simdscalar
&in
)
91 #if KNOB_SIMD_WIDTH == 8
92 #if KNOB_ARCH==KNOB_ARCH_AVX
93 __m128i src
= _mm_castps_si128(_mm256_castps256_ps128(in
));
94 __m128i resLo
= _mm_cvtepu8_epi32(src
);
95 __m128i resHi
= _mm_shuffle_epi8(src
,
96 _mm_set_epi32(0x80808007, 0x80808006, 0x80808005, 0x80808004));
98 __m256i result
= _mm256_castsi128_si256(resLo
);
99 result
= _mm256_insertf128_si256(result
, resHi
, 1);
100 return _mm256_castsi256_ps(result
);
101 #elif KNOB_ARCH==KNOB_ARCH_AVX2
102 return _mm256_castsi256_ps(_mm256_cvtepu8_epi32(_mm_castps_si128(_mm256_castps256_ps128(in
))));
105 #error Unsupported vector width
109 static simdscalar
pack(simdscalar
&in
)
111 #if KNOB_SIMD_WIDTH == 8
112 simdscalari src
= _simd_castps_si(in
);
113 __m128i res16
= _mm_packus_epi32(_mm256_castsi256_si128(src
), _mm256_extractf128_si256(src
, 1));
114 __m128i res8
= _mm_packus_epi16(res16
, _mm_undefined_si128());
115 return _mm256_castsi256_ps(_mm256_castsi128_si256(res8
));
117 #error Unsupported vector width
122 //////////////////////////////////////////////////////////////////////////
123 /// PackTraits - Helpers for packing / unpacking 8 bit signed channels
124 //////////////////////////////////////////////////////////////////////////
126 struct PackTraits
<8, true>
128 static const uint32_t MyNumBits
= 8;
130 static simdscalar
loadSOA(const uint8_t *pSrc
)
132 #if KNOB_SIMD_WIDTH == 8
133 __m256 result
= _mm256_setzero_ps();
134 __m128 vLo
= _mm_castpd_ps(_mm_load_sd((double*)pSrc
));
135 return _mm256_insertf128_ps(result
, vLo
, 0);
137 #error Unsupported vector width
141 static void storeSOA(uint8_t *pDst
, simdscalar src
)
144 #if KNOB_SIMD_WIDTH == 8
145 _mm_storel_pd((double*)pDst
, _mm_castps_pd(_mm256_castps256_ps128(src
)));
147 #error Unsupported vector width
151 static simdscalar
unpack(simdscalar
&in
)
153 #if KNOB_SIMD_WIDTH == 8
154 #if KNOB_ARCH==KNOB_ARCH_AVX
155 SWR_ASSERT(0); // I think this may be incorrect.
156 __m128i src
= _mm_castps_si128(_mm256_castps256_ps128(in
));
157 __m128i resLo
= _mm_cvtepi8_epi32(src
);
158 __m128i resHi
= _mm_shuffle_epi8(src
,
159 _mm_set_epi32(0x80808007, 0x80808006, 0x80808005, 0x80808004));
161 __m256i result
= _mm256_castsi128_si256(resLo
);
162 result
= _mm256_insertf128_si256(result
, resHi
, 1);
163 return _mm256_castsi256_ps(result
);
164 #elif KNOB_ARCH==KNOB_ARCH_AVX2
165 return _mm256_castsi256_ps(_mm256_cvtepi8_epi32(_mm_castps_si128(_mm256_castps256_ps128(in
))));
168 #error Unsupported vector width
172 static simdscalar
pack(simdscalar
&in
)
174 #if KNOB_SIMD_WIDTH == 8
175 simdscalari src
= _simd_castps_si(in
);
176 __m128i res16
= _mm_packs_epi32(_mm256_castsi256_si128(src
), _mm256_extractf128_si256(src
, 1));
177 __m128i res8
= _mm_packs_epi16(res16
, _mm_undefined_si128());
178 return _mm256_castsi256_ps(_mm256_castsi128_si256(res8
));
180 #error Unsupported vector width
185 //////////////////////////////////////////////////////////////////////////
186 /// PackTraits - Helpers for packing / unpacking 16 bit unsigned channels
187 //////////////////////////////////////////////////////////////////////////
189 struct PackTraits
<16, false>
191 static const uint32_t MyNumBits
= 16;
193 static simdscalar
loadSOA(const uint8_t *pSrc
)
195 #if KNOB_SIMD_WIDTH == 8
196 __m256 result
= _mm256_setzero_ps();
197 __m128 vLo
= _mm_load_ps((const float*)pSrc
);
198 return _mm256_insertf128_ps(result
, vLo
, 0);
200 #error Unsupported vector width
204 static void storeSOA(uint8_t *pDst
, simdscalar src
)
206 #if KNOB_SIMD_WIDTH == 8
207 // store 16B (2B * 8)
208 _mm_store_ps((float*)pDst
, _mm256_castps256_ps128(src
));
210 #error Unsupported vector width
214 static simdscalar
unpack(simdscalar
&in
)
216 #if KNOB_SIMD_WIDTH == 8
217 #if KNOB_ARCH==KNOB_ARCH_AVX
218 __m128i src
= _mm_castps_si128(_mm256_castps256_ps128(in
));
219 __m128i resLo
= _mm_cvtepu16_epi32(src
);
220 __m128i resHi
= _mm_shuffle_epi8(src
,
221 _mm_set_epi32(0x80800F0E, 0x80800D0C, 0x80800B0A, 0x80800908));
223 __m256i result
= _mm256_castsi128_si256(resLo
);
224 result
= _mm256_insertf128_si256(result
, resHi
, 1);
225 return _mm256_castsi256_ps(result
);
226 #elif KNOB_ARCH==KNOB_ARCH_AVX2
227 return _mm256_castsi256_ps(_mm256_cvtepu16_epi32(_mm_castps_si128(_mm256_castps256_ps128(in
))));
230 #error Unsupported vector width
234 static simdscalar
pack(simdscalar
&in
)
236 #if KNOB_SIMD_WIDTH == 8
237 simdscalari src
= _simd_castps_si(in
);
238 __m256i res
= _mm256_castsi128_si256(_mm_packus_epi32(_mm256_castsi256_si128(src
), _mm256_extractf128_si256(src
, 1)));
239 return _mm256_castsi256_ps(res
);
241 #error Unsupported vector width
246 //////////////////////////////////////////////////////////////////////////
247 /// PackTraits - Helpers for packing / unpacking 16 bit signed channels
248 //////////////////////////////////////////////////////////////////////////
250 struct PackTraits
<16, true>
252 static const uint32_t MyNumBits
= 16;
254 static simdscalar
loadSOA(const uint8_t *pSrc
)
256 #if KNOB_SIMD_WIDTH == 8
257 __m256 result
= _mm256_setzero_ps();
258 __m128 vLo
= _mm_load_ps((const float*)pSrc
);
259 return _mm256_insertf128_ps(result
, vLo
, 0);
261 #error Unsupported vector width
265 static void storeSOA(uint8_t *pDst
, simdscalar src
)
267 #if KNOB_SIMD_WIDTH == 8
268 // store 16B (2B * 8)
269 _mm_store_ps((float*)pDst
, _mm256_castps256_ps128(src
));
271 #error Unsupported vector width
275 static simdscalar
unpack(simdscalar
&in
)
277 #if KNOB_SIMD_WIDTH == 8
278 #if KNOB_ARCH==KNOB_ARCH_AVX
279 SWR_ASSERT(0); // I think this is incorrectly implemented
280 __m128i src
= _mm_castps_si128(_mm256_castps256_ps128(in
));
281 __m128i resLo
= _mm_cvtepi16_epi32(src
);
282 __m128i resHi
= _mm_shuffle_epi8(src
,
283 _mm_set_epi32(0x80800F0E, 0x80800D0C, 0x80800B0A, 0x80800908));
285 __m256i result
= _mm256_castsi128_si256(resLo
);
286 result
= _mm256_insertf128_si256(result
, resHi
, 1);
287 return _mm256_castsi256_ps(result
);
288 #elif KNOB_ARCH==KNOB_ARCH_AVX2
289 return _mm256_castsi256_ps(_mm256_cvtepi16_epi32(_mm_castps_si128(_mm256_castps256_ps128(in
))));
292 #error Unsupported vector width
296 static simdscalar
pack(simdscalar
&in
)
298 #if KNOB_SIMD_WIDTH == 8
299 simdscalari src
= _simd_castps_si(in
);
300 __m256i res
= _mm256_castsi128_si256(_mm_packs_epi32(_mm256_castsi256_si128(src
), _mm256_extractf128_si256(src
, 1)));
301 return _mm256_castsi256_ps(res
);
303 #error Unsupported vector width
308 //////////////////////////////////////////////////////////////////////////
309 /// PackTraits - Helpers for packing / unpacking 32 bit channels
310 //////////////////////////////////////////////////////////////////////////
312 struct PackTraits
<32, false>
314 static const uint32_t MyNumBits
= 32;
316 static simdscalar
loadSOA(const uint8_t *pSrc
) { return _simd_load_ps((const float*)pSrc
); }
317 static void storeSOA(uint8_t *pDst
, simdscalar src
) { _simd_store_ps((float*)pDst
, src
); }
318 static simdscalar
unpack(simdscalar
&in
) { return in
; }
319 static simdscalar
pack(simdscalar
&in
) { return in
; }
322 //////////////////////////////////////////////////////////////////////////
323 /// TypeTraits - Format type traits.
324 //////////////////////////////////////////////////////////////////////////
325 template<SWR_TYPE type
, uint32_t NumBits
>
326 struct TypeTraits
: PackTraits
<NumBits
>
328 static const SWR_TYPE MyType
= type
;
329 static float toFloat() { return 0.0; }
330 static float fromFloat() { SWR_ASSERT(0); return 0.0; }
331 static simdscalar
convertSrgb(simdscalar
&in
) { SWR_ASSERT(0); return _simd_setzero_ps(); }
334 //////////////////////////////////////////////////////////////////////////
335 /// TypeTraits - Format type traits specialization for UINT8
336 //////////////////////////////////////////////////////////////////////////
337 template<> struct TypeTraits
<SWR_TYPE_UINT
, 8> : PackTraits
<8>
339 static const SWR_TYPE MyType
= SWR_TYPE_UINT
;
340 static float toFloat() { return 0.0; }
341 static float fromFloat() { SWR_ASSERT(0); return 0.0; }
342 static simdscalar
convertSrgb(simdscalar
&in
) { SWR_ASSERT(0); return _simd_setzero_ps(); }
345 //////////////////////////////////////////////////////////////////////////
346 /// TypeTraits - Format type traits specialization for UINT8
347 //////////////////////////////////////////////////////////////////////////
348 template<> struct TypeTraits
<SWR_TYPE_SINT
, 8> : PackTraits
<8, true>
350 static const SWR_TYPE MyType
= SWR_TYPE_SINT
;
351 static float toFloat() { return 0.0; }
352 static float fromFloat() { SWR_ASSERT(0); return 0.0; }
353 static simdscalar
convertSrgb(simdscalar
&in
) { SWR_ASSERT(0); return _simd_setzero_ps(); }
356 //////////////////////////////////////////////////////////////////////////
357 /// TypeTraits - Format type traits specialization for UINT16
358 //////////////////////////////////////////////////////////////////////////
359 template<> struct TypeTraits
<SWR_TYPE_UINT
, 16> : PackTraits
<16>
361 static const SWR_TYPE MyType
= SWR_TYPE_UINT
;
362 static float toFloat() { return 0.0; }
363 static float fromFloat() { SWR_ASSERT(0); return 0.0; }
364 static simdscalar
convertSrgb(simdscalar
&in
) { SWR_ASSERT(0); return _simd_setzero_ps(); }
367 //////////////////////////////////////////////////////////////////////////
368 /// TypeTraits - Format type traits specialization for SINT16
369 //////////////////////////////////////////////////////////////////////////
370 template<> struct TypeTraits
<SWR_TYPE_SINT
, 16> : PackTraits
<16, true>
372 static const SWR_TYPE MyType
= SWR_TYPE_SINT
;
373 static float toFloat() { return 0.0; }
374 static float fromFloat() { SWR_ASSERT(0); return 0.0; }
375 static simdscalar
convertSrgb(simdscalar
&in
) { SWR_ASSERT(0); return _simd_setzero_ps(); }
378 //////////////////////////////////////////////////////////////////////////
379 /// TypeTraits - Format type traits specialization for UINT32
380 //////////////////////////////////////////////////////////////////////////
381 template<> struct TypeTraits
<SWR_TYPE_UINT
, 32> : PackTraits
<32>
383 static const SWR_TYPE MyType
= SWR_TYPE_UINT
;
384 static float toFloat() { return 0.0; }
385 static float fromFloat() { SWR_ASSERT(0); return 0.0; }
386 static simdscalar
convertSrgb(simdscalar
&in
) { SWR_ASSERT(0); return _simd_setzero_ps(); }
389 //////////////////////////////////////////////////////////////////////////
390 /// TypeTraits - Format type traits specialization for UINT32
391 //////////////////////////////////////////////////////////////////////////
392 template<> struct TypeTraits
<SWR_TYPE_SINT
, 32> : PackTraits
<32>
394 static const SWR_TYPE MyType
= SWR_TYPE_SINT
;
395 static float toFloat() { return 0.0; }
396 static float fromFloat() { SWR_ASSERT(0); return 0.0; }
397 static simdscalar
convertSrgb(simdscalar
&in
) { SWR_ASSERT(0); return _simd_setzero_ps(); }
400 //////////////////////////////////////////////////////////////////////////
401 /// TypeTraits - Format type traits specialization for UNORM5
402 //////////////////////////////////////////////////////////////////////////
403 template<> struct TypeTraits
<SWR_TYPE_UNORM
, 5> : PackTraits
<5>
405 static const SWR_TYPE MyType
= SWR_TYPE_UNORM
;
406 static float toFloat() { return 1.0f
/ 31.0f
; }
407 static float fromFloat() { return 31.0f
; }
408 static simdscalar
convertSrgb(simdscalar
&in
) { SWR_ASSERT(0); return _simd_setzero_ps(); }
411 //////////////////////////////////////////////////////////////////////////
412 /// TypeTraits - Format type traits specialization for UNORM6
413 //////////////////////////////////////////////////////////////////////////
414 template<> struct TypeTraits
<SWR_TYPE_UNORM
, 6> : PackTraits
<6>
416 static const SWR_TYPE MyType
= SWR_TYPE_UNORM
;
417 static float toFloat() { return 1.0f
/ 63.0f
; }
418 static float fromFloat() { return 63.0f
; }
419 static simdscalar
convertSrgb(simdscalar
&in
) { SWR_ASSERT(0); return _simd_setzero_ps(); }
422 //////////////////////////////////////////////////////////////////////////
423 /// TypeTraits - Format type traits specialization for UNORM8
424 //////////////////////////////////////////////////////////////////////////
425 template<> struct TypeTraits
<SWR_TYPE_UNORM
, 8> : PackTraits
<8>
427 static const SWR_TYPE MyType
= SWR_TYPE_UNORM
;
428 static float toFloat() { return 1.0f
/ 255.0f
; }
429 static float fromFloat() { return 255.0f
; }
430 static simdscalar
convertSrgb(simdscalar
&in
) { SWR_ASSERT(0); return _simd_setzero_ps(); }
433 //////////////////////////////////////////////////////////////////////////
434 /// TypeTraits - Format type traits specialization for UNORM8
435 //////////////////////////////////////////////////////////////////////////
436 template<> struct TypeTraits
<SWR_TYPE_SNORM
, 8> : PackTraits
<8, true>
438 static const SWR_TYPE MyType
= SWR_TYPE_SNORM
;
439 static float toFloat() { return 1.0f
/ 127.0f
; }
440 static float fromFloat() { return 127.0f
; }
441 static simdscalar
convertSrgb(simdscalar
&in
) { SWR_ASSERT(0); return _simd_setzero_ps(); }
444 //////////////////////////////////////////////////////////////////////////
445 /// TypeTraits - Format type traits specialization for UNORM16
446 //////////////////////////////////////////////////////////////////////////
447 template<> struct TypeTraits
<SWR_TYPE_UNORM
, 16> : PackTraits
<16>
449 static const SWR_TYPE MyType
= SWR_TYPE_UNORM
;
450 static float toFloat() { return 1.0f
/ 65535.0f
; }
451 static float fromFloat() { return 65535.0f
; }
452 static simdscalar
convertSrgb(simdscalar
&in
) { SWR_ASSERT(0); return _simd_setzero_ps(); }
455 //////////////////////////////////////////////////////////////////////////
456 /// TypeTraits - Format type traits specialization for SNORM16
457 //////////////////////////////////////////////////////////////////////////
458 template<> struct TypeTraits
<SWR_TYPE_SNORM
, 16> : PackTraits
<16, true>
460 static const SWR_TYPE MyType
= SWR_TYPE_UNORM
;
461 static float toFloat() { return 1.0f
/ 32767.0f
; }
462 static float fromFloat() { return 32767.0f
; }
463 static simdscalar
convertSrgb(simdscalar
&in
) { SWR_ASSERT(0); return _simd_setzero_ps(); }
466 //////////////////////////////////////////////////////////////////////////
467 /// TypeTraits - Format type traits specialization for UNORM24
468 //////////////////////////////////////////////////////////////////////////
470 struct TypeTraits
< SWR_TYPE_UNORM
, 24 > : PackTraits
<32>
472 static const SWR_TYPE MyType
= SWR_TYPE_UNORM
;
473 static float toFloat() { return 1.0f
/ 16777215.0f
; }
474 static float fromFloat() { return 16777215.0f
; }
475 static simdscalar
convertSrgb(simdscalar
&in
) { SWR_ASSERT(0); return _simd_setzero_ps(); }
478 //////////////////////////////////////////////////////////////////////////
479 // FLOAT Specializations from here on...
480 //////////////////////////////////////////////////////////////////////////
481 #define TO_M128i(a) _mm_castps_si128(a)
482 #define TO_M128(a) _mm_castsi128_ps(a)
486 template< unsigned expnum
, unsigned expden
, unsigned coeffnum
, unsigned coeffden
>
487 inline static __m128
fastpow(__m128 arg
) {
490 static const __m128 factor
= _mm_set1_ps(exp2(127.0f
* expden
/ expnum
- 127.0f
)
491 * powf(1.0f
* coeffnum
/ coeffden
, 1.0f
* expden
/ expnum
));
493 // Apply a constant pre-correction factor.
494 ret
= _mm_mul_ps(ret
, factor
);
496 // Reinterpret arg as integer to obtain logarithm.
497 //asm("cvtdq2ps %1, %0" : "=x" (ret) : "x" (ret));
498 ret
= _mm_cvtepi32_ps(_mm_castps_si128(ret
));
500 // Multiply logarithm by power.
501 ret
= _mm_mul_ps(ret
, _mm_set1_ps(1.0f
* expnum
/ expden
));
503 // Convert back to "integer" to exponentiate.
504 //asm("cvtps2dq %1, %0" : "=x" (ret) : "x" (ret));
505 ret
= _mm_castsi128_ps(_mm_cvtps_epi32(ret
));
510 inline static __m128
pow512_4(__m128 arg
) {
511 // 5/12 is too small, so compute the 4th root of 20/12 instead.
512 // 20/12 = 5/3 = 1 + 2/3 = 2 - 1/3. 2/3 is a suitable argument for fastpow.
513 // weighting coefficient: a^-1/2 = 2 a; a = 2^-2/3
514 __m128 xf
= fastpow
< 2, 3, int(0.629960524947437 * 1e9
), int(1e9
) >(arg
);
515 __m128 xover
= _mm_mul_ps(arg
, xf
);
517 __m128 xfm1
= _mm_rsqrt_ps(xf
);
518 __m128 x2
= _mm_mul_ps(arg
, arg
);
519 __m128 xunder
= _mm_mul_ps(x2
, xfm1
);
521 // sqrt2 * over + 2 * sqrt2 * under
522 __m128 xavg
= _mm_mul_ps(_mm_set1_ps(1.0f
/ (3.0f
* 0.629960524947437f
) * 0.999852f
),
523 _mm_add_ps(xover
, xunder
));
525 xavg
= _mm_mul_ps(xavg
, _mm_rsqrt_ps(xavg
));
526 xavg
= _mm_mul_ps(xavg
, _mm_rsqrt_ps(xavg
));
530 inline static __m128
powf_wrapper(__m128 Base
, float Exp
)
532 float *f
= (float *)(&Base
);
534 return _mm_set_ps(powf(f
[0], Exp
),
540 static inline __m128
ConvertFloatToSRGB2(__m128
& Src
)
542 // create a mask with 0xFFFFFFFF in the DWORDs where the source is <= the minimal SRGB float value
543 __m128i CmpToSRGBThresholdMask
= TO_M128i(_mm_cmpnlt_ps(_mm_set1_ps(0.0031308f
), Src
));
545 // squeeze the mask down to 16 bits (4 bits per DWORD)
546 int CompareResult
= _mm_movemask_epi8(CmpToSRGBThresholdMask
);
551 if (CompareResult
== 0xFFFF)
553 // all DWORDs are <= the threshold
554 Result
= _mm_mul_ps(Src
, _mm_set1_ps(12.92f
));
556 else if (CompareResult
== 0x0)
558 // all DWORDs are > the threshold
559 __m128 fSrc_0RGB
= Src
;
561 // --> 1.055f * c(1.0f/2.4f) - 0.055f
562 #if KNOB_USE_FAST_SRGB == TRUE
563 // 1.0f / 2.4f is 5.0f / 12.0f which is used for approximation.
564 __m128 f
= pow512_4(fSrc_0RGB
);
566 __m128 f
= powf_wrapper(fSrc_0RGB
, 1.0f
/ 2.4f
);
568 f
= _mm_mul_ps(f
, _mm_set1_ps(1.055f
));
569 Result
= _mm_sub_ps(f
, _mm_set1_ps(0.055f
));
573 // some DWORDs are <= the threshold and some are > threshold
574 __m128 Src_0RGB_mul_denorm
= _mm_mul_ps(Src
, _mm_set1_ps(12.92f
));
576 __m128 fSrc_0RGB
= Src
;
578 // --> 1.055f * c(1.0f/2.4f) - 0.055f
579 #if KNOB_USE_FAST_SRGB == TRUE
580 // 1.0f / 2.4f is 5.0f / 12.0f which is used for approximation.
581 __m128 f
= pow512_4(fSrc_0RGB
);
583 __m128 f
= powf_wrapper(fSrc_0RGB
, 1.0f
/ 2.4f
);
585 f
= _mm_mul_ps(f
, _mm_set1_ps(1.055f
));
586 f
= _mm_sub_ps(f
, _mm_set1_ps(0.055f
));
588 // Clear the alpha (is garbage after the sub)
589 __m128i i
= _mm_and_si128(TO_M128i(f
), _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF));
591 __m128i LessThanPart
= _mm_and_si128(CmpToSRGBThresholdMask
, TO_M128i(Src_0RGB_mul_denorm
));
592 __m128i GreaterEqualPart
= _mm_andnot_si128(CmpToSRGBThresholdMask
, i
);
593 __m128i CombinedParts
= _mm_or_si128(LessThanPart
, GreaterEqualPart
);
595 Result
= TO_M128(CombinedParts
);
601 //////////////////////////////////////////////////////////////////////////
602 /// TypeTraits - Format type traits specialization for FLOAT16
603 //////////////////////////////////////////////////////////////////////////
604 template<> struct TypeTraits
<SWR_TYPE_FLOAT
, 16> : PackTraits
<16>
606 static const SWR_TYPE MyType
= SWR_TYPE_FLOAT
;
607 static float toFloat() { return 1.0f
; }
608 static float fromFloat() { return 1.0f
; }
609 static simdscalar
convertSrgb(simdscalar
&in
) { SWR_ASSERT(0); return _simd_setzero_ps(); }
611 static simdscalar
pack(const simdscalar
&in
)
613 #if KNOB_SIMD_WIDTH == 8
614 #if (KNOB_ARCH == KNOB_ARCH_AVX)
615 // input is 8 packed float32, output is 8 packed float16
616 simdscalari src
= _simd_castps_si(in
);
618 static const uint32_t FLOAT_EXP_BITS
= 8;
619 static const uint32_t FLOAT_MANTISSA_BITS
= 23;
620 static const uint32_t FLOAT_MANTISSA_MASK
= (1U << FLOAT_MANTISSA_BITS
) - 1;
621 static const uint32_t FLOAT_EXP_MASK
= ((1U << FLOAT_EXP_BITS
) - 1) << FLOAT_MANTISSA_BITS
;
623 static const uint32_t HALF_EXP_BITS
= 5;
624 static const uint32_t HALF_MANTISSA_BITS
= 10;
625 static const uint32_t HALF_EXP_MASK
= ((1U << HALF_EXP_BITS
) - 1) << HALF_MANTISSA_BITS
;
627 // minimum exponent required, exponents below this are flushed to 0.
628 static const int32_t HALF_EXP_MIN
= -14;
629 static const int32_t FLOAT_EXP_BIAS
= 127;
630 static const int32_t FLOAT_EXP_MIN
= HALF_EXP_MIN
+ FLOAT_EXP_BIAS
;
631 static const int32_t FLOAT_EXP_MIN_FTZ
= FLOAT_EXP_MIN
- (HALF_MANTISSA_BITS
+ 1); // +1 for the lack of implicit significand
633 // maximum exponent required, exponents above this are set to infinity
634 static const int32_t HALF_EXP_MAX
= 15;
635 static const int32_t FLOAT_EXP_MAX
= HALF_EXP_MAX
+ FLOAT_EXP_BIAS
;
637 const simdscalari vSignMask
= _simd_set1_epi32(0x80000000);
638 const simdscalari vExpMask
= _simd_set1_epi32(FLOAT_EXP_MASK
);
639 const simdscalari vManMask
= _simd_set1_epi32(FLOAT_MANTISSA_MASK
);
640 const simdscalari vExpMin
= _simd_set1_epi32(FLOAT_EXP_MASK
& uint32_t(FLOAT_EXP_MIN
<< FLOAT_MANTISSA_BITS
));
641 const simdscalari vExpMinFtz
= _simd_set1_epi32(FLOAT_EXP_MASK
& uint32_t(FLOAT_EXP_MIN_FTZ
<< FLOAT_MANTISSA_BITS
));
642 const simdscalari vExpMax
= _simd_set1_epi32(FLOAT_EXP_MASK
& uint32_t(FLOAT_EXP_MAX
<< FLOAT_MANTISSA_BITS
));
644 simdscalari vSign
= _simd_and_si(src
, vSignMask
);
645 simdscalari vExp
= _simd_and_si(src
, vExpMask
);
646 simdscalari vMan
= _simd_and_si(src
, vManMask
);
648 simdscalari vFTZMask
= _simd_cmplt_epi32(vExp
, vExpMinFtz
);
649 simdscalari vDenormMask
= _simd_andnot_si(vFTZMask
, _simd_cmplt_epi32(vExp
, vExpMin
));
650 simdscalari vInfMask
= _simd_cmpeq_epi32(vExpMask
, vExp
);
651 simdscalari vClampMask
= _simd_andnot_si(vInfMask
, _simd_cmplt_epi32(vExpMax
, vExp
));
653 simdscalari vHalfExp
= _simd_add_epi32(_simd_sub_epi32(vExp
, vExpMin
), _simd_set1_epi32(1U << FLOAT_MANTISSA_BITS
));
655 // pack output 16-bits into the lower 16-bits of each 32-bit channel
656 simdscalari vDst
= _simd_and_si(_simd_srli_epi32(vHalfExp
, 13), _simd_set1_epi32(HALF_EXP_MASK
));
657 vDst
= _simd_or_si(vDst
, _simd_srli_epi32(vMan
, FLOAT_MANTISSA_BITS
- HALF_MANTISSA_BITS
));
660 vDst
= _simd_andnot_si(vFTZMask
, vDst
);
661 // Apply Infinites / NaN
662 vDst
= _simd_or_si(vDst
, _simd_and_si(vInfMask
, _simd_set1_epi32(HALF_EXP_MASK
)));
665 vDst
= _simd_andnot_si(vClampMask
, vDst
);
666 vDst
= _simd_or_si(vDst
,
667 _simd_and_si(vClampMask
, _simd_set1_epi32(0x7BFF)));
669 // Compute Denormals (subnormals)
670 if (!_mm256_testz_si256(vDenormMask
, vDenormMask
))
672 uint32_t *pDenormMask
= (uint32_t*)&vDenormMask
;
673 uint32_t *pExp
= (uint32_t*)&vExp
;
674 uint32_t *pMan
= (uint32_t*)&vMan
;
675 uint32_t *pDst
= (uint32_t*)&vDst
;
676 for (uint32_t i
= 0; i
< KNOB_SIMD_WIDTH
; ++i
)
680 // Need to compute subnormal value
681 uint32_t exponent
= pExp
[i
] >> FLOAT_MANTISSA_BITS
;
682 uint32_t mantissa
= pMan
[i
] |
683 (1U << FLOAT_MANTISSA_BITS
); // Denorms include no "implicit" 1s. Make it explicit
685 pDst
[i
] = mantissa
>> ((FLOAT_EXP_MIN
- exponent
) + (FLOAT_MANTISSA_BITS
- HALF_MANTISSA_BITS
));
691 vDst
= _simd_or_si(vDst
, _simd_srli_epi32(vSign
, 16));
693 // Pack to lower 128-bits
694 vDst
= _mm256_castsi128_si256(_mm_packus_epi32(_mm256_castsi256_si128(vDst
), _mm256_extractf128_si256(vDst
, 1)));
698 simdscalari vCheck
= _mm256_castsi128_si256(_mm256_cvtps_ph(in
, _MM_FROUND_TRUNC
));
700 for (uint32_t i
= 0; i
< 4; ++i
)
702 SWR_ASSERT(vCheck
.m256i_i32
[i
] == vDst
.m256i_i32
[i
]);
707 return _simd_castsi_ps(vDst
);
710 return _mm256_castsi256_ps(_mm256_castsi128_si256(_mm256_cvtps_ph(in
, _MM_FROUND_TRUNC
)));
713 #error Unsupported vector width
717 static simdscalar
unpack(const simdscalar
&in
)
719 // input is 8 packed float16, output is 8 packed float32
720 SWR_ASSERT(0); // @todo
721 return _simd_setzero_ps();
725 //////////////////////////////////////////////////////////////////////////
726 /// TypeTraits - Format type traits specialization for FLOAT32
727 //////////////////////////////////////////////////////////////////////////
728 template<> struct TypeTraits
<SWR_TYPE_FLOAT
, 32> : PackTraits
<32>
730 static const SWR_TYPE MyType
= SWR_TYPE_FLOAT
;
731 static float toFloat() { return 1.0f
; }
732 static float fromFloat() { return 1.0f
; }
733 static inline simdscalar
convertSrgb(simdscalar
&in
)
735 #if (KNOB_ARCH == KNOB_ARCH_AVX || KNOB_ARCH == KNOB_ARCH_AVX2)
736 __m128 srcLo
= _mm256_extractf128_ps(in
, 0);
737 __m128 srcHi
= _mm256_extractf128_ps(in
, 1);
739 srcLo
= ConvertFloatToSRGB2(srcLo
);
740 srcHi
= ConvertFloatToSRGB2(srcHi
);
742 in
= _mm256_insertf128_ps(in
, srcLo
, 0);
743 in
= _mm256_insertf128_ps(in
, srcHi
, 1);
750 //////////////////////////////////////////////////////////////////////////
751 /// Format1 - Bitfield for single component formats.
752 //////////////////////////////////////////////////////////////////////////
760 ///@ The following are here to provide full template needed in Formats.
767 //////////////////////////////////////////////////////////////////////////
768 /// Format1 - Bitfield for single component formats - 8 bit specialization
769 //////////////////////////////////////////////////////////////////////////
777 ///@ The following are here to provide full template needed in Formats.
784 //////////////////////////////////////////////////////////////////////////
785 /// Format1 - Bitfield for single component formats - 16 bit specialization
786 //////////////////////////////////////////////////////////////////////////
794 ///@ The following are here to provide full template needed in Formats.
801 //////////////////////////////////////////////////////////////////////////
802 /// Format2 - Bitfield for 2 component formats.
803 //////////////////////////////////////////////////////////////////////////
804 template<uint32_t x
, uint32_t y
>
814 ///@ The following are here to provide full template needed in Formats.
820 //////////////////////////////////////////////////////////////////////////
821 /// Format2 - Bitfield for 2 component formats - 16 bit specialization
822 //////////////////////////////////////////////////////////////////////////
833 ///@ The following are here to provide full template needed in Formats.
839 //////////////////////////////////////////////////////////////////////////
840 /// Format3 - Bitfield for 3 component formats.
841 //////////////////////////////////////////////////////////////////////////
842 template<uint32_t x
, uint32_t y
, uint32_t z
>
851 uint32_t a
; ///@note This is here to provide full template needed in Formats.
854 //////////////////////////////////////////////////////////////////////////
855 /// Format3 - Bitfield for 3 component formats - 16 bit specialization
856 //////////////////////////////////////////////////////////////////////////
866 uint16_t a
; ///@note This is here to provide full template needed in Formats.
869 //////////////////////////////////////////////////////////////////////////
870 /// Format4 - Bitfield for 4 component formats.
871 //////////////////////////////////////////////////////////////////////////
872 template<uint32_t x
, uint32_t y
, uint32_t z
, uint32_t w
>
881 //////////////////////////////////////////////////////////////////////////
882 /// Format4 - Bitfield for 4 component formats - 16 bit specialization
883 //////////////////////////////////////////////////////////////////////////
885 struct Format4
<5,5,5,1>
893 //////////////////////////////////////////////////////////////////////////
894 /// Format4 - Bitfield for 4 component formats - 16 bit specialization
895 //////////////////////////////////////////////////////////////////////////
897 struct Format4
<4,4,4,4>
905 //////////////////////////////////////////////////////////////////////////
906 /// ComponentTraits - Default components
907 //////////////////////////////////////////////////////////////////////////
908 template<uint32_t x
, uint32_t y
, uint32_t z
, uint32_t w
>
911 INLINE
static uint32_t GetDefault(uint32_t comp
)
913 static const uint32_t defaults
[4]{ x
, y
, z
, w
};
914 return defaults
[comp
];
918 //////////////////////////////////////////////////////////////////////////
919 /// ComponentTraits - Component type traits.
920 //////////////////////////////////////////////////////////////////////////
921 template<SWR_TYPE X
, uint32_t NumBitsX
, SWR_TYPE Y
= SWR_TYPE_UNKNOWN
, uint32_t NumBitsY
= 0, SWR_TYPE Z
= SWR_TYPE_UNKNOWN
, uint32_t NumBitsZ
= 0, SWR_TYPE W
= SWR_TYPE_UNKNOWN
, uint32_t NumBitsW
= 0>
922 struct ComponentTraits
924 INLINE
static SWR_TYPE
GetType(uint32_t comp
)
926 static const SWR_TYPE CompType
[4]{ X
, Y
, Z
, W
};
927 return CompType
[comp
];
930 INLINE
static uint32_t GetBPC(uint32_t comp
)
932 static const uint32_t MyBpc
[4]{ NumBitsX
, NumBitsY
, NumBitsZ
, NumBitsW
};
936 INLINE
static bool isNormalized(uint32_t comp
)
941 return (X
== SWR_TYPE_UNORM
|| X
== SWR_TYPE_SNORM
) ? true : false;
943 return (Y
== SWR_TYPE_UNORM
|| Y
== SWR_TYPE_SNORM
) ? true : false;
945 return (Z
== SWR_TYPE_UNORM
|| Z
== SWR_TYPE_SNORM
) ? true : false;
947 return (W
== SWR_TYPE_UNORM
|| W
== SWR_TYPE_SNORM
) ? true : false;
953 INLINE
static float toFloat(uint32_t comp
)
958 return TypeTraits
<X
, NumBitsX
>::toFloat();
960 return TypeTraits
<Y
, NumBitsY
>::toFloat();
962 return TypeTraits
<Z
, NumBitsZ
>::toFloat();
964 return TypeTraits
<W
, NumBitsW
>::toFloat();
967 return TypeTraits
<X
, NumBitsX
>::toFloat();
971 INLINE
static float fromFloat(uint32_t comp
)
976 return TypeTraits
<X
, NumBitsX
>::fromFloat();
978 return TypeTraits
<Y
, NumBitsY
>::fromFloat();
980 return TypeTraits
<Z
, NumBitsZ
>::fromFloat();
982 return TypeTraits
<W
, NumBitsW
>::fromFloat();
985 return TypeTraits
<X
, NumBitsX
>::fromFloat();
988 INLINE
static simdscalar
loadSOA(uint32_t comp
, const uint8_t* pSrc
)
993 return TypeTraits
<X
, NumBitsX
>::loadSOA(pSrc
);
995 return TypeTraits
<Y
, NumBitsY
>::loadSOA(pSrc
);
997 return TypeTraits
<Z
, NumBitsZ
>::loadSOA(pSrc
);
999 return TypeTraits
<W
, NumBitsW
>::loadSOA(pSrc
);
1002 return TypeTraits
<X
, NumBitsX
>::loadSOA(pSrc
);
1005 INLINE
static void storeSOA(uint32_t comp
, uint8_t *pDst
, simdscalar src
)
1010 TypeTraits
<X
, NumBitsX
>::storeSOA(pDst
, src
);
1013 TypeTraits
<Y
, NumBitsY
>::storeSOA(pDst
, src
);
1016 TypeTraits
<Z
, NumBitsZ
>::storeSOA(pDst
, src
);
1019 TypeTraits
<W
, NumBitsW
>::storeSOA(pDst
, src
);
1023 TypeTraits
<X
, NumBitsX
>::storeSOA(pDst
, src
);
1026 INLINE
static simdscalar
unpack(uint32_t comp
, simdscalar
&in
)
1031 return TypeTraits
<X
, NumBitsX
>::unpack(in
);
1033 return TypeTraits
<Y
, NumBitsY
>::unpack(in
);
1035 return TypeTraits
<Z
, NumBitsZ
>::unpack(in
);
1037 return TypeTraits
<W
, NumBitsW
>::unpack(in
);
1040 return TypeTraits
<X
, NumBitsX
>::unpack(in
);
1043 INLINE
static simdscalar
pack(uint32_t comp
, simdscalar
&in
)
1048 return TypeTraits
<X
, NumBitsX
>::pack(in
);
1050 return TypeTraits
<Y
, NumBitsY
>::pack(in
);
1052 return TypeTraits
<Z
, NumBitsZ
>::pack(in
);
1054 return TypeTraits
<W
, NumBitsW
>::pack(in
);
1057 return TypeTraits
<X
, NumBitsX
>::pack(in
);
1060 INLINE
static simdscalar
convertSrgb(uint32_t comp
, simdscalar
&in
)
1065 return TypeTraits
<X
, NumBitsX
>::convertSrgb(in
);
1067 return TypeTraits
<Y
, NumBitsY
>::convertSrgb(in
);
1069 return TypeTraits
<Z
, NumBitsZ
>::convertSrgb(in
);
1071 return TypeTraits
<W
, NumBitsW
>::convertSrgb(in
);
1074 return TypeTraits
<X
, NumBitsX
>::convertSrgb(in
);