1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * @brief Definitions for SWR_FORMAT functions.
27 ******************************************************************************/
30 //////////////////////////////////////////////////////////////////////////
31 /// PackTraits - Helpers for packing / unpacking same pixel sizes
32 //////////////////////////////////////////////////////////////////////////
33 template <uint32_t NumBits
, bool Signed
= false>
36 static const uint32_t MyNumBits
= NumBits
;
37 static simdscalar
loadSOA(const uint8_t *pSrc
) = delete;
38 static void storeSOA(uint8_t *pDst
, simdscalar src
) = delete;
39 static simdscalar
unpack(simdscalar
&in
) = delete;
40 static simdscalar
pack(simdscalar
&in
) = delete;
43 //////////////////////////////////////////////////////////////////////////
44 /// PackTraits - Helpers for packing / unpacking unused channels
45 //////////////////////////////////////////////////////////////////////////
47 struct PackTraits
<0, false>
49 static const uint32_t MyNumBits
= 0;
51 static simdscalar
loadSOA(const uint8_t *pSrc
) { return _simd_setzero_ps(); }
52 static void storeSOA(uint8_t *pDst
, simdscalar src
) { return; }
53 static simdscalar
unpack(simdscalar
&in
) { return _simd_setzero_ps(); }
54 static simdscalar
pack(simdscalar
&in
) { return _simd_setzero_ps(); }
58 //////////////////////////////////////////////////////////////////////////
59 /// PackTraits - Helpers for packing / unpacking 8 bit unsigned channels
60 //////////////////////////////////////////////////////////////////////////
62 struct PackTraits
<8, false>
64 static const uint32_t MyNumBits
= 8;
66 static simdscalar
loadSOA(const uint8_t *pSrc
)
68 #if KNOB_SIMD_WIDTH == 8
69 __m256 result
= _mm256_setzero_ps();
70 __m128 vLo
= _mm_castpd_ps(_mm_load_sd((double*)pSrc
));
71 return _mm256_insertf128_ps(result
, vLo
, 0);
73 #error Unsupported vector width
77 static void storeSOA(uint8_t *pDst
, simdscalar src
)
80 #if KNOB_SIMD_WIDTH == 8
81 _mm_storel_pd((double*)pDst
, _mm_castps_pd(_mm256_castps256_ps128(src
)));
83 #error Unsupported vector width
87 static simdscalar
unpack(simdscalar
&in
)
89 #if KNOB_SIMD_WIDTH == 8
90 #if KNOB_ARCH==KNOB_ARCH_AVX
91 __m128i src
= _mm_castps_si128(_mm256_castps256_ps128(in
));
92 __m128i resLo
= _mm_cvtepu8_epi32(src
);
93 __m128i resHi
= _mm_shuffle_epi8(src
,
94 _mm_set_epi32(0x80808007, 0x80808006, 0x80808005, 0x80808004));
96 __m256i result
= _mm256_castsi128_si256(resLo
);
97 result
= _mm256_insertf128_si256(result
, resHi
, 1);
98 return _mm256_castsi256_ps(result
);
99 #elif KNOB_ARCH==KNOB_ARCH_AVX2
100 return _mm256_castsi256_ps(_mm256_cvtepu8_epi32(_mm_castps_si128(_mm256_castps256_ps128(in
))));
103 #error Unsupported vector width
107 static simdscalar
pack(simdscalar
&in
)
109 #if KNOB_SIMD_WIDTH == 8
110 simdscalari src
= _simd_castps_si(in
);
111 __m128i res16
= _mm_packus_epi32(_mm256_castsi256_si128(src
), _mm256_extractf128_si256(src
, 1));
112 __m128i res8
= _mm_packus_epi16(res16
, _mm_undefined_si128());
113 return _mm256_castsi256_ps(_mm256_castsi128_si256(res8
));
115 #error Unsupported vector width
120 //////////////////////////////////////////////////////////////////////////
121 /// PackTraits - Helpers for packing / unpacking 8 bit signed channels
122 //////////////////////////////////////////////////////////////////////////
124 struct PackTraits
<8, true>
126 static const uint32_t MyNumBits
= 8;
128 static simdscalar
loadSOA(const uint8_t *pSrc
)
130 #if KNOB_SIMD_WIDTH == 8
131 __m256 result
= _mm256_setzero_ps();
132 __m128 vLo
= _mm_castpd_ps(_mm_load_sd((double*)pSrc
));
133 return _mm256_insertf128_ps(result
, vLo
, 0);
135 #error Unsupported vector width
139 static void storeSOA(uint8_t *pDst
, simdscalar src
)
142 #if KNOB_SIMD_WIDTH == 8
143 _mm_storel_pd((double*)pDst
, _mm_castps_pd(_mm256_castps256_ps128(src
)));
145 #error Unsupported vector width
149 static simdscalar
unpack(simdscalar
&in
)
151 #if KNOB_SIMD_WIDTH == 8
152 #if KNOB_ARCH==KNOB_ARCH_AVX
153 SWR_ASSERT(0); // I think this may be incorrect.
154 __m128i src
= _mm_castps_si128(_mm256_castps256_ps128(in
));
155 __m128i resLo
= _mm_cvtepi8_epi32(src
);
156 __m128i resHi
= _mm_shuffle_epi8(src
,
157 _mm_set_epi32(0x80808007, 0x80808006, 0x80808005, 0x80808004));
159 __m256i result
= _mm256_castsi128_si256(resLo
);
160 result
= _mm256_insertf128_si256(result
, resHi
, 1);
161 return _mm256_castsi256_ps(result
);
162 #elif KNOB_ARCH==KNOB_ARCH_AVX2
163 return _mm256_castsi256_ps(_mm256_cvtepi8_epi32(_mm_castps_si128(_mm256_castps256_ps128(in
))));
166 #error Unsupported vector width
170 static simdscalar
pack(simdscalar
&in
)
172 #if KNOB_SIMD_WIDTH == 8
173 simdscalari src
= _simd_castps_si(in
);
174 __m128i res16
= _mm_packs_epi32(_mm256_castsi256_si128(src
), _mm256_extractf128_si256(src
, 1));
175 __m128i res8
= _mm_packs_epi16(res16
, _mm_undefined_si128());
176 return _mm256_castsi256_ps(_mm256_castsi128_si256(res8
));
178 #error Unsupported vector width
183 //////////////////////////////////////////////////////////////////////////
184 /// PackTraits - Helpers for packing / unpacking 16 bit unsigned channels
185 //////////////////////////////////////////////////////////////////////////
187 struct PackTraits
<16, false>
189 static const uint32_t MyNumBits
= 16;
191 static simdscalar
loadSOA(const uint8_t *pSrc
)
193 #if KNOB_SIMD_WIDTH == 8
194 __m256 result
= _mm256_setzero_ps();
195 __m128 vLo
= _mm_load_ps((const float*)pSrc
);
196 return _mm256_insertf128_ps(result
, vLo
, 0);
198 #error Unsupported vector width
202 static void storeSOA(uint8_t *pDst
, simdscalar src
)
204 #if KNOB_SIMD_WIDTH == 8
205 // store 16B (2B * 8)
206 _mm_store_ps((float*)pDst
, _mm256_castps256_ps128(src
));
208 #error Unsupported vector width
212 static simdscalar
unpack(simdscalar
&in
)
214 #if KNOB_SIMD_WIDTH == 8
215 #if KNOB_ARCH==KNOB_ARCH_AVX
216 __m128i src
= _mm_castps_si128(_mm256_castps256_ps128(in
));
217 __m128i resLo
= _mm_cvtepu16_epi32(src
);
218 __m128i resHi
= _mm_shuffle_epi8(src
,
219 _mm_set_epi32(0x80800F0E, 0x80800D0C, 0x80800B0A, 0x80800908));
221 __m256i result
= _mm256_castsi128_si256(resLo
);
222 result
= _mm256_insertf128_si256(result
, resHi
, 1);
223 return _mm256_castsi256_ps(result
);
224 #elif KNOB_ARCH==KNOB_ARCH_AVX2
225 return _mm256_castsi256_ps(_mm256_cvtepu16_epi32(_mm_castps_si128(_mm256_castps256_ps128(in
))));
228 #error Unsupported vector width
232 static simdscalar
pack(simdscalar
&in
)
234 #if KNOB_SIMD_WIDTH == 8
235 simdscalari src
= _simd_castps_si(in
);
236 __m256i res
= _mm256_castsi128_si256(_mm_packus_epi32(_mm256_castsi256_si128(src
), _mm256_extractf128_si256(src
, 1)));
237 return _mm256_castsi256_ps(res
);
239 #error Unsupported vector width
244 //////////////////////////////////////////////////////////////////////////
245 /// PackTraits - Helpers for packing / unpacking 16 bit signed channels
246 //////////////////////////////////////////////////////////////////////////
248 struct PackTraits
<16, true>
250 static const uint32_t MyNumBits
= 16;
252 static simdscalar
loadSOA(const uint8_t *pSrc
)
254 #if KNOB_SIMD_WIDTH == 8
255 __m256 result
= _mm256_setzero_ps();
256 __m128 vLo
= _mm_load_ps((const float*)pSrc
);
257 return _mm256_insertf128_ps(result
, vLo
, 0);
259 #error Unsupported vector width
263 static void storeSOA(uint8_t *pDst
, simdscalar src
)
265 #if KNOB_SIMD_WIDTH == 8
266 // store 16B (2B * 8)
267 _mm_store_ps((float*)pDst
, _mm256_castps256_ps128(src
));
269 #error Unsupported vector width
273 static simdscalar
unpack(simdscalar
&in
)
275 #if KNOB_SIMD_WIDTH == 8
276 #if KNOB_ARCH==KNOB_ARCH_AVX
277 SWR_ASSERT(0); // I think this is incorrectly implemented
278 __m128i src
= _mm_castps_si128(_mm256_castps256_ps128(in
));
279 __m128i resLo
= _mm_cvtepi16_epi32(src
);
280 __m128i resHi
= _mm_shuffle_epi8(src
,
281 _mm_set_epi32(0x80800F0E, 0x80800D0C, 0x80800B0A, 0x80800908));
283 __m256i result
= _mm256_castsi128_si256(resLo
);
284 result
= _mm256_insertf128_si256(result
, resHi
, 1);
285 return _mm256_castsi256_ps(result
);
286 #elif KNOB_ARCH==KNOB_ARCH_AVX2
287 return _mm256_castsi256_ps(_mm256_cvtepi16_epi32(_mm_castps_si128(_mm256_castps256_ps128(in
))));
290 #error Unsupported vector width
294 static simdscalar
pack(simdscalar
&in
)
296 #if KNOB_SIMD_WIDTH == 8
297 simdscalari src
= _simd_castps_si(in
);
298 __m256i res
= _mm256_castsi128_si256(_mm_packs_epi32(_mm256_castsi256_si128(src
), _mm256_extractf128_si256(src
, 1)));
299 return _mm256_castsi256_ps(res
);
301 #error Unsupported vector width
306 //////////////////////////////////////////////////////////////////////////
307 /// PackTraits - Helpers for packing / unpacking 32 bit channels
308 //////////////////////////////////////////////////////////////////////////
310 struct PackTraits
<32, false>
312 static const uint32_t MyNumBits
= 32;
314 static simdscalar
loadSOA(const uint8_t *pSrc
) { return _simd_load_ps((const float*)pSrc
); }
315 static void storeSOA(uint8_t *pDst
, simdscalar src
) { _simd_store_ps((float*)pDst
, src
); }
316 static simdscalar
unpack(simdscalar
&in
) { return in
; }
317 static simdscalar
pack(simdscalar
&in
) { return in
; }
320 //////////////////////////////////////////////////////////////////////////
321 /// TypeTraits - Format type traits.
322 //////////////////////////////////////////////////////////////////////////
323 template<SWR_TYPE type
, uint32_t NumBits
>
324 struct TypeTraits
: PackTraits
<NumBits
>
326 static const SWR_TYPE MyType
= type
;
327 static float toFloat() { return 0.0; }
328 static float fromFloat() { SWR_ASSERT(0); return 0.0; }
329 static simdscalar
convertSrgb(simdscalar
&in
) { SWR_ASSERT(0); return _simd_setzero_ps(); }
332 //////////////////////////////////////////////////////////////////////////
333 /// TypeTraits - Format type traits specialization for UINT8
334 //////////////////////////////////////////////////////////////////////////
335 template<> struct TypeTraits
<SWR_TYPE_UINT
, 8> : PackTraits
<8>
337 static const SWR_TYPE MyType
= SWR_TYPE_UINT
;
338 static float toFloat() { return 0.0; }
339 static float fromFloat() { SWR_ASSERT(0); return 0.0; }
340 static simdscalar
convertSrgb(simdscalar
&in
) { SWR_ASSERT(0); return _simd_setzero_ps(); }
343 //////////////////////////////////////////////////////////////////////////
344 /// TypeTraits - Format type traits specialization for UINT8
345 //////////////////////////////////////////////////////////////////////////
346 template<> struct TypeTraits
<SWR_TYPE_SINT
, 8> : PackTraits
<8, true>
348 static const SWR_TYPE MyType
= SWR_TYPE_SINT
;
349 static float toFloat() { return 0.0; }
350 static float fromFloat() { SWR_ASSERT(0); return 0.0; }
351 static simdscalar
convertSrgb(simdscalar
&in
) { SWR_ASSERT(0); return _simd_setzero_ps(); }
354 //////////////////////////////////////////////////////////////////////////
355 /// TypeTraits - Format type traits specialization for UINT16
356 //////////////////////////////////////////////////////////////////////////
357 template<> struct TypeTraits
<SWR_TYPE_UINT
, 16> : PackTraits
<16>
359 static const SWR_TYPE MyType
= SWR_TYPE_UINT
;
360 static float toFloat() { return 0.0; }
361 static float fromFloat() { SWR_ASSERT(0); return 0.0; }
362 static simdscalar
convertSrgb(simdscalar
&in
) { SWR_ASSERT(0); return _simd_setzero_ps(); }
365 //////////////////////////////////////////////////////////////////////////
366 /// TypeTraits - Format type traits specialization for SINT16
367 //////////////////////////////////////////////////////////////////////////
368 template<> struct TypeTraits
<SWR_TYPE_SINT
, 16> : PackTraits
<16, true>
370 static const SWR_TYPE MyType
= SWR_TYPE_SINT
;
371 static float toFloat() { return 0.0; }
372 static float fromFloat() { SWR_ASSERT(0); return 0.0; }
373 static simdscalar
convertSrgb(simdscalar
&in
) { SWR_ASSERT(0); return _simd_setzero_ps(); }
376 //////////////////////////////////////////////////////////////////////////
377 /// TypeTraits - Format type traits specialization for UINT32
378 //////////////////////////////////////////////////////////////////////////
379 template<> struct TypeTraits
<SWR_TYPE_UINT
, 32> : PackTraits
<32>
381 static const SWR_TYPE MyType
= SWR_TYPE_UINT
;
382 static float toFloat() { return 0.0; }
383 static float fromFloat() { SWR_ASSERT(0); return 0.0; }
384 static simdscalar
convertSrgb(simdscalar
&in
) { SWR_ASSERT(0); return _simd_setzero_ps(); }
387 //////////////////////////////////////////////////////////////////////////
388 /// TypeTraits - Format type traits specialization for UINT32
389 //////////////////////////////////////////////////////////////////////////
390 template<> struct TypeTraits
<SWR_TYPE_SINT
, 32> : PackTraits
<32>
392 static const SWR_TYPE MyType
= SWR_TYPE_SINT
;
393 static float toFloat() { return 0.0; }
394 static float fromFloat() { SWR_ASSERT(0); return 0.0; }
395 static simdscalar
convertSrgb(simdscalar
&in
) { SWR_ASSERT(0); return _simd_setzero_ps(); }
398 //////////////////////////////////////////////////////////////////////////
399 /// TypeTraits - Format type traits specialization for UNORM5
400 //////////////////////////////////////////////////////////////////////////
401 template<> struct TypeTraits
<SWR_TYPE_UNORM
, 5> : PackTraits
<5>
403 static const SWR_TYPE MyType
= SWR_TYPE_UNORM
;
404 static float toFloat() { return 1.0f
/ 31.0f
; }
405 static float fromFloat() { return 31.0f
; }
406 static simdscalar
convertSrgb(simdscalar
&in
) { SWR_ASSERT(0); return _simd_setzero_ps(); }
409 //////////////////////////////////////////////////////////////////////////
410 /// TypeTraits - Format type traits specialization for UNORM6
411 //////////////////////////////////////////////////////////////////////////
412 template<> struct TypeTraits
<SWR_TYPE_UNORM
, 6> : PackTraits
<6>
414 static const SWR_TYPE MyType
= SWR_TYPE_UNORM
;
415 static float toFloat() { return 1.0f
/ 63.0f
; }
416 static float fromFloat() { return 63.0f
; }
417 static simdscalar
convertSrgb(simdscalar
&in
) { SWR_ASSERT(0); return _simd_setzero_ps(); }
420 //////////////////////////////////////////////////////////////////////////
421 /// TypeTraits - Format type traits specialization for UNORM8
422 //////////////////////////////////////////////////////////////////////////
423 template<> struct TypeTraits
<SWR_TYPE_UNORM
, 8> : PackTraits
<8>
425 static const SWR_TYPE MyType
= SWR_TYPE_UNORM
;
426 static float toFloat() { return 1.0f
/ 255.0f
; }
427 static float fromFloat() { return 255.0f
; }
428 static simdscalar
convertSrgb(simdscalar
&in
) { SWR_ASSERT(0); return _simd_setzero_ps(); }
431 //////////////////////////////////////////////////////////////////////////
432 /// TypeTraits - Format type traits specialization for UNORM8
433 //////////////////////////////////////////////////////////////////////////
434 template<> struct TypeTraits
<SWR_TYPE_SNORM
, 8> : PackTraits
<8, true>
436 static const SWR_TYPE MyType
= SWR_TYPE_SNORM
;
437 static float toFloat() { return 1.0f
/ 127.0f
; }
438 static float fromFloat() { return 127.0f
; }
439 static simdscalar
convertSrgb(simdscalar
&in
) { SWR_ASSERT(0); return _simd_setzero_ps(); }
442 //////////////////////////////////////////////////////////////////////////
443 /// TypeTraits - Format type traits specialization for UNORM16
444 //////////////////////////////////////////////////////////////////////////
445 template<> struct TypeTraits
<SWR_TYPE_UNORM
, 16> : PackTraits
<16>
447 static const SWR_TYPE MyType
= SWR_TYPE_UNORM
;
448 static float toFloat() { return 1.0f
/ 65535.0f
; }
449 static float fromFloat() { return 65535.0f
; }
450 static simdscalar
convertSrgb(simdscalar
&in
) { SWR_ASSERT(0); return _simd_setzero_ps(); }
453 //////////////////////////////////////////////////////////////////////////
454 /// TypeTraits - Format type traits specialization for SNORM16
455 //////////////////////////////////////////////////////////////////////////
456 template<> struct TypeTraits
<SWR_TYPE_SNORM
, 16> : PackTraits
<16, true>
458 static const SWR_TYPE MyType
= SWR_TYPE_UNORM
;
459 static float toFloat() { return 1.0f
/ 32767.0f
; }
460 static float fromFloat() { return 32767.0f
; }
461 static simdscalar
convertSrgb(simdscalar
&in
) { SWR_ASSERT(0); return _simd_setzero_ps(); }
464 //////////////////////////////////////////////////////////////////////////
465 /// TypeTraits - Format type traits specialization for UNORM24
466 //////////////////////////////////////////////////////////////////////////
468 struct TypeTraits
< SWR_TYPE_UNORM
, 24 > : PackTraits
<32>
470 static const SWR_TYPE MyType
= SWR_TYPE_UNORM
;
471 static float toFloat() { return 1.0f
/ 16777215.0f
; }
472 static float fromFloat() { return 16777215.0f
; }
473 static simdscalar
convertSrgb(simdscalar
&in
) { SWR_ASSERT(0); return _simd_setzero_ps(); }
476 //////////////////////////////////////////////////////////////////////////
477 // FLOAT Specializations from here on...
478 //////////////////////////////////////////////////////////////////////////
479 #define TO_M128i(a) _mm_castps_si128(a)
480 #define TO_M128(a) _mm_castsi128_ps(a)
484 template< unsigned expnum
, unsigned expden
, unsigned coeffnum
, unsigned coeffden
>
485 inline static __m128
fastpow(__m128 arg
) {
488 static const __m128 factor
= _mm_set1_ps(exp2(127.0f
* expden
/ expnum
- 127.0f
)
489 * powf(1.0f
* coeffnum
/ coeffden
, 1.0f
* expden
/ expnum
));
491 // Apply a constant pre-correction factor.
492 ret
= _mm_mul_ps(ret
, factor
);
494 // Reinterpret arg as integer to obtain logarithm.
495 //asm("cvtdq2ps %1, %0" : "=x" (ret) : "x" (ret));
496 ret
= _mm_cvtepi32_ps(_mm_castps_si128(ret
));
498 // Multiply logarithm by power.
499 ret
= _mm_mul_ps(ret
, _mm_set1_ps(1.0f
* expnum
/ expden
));
501 // Convert back to "integer" to exponentiate.
502 //asm("cvtps2dq %1, %0" : "=x" (ret) : "x" (ret));
503 ret
= _mm_castsi128_ps(_mm_cvtps_epi32(ret
));
508 inline static __m128
pow512_4(__m128 arg
) {
509 // 5/12 is too small, so compute the 4th root of 20/12 instead.
510 // 20/12 = 5/3 = 1 + 2/3 = 2 - 1/3. 2/3 is a suitable argument for fastpow.
511 // weighting coefficient: a^-1/2 = 2 a; a = 2^-2/3
512 __m128 xf
= fastpow
< 2, 3, int(0.629960524947437 * 1e9
), int(1e9
) >(arg
);
513 __m128 xover
= _mm_mul_ps(arg
, xf
);
515 __m128 xfm1
= _mm_rsqrt_ps(xf
);
516 __m128 x2
= _mm_mul_ps(arg
, arg
);
517 __m128 xunder
= _mm_mul_ps(x2
, xfm1
);
519 // sqrt2 * over + 2 * sqrt2 * under
520 __m128 xavg
= _mm_mul_ps(_mm_set1_ps(1.0f
/ (3.0f
* 0.629960524947437f
) * 0.999852f
),
521 _mm_add_ps(xover
, xunder
));
523 xavg
= _mm_mul_ps(xavg
, _mm_rsqrt_ps(xavg
));
524 xavg
= _mm_mul_ps(xavg
, _mm_rsqrt_ps(xavg
));
528 inline static __m128
powf_wrapper(__m128 Base
, float Exp
)
530 float *f
= (float *)(&Base
);
532 return _mm_set_ps(powf(f
[0], Exp
),
538 static inline __m128
ConvertFloatToSRGB2(__m128
& Src
)
540 // create a mask with 0xFFFFFFFF in the DWORDs where the source is <= the minimal SRGB float value
541 __m128i CmpToSRGBThresholdMask
= TO_M128i(_mm_cmpnlt_ps(_mm_set1_ps(0.0031308f
), Src
));
543 // squeeze the mask down to 16 bits (4 bits per DWORD)
544 int CompareResult
= _mm_movemask_epi8(CmpToSRGBThresholdMask
);
549 if (CompareResult
== 0xFFFF)
551 // all DWORDs are <= the threshold
552 Result
= _mm_mul_ps(Src
, _mm_set1_ps(12.92f
));
554 else if (CompareResult
== 0x0)
556 // all DWORDs are > the threshold
557 __m128 fSrc_0RGB
= Src
;
559 // --> 1.055f * c(1.0f/2.4f) - 0.055f
560 #if KNOB_USE_FAST_SRGB == TRUE
561 // 1.0f / 2.4f is 5.0f / 12.0f which is used for approximation.
562 __m128 f
= pow512_4(fSrc_0RGB
);
564 __m128 f
= powf_wrapper(fSrc_0RGB
, 1.0f
/ 2.4f
);
566 f
= _mm_mul_ps(f
, _mm_set1_ps(1.055f
));
567 Result
= _mm_sub_ps(f
, _mm_set1_ps(0.055f
));
571 // some DWORDs are <= the threshold and some are > threshold
572 __m128 Src_0RGB_mul_denorm
= _mm_mul_ps(Src
, _mm_set1_ps(12.92f
));
574 __m128 fSrc_0RGB
= Src
;
576 // --> 1.055f * c(1.0f/2.4f) - 0.055f
577 #if KNOB_USE_FAST_SRGB == TRUE
578 // 1.0f / 2.4f is 5.0f / 12.0f which is used for approximation.
579 __m128 f
= pow512_4(fSrc_0RGB
);
581 __m128 f
= powf_wrapper(fSrc_0RGB
, 1.0f
/ 2.4f
);
583 f
= _mm_mul_ps(f
, _mm_set1_ps(1.055f
));
584 f
= _mm_sub_ps(f
, _mm_set1_ps(0.055f
));
586 // Clear the alpha (is garbage after the sub)
587 __m128i i
= _mm_and_si128(TO_M128i(f
), _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF));
589 __m128i LessThanPart
= _mm_and_si128(CmpToSRGBThresholdMask
, TO_M128i(Src_0RGB_mul_denorm
));
590 __m128i GreaterEqualPart
= _mm_andnot_si128(CmpToSRGBThresholdMask
, i
);
591 __m128i CombinedParts
= _mm_or_si128(LessThanPart
, GreaterEqualPart
);
593 Result
= TO_M128(CombinedParts
);
599 //////////////////////////////////////////////////////////////////////////
600 /// TypeTraits - Format type traits specialization for FLOAT16
601 //////////////////////////////////////////////////////////////////////////
602 template<> struct TypeTraits
<SWR_TYPE_FLOAT
, 16> : PackTraits
<16>
604 static const SWR_TYPE MyType
= SWR_TYPE_FLOAT
;
605 static float toFloat() { return 1.0f
; }
606 static float fromFloat() { return 1.0f
; }
607 static simdscalar
convertSrgb(simdscalar
&in
) { SWR_ASSERT(0); return _simd_setzero_ps(); }
609 static simdscalar
pack(const simdscalar
&in
)
611 #if KNOB_SIMD_WIDTH == 8
612 #if (KNOB_ARCH == KNOB_ARCH_AVX)
613 // input is 8 packed float32, output is 8 packed float16
614 simdscalari src
= _simd_castps_si(in
);
616 static const uint32_t FLOAT_EXP_BITS
= 8;
617 static const uint32_t FLOAT_MANTISSA_BITS
= 23;
618 static const uint32_t FLOAT_MANTISSA_MASK
= (1U << FLOAT_MANTISSA_BITS
) - 1;
619 static const uint32_t FLOAT_EXP_MASK
= ((1U << FLOAT_EXP_BITS
) - 1) << FLOAT_MANTISSA_BITS
;
621 static const uint32_t HALF_EXP_BITS
= 5;
622 static const uint32_t HALF_MANTISSA_BITS
= 10;
623 static const uint32_t HALF_MANTISSA_MASK
= (1U << HALF_MANTISSA_BITS
) - 1;
624 static const uint32_t HALF_EXP_MASK
= ((1U << HALF_EXP_BITS
) - 1) << HALF_MANTISSA_BITS
;
626 // minimum exponent required, exponents below this are flushed to 0.
627 static const int32_t HALF_EXP_MIN
= -14;
628 static const int32_t FLOAT_EXP_BIAS
= 127;
629 static const int32_t FLOAT_EXP_MIN
= HALF_EXP_MIN
+ FLOAT_EXP_BIAS
;
630 static const int32_t FLOAT_EXP_MIN_FTZ
= FLOAT_EXP_MIN
- (HALF_MANTISSA_BITS
+ 1); // +1 for the lack of implicit significand
632 // maximum exponent required, exponents above this are set to infinity
633 static const int32_t HALF_EXP_MAX
= 15;
634 static const int32_t FLOAT_EXP_MAX
= HALF_EXP_MAX
+ FLOAT_EXP_BIAS
;
636 const simdscalari vSignMask
= _simd_set1_epi32(0x80000000);
637 const simdscalari vExpMask
= _simd_set1_epi32(FLOAT_EXP_MASK
);
638 const simdscalari vManMask
= _simd_set1_epi32(FLOAT_MANTISSA_MASK
);
639 const simdscalari vExpMin
= _simd_set1_epi32(FLOAT_EXP_MASK
& uint32_t(FLOAT_EXP_MIN
<< FLOAT_MANTISSA_BITS
));
640 const simdscalari vExpMinFtz
= _simd_set1_epi32(FLOAT_EXP_MASK
& uint32_t(FLOAT_EXP_MIN_FTZ
<< FLOAT_MANTISSA_BITS
));
641 const simdscalari vExpMax
= _simd_set1_epi32(FLOAT_EXP_MASK
& uint32_t(FLOAT_EXP_MAX
<< FLOAT_MANTISSA_BITS
));
643 simdscalari vSign
= _simd_and_si(src
, vSignMask
);
644 simdscalari vExp
= _simd_and_si(src
, vExpMask
);
645 simdscalari vMan
= _simd_and_si(src
, vManMask
);
647 simdscalari vFTZMask
= _simd_cmplt_epi32(vExp
, vExpMinFtz
);
648 simdscalari vDenormMask
= _simd_andnot_si(vFTZMask
, _simd_cmplt_epi32(vExp
, vExpMin
));
649 simdscalari vInfMask
= _simd_cmpeq_epi32(vExpMask
, vExp
);
650 simdscalari vClampMask
= _simd_andnot_si(vInfMask
, _simd_cmplt_epi32(vExpMax
, vExp
));
652 simdscalari vHalfExp
= _simd_add_epi32(_simd_sub_epi32(vExp
, vExpMin
), _simd_set1_epi32(1U << FLOAT_MANTISSA_BITS
));
654 // pack output 16-bits into the lower 16-bits of each 32-bit channel
655 simdscalari vDst
= _simd_and_si(_simd_srli_epi32(vHalfExp
, 13), _simd_set1_epi32(HALF_EXP_MASK
));
656 vDst
= _simd_or_si(vDst
, _simd_srli_epi32(vMan
, FLOAT_MANTISSA_BITS
- HALF_MANTISSA_BITS
));
659 vDst
= _simd_andnot_si(vFTZMask
, vDst
);
660 // Apply Infinites / NaN
661 vDst
= _simd_or_si(vDst
, _simd_and_si(vInfMask
, _simd_set1_epi32(HALF_EXP_MASK
)));
664 vDst
= _simd_andnot_si(vClampMask
, vDst
);
665 vDst
= _simd_or_si(vDst
,
666 _simd_and_si(vClampMask
, _simd_set1_epi32(0x7BFF)));
668 // Compute Denormals (subnormals)
669 if (!_mm256_testz_si256(vDenormMask
, vDenormMask
))
671 uint32_t *pDenormMask
= (uint32_t*)&vDenormMask
;
672 uint32_t *pExp
= (uint32_t*)&vExp
;
673 uint32_t *pMan
= (uint32_t*)&vMan
;
674 uint32_t *pDst
= (uint32_t*)&vDst
;
675 for (uint32_t i
= 0; i
< KNOB_SIMD_WIDTH
; ++i
)
679 // Need to compute subnormal value
680 uint32_t exponent
= pExp
[i
] >> FLOAT_MANTISSA_BITS
;
681 uint32_t mantissa
= pMan
[i
] |
682 (1U << FLOAT_MANTISSA_BITS
); // Denorms include no "implicit" 1s. Make it explicit
684 pDst
[i
] = mantissa
>> ((FLOAT_EXP_MIN
- exponent
) + (FLOAT_MANTISSA_BITS
- HALF_MANTISSA_BITS
));
690 vDst
= _simd_or_si(vDst
, _simd_srli_epi32(vSign
, 16));
692 // Pack to lower 128-bits
693 vDst
= _mm256_castsi128_si256(_mm_packus_epi32(_mm256_castsi256_si128(vDst
), _mm256_extractf128_si256(vDst
, 1)));
697 simdscalari vCheck
= _mm256_castsi128_si256(_mm256_cvtps_ph(in
, _MM_FROUND_TRUNC
));
699 for (uint32_t i
= 0; i
< 4; ++i
)
701 SWR_ASSERT(vCheck
.m256i_i32
[i
] == vDst
.m256i_i32
[i
]);
706 return _simd_castsi_ps(vDst
);
709 return _mm256_castsi256_ps(_mm256_castsi128_si256(_mm256_cvtps_ph(in
, _MM_FROUND_TRUNC
)));
712 #error Unsupported vector width
716 static simdscalar
unpack(const simdscalar
&in
)
718 // input is 8 packed float16, output is 8 packed float32
719 SWR_ASSERT(0); // @todo
720 return _simd_setzero_ps();
724 //////////////////////////////////////////////////////////////////////////
725 /// TypeTraits - Format type traits specialization for FLOAT32
726 //////////////////////////////////////////////////////////////////////////
727 template<> struct TypeTraits
<SWR_TYPE_FLOAT
, 32> : PackTraits
<32>
729 static const SWR_TYPE MyType
= SWR_TYPE_FLOAT
;
730 static float toFloat() { return 1.0f
; }
731 static float fromFloat() { return 1.0f
; }
732 static inline simdscalar
convertSrgb(simdscalar
&in
)
734 #if (KNOB_ARCH == KNOB_ARCH_AVX || KNOB_ARCH == KNOB_ARCH_AVX2)
735 __m128 srcLo
= _mm256_extractf128_ps(in
, 0);
736 __m128 srcHi
= _mm256_extractf128_ps(in
, 1);
738 srcLo
= ConvertFloatToSRGB2(srcLo
);
739 srcHi
= ConvertFloatToSRGB2(srcHi
);
741 in
= _mm256_insertf128_ps(in
, srcLo
, 0);
742 in
= _mm256_insertf128_ps(in
, srcHi
, 1);
749 //////////////////////////////////////////////////////////////////////////
750 /// Format1 - Bitfield for single component formats.
751 //////////////////////////////////////////////////////////////////////////
759 ///@ The following are here to provide full template needed in Formats.
766 //////////////////////////////////////////////////////////////////////////
767 /// Format1 - Bitfield for single component formats - 8 bit specialization
768 //////////////////////////////////////////////////////////////////////////
776 ///@ The following are here to provide full template needed in Formats.
783 //////////////////////////////////////////////////////////////////////////
784 /// Format1 - Bitfield for single component formats - 16 bit specialization
785 //////////////////////////////////////////////////////////////////////////
793 ///@ The following are here to provide full template needed in Formats.
800 //////////////////////////////////////////////////////////////////////////
801 /// Format2 - Bitfield for 2 component formats.
802 //////////////////////////////////////////////////////////////////////////
803 template<uint32_t x
, uint32_t y
>
813 ///@ The following are here to provide full template needed in Formats.
819 //////////////////////////////////////////////////////////////////////////
820 /// Format2 - Bitfield for 2 component formats - 16 bit specialization
821 //////////////////////////////////////////////////////////////////////////
832 ///@ The following are here to provide full template needed in Formats.
838 //////////////////////////////////////////////////////////////////////////
839 /// Format3 - Bitfield for 3 component formats.
840 //////////////////////////////////////////////////////////////////////////
841 template<uint32_t x
, uint32_t y
, uint32_t z
>
850 uint32_t a
; ///@note This is here to provide full template needed in Formats.
853 //////////////////////////////////////////////////////////////////////////
854 /// Format3 - Bitfield for 3 component formats - 16 bit specialization
855 //////////////////////////////////////////////////////////////////////////
865 uint16_t a
; ///@note This is here to provide full template needed in Formats.
868 //////////////////////////////////////////////////////////////////////////
869 /// Format4 - Bitfield for 4 component formats.
870 //////////////////////////////////////////////////////////////////////////
871 template<uint32_t x
, uint32_t y
, uint32_t z
, uint32_t w
>
880 //////////////////////////////////////////////////////////////////////////
881 /// Format4 - Bitfield for 4 component formats - 16 bit specialization
882 //////////////////////////////////////////////////////////////////////////
884 struct Format4
<5,5,5,1>
892 //////////////////////////////////////////////////////////////////////////
893 /// Format4 - Bitfield for 4 component formats - 16 bit specialization
894 //////////////////////////////////////////////////////////////////////////
896 struct Format4
<4,4,4,4>
904 //////////////////////////////////////////////////////////////////////////
905 /// ComponentTraits - Default components
906 //////////////////////////////////////////////////////////////////////////
907 template<uint32_t x
, uint32_t y
, uint32_t z
, uint32_t w
>
910 INLINE
static uint32_t GetDefault(uint32_t comp
)
912 static const uint32_t defaults
[4]{ x
, y
, z
, w
};
913 return defaults
[comp
];
917 //////////////////////////////////////////////////////////////////////////
918 /// ComponentTraits - Component type traits.
919 //////////////////////////////////////////////////////////////////////////
920 template<SWR_TYPE X
, uint32_t NumBitsX
, SWR_TYPE Y
= SWR_TYPE_UNKNOWN
, uint32_t NumBitsY
= 0, SWR_TYPE Z
= SWR_TYPE_UNKNOWN
, uint32_t NumBitsZ
= 0, SWR_TYPE W
= SWR_TYPE_UNKNOWN
, uint32_t NumBitsW
= 0>
921 struct ComponentTraits
923 INLINE
static SWR_TYPE
GetType(uint32_t comp
)
925 static const SWR_TYPE CompType
[4]{ X
, Y
, Z
, W
};
926 return CompType
[comp
];
929 INLINE
static uint32_t GetBPC(uint32_t comp
)
931 static const uint32_t MyBpc
[4]{ NumBitsX
, NumBitsY
, NumBitsZ
, NumBitsW
};
935 INLINE
static bool isNormalized(uint32_t comp
)
940 return (X
== SWR_TYPE_UNORM
|| X
== SWR_TYPE_SNORM
) ? true : false;
942 return (Y
== SWR_TYPE_UNORM
|| Y
== SWR_TYPE_SNORM
) ? true : false;
944 return (Z
== SWR_TYPE_UNORM
|| Z
== SWR_TYPE_SNORM
) ? true : false;
946 return (W
== SWR_TYPE_UNORM
|| W
== SWR_TYPE_SNORM
) ? true : false;
952 INLINE
static float toFloat(uint32_t comp
)
957 return TypeTraits
<X
, NumBitsX
>::toFloat();
959 return TypeTraits
<Y
, NumBitsY
>::toFloat();
961 return TypeTraits
<Z
, NumBitsZ
>::toFloat();
963 return TypeTraits
<W
, NumBitsW
>::toFloat();
966 return TypeTraits
<X
, NumBitsX
>::toFloat();
970 INLINE
static float fromFloat(uint32_t comp
)
975 return TypeTraits
<X
, NumBitsX
>::fromFloat();
977 return TypeTraits
<Y
, NumBitsY
>::fromFloat();
979 return TypeTraits
<Z
, NumBitsZ
>::fromFloat();
981 return TypeTraits
<W
, NumBitsW
>::fromFloat();
984 return TypeTraits
<X
, NumBitsX
>::fromFloat();
987 INLINE
static simdscalar
loadSOA(uint32_t comp
, const uint8_t* pSrc
)
992 return TypeTraits
<X
, NumBitsX
>::loadSOA(pSrc
);
994 return TypeTraits
<Y
, NumBitsY
>::loadSOA(pSrc
);
996 return TypeTraits
<Z
, NumBitsZ
>::loadSOA(pSrc
);
998 return TypeTraits
<W
, NumBitsW
>::loadSOA(pSrc
);
1001 return TypeTraits
<X
, NumBitsX
>::loadSOA(pSrc
);
1004 INLINE
static void storeSOA(uint32_t comp
, uint8_t *pDst
, simdscalar src
)
1009 TypeTraits
<X
, NumBitsX
>::storeSOA(pDst
, src
);
1012 TypeTraits
<Y
, NumBitsY
>::storeSOA(pDst
, src
);
1015 TypeTraits
<Z
, NumBitsZ
>::storeSOA(pDst
, src
);
1018 TypeTraits
<W
, NumBitsW
>::storeSOA(pDst
, src
);
1022 TypeTraits
<X
, NumBitsX
>::storeSOA(pDst
, src
);
1025 INLINE
static simdscalar
unpack(uint32_t comp
, simdscalar
&in
)
1030 return TypeTraits
<X
, NumBitsX
>::unpack(in
);
1032 return TypeTraits
<Y
, NumBitsY
>::unpack(in
);
1034 return TypeTraits
<Z
, NumBitsZ
>::unpack(in
);
1036 return TypeTraits
<W
, NumBitsW
>::unpack(in
);
1039 return TypeTraits
<X
, NumBitsX
>::unpack(in
);
1042 INLINE
static simdscalar
pack(uint32_t comp
, simdscalar
&in
)
1047 return TypeTraits
<X
, NumBitsX
>::pack(in
);
1049 return TypeTraits
<Y
, NumBitsY
>::pack(in
);
1051 return TypeTraits
<Z
, NumBitsZ
>::pack(in
);
1053 return TypeTraits
<W
, NumBitsW
>::pack(in
);
1056 return TypeTraits
<X
, NumBitsX
>::pack(in
);
1059 INLINE
static simdscalar
convertSrgb(uint32_t comp
, simdscalar
&in
)
1064 return TypeTraits
<X
, NumBitsX
>::convertSrgb(in
);
1066 return TypeTraits
<Y
, NumBitsY
>::convertSrgb(in
);
1068 return TypeTraits
<Z
, NumBitsZ
>::convertSrgb(in
);
1070 return TypeTraits
<W
, NumBitsW
>::convertSrgb(in
);
1073 return TypeTraits
<X
, NumBitsX
>::convertSrgb(in
);