swr: [rasterizer core] move MultisampleTrait static from header to cpp
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / format_types.h
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file formats.h
24 *
25 * @brief Definitions for SWR_FORMAT functions.
26 *
27 ******************************************************************************/
28 #pragma once
29
30 #include "utils.h"
31
32 //////////////////////////////////////////////////////////////////////////
33 /// PackTraits - Helpers for packing / unpacking same pixel sizes
34 //////////////////////////////////////////////////////////////////////////
35 template <uint32_t NumBits, bool Signed = false>
36 struct PackTraits
37 {
38 static const uint32_t MyNumBits = NumBits;
39 static simdscalar loadSOA(const uint8_t *pSrc) = delete;
40 static void storeSOA(uint8_t *pDst, simdscalar src) = delete;
41 static simdscalar unpack(simdscalar &in) = delete;
42 static simdscalar pack(simdscalar &in) = delete;
43 };
44
45 //////////////////////////////////////////////////////////////////////////
46 /// PackTraits - Helpers for packing / unpacking unused channels
47 //////////////////////////////////////////////////////////////////////////
48 template <>
49 struct PackTraits<0, false>
50 {
51 static const uint32_t MyNumBits = 0;
52
53 static simdscalar loadSOA(const uint8_t *pSrc) { return _simd_setzero_ps(); }
54 static void storeSOA(uint8_t *pDst, simdscalar src) { return; }
55 static simdscalar unpack(simdscalar &in) { return _simd_setzero_ps(); }
56 static simdscalar pack(simdscalar &in) { return _simd_setzero_ps(); }
57 };
58
59
60 //////////////////////////////////////////////////////////////////////////
61 /// PackTraits - Helpers for packing / unpacking 8 bit unsigned channels
62 //////////////////////////////////////////////////////////////////////////
63 template <>
64 struct PackTraits<8, false>
65 {
66 static const uint32_t MyNumBits = 8;
67
68 static simdscalar loadSOA(const uint8_t *pSrc)
69 {
70 #if KNOB_SIMD_WIDTH == 8
71 __m256 result = _mm256_setzero_ps();
72 __m128 vLo = _mm_castpd_ps(_mm_load_sd((double*)pSrc));
73 return _mm256_insertf128_ps(result, vLo, 0);
74 #else
75 #error Unsupported vector width
76 #endif
77 }
78
79 static void storeSOA(uint8_t *pDst, simdscalar src)
80 {
81 // store simd bytes
82 #if KNOB_SIMD_WIDTH == 8
83 _mm_storel_pd((double*)pDst, _mm_castps_pd(_mm256_castps256_ps128(src)));
84 #else
85 #error Unsupported vector width
86 #endif
87 }
88
89 static simdscalar unpack(simdscalar &in)
90 {
91 #if KNOB_SIMD_WIDTH == 8
92 #if KNOB_ARCH==KNOB_ARCH_AVX
93 __m128i src = _mm_castps_si128(_mm256_castps256_ps128(in));
94 __m128i resLo = _mm_cvtepu8_epi32(src);
95 __m128i resHi = _mm_shuffle_epi8(src,
96 _mm_set_epi32(0x80808007, 0x80808006, 0x80808005, 0x80808004));
97
98 __m256i result = _mm256_castsi128_si256(resLo);
99 result = _mm256_insertf128_si256(result, resHi, 1);
100 return _mm256_castsi256_ps(result);
101 #elif KNOB_ARCH==KNOB_ARCH_AVX2
102 return _mm256_castsi256_ps(_mm256_cvtepu8_epi32(_mm_castps_si128(_mm256_castps256_ps128(in))));
103 #endif
104 #else
105 #error Unsupported vector width
106 #endif
107 }
108
109 static simdscalar pack(simdscalar &in)
110 {
111 #if KNOB_SIMD_WIDTH == 8
112 simdscalari src = _simd_castps_si(in);
113 __m128i res16 = _mm_packus_epi32(_mm256_castsi256_si128(src), _mm256_extractf128_si256(src, 1));
114 __m128i res8 = _mm_packus_epi16(res16, _mm_undefined_si128());
115 return _mm256_castsi256_ps(_mm256_castsi128_si256(res8));
116 #else
117 #error Unsupported vector width
118 #endif
119 }
120 };
121
122 //////////////////////////////////////////////////////////////////////////
123 /// PackTraits - Helpers for packing / unpacking 8 bit signed channels
124 //////////////////////////////////////////////////////////////////////////
125 template <>
126 struct PackTraits<8, true>
127 {
128 static const uint32_t MyNumBits = 8;
129
130 static simdscalar loadSOA(const uint8_t *pSrc)
131 {
132 #if KNOB_SIMD_WIDTH == 8
133 __m256 result = _mm256_setzero_ps();
134 __m128 vLo = _mm_castpd_ps(_mm_load_sd((double*)pSrc));
135 return _mm256_insertf128_ps(result, vLo, 0);
136 #else
137 #error Unsupported vector width
138 #endif
139 }
140
141 static void storeSOA(uint8_t *pDst, simdscalar src)
142 {
143 // store simd bytes
144 #if KNOB_SIMD_WIDTH == 8
145 _mm_storel_pd((double*)pDst, _mm_castps_pd(_mm256_castps256_ps128(src)));
146 #else
147 #error Unsupported vector width
148 #endif
149 }
150
151 static simdscalar unpack(simdscalar &in)
152 {
153 #if KNOB_SIMD_WIDTH == 8
154 #if KNOB_ARCH==KNOB_ARCH_AVX
155 SWR_ASSERT(0); // I think this may be incorrect.
156 __m128i src = _mm_castps_si128(_mm256_castps256_ps128(in));
157 __m128i resLo = _mm_cvtepi8_epi32(src);
158 __m128i resHi = _mm_shuffle_epi8(src,
159 _mm_set_epi32(0x80808007, 0x80808006, 0x80808005, 0x80808004));
160
161 __m256i result = _mm256_castsi128_si256(resLo);
162 result = _mm256_insertf128_si256(result, resHi, 1);
163 return _mm256_castsi256_ps(result);
164 #elif KNOB_ARCH==KNOB_ARCH_AVX2
165 return _mm256_castsi256_ps(_mm256_cvtepi8_epi32(_mm_castps_si128(_mm256_castps256_ps128(in))));
166 #endif
167 #else
168 #error Unsupported vector width
169 #endif
170 }
171
172 static simdscalar pack(simdscalar &in)
173 {
174 #if KNOB_SIMD_WIDTH == 8
175 simdscalari src = _simd_castps_si(in);
176 __m128i res16 = _mm_packs_epi32(_mm256_castsi256_si128(src), _mm256_extractf128_si256(src, 1));
177 __m128i res8 = _mm_packs_epi16(res16, _mm_undefined_si128());
178 return _mm256_castsi256_ps(_mm256_castsi128_si256(res8));
179 #else
180 #error Unsupported vector width
181 #endif
182 }
183 };
184
185 //////////////////////////////////////////////////////////////////////////
186 /// PackTraits - Helpers for packing / unpacking 16 bit unsigned channels
187 //////////////////////////////////////////////////////////////////////////
188 template <>
189 struct PackTraits<16, false>
190 {
191 static const uint32_t MyNumBits = 16;
192
193 static simdscalar loadSOA(const uint8_t *pSrc)
194 {
195 #if KNOB_SIMD_WIDTH == 8
196 __m256 result = _mm256_setzero_ps();
197 __m128 vLo = _mm_load_ps((const float*)pSrc);
198 return _mm256_insertf128_ps(result, vLo, 0);
199 #else
200 #error Unsupported vector width
201 #endif
202 }
203
204 static void storeSOA(uint8_t *pDst, simdscalar src)
205 {
206 #if KNOB_SIMD_WIDTH == 8
207 // store 16B (2B * 8)
208 _mm_store_ps((float*)pDst, _mm256_castps256_ps128(src));
209 #else
210 #error Unsupported vector width
211 #endif
212 }
213
214 static simdscalar unpack(simdscalar &in)
215 {
216 #if KNOB_SIMD_WIDTH == 8
217 #if KNOB_ARCH==KNOB_ARCH_AVX
218 __m128i src = _mm_castps_si128(_mm256_castps256_ps128(in));
219 __m128i resLo = _mm_cvtepu16_epi32(src);
220 __m128i resHi = _mm_shuffle_epi8(src,
221 _mm_set_epi32(0x80800F0E, 0x80800D0C, 0x80800B0A, 0x80800908));
222
223 __m256i result = _mm256_castsi128_si256(resLo);
224 result = _mm256_insertf128_si256(result, resHi, 1);
225 return _mm256_castsi256_ps(result);
226 #elif KNOB_ARCH==KNOB_ARCH_AVX2
227 return _mm256_castsi256_ps(_mm256_cvtepu16_epi32(_mm_castps_si128(_mm256_castps256_ps128(in))));
228 #endif
229 #else
230 #error Unsupported vector width
231 #endif
232 }
233
234 static simdscalar pack(simdscalar &in)
235 {
236 #if KNOB_SIMD_WIDTH == 8
237 simdscalari src = _simd_castps_si(in);
238 __m256i res = _mm256_castsi128_si256(_mm_packus_epi32(_mm256_castsi256_si128(src), _mm256_extractf128_si256(src, 1)));
239 return _mm256_castsi256_ps(res);
240 #else
241 #error Unsupported vector width
242 #endif
243 }
244 };
245
246 //////////////////////////////////////////////////////////////////////////
247 /// PackTraits - Helpers for packing / unpacking 16 bit signed channels
248 //////////////////////////////////////////////////////////////////////////
249 template <>
250 struct PackTraits<16, true>
251 {
252 static const uint32_t MyNumBits = 16;
253
254 static simdscalar loadSOA(const uint8_t *pSrc)
255 {
256 #if KNOB_SIMD_WIDTH == 8
257 __m256 result = _mm256_setzero_ps();
258 __m128 vLo = _mm_load_ps((const float*)pSrc);
259 return _mm256_insertf128_ps(result, vLo, 0);
260 #else
261 #error Unsupported vector width
262 #endif
263 }
264
265 static void storeSOA(uint8_t *pDst, simdscalar src)
266 {
267 #if KNOB_SIMD_WIDTH == 8
268 // store 16B (2B * 8)
269 _mm_store_ps((float*)pDst, _mm256_castps256_ps128(src));
270 #else
271 #error Unsupported vector width
272 #endif
273 }
274
275 static simdscalar unpack(simdscalar &in)
276 {
277 #if KNOB_SIMD_WIDTH == 8
278 #if KNOB_ARCH==KNOB_ARCH_AVX
279 SWR_ASSERT(0); // I think this is incorrectly implemented
280 __m128i src = _mm_castps_si128(_mm256_castps256_ps128(in));
281 __m128i resLo = _mm_cvtepi16_epi32(src);
282 __m128i resHi = _mm_shuffle_epi8(src,
283 _mm_set_epi32(0x80800F0E, 0x80800D0C, 0x80800B0A, 0x80800908));
284
285 __m256i result = _mm256_castsi128_si256(resLo);
286 result = _mm256_insertf128_si256(result, resHi, 1);
287 return _mm256_castsi256_ps(result);
288 #elif KNOB_ARCH==KNOB_ARCH_AVX2
289 return _mm256_castsi256_ps(_mm256_cvtepi16_epi32(_mm_castps_si128(_mm256_castps256_ps128(in))));
290 #endif
291 #else
292 #error Unsupported vector width
293 #endif
294 }
295
296 static simdscalar pack(simdscalar &in)
297 {
298 #if KNOB_SIMD_WIDTH == 8
299 simdscalari src = _simd_castps_si(in);
300 __m256i res = _mm256_castsi128_si256(_mm_packs_epi32(_mm256_castsi256_si128(src), _mm256_extractf128_si256(src, 1)));
301 return _mm256_castsi256_ps(res);
302 #else
303 #error Unsupported vector width
304 #endif
305 }
306 };
307
308 //////////////////////////////////////////////////////////////////////////
309 /// PackTraits - Helpers for packing / unpacking 32 bit channels
310 //////////////////////////////////////////////////////////////////////////
311 template <>
312 struct PackTraits<32, false>
313 {
314 static const uint32_t MyNumBits = 32;
315
316 static simdscalar loadSOA(const uint8_t *pSrc) { return _simd_load_ps((const float*)pSrc); }
317 static void storeSOA(uint8_t *pDst, simdscalar src) { _simd_store_ps((float*)pDst, src); }
318 static simdscalar unpack(simdscalar &in) { return in; }
319 static simdscalar pack(simdscalar &in) { return in; }
320 };
321
322 //////////////////////////////////////////////////////////////////////////
323 /// TypeTraits - Format type traits.
324 //////////////////////////////////////////////////////////////////////////
325 template<SWR_TYPE type, uint32_t NumBits>
326 struct TypeTraits : PackTraits<NumBits>
327 {
328 static const SWR_TYPE MyType = type;
329 static float toFloat() { return 0.0; }
330 static float fromFloat() { SWR_ASSERT(0); return 0.0; }
331 static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
332 };
333
334 //////////////////////////////////////////////////////////////////////////
335 /// TypeTraits - Format type traits specialization for UINT8
336 //////////////////////////////////////////////////////////////////////////
337 template<> struct TypeTraits<SWR_TYPE_UINT, 8> : PackTraits<8>
338 {
339 static const SWR_TYPE MyType = SWR_TYPE_UINT;
340 static float toFloat() { return 0.0; }
341 static float fromFloat() { SWR_ASSERT(0); return 0.0; }
342 static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
343 };
344
345 //////////////////////////////////////////////////////////////////////////
346 /// TypeTraits - Format type traits specialization for UINT8
347 //////////////////////////////////////////////////////////////////////////
348 template<> struct TypeTraits<SWR_TYPE_SINT, 8> : PackTraits<8, true>
349 {
350 static const SWR_TYPE MyType = SWR_TYPE_SINT;
351 static float toFloat() { return 0.0; }
352 static float fromFloat() { SWR_ASSERT(0); return 0.0; }
353 static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
354 };
355
356 //////////////////////////////////////////////////////////////////////////
357 /// TypeTraits - Format type traits specialization for UINT16
358 //////////////////////////////////////////////////////////////////////////
359 template<> struct TypeTraits<SWR_TYPE_UINT, 16> : PackTraits<16>
360 {
361 static const SWR_TYPE MyType = SWR_TYPE_UINT;
362 static float toFloat() { return 0.0; }
363 static float fromFloat() { SWR_ASSERT(0); return 0.0; }
364 static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
365 };
366
367 //////////////////////////////////////////////////////////////////////////
368 /// TypeTraits - Format type traits specialization for SINT16
369 //////////////////////////////////////////////////////////////////////////
370 template<> struct TypeTraits<SWR_TYPE_SINT, 16> : PackTraits<16, true>
371 {
372 static const SWR_TYPE MyType = SWR_TYPE_SINT;
373 static float toFloat() { return 0.0; }
374 static float fromFloat() { SWR_ASSERT(0); return 0.0; }
375 static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
376 };
377
378 //////////////////////////////////////////////////////////////////////////
379 /// TypeTraits - Format type traits specialization for UINT32
380 //////////////////////////////////////////////////////////////////////////
381 template<> struct TypeTraits<SWR_TYPE_UINT, 32> : PackTraits<32>
382 {
383 static const SWR_TYPE MyType = SWR_TYPE_UINT;
384 static float toFloat() { return 0.0; }
385 static float fromFloat() { SWR_ASSERT(0); return 0.0; }
386 static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
387 };
388
389 //////////////////////////////////////////////////////////////////////////
390 /// TypeTraits - Format type traits specialization for UINT32
391 //////////////////////////////////////////////////////////////////////////
392 template<> struct TypeTraits<SWR_TYPE_SINT, 32> : PackTraits<32>
393 {
394 static const SWR_TYPE MyType = SWR_TYPE_SINT;
395 static float toFloat() { return 0.0; }
396 static float fromFloat() { SWR_ASSERT(0); return 0.0; }
397 static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
398 };
399
400 //////////////////////////////////////////////////////////////////////////
401 /// TypeTraits - Format type traits specialization for UNORM5
402 //////////////////////////////////////////////////////////////////////////
403 template<> struct TypeTraits<SWR_TYPE_UNORM, 5> : PackTraits<5>
404 {
405 static const SWR_TYPE MyType = SWR_TYPE_UNORM;
406 static float toFloat() { return 1.0f / 31.0f; }
407 static float fromFloat() { return 31.0f; }
408 static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
409 };
410
411 //////////////////////////////////////////////////////////////////////////
412 /// TypeTraits - Format type traits specialization for UNORM6
413 //////////////////////////////////////////////////////////////////////////
414 template<> struct TypeTraits<SWR_TYPE_UNORM, 6> : PackTraits<6>
415 {
416 static const SWR_TYPE MyType = SWR_TYPE_UNORM;
417 static float toFloat() { return 1.0f / 63.0f; }
418 static float fromFloat() { return 63.0f; }
419 static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
420 };
421
422 //////////////////////////////////////////////////////////////////////////
423 /// TypeTraits - Format type traits specialization for UNORM8
424 //////////////////////////////////////////////////////////////////////////
425 template<> struct TypeTraits<SWR_TYPE_UNORM, 8> : PackTraits<8>
426 {
427 static const SWR_TYPE MyType = SWR_TYPE_UNORM;
428 static float toFloat() { return 1.0f / 255.0f; }
429 static float fromFloat() { return 255.0f; }
430 static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
431 };
432
433 //////////////////////////////////////////////////////////////////////////
434 /// TypeTraits - Format type traits specialization for UNORM8
435 //////////////////////////////////////////////////////////////////////////
436 template<> struct TypeTraits<SWR_TYPE_SNORM, 8> : PackTraits<8, true>
437 {
438 static const SWR_TYPE MyType = SWR_TYPE_SNORM;
439 static float toFloat() { return 1.0f / 127.0f; }
440 static float fromFloat() { return 127.0f; }
441 static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
442 };
443
444 //////////////////////////////////////////////////////////////////////////
445 /// TypeTraits - Format type traits specialization for UNORM16
446 //////////////////////////////////////////////////////////////////////////
447 template<> struct TypeTraits<SWR_TYPE_UNORM, 16> : PackTraits<16>
448 {
449 static const SWR_TYPE MyType = SWR_TYPE_UNORM;
450 static float toFloat() { return 1.0f / 65535.0f; }
451 static float fromFloat() { return 65535.0f; }
452 static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
453 };
454
455 //////////////////////////////////////////////////////////////////////////
456 /// TypeTraits - Format type traits specialization for SNORM16
457 //////////////////////////////////////////////////////////////////////////
458 template<> struct TypeTraits<SWR_TYPE_SNORM, 16> : PackTraits<16, true>
459 {
460 static const SWR_TYPE MyType = SWR_TYPE_UNORM;
461 static float toFloat() { return 1.0f / 32767.0f; }
462 static float fromFloat() { return 32767.0f; }
463 static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
464 };
465
466 //////////////////////////////////////////////////////////////////////////
467 /// TypeTraits - Format type traits specialization for UNORM24
468 //////////////////////////////////////////////////////////////////////////
469 template<>
470 struct TypeTraits < SWR_TYPE_UNORM, 24 > : PackTraits<32>
471 {
472 static const SWR_TYPE MyType = SWR_TYPE_UNORM;
473 static float toFloat() { return 1.0f / 16777215.0f; }
474 static float fromFloat() { return 16777215.0f; }
475 static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
476 };
477
478 //////////////////////////////////////////////////////////////////////////
479 // FLOAT Specializations from here on...
480 //////////////////////////////////////////////////////////////////////////
481 #define TO_M128i(a) _mm_castps_si128(a)
482 #define TO_M128(a) _mm_castsi128_ps(a)
483
484 #include "math.h"
485
486 template< unsigned expnum, unsigned expden, unsigned coeffnum, unsigned coeffden >
487 inline static __m128 fastpow(__m128 arg) {
488 __m128 ret = arg;
489
490 static const __m128 factor = _mm_set1_ps(exp2(127.0f * expden / expnum - 127.0f)
491 * powf(1.0f * coeffnum / coeffden, 1.0f * expden / expnum));
492
493 // Apply a constant pre-correction factor.
494 ret = _mm_mul_ps(ret, factor);
495
496 // Reinterpret arg as integer to obtain logarithm.
497 //asm("cvtdq2ps %1, %0" : "=x" (ret) : "x" (ret));
498 ret = _mm_cvtepi32_ps(_mm_castps_si128(ret));
499
500 // Multiply logarithm by power.
501 ret = _mm_mul_ps(ret, _mm_set1_ps(1.0f * expnum / expden));
502
503 // Convert back to "integer" to exponentiate.
504 //asm("cvtps2dq %1, %0" : "=x" (ret) : "x" (ret));
505 ret = _mm_castsi128_ps(_mm_cvtps_epi32(ret));
506
507 return ret;
508 }
509
510 inline static __m128 pow512_4(__m128 arg) {
511 // 5/12 is too small, so compute the 4th root of 20/12 instead.
512 // 20/12 = 5/3 = 1 + 2/3 = 2 - 1/3. 2/3 is a suitable argument for fastpow.
513 // weighting coefficient: a^-1/2 = 2 a; a = 2^-2/3
514 __m128 xf = fastpow< 2, 3, int(0.629960524947437 * 1e9), int(1e9) >(arg);
515 __m128 xover = _mm_mul_ps(arg, xf);
516
517 __m128 xfm1 = _mm_rsqrt_ps(xf);
518 __m128 x2 = _mm_mul_ps(arg, arg);
519 __m128 xunder = _mm_mul_ps(x2, xfm1);
520
521 // sqrt2 * over + 2 * sqrt2 * under
522 __m128 xavg = _mm_mul_ps(_mm_set1_ps(1.0f / (3.0f * 0.629960524947437f) * 0.999852f),
523 _mm_add_ps(xover, xunder));
524
525 xavg = _mm_mul_ps(xavg, _mm_rsqrt_ps(xavg));
526 xavg = _mm_mul_ps(xavg, _mm_rsqrt_ps(xavg));
527 return xavg;
528 }
529
530 inline static __m128 powf_wrapper(__m128 Base, float Exp)
531 {
532 float *f = (float *)(&Base);
533
534 return _mm_set_ps(powf(f[0], Exp),
535 powf(f[1], Exp),
536 powf(f[2], Exp),
537 powf(f[3], Exp));
538 }
539
540 static inline __m128 ConvertFloatToSRGB2(__m128& Src)
541 {
542 // create a mask with 0xFFFFFFFF in the DWORDs where the source is <= the minimal SRGB float value
543 __m128i CmpToSRGBThresholdMask = TO_M128i(_mm_cmpnlt_ps(_mm_set1_ps(0.0031308f), Src));
544
545 // squeeze the mask down to 16 bits (4 bits per DWORD)
546 int CompareResult = _mm_movemask_epi8(CmpToSRGBThresholdMask);
547
548 __m128 Result;
549
550 //
551 if (CompareResult == 0xFFFF)
552 {
553 // all DWORDs are <= the threshold
554 Result = _mm_mul_ps(Src, _mm_set1_ps(12.92f));
555 }
556 else if (CompareResult == 0x0)
557 {
558 // all DWORDs are > the threshold
559 __m128 fSrc_0RGB = Src;
560
561 // --> 1.055f * c(1.0f/2.4f) - 0.055f
562 #if KNOB_USE_FAST_SRGB == TRUE
563 // 1.0f / 2.4f is 5.0f / 12.0f which is used for approximation.
564 __m128 f = pow512_4(fSrc_0RGB);
565 #else
566 __m128 f = powf_wrapper(fSrc_0RGB, 1.0f / 2.4f);
567 #endif
568 f = _mm_mul_ps(f, _mm_set1_ps(1.055f));
569 Result = _mm_sub_ps(f, _mm_set1_ps(0.055f));
570 }
571 else
572 {
573 // some DWORDs are <= the threshold and some are > threshold
574 __m128 Src_0RGB_mul_denorm = _mm_mul_ps(Src, _mm_set1_ps(12.92f));
575
576 __m128 fSrc_0RGB = Src;
577
578 // --> 1.055f * c(1.0f/2.4f) - 0.055f
579 #if KNOB_USE_FAST_SRGB == TRUE
580 // 1.0f / 2.4f is 5.0f / 12.0f which is used for approximation.
581 __m128 f = pow512_4(fSrc_0RGB);
582 #else
583 __m128 f = powf_wrapper(fSrc_0RGB, 1.0f / 2.4f);
584 #endif
585 f = _mm_mul_ps(f, _mm_set1_ps(1.055f));
586 f = _mm_sub_ps(f, _mm_set1_ps(0.055f));
587
588 // Clear the alpha (is garbage after the sub)
589 __m128i i = _mm_and_si128(TO_M128i(f), _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF));
590
591 __m128i LessThanPart = _mm_and_si128(CmpToSRGBThresholdMask, TO_M128i(Src_0RGB_mul_denorm));
592 __m128i GreaterEqualPart = _mm_andnot_si128(CmpToSRGBThresholdMask, i);
593 __m128i CombinedParts = _mm_or_si128(LessThanPart, GreaterEqualPart);
594
595 Result = TO_M128(CombinedParts);
596 }
597
598 return Result;
599 }
600
601 //////////////////////////////////////////////////////////////////////////
602 /// TypeTraits - Format type traits specialization for FLOAT16
603 //////////////////////////////////////////////////////////////////////////
604 template<> struct TypeTraits<SWR_TYPE_FLOAT, 16> : PackTraits<16>
605 {
606 static const SWR_TYPE MyType = SWR_TYPE_FLOAT;
607 static float toFloat() { return 1.0f; }
608 static float fromFloat() { return 1.0f; }
609 static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
610
611 static simdscalar pack(const simdscalar &in)
612 {
613 #if KNOB_SIMD_WIDTH == 8
614 #if (KNOB_ARCH == KNOB_ARCH_AVX)
615 // input is 8 packed float32, output is 8 packed float16
616 simdscalari src = _simd_castps_si(in);
617
618 static const uint32_t FLOAT_EXP_BITS = 8;
619 static const uint32_t FLOAT_MANTISSA_BITS = 23;
620 static const uint32_t FLOAT_MANTISSA_MASK = (1U << FLOAT_MANTISSA_BITS) - 1;
621 static const uint32_t FLOAT_EXP_MASK = ((1U << FLOAT_EXP_BITS) - 1) << FLOAT_MANTISSA_BITS;
622
623 static const uint32_t HALF_EXP_BITS = 5;
624 static const uint32_t HALF_MANTISSA_BITS = 10;
625 static const uint32_t HALF_EXP_MASK = ((1U << HALF_EXP_BITS) - 1) << HALF_MANTISSA_BITS;
626
627 // minimum exponent required, exponents below this are flushed to 0.
628 static const int32_t HALF_EXP_MIN = -14;
629 static const int32_t FLOAT_EXP_BIAS = 127;
630 static const int32_t FLOAT_EXP_MIN = HALF_EXP_MIN + FLOAT_EXP_BIAS;
631 static const int32_t FLOAT_EXP_MIN_FTZ = FLOAT_EXP_MIN - (HALF_MANTISSA_BITS + 1); // +1 for the lack of implicit significand
632
633 // maximum exponent required, exponents above this are set to infinity
634 static const int32_t HALF_EXP_MAX = 15;
635 static const int32_t FLOAT_EXP_MAX = HALF_EXP_MAX + FLOAT_EXP_BIAS;
636
637 const simdscalari vSignMask = _simd_set1_epi32(0x80000000);
638 const simdscalari vExpMask = _simd_set1_epi32(FLOAT_EXP_MASK);
639 const simdscalari vManMask = _simd_set1_epi32(FLOAT_MANTISSA_MASK);
640 const simdscalari vExpMin = _simd_set1_epi32(FLOAT_EXP_MASK & uint32_t(FLOAT_EXP_MIN << FLOAT_MANTISSA_BITS));
641 const simdscalari vExpMinFtz = _simd_set1_epi32(FLOAT_EXP_MASK & uint32_t(FLOAT_EXP_MIN_FTZ << FLOAT_MANTISSA_BITS));
642 const simdscalari vExpMax = _simd_set1_epi32(FLOAT_EXP_MASK & uint32_t(FLOAT_EXP_MAX << FLOAT_MANTISSA_BITS));
643
644 simdscalari vSign = _simd_and_si(src, vSignMask);
645 simdscalari vExp = _simd_and_si(src, vExpMask);
646 simdscalari vMan = _simd_and_si(src, vManMask);
647
648 simdscalari vFTZMask = _simd_cmplt_epi32(vExp, vExpMinFtz);
649 simdscalari vDenormMask = _simd_andnot_si(vFTZMask, _simd_cmplt_epi32(vExp, vExpMin));
650 simdscalari vInfMask = _simd_cmpeq_epi32(vExpMask, vExp);
651 simdscalari vClampMask = _simd_andnot_si(vInfMask, _simd_cmplt_epi32(vExpMax, vExp));
652
653 simdscalari vHalfExp = _simd_add_epi32(_simd_sub_epi32(vExp, vExpMin), _simd_set1_epi32(1U << FLOAT_MANTISSA_BITS));
654
655 // pack output 16-bits into the lower 16-bits of each 32-bit channel
656 simdscalari vDst = _simd_and_si(_simd_srli_epi32(vHalfExp, 13), _simd_set1_epi32(HALF_EXP_MASK));
657 vDst = _simd_or_si(vDst, _simd_srli_epi32(vMan, FLOAT_MANTISSA_BITS - HALF_MANTISSA_BITS));
658
659 // Flush To Zero
660 vDst = _simd_andnot_si(vFTZMask, vDst);
661 // Apply Infinites / NaN
662 vDst = _simd_or_si(vDst, _simd_and_si(vInfMask, _simd_set1_epi32(HALF_EXP_MASK)));
663
664 // Apply clamps
665 vDst = _simd_andnot_si(vClampMask, vDst);
666 vDst = _simd_or_si(vDst,
667 _simd_and_si(vClampMask, _simd_set1_epi32(0x7BFF)));
668
669 // Compute Denormals (subnormals)
670 if (!_mm256_testz_si256(vDenormMask, vDenormMask))
671 {
672 uint32_t *pDenormMask = (uint32_t*)&vDenormMask;
673 uint32_t *pExp = (uint32_t*)&vExp;
674 uint32_t *pMan = (uint32_t*)&vMan;
675 uint32_t *pDst = (uint32_t*)&vDst;
676 for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i)
677 {
678 if (pDenormMask[i])
679 {
680 // Need to compute subnormal value
681 uint32_t exponent = pExp[i] >> FLOAT_MANTISSA_BITS;
682 uint32_t mantissa = pMan[i] |
683 (1U << FLOAT_MANTISSA_BITS); // Denorms include no "implicit" 1s. Make it explicit
684
685 pDst[i] = mantissa >> ((FLOAT_EXP_MIN - exponent) + (FLOAT_MANTISSA_BITS - HALF_MANTISSA_BITS));
686 }
687 }
688 }
689
690 // Add in sign bits
691 vDst = _simd_or_si(vDst, _simd_srli_epi32(vSign, 16));
692
693 // Pack to lower 128-bits
694 vDst = _mm256_castsi128_si256(_mm_packus_epi32(_mm256_castsi256_si128(vDst), _mm256_extractf128_si256(vDst, 1)));
695
696 #if 0
697 #if !defined(NDEBUG)
698 simdscalari vCheck = _mm256_castsi128_si256(_mm256_cvtps_ph(in, _MM_FROUND_TRUNC));
699
700 for (uint32_t i = 0; i < 4; ++i)
701 {
702 SWR_ASSERT(vCheck.m256i_i32[i] == vDst.m256i_i32[i]);
703 }
704 #endif
705 #endif
706
707 return _simd_castsi_ps(vDst);
708
709 #else
710 return _mm256_castsi256_ps(_mm256_castsi128_si256(_mm256_cvtps_ph(in, _MM_FROUND_TRUNC)));
711 #endif
712 #else
713 #error Unsupported vector width
714 #endif
715 }
716
717 static simdscalar unpack(const simdscalar &in)
718 {
719 // input is 8 packed float16, output is 8 packed float32
720 SWR_ASSERT(0); // @todo
721 return _simd_setzero_ps();
722 }
723 };
724
725 //////////////////////////////////////////////////////////////////////////
726 /// TypeTraits - Format type traits specialization for FLOAT32
727 //////////////////////////////////////////////////////////////////////////
728 template<> struct TypeTraits<SWR_TYPE_FLOAT, 32> : PackTraits<32>
729 {
730 static const SWR_TYPE MyType = SWR_TYPE_FLOAT;
731 static float toFloat() { return 1.0f; }
732 static float fromFloat() { return 1.0f; }
733 static inline simdscalar convertSrgb(simdscalar &in)
734 {
735 #if (KNOB_ARCH == KNOB_ARCH_AVX || KNOB_ARCH == KNOB_ARCH_AVX2)
736 __m128 srcLo = _mm256_extractf128_ps(in, 0);
737 __m128 srcHi = _mm256_extractf128_ps(in, 1);
738
739 srcLo = ConvertFloatToSRGB2(srcLo);
740 srcHi = ConvertFloatToSRGB2(srcHi);
741
742 in = _mm256_insertf128_ps(in, srcLo, 0);
743 in = _mm256_insertf128_ps(in, srcHi, 1);
744
745 #endif
746 return in;
747 }
748 };
749
750 //////////////////////////////////////////////////////////////////////////
751 /// Format1 - Bitfield for single component formats.
752 //////////////////////////////////////////////////////////////////////////
753 template<uint32_t x>
754 struct Format1
755 {
756 union
757 {
758 uint32_t r : x;
759
760 ///@ The following are here to provide full template needed in Formats.
761 uint32_t g : x;
762 uint32_t b : x;
763 uint32_t a : x;
764 };
765 };
766
767 //////////////////////////////////////////////////////////////////////////
768 /// Format1 - Bitfield for single component formats - 8 bit specialization
769 //////////////////////////////////////////////////////////////////////////
770 template<>
771 struct Format1<8>
772 {
773 union
774 {
775 uint8_t r;
776
777 ///@ The following are here to provide full template needed in Formats.
778 uint8_t g;
779 uint8_t b;
780 uint8_t a;
781 };
782 };
783
784 //////////////////////////////////////////////////////////////////////////
785 /// Format1 - Bitfield for single component formats - 16 bit specialization
786 //////////////////////////////////////////////////////////////////////////
787 template<>
788 struct Format1<16>
789 {
790 union
791 {
792 uint16_t r;
793
794 ///@ The following are here to provide full template needed in Formats.
795 uint16_t g;
796 uint16_t b;
797 uint16_t a;
798 };
799 };
800
801 //////////////////////////////////////////////////////////////////////////
802 /// Format2 - Bitfield for 2 component formats.
803 //////////////////////////////////////////////////////////////////////////
804 template<uint32_t x, uint32_t y>
805 union Format2
806 {
807 struct
808 {
809 uint32_t r : x;
810 uint32_t g : y;
811 };
812 struct
813 {
814 ///@ The following are here to provide full template needed in Formats.
815 uint32_t b : x;
816 uint32_t a : y;
817 };
818 };
819
820 //////////////////////////////////////////////////////////////////////////
821 /// Format2 - Bitfield for 2 component formats - 16 bit specialization
822 //////////////////////////////////////////////////////////////////////////
823 template<>
824 union Format2<8,8>
825 {
826 struct
827 {
828 uint16_t r : 8;
829 uint16_t g : 8;
830 };
831 struct
832 {
833 ///@ The following are here to provide full template needed in Formats.
834 uint16_t b : 8;
835 uint16_t a : 8;
836 };
837 };
838
839 //////////////////////////////////////////////////////////////////////////
840 /// Format3 - Bitfield for 3 component formats.
841 //////////////////////////////////////////////////////////////////////////
842 template<uint32_t x, uint32_t y, uint32_t z>
843 union Format3
844 {
845 struct
846 {
847 uint32_t r : x;
848 uint32_t g : y;
849 uint32_t b : z;
850 };
851 uint32_t a; ///@note This is here to provide full template needed in Formats.
852 };
853
854 //////////////////////////////////////////////////////////////////////////
855 /// Format3 - Bitfield for 3 component formats - 16 bit specialization
856 //////////////////////////////////////////////////////////////////////////
857 template<>
858 union Format3<5,6,5>
859 {
860 struct
861 {
862 uint16_t r : 5;
863 uint16_t g : 6;
864 uint16_t b : 5;
865 };
866 uint16_t a; ///@note This is here to provide full template needed in Formats.
867 };
868
869 //////////////////////////////////////////////////////////////////////////
870 /// Format4 - Bitfield for 4 component formats.
871 //////////////////////////////////////////////////////////////////////////
872 template<uint32_t x, uint32_t y, uint32_t z, uint32_t w>
873 struct Format4
874 {
875 uint32_t r : x;
876 uint32_t g : y;
877 uint32_t b : z;
878 uint32_t a : w;
879 };
880
881 //////////////////////////////////////////////////////////////////////////
882 /// Format4 - Bitfield for 4 component formats - 16 bit specialization
883 //////////////////////////////////////////////////////////////////////////
884 template<>
885 struct Format4<5,5,5,1>
886 {
887 uint16_t r : 5;
888 uint16_t g : 5;
889 uint16_t b : 5;
890 uint16_t a : 1;
891 };
892
893 //////////////////////////////////////////////////////////////////////////
894 /// Format4 - Bitfield for 4 component formats - 16 bit specialization
895 //////////////////////////////////////////////////////////////////////////
896 template<>
897 struct Format4<4,4,4,4>
898 {
899 uint16_t r : 4;
900 uint16_t g : 4;
901 uint16_t b : 4;
902 uint16_t a : 4;
903 };
904
905 //////////////////////////////////////////////////////////////////////////
906 /// ComponentTraits - Default components
907 //////////////////////////////////////////////////////////////////////////
908 template<uint32_t x, uint32_t y, uint32_t z, uint32_t w>
909 struct Defaults
910 {
911 INLINE static uint32_t GetDefault(uint32_t comp)
912 {
913 static const uint32_t defaults[4]{ x, y, z, w };
914 return defaults[comp];
915 }
916 };
917
918 //////////////////////////////////////////////////////////////////////////
919 /// ComponentTraits - Component type traits.
920 //////////////////////////////////////////////////////////////////////////
921 template<SWR_TYPE X, uint32_t NumBitsX, SWR_TYPE Y = SWR_TYPE_UNKNOWN, uint32_t NumBitsY = 0, SWR_TYPE Z = SWR_TYPE_UNKNOWN, uint32_t NumBitsZ = 0, SWR_TYPE W = SWR_TYPE_UNKNOWN, uint32_t NumBitsW = 0>
922 struct ComponentTraits
923 {
924 INLINE static SWR_TYPE GetType(uint32_t comp)
925 {
926 static const SWR_TYPE CompType[4]{ X, Y, Z, W };
927 return CompType[comp];
928 }
929
930 INLINE static uint32_t GetBPC(uint32_t comp)
931 {
932 static const uint32_t MyBpc[4]{ NumBitsX, NumBitsY, NumBitsZ, NumBitsW };
933 return MyBpc[comp];
934 }
935
936 INLINE static bool isNormalized(uint32_t comp)
937 {
938 switch (comp)
939 {
940 case 0:
941 return (X == SWR_TYPE_UNORM || X == SWR_TYPE_SNORM) ? true : false;
942 case 1:
943 return (Y == SWR_TYPE_UNORM || Y == SWR_TYPE_SNORM) ? true : false;
944 case 2:
945 return (Z == SWR_TYPE_UNORM || Z == SWR_TYPE_SNORM) ? true : false;
946 case 3:
947 return (W == SWR_TYPE_UNORM || W == SWR_TYPE_SNORM) ? true : false;
948 }
949 SWR_ASSERT(0);
950 return false;
951 }
952
953 INLINE static float toFloat(uint32_t comp)
954 {
955 switch (comp)
956 {
957 case 0:
958 return TypeTraits<X, NumBitsX>::toFloat();
959 case 1:
960 return TypeTraits<Y, NumBitsY>::toFloat();
961 case 2:
962 return TypeTraits<Z, NumBitsZ>::toFloat();
963 case 3:
964 return TypeTraits<W, NumBitsW>::toFloat();
965 }
966 SWR_ASSERT(0);
967 return TypeTraits<X, NumBitsX>::toFloat();
968
969 }
970
971 INLINE static float fromFloat(uint32_t comp)
972 {
973 switch (comp)
974 {
975 case 0:
976 return TypeTraits<X, NumBitsX>::fromFloat();
977 case 1:
978 return TypeTraits<Y, NumBitsY>::fromFloat();
979 case 2:
980 return TypeTraits<Z, NumBitsZ>::fromFloat();
981 case 3:
982 return TypeTraits<W, NumBitsW>::fromFloat();
983 }
984 SWR_ASSERT(0);
985 return TypeTraits<X, NumBitsX>::fromFloat();
986 }
987
988 INLINE static simdscalar loadSOA(uint32_t comp, const uint8_t* pSrc)
989 {
990 switch (comp)
991 {
992 case 0:
993 return TypeTraits<X, NumBitsX>::loadSOA(pSrc);
994 case 1:
995 return TypeTraits<Y, NumBitsY>::loadSOA(pSrc);
996 case 2:
997 return TypeTraits<Z, NumBitsZ>::loadSOA(pSrc);
998 case 3:
999 return TypeTraits<W, NumBitsW>::loadSOA(pSrc);
1000 }
1001 SWR_ASSERT(0);
1002 return TypeTraits<X, NumBitsX>::loadSOA(pSrc);
1003 }
1004
1005 INLINE static void storeSOA(uint32_t comp, uint8_t *pDst, simdscalar src)
1006 {
1007 switch (comp)
1008 {
1009 case 0:
1010 TypeTraits<X, NumBitsX>::storeSOA(pDst, src);
1011 return;
1012 case 1:
1013 TypeTraits<Y, NumBitsY>::storeSOA(pDst, src);
1014 return;
1015 case 2:
1016 TypeTraits<Z, NumBitsZ>::storeSOA(pDst, src);
1017 return;
1018 case 3:
1019 TypeTraits<W, NumBitsW>::storeSOA(pDst, src);
1020 return;
1021 }
1022 SWR_ASSERT(0);
1023 TypeTraits<X, NumBitsX>::storeSOA(pDst, src);
1024 }
1025
1026 INLINE static simdscalar unpack(uint32_t comp, simdscalar &in)
1027 {
1028 switch (comp)
1029 {
1030 case 0:
1031 return TypeTraits<X, NumBitsX>::unpack(in);
1032 case 1:
1033 return TypeTraits<Y, NumBitsY>::unpack(in);
1034 case 2:
1035 return TypeTraits<Z, NumBitsZ>::unpack(in);
1036 case 3:
1037 return TypeTraits<W, NumBitsW>::unpack(in);
1038 }
1039 SWR_ASSERT(0);
1040 return TypeTraits<X, NumBitsX>::unpack(in);
1041 }
1042
1043 INLINE static simdscalar pack(uint32_t comp, simdscalar &in)
1044 {
1045 switch (comp)
1046 {
1047 case 0:
1048 return TypeTraits<X, NumBitsX>::pack(in);
1049 case 1:
1050 return TypeTraits<Y, NumBitsY>::pack(in);
1051 case 2:
1052 return TypeTraits<Z, NumBitsZ>::pack(in);
1053 case 3:
1054 return TypeTraits<W, NumBitsW>::pack(in);
1055 }
1056 SWR_ASSERT(0);
1057 return TypeTraits<X, NumBitsX>::pack(in);
1058 }
1059
1060 INLINE static simdscalar convertSrgb(uint32_t comp, simdscalar &in)
1061 {
1062 switch (comp)
1063 {
1064 case 0:
1065 return TypeTraits<X, NumBitsX>::convertSrgb(in);
1066 case 1:
1067 return TypeTraits<Y, NumBitsY>::convertSrgb(in);
1068 case 2:
1069 return TypeTraits<Z, NumBitsZ>::convertSrgb(in);
1070 case 3:
1071 return TypeTraits<W, NumBitsW>::convertSrgb(in);
1072 }
1073 SWR_ASSERT(0);
1074 return TypeTraits<X, NumBitsX>::convertSrgb(in);
1075 }
1076 };