gallium/swr: add OpenSWR rasterizer
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / format_types.h
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file formats.h
24 *
25 * @brief Definitions for SWR_FORMAT functions.
26 *
27 ******************************************************************************/
28 #pragma once
29
30 //////////////////////////////////////////////////////////////////////////
31 /// PackTraits - Helpers for packing / unpacking same pixel sizes
32 //////////////////////////////////////////////////////////////////////////
33 template <uint32_t NumBits, bool Signed = false>
34 struct PackTraits
35 {
36 static const uint32_t MyNumBits = NumBits;
37 static simdscalar loadSOA(const BYTE *pSrc) = delete;
38 static void storeSOA(BYTE *pDst, simdscalar src) = delete;
39 static simdscalar unpack(simdscalar &in) = delete;
40 static simdscalar pack(simdscalar &in) = delete;
41 };
42
43 //////////////////////////////////////////////////////////////////////////
44 /// PackTraits - Helpers for packing / unpacking unused channels
45 //////////////////////////////////////////////////////////////////////////
46 template <>
47 struct PackTraits<0, false>
48 {
49 static const uint32_t MyNumBits = 0;
50
51 static simdscalar loadSOA(const BYTE *pSrc) { return _simd_setzero_ps(); }
52 static void storeSOA(BYTE *pDst, simdscalar src) { return; }
53 static simdscalar unpack(simdscalar &in) { return _simd_setzero_ps(); }
54 static simdscalar pack(simdscalar &in) { return _simd_setzero_ps(); }
55 };
56
57
58 //////////////////////////////////////////////////////////////////////////
59 /// PackTraits - Helpers for packing / unpacking 8 bit unsigned channels
60 //////////////////////////////////////////////////////////////////////////
61 template <>
62 struct PackTraits<8, false>
63 {
64 static const uint32_t MyNumBits = 8;
65
66 static simdscalar loadSOA(const BYTE *pSrc)
67 {
68 #if KNOB_SIMD_WIDTH == 8
69 __m256 result = _mm256_setzero_ps();
70 __m128 vLo = _mm_castpd_ps(_mm_load_sd((double*)pSrc));
71 return _mm256_insertf128_ps(result, vLo, 0);
72 #else
73 #error Unsupported vector width
74 #endif
75 }
76
77 static void storeSOA(BYTE *pDst, simdscalar src)
78 {
79 // store simd bytes
80 #if KNOB_SIMD_WIDTH == 8
81 _mm_storel_pd((double*)pDst, _mm_castps_pd(_mm256_castps256_ps128(src)));
82 #else
83 #error Unsupported vector width
84 #endif
85 }
86
87 static simdscalar unpack(simdscalar &in)
88 {
89 #if KNOB_SIMD_WIDTH == 8
90 #if KNOB_ARCH==KNOB_ARCH_AVX
91 __m128i src = _mm_castps_si128(_mm256_castps256_ps128(in));
92 __m128i resLo = _mm_cvtepu8_epi32(src);
93 __m128i resHi = _mm_shuffle_epi8(src,
94 _mm_set_epi32(0x80808007, 0x80808006, 0x80808005, 0x80808004));
95
96 __m256i result = _mm256_castsi128_si256(resLo);
97 result = _mm256_insertf128_si256(result, resHi, 1);
98 return _mm256_castsi256_ps(result);
99 #elif KNOB_ARCH==KNOB_ARCH_AVX2
100 return _mm256_castsi256_ps(_mm256_cvtepu8_epi32(_mm_castps_si128(_mm256_castps256_ps128(in))));
101 #endif
102 #else
103 #error Unsupported vector width
104 #endif
105 }
106
107 static simdscalar pack(simdscalar &in)
108 {
109 #if KNOB_SIMD_WIDTH == 8
110 simdscalari src = _simd_castps_si(in);
111 __m128i res16 = _mm_packus_epi32(_mm256_castsi256_si128(src), _mm256_extractf128_si256(src, 1));
112 __m128i res8 = _mm_packus_epi16(res16, _mm_undefined_si128());
113 return _mm256_castsi256_ps(_mm256_castsi128_si256(res8));
114 #else
115 #error Unsupported vector width
116 #endif
117 }
118 };
119
120 //////////////////////////////////////////////////////////////////////////
121 /// PackTraits - Helpers for packing / unpacking 8 bit signed channels
122 //////////////////////////////////////////////////////////////////////////
123 template <>
124 struct PackTraits<8, true>
125 {
126 static const uint32_t MyNumBits = 8;
127
128 static simdscalar loadSOA(const BYTE *pSrc)
129 {
130 #if KNOB_SIMD_WIDTH == 8
131 __m256 result = _mm256_setzero_ps();
132 __m128 vLo = _mm_castpd_ps(_mm_load_sd((double*)pSrc));
133 return _mm256_insertf128_ps(result, vLo, 0);
134 #else
135 #error Unsupported vector width
136 #endif
137 }
138
139 static void storeSOA(BYTE *pDst, simdscalar src)
140 {
141 // store simd bytes
142 #if KNOB_SIMD_WIDTH == 8
143 _mm_storel_pd((double*)pDst, _mm_castps_pd(_mm256_castps256_ps128(src)));
144 #else
145 #error Unsupported vector width
146 #endif
147 }
148
149 static simdscalar unpack(simdscalar &in)
150 {
151 #if KNOB_SIMD_WIDTH == 8
152 #if KNOB_ARCH==KNOB_ARCH_AVX
153 SWR_ASSERT(0); // I think this may be incorrect.
154 __m128i src = _mm_castps_si128(_mm256_castps256_ps128(in));
155 __m128i resLo = _mm_cvtepi8_epi32(src);
156 __m128i resHi = _mm_shuffle_epi8(src,
157 _mm_set_epi32(0x80808007, 0x80808006, 0x80808005, 0x80808004));
158
159 __m256i result = _mm256_castsi128_si256(resLo);
160 result = _mm256_insertf128_si256(result, resHi, 1);
161 return _mm256_castsi256_ps(result);
162 #elif KNOB_ARCH==KNOB_ARCH_AVX2
163 return _mm256_castsi256_ps(_mm256_cvtepi8_epi32(_mm_castps_si128(_mm256_castps256_ps128(in))));
164 #endif
165 #else
166 #error Unsupported vector width
167 #endif
168 }
169
170 static simdscalar pack(simdscalar &in)
171 {
172 #if KNOB_SIMD_WIDTH == 8
173 simdscalari src = _simd_castps_si(in);
174 __m128i res16 = _mm_packs_epi32(_mm256_castsi256_si128(src), _mm256_extractf128_si256(src, 1));
175 __m128i res8 = _mm_packs_epi16(res16, _mm_undefined_si128());
176 return _mm256_castsi256_ps(_mm256_castsi128_si256(res8));
177 #else
178 #error Unsupported vector width
179 #endif
180 }
181 };
182
183 //////////////////////////////////////////////////////////////////////////
184 /// PackTraits - Helpers for packing / unpacking 16 bit unsigned channels
185 //////////////////////////////////////////////////////////////////////////
186 template <>
187 struct PackTraits<16, false>
188 {
189 static const uint32_t MyNumBits = 16;
190
191 static simdscalar loadSOA(const BYTE *pSrc)
192 {
193 #if KNOB_SIMD_WIDTH == 8
194 __m256 result = _mm256_setzero_ps();
195 __m128 vLo = _mm_load_ps((const float*)pSrc);
196 return _mm256_insertf128_ps(result, vLo, 0);
197 #else
198 #error Unsupported vector width
199 #endif
200 }
201
202 static void storeSOA(BYTE *pDst, simdscalar src)
203 {
204 #if KNOB_SIMD_WIDTH == 8
205 // store 16B (2B * 8)
206 _mm_store_ps((float*)pDst, _mm256_castps256_ps128(src));
207 #else
208 #error Unsupported vector width
209 #endif
210 }
211
212 static simdscalar unpack(simdscalar &in)
213 {
214 #if KNOB_SIMD_WIDTH == 8
215 #if KNOB_ARCH==KNOB_ARCH_AVX
216 __m128i src = _mm_castps_si128(_mm256_castps256_ps128(in));
217 __m128i resLo = _mm_cvtepu16_epi32(src);
218 __m128i resHi = _mm_shuffle_epi8(src,
219 _mm_set_epi32(0x80800F0E, 0x80800D0C, 0x80800B0A, 0x80800908));
220
221 __m256i result = _mm256_castsi128_si256(resLo);
222 result = _mm256_insertf128_si256(result, resHi, 1);
223 return _mm256_castsi256_ps(result);
224 #elif KNOB_ARCH==KNOB_ARCH_AVX2
225 return _mm256_castsi256_ps(_mm256_cvtepu16_epi32(_mm_castps_si128(_mm256_castps256_ps128(in))));
226 #endif
227 #else
228 #error Unsupported vector width
229 #endif
230 }
231
232 static simdscalar pack(simdscalar &in)
233 {
234 #if KNOB_SIMD_WIDTH == 8
235 simdscalari src = _simd_castps_si(in);
236 __m256i res = _mm256_castsi128_si256(_mm_packus_epi32(_mm256_castsi256_si128(src), _mm256_extractf128_si256(src, 1)));
237 return _mm256_castsi256_ps(res);
238 #else
239 #error Unsupported vector width
240 #endif
241 }
242 };
243
244 //////////////////////////////////////////////////////////////////////////
245 /// PackTraits - Helpers for packing / unpacking 16 bit signed channels
246 //////////////////////////////////////////////////////////////////////////
247 template <>
248 struct PackTraits<16, true>
249 {
250 static const uint32_t MyNumBits = 16;
251
252 static simdscalar loadSOA(const BYTE *pSrc)
253 {
254 #if KNOB_SIMD_WIDTH == 8
255 __m256 result = _mm256_setzero_ps();
256 __m128 vLo = _mm_load_ps((const float*)pSrc);
257 return _mm256_insertf128_ps(result, vLo, 0);
258 #else
259 #error Unsupported vector width
260 #endif
261 }
262
263 static void storeSOA(BYTE *pDst, simdscalar src)
264 {
265 #if KNOB_SIMD_WIDTH == 8
266 // store 16B (2B * 8)
267 _mm_store_ps((float*)pDst, _mm256_castps256_ps128(src));
268 #else
269 #error Unsupported vector width
270 #endif
271 }
272
273 static simdscalar unpack(simdscalar &in)
274 {
275 #if KNOB_SIMD_WIDTH == 8
276 #if KNOB_ARCH==KNOB_ARCH_AVX
277 SWR_ASSERT(0); // I think this is incorrectly implemented
278 __m128i src = _mm_castps_si128(_mm256_castps256_ps128(in));
279 __m128i resLo = _mm_cvtepi16_epi32(src);
280 __m128i resHi = _mm_shuffle_epi8(src,
281 _mm_set_epi32(0x80800F0E, 0x80800D0C, 0x80800B0A, 0x80800908));
282
283 __m256i result = _mm256_castsi128_si256(resLo);
284 result = _mm256_insertf128_si256(result, resHi, 1);
285 return _mm256_castsi256_ps(result);
286 #elif KNOB_ARCH==KNOB_ARCH_AVX2
287 return _mm256_castsi256_ps(_mm256_cvtepi16_epi32(_mm_castps_si128(_mm256_castps256_ps128(in))));
288 #endif
289 #else
290 #error Unsupported vector width
291 #endif
292 }
293
294 static simdscalar pack(simdscalar &in)
295 {
296 #if KNOB_SIMD_WIDTH == 8
297 simdscalari src = _simd_castps_si(in);
298 __m256i res = _mm256_castsi128_si256(_mm_packs_epi32(_mm256_castsi256_si128(src), _mm256_extractf128_si256(src, 1)));
299 return _mm256_castsi256_ps(res);
300 #else
301 #error Unsupported vector width
302 #endif
303 }
304 };
305
306 //////////////////////////////////////////////////////////////////////////
307 /// PackTraits - Helpers for packing / unpacking 32 bit channels
308 //////////////////////////////////////////////////////////////////////////
309 template <>
310 struct PackTraits<32, false>
311 {
312 static const uint32_t MyNumBits = 32;
313
314 static simdscalar loadSOA(const BYTE *pSrc) { return _simd_load_ps((const float*)pSrc); }
315 static void storeSOA(BYTE *pDst, simdscalar src) { _simd_store_ps((float*)pDst, src); }
316 static simdscalar unpack(simdscalar &in) { return in; }
317 static simdscalar pack(simdscalar &in) { return in; }
318 };
319
320 //////////////////////////////////////////////////////////////////////////
321 /// TypeTraits - Format type traits.
322 //////////////////////////////////////////////////////////////////////////
323 template<SWR_TYPE type, uint32_t NumBits>
324 struct TypeTraits : PackTraits<NumBits>
325 {
326 static const SWR_TYPE MyType = type;
327 static float toFloat() { return 0.0; }
328 static float fromFloat() { SWR_ASSERT(0); return 0.0; }
329 static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
330 };
331
332 //////////////////////////////////////////////////////////////////////////
333 /// TypeTraits - Format type traits specialization for UINT8
334 //////////////////////////////////////////////////////////////////////////
335 template<> struct TypeTraits<SWR_TYPE_UINT, 8> : PackTraits<8>
336 {
337 static const SWR_TYPE MyType = SWR_TYPE_UINT;
338 static float toFloat() { return 0.0; }
339 static float fromFloat() { SWR_ASSERT(0); return 0.0; }
340 static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
341 };
342
343 //////////////////////////////////////////////////////////////////////////
344 /// TypeTraits - Format type traits specialization for UINT8
345 //////////////////////////////////////////////////////////////////////////
346 template<> struct TypeTraits<SWR_TYPE_SINT, 8> : PackTraits<8, true>
347 {
348 static const SWR_TYPE MyType = SWR_TYPE_SINT;
349 static float toFloat() { return 0.0; }
350 static float fromFloat() { SWR_ASSERT(0); return 0.0; }
351 static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
352 };
353
354 //////////////////////////////////////////////////////////////////////////
355 /// TypeTraits - Format type traits specialization for UINT16
356 //////////////////////////////////////////////////////////////////////////
357 template<> struct TypeTraits<SWR_TYPE_UINT, 16> : PackTraits<16>
358 {
359 static const SWR_TYPE MyType = SWR_TYPE_UINT;
360 static float toFloat() { return 0.0; }
361 static float fromFloat() { SWR_ASSERT(0); return 0.0; }
362 static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
363 };
364
365 //////////////////////////////////////////////////////////////////////////
366 /// TypeTraits - Format type traits specialization for SINT16
367 //////////////////////////////////////////////////////////////////////////
368 template<> struct TypeTraits<SWR_TYPE_SINT, 16> : PackTraits<16, true>
369 {
370 static const SWR_TYPE MyType = SWR_TYPE_SINT;
371 static float toFloat() { return 0.0; }
372 static float fromFloat() { SWR_ASSERT(0); return 0.0; }
373 static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
374 };
375
376 //////////////////////////////////////////////////////////////////////////
377 /// TypeTraits - Format type traits specialization for UINT32
378 //////////////////////////////////////////////////////////////////////////
379 template<> struct TypeTraits<SWR_TYPE_UINT, 32> : PackTraits<32>
380 {
381 static const SWR_TYPE MyType = SWR_TYPE_UINT;
382 static float toFloat() { return 0.0; }
383 static float fromFloat() { SWR_ASSERT(0); return 0.0; }
384 static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
385 };
386
387 //////////////////////////////////////////////////////////////////////////
388 /// TypeTraits - Format type traits specialization for UINT32
389 //////////////////////////////////////////////////////////////////////////
390 template<> struct TypeTraits<SWR_TYPE_SINT, 32> : PackTraits<32>
391 {
392 static const SWR_TYPE MyType = SWR_TYPE_SINT;
393 static float toFloat() { return 0.0; }
394 static float fromFloat() { SWR_ASSERT(0); return 0.0; }
395 static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
396 };
397
398 //////////////////////////////////////////////////////////////////////////
399 /// TypeTraits - Format type traits specialization for UNORM5
400 //////////////////////////////////////////////////////////////////////////
401 template<> struct TypeTraits<SWR_TYPE_UNORM, 5> : PackTraits<5>
402 {
403 static const SWR_TYPE MyType = SWR_TYPE_UNORM;
404 static float toFloat() { return 1.0f / 31.0f; }
405 static float fromFloat() { return 31.0f; }
406 static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
407 };
408
409 //////////////////////////////////////////////////////////////////////////
410 /// TypeTraits - Format type traits specialization for UNORM6
411 //////////////////////////////////////////////////////////////////////////
412 template<> struct TypeTraits<SWR_TYPE_UNORM, 6> : PackTraits<6>
413 {
414 static const SWR_TYPE MyType = SWR_TYPE_UNORM;
415 static float toFloat() { return 1.0f / 63.0f; }
416 static float fromFloat() { return 63.0f; }
417 static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
418 };
419
420 //////////////////////////////////////////////////////////////////////////
421 /// TypeTraits - Format type traits specialization for UNORM8
422 //////////////////////////////////////////////////////////////////////////
423 template<> struct TypeTraits<SWR_TYPE_UNORM, 8> : PackTraits<8>
424 {
425 static const SWR_TYPE MyType = SWR_TYPE_UNORM;
426 static float toFloat() { return 1.0f / 255.0f; }
427 static float fromFloat() { return 255.0f; }
428 static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
429 };
430
431 //////////////////////////////////////////////////////////////////////////
432 /// TypeTraits - Format type traits specialization for UNORM8
433 //////////////////////////////////////////////////////////////////////////
434 template<> struct TypeTraits<SWR_TYPE_SNORM, 8> : PackTraits<8, true>
435 {
436 static const SWR_TYPE MyType = SWR_TYPE_SNORM;
437 static float toFloat() { return 1.0f / 127.0f; }
438 static float fromFloat() { return 127.0f; }
439 static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
440 };
441
442 //////////////////////////////////////////////////////////////////////////
443 /// TypeTraits - Format type traits specialization for UNORM16
444 //////////////////////////////////////////////////////////////////////////
445 template<> struct TypeTraits<SWR_TYPE_UNORM, 16> : PackTraits<16>
446 {
447 static const SWR_TYPE MyType = SWR_TYPE_UNORM;
448 static float toFloat() { return 1.0f / 65535.0f; }
449 static float fromFloat() { return 65535.0f; }
450 static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
451 };
452
453 //////////////////////////////////////////////////////////////////////////
454 /// TypeTraits - Format type traits specialization for SNORM16
455 //////////////////////////////////////////////////////////////////////////
456 template<> struct TypeTraits<SWR_TYPE_SNORM, 16> : PackTraits<16, true>
457 {
458 static const SWR_TYPE MyType = SWR_TYPE_UNORM;
459 static float toFloat() { return 1.0f / 32767.0f; }
460 static float fromFloat() { return 32767.0f; }
461 static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
462 };
463
464 //////////////////////////////////////////////////////////////////////////
465 /// TypeTraits - Format type traits specialization for UNORM24
466 //////////////////////////////////////////////////////////////////////////
467 template<>
468 struct TypeTraits < SWR_TYPE_UNORM, 24 > : PackTraits<32>
469 {
470 static const SWR_TYPE MyType = SWR_TYPE_UNORM;
471 static float toFloat() { return 1.0f / 16777215.0f; }
472 static float fromFloat() { return 16777215.0f; }
473 static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
474 };
475
476 //////////////////////////////////////////////////////////////////////////
477 // FLOAT Specializations from here on...
478 //////////////////////////////////////////////////////////////////////////
479 #define TO_M128i(a) _mm_castps_si128(a)
480 #define TO_M128(a) _mm_castsi128_ps(a)
481
482 #include "math.h"
483
484 template< unsigned expnum, unsigned expden, unsigned coeffnum, unsigned coeffden >
485 inline static __m128 fastpow(__m128 arg) {
486 __m128 ret = arg;
487
488 static const __m128 factor = _mm_set1_ps(exp2(127.0f * expden / expnum - 127.0f)
489 * powf(1.0f * coeffnum / coeffden, 1.0f * expden / expnum));
490
491 // Apply a constant pre-correction factor.
492 ret = _mm_mul_ps(ret, factor);
493
494 // Reinterpret arg as integer to obtain logarithm.
495 //asm("cvtdq2ps %1, %0" : "=x" (ret) : "x" (ret));
496 ret = _mm_cvtepi32_ps(_mm_castps_si128(ret));
497
498 // Multiply logarithm by power.
499 ret = _mm_mul_ps(ret, _mm_set1_ps(1.0f * expnum / expden));
500
501 // Convert back to "integer" to exponentiate.
502 //asm("cvtps2dq %1, %0" : "=x" (ret) : "x" (ret));
503 ret = _mm_castsi128_ps(_mm_cvtps_epi32(ret));
504
505 return ret;
506 }
507
508 inline static __m128 pow512_4(__m128 arg) {
509 // 5/12 is too small, so compute the 4th root of 20/12 instead.
510 // 20/12 = 5/3 = 1 + 2/3 = 2 - 1/3. 2/3 is a suitable argument for fastpow.
511 // weighting coefficient: a^-1/2 = 2 a; a = 2^-2/3
512 __m128 xf = fastpow< 2, 3, int(0.629960524947437 * 1e9), int(1e9) >(arg);
513 __m128 xover = _mm_mul_ps(arg, xf);
514
515 __m128 xfm1 = _mm_rsqrt_ps(xf);
516 __m128 x2 = _mm_mul_ps(arg, arg);
517 __m128 xunder = _mm_mul_ps(x2, xfm1);
518
519 // sqrt2 * over + 2 * sqrt2 * under
520 __m128 xavg = _mm_mul_ps(_mm_set1_ps(1.0f / (3.0f * 0.629960524947437f) * 0.999852f),
521 _mm_add_ps(xover, xunder));
522
523 xavg = _mm_mul_ps(xavg, _mm_rsqrt_ps(xavg));
524 xavg = _mm_mul_ps(xavg, _mm_rsqrt_ps(xavg));
525 return xavg;
526 }
527
528 inline static __m128 powf_wrapper(__m128 Base, float Exp)
529 {
530 float *f = (float *)(&Base);
531
532 return _mm_set_ps(powf(f[0], Exp),
533 powf(f[1], Exp),
534 powf(f[2], Exp),
535 powf(f[3], Exp));
536 }
537
538 static inline __m128 ConvertFloatToSRGB2(__m128& Src)
539 {
540 // create a mask with 0xFFFFFFFF in the DWORDs where the source is <= the minimal SRGB float value
541 __m128i CmpToSRGBThresholdMask = TO_M128i(_mm_cmpnlt_ps(_mm_set1_ps(0.0031308f), Src));
542
543 // squeeze the mask down to 16 bits (4 bits per DWORD)
544 int CompareResult = _mm_movemask_epi8(CmpToSRGBThresholdMask);
545
546 __m128 Result;
547
548 //
549 if (CompareResult == 0xFFFF)
550 {
551 // all DWORDs are <= the threshold
552 Result = _mm_mul_ps(Src, _mm_set1_ps(12.92f));
553 }
554 else if (CompareResult == 0x0)
555 {
556 // all DWORDs are > the threshold
557 __m128 fSrc_0RGB = Src;
558
559 // --> 1.055f * c(1.0f/2.4f) - 0.055f
560 #if KNOB_USE_FAST_SRGB == TRUE
561 // 1.0f / 2.4f is 5.0f / 12.0f which is used for approximation.
562 __m128 f = pow512_4(fSrc_0RGB);
563 #else
564 __m128 f = powf_wrapper(fSrc_0RGB, 1.0f / 2.4f);
565 #endif
566 f = _mm_mul_ps(f, _mm_set1_ps(1.055f));
567 Result = _mm_sub_ps(f, _mm_set1_ps(0.055f));
568 }
569 else
570 {
571 // some DWORDs are <= the threshold and some are > threshold
572 __m128 Src_0RGB_mul_denorm = _mm_mul_ps(Src, _mm_set1_ps(12.92f));
573
574 __m128 fSrc_0RGB = Src;
575
576 // --> 1.055f * c(1.0f/2.4f) - 0.055f
577 #if KNOB_USE_FAST_SRGB == TRUE
578 // 1.0f / 2.4f is 5.0f / 12.0f which is used for approximation.
579 __m128 f = pow512_4(fSrc_0RGB);
580 #else
581 __m128 f = powf_wrapper(fSrc_0RGB, 1.0f / 2.4f);
582 #endif
583 f = _mm_mul_ps(f, _mm_set1_ps(1.055f));
584 f = _mm_sub_ps(f, _mm_set1_ps(0.055f));
585
586 // Clear the alpha (is garbage after the sub)
587 __m128i i = _mm_and_si128(TO_M128i(f), _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF));
588
589 __m128i LessThanPart = _mm_and_si128(CmpToSRGBThresholdMask, TO_M128i(Src_0RGB_mul_denorm));
590 __m128i GreaterEqualPart = _mm_andnot_si128(CmpToSRGBThresholdMask, i);
591 __m128i CombinedParts = _mm_or_si128(LessThanPart, GreaterEqualPart);
592
593 Result = TO_M128(CombinedParts);
594 }
595
596 return Result;
597 }
598
599 //////////////////////////////////////////////////////////////////////////
600 /// TypeTraits - Format type traits specialization for FLOAT16
601 //////////////////////////////////////////////////////////////////////////
602 template<> struct TypeTraits<SWR_TYPE_FLOAT, 16> : PackTraits<16>
603 {
604 static const SWR_TYPE MyType = SWR_TYPE_FLOAT;
605 static float toFloat() { return 1.0f; }
606 static float fromFloat() { return 1.0f; }
607 static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
608
609 static simdscalar pack(const simdscalar &in)
610 {
611 #if KNOB_SIMD_WIDTH == 8
612 #if (KNOB_ARCH == KNOB_ARCH_AVX)
613 // input is 8 packed float32, output is 8 packed float16
614 simdscalari src = _simd_castps_si(in);
615
616 static const uint32_t FLOAT_EXP_BITS = 8;
617 static const uint32_t FLOAT_MANTISSA_BITS = 23;
618 static const uint32_t FLOAT_MANTISSA_MASK = (1U << FLOAT_MANTISSA_BITS) - 1;
619 static const uint32_t FLOAT_EXP_MASK = ((1U << FLOAT_EXP_BITS) - 1) << FLOAT_MANTISSA_BITS;
620
621 static const uint32_t HALF_EXP_BITS = 5;
622 static const uint32_t HALF_MANTISSA_BITS = 10;
623 static const uint32_t HALF_MANTISSA_MASK = (1U << HALF_MANTISSA_BITS) - 1;
624 static const uint32_t HALF_EXP_MASK = ((1U << HALF_EXP_BITS) - 1) << HALF_MANTISSA_BITS;
625
626 // minimum exponent required, exponents below this are flushed to 0.
627 static const int32_t HALF_EXP_MIN = -14;
628 static const int32_t FLOAT_EXP_BIAS = 127;
629 static const int32_t FLOAT_EXP_MIN = HALF_EXP_MIN + FLOAT_EXP_BIAS;
630 static const int32_t FLOAT_EXP_MIN_FTZ = FLOAT_EXP_MIN - (HALF_MANTISSA_BITS + 1); // +1 for the lack of implicit significand
631
632 // maximum exponent required, exponents above this are set to infinity
633 static const int32_t HALF_EXP_MAX = 15;
634 static const int32_t FLOAT_EXP_MAX = HALF_EXP_MAX + FLOAT_EXP_BIAS;
635
636 const simdscalari vSignMask = _simd_set1_epi32(0x80000000);
637 const simdscalari vExpMask = _simd_set1_epi32(FLOAT_EXP_MASK);
638 const simdscalari vManMask = _simd_set1_epi32(FLOAT_MANTISSA_MASK);
639 const simdscalari vExpMin = _simd_set1_epi32(FLOAT_EXP_MASK & uint32_t(FLOAT_EXP_MIN << FLOAT_MANTISSA_BITS));
640 const simdscalari vExpMinFtz = _simd_set1_epi32(FLOAT_EXP_MASK & uint32_t(FLOAT_EXP_MIN_FTZ << FLOAT_MANTISSA_BITS));
641 const simdscalari vExpMax = _simd_set1_epi32(FLOAT_EXP_MASK & uint32_t(FLOAT_EXP_MAX << FLOAT_MANTISSA_BITS));
642
643 simdscalari vSign = _simd_and_si(src, vSignMask);
644 simdscalari vExp = _simd_and_si(src, vExpMask);
645 simdscalari vMan = _simd_and_si(src, vManMask);
646
647 simdscalari vFTZMask = _simd_cmplt_epi32(vExp, vExpMinFtz);
648 simdscalari vDenormMask = _simd_andnot_si(vFTZMask, _simd_cmplt_epi32(vExp, vExpMin));
649 simdscalari vInfMask = _simd_cmpeq_epi32(vExpMask, vExp);
650 simdscalari vClampMask = _simd_andnot_si(vInfMask, _simd_cmplt_epi32(vExpMax, vExp));
651
652 simdscalari vHalfExp = _simd_add_epi32(_simd_sub_epi32(vExp, vExpMin), _simd_set1_epi32(1U << FLOAT_MANTISSA_BITS));
653
654 // pack output 16-bits into the lower 16-bits of each 32-bit channel
655 simdscalari vDst = _simd_and_si(_simd_srli_epi32(vHalfExp, 13), _simd_set1_epi32(HALF_EXP_MASK));
656 vDst = _simd_or_si(vDst, _simd_srli_epi32(vMan, FLOAT_MANTISSA_BITS - HALF_MANTISSA_BITS));
657
658 // Flush To Zero
659 vDst = _simd_andnot_si(vFTZMask, vDst);
660 // Apply Infinites / NaN
661 vDst = _simd_or_si(vDst, _simd_and_si(vInfMask, _simd_set1_epi32(HALF_EXP_MASK)));
662
663 // Apply clamps
664 vDst = _simd_andnot_si(vClampMask, vDst);
665 vDst = _simd_or_si(vDst,
666 _simd_and_si(vClampMask, _simd_set1_epi32(0x7BFF)));
667
668 // Compute Denormals (subnormals)
669 if (!_mm256_testz_si256(vDenormMask, vDenormMask))
670 {
671 uint32_t *pDenormMask = (uint32_t*)&vDenormMask;
672 uint32_t *pExp = (uint32_t*)&vExp;
673 uint32_t *pMan = (uint32_t*)&vMan;
674 uint32_t *pDst = (uint32_t*)&vDst;
675 for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i)
676 {
677 if (pDenormMask[i])
678 {
679 // Need to compute subnormal value
680 uint32_t exponent = pExp[i] >> FLOAT_MANTISSA_BITS;
681 uint32_t mantissa = pMan[i] |
682 (1U << FLOAT_MANTISSA_BITS); // Denorms include no "implicit" 1s. Make it explicit
683
684 pDst[i] = mantissa >> ((FLOAT_EXP_MIN - exponent) + (FLOAT_MANTISSA_BITS - HALF_MANTISSA_BITS));
685 }
686 }
687 }
688
689 // Add in sign bits
690 vDst = _simd_or_si(vDst, _simd_srli_epi32(vSign, 16));
691
692 // Pack to lower 128-bits
693 vDst = _mm256_castsi128_si256(_mm_packus_epi32(_mm256_castsi256_si128(vDst), _mm256_extractf128_si256(vDst, 1)));
694
695 #if 0
696 #if !defined(NDEBUG)
697 simdscalari vCheck = _mm256_castsi128_si256(_mm256_cvtps_ph(in, _MM_FROUND_TRUNC));
698
699 for (uint32_t i = 0; i < 4; ++i)
700 {
701 SWR_ASSERT(vCheck.m256i_i32[i] == vDst.m256i_i32[i]);
702 }
703 #endif
704 #endif
705
706 return _simd_castsi_ps(vDst);
707
708 #else
709 return _mm256_castsi256_ps(_mm256_castsi128_si256(_mm256_cvtps_ph(in, _MM_FROUND_TRUNC)));
710 #endif
711 #else
712 #error Unsupported vector width
713 #endif
714 }
715
716 static simdscalar unpack(const simdscalar &in)
717 {
718 // input is 8 packed float16, output is 8 packed float32
719 SWR_ASSERT(0); // @todo
720 return _simd_setzero_ps();
721 }
722 };
723
724 //////////////////////////////////////////////////////////////////////////
725 /// TypeTraits - Format type traits specialization for FLOAT32
726 //////////////////////////////////////////////////////////////////////////
727 template<> struct TypeTraits<SWR_TYPE_FLOAT, 32> : PackTraits<32>
728 {
729 static const SWR_TYPE MyType = SWR_TYPE_FLOAT;
730 static float toFloat() { return 1.0f; }
731 static float fromFloat() { return 1.0f; }
732 static inline simdscalar convertSrgb(simdscalar &in)
733 {
734 #if (KNOB_ARCH == KNOB_ARCH_AVX || KNOB_ARCH == KNOB_ARCH_AVX2)
735 __m128 srcLo = _mm256_extractf128_ps(in, 0);
736 __m128 srcHi = _mm256_extractf128_ps(in, 1);
737
738 srcLo = ConvertFloatToSRGB2(srcLo);
739 srcHi = ConvertFloatToSRGB2(srcHi);
740
741 in = _mm256_insertf128_ps(in, srcLo, 0);
742 in = _mm256_insertf128_ps(in, srcHi, 1);
743
744 #endif
745 return in;
746 }
747 };
748
749 //////////////////////////////////////////////////////////////////////////
750 /// Format1 - Bitfield for single component formats.
751 //////////////////////////////////////////////////////////////////////////
752 template<uint32_t x>
753 struct Format1
754 {
755 union
756 {
757 uint32_t r : x;
758
759 ///@ The following are here to provide full template needed in Formats.
760 uint32_t g : x;
761 uint32_t b : x;
762 uint32_t a : x;
763 };
764 };
765
766 //////////////////////////////////////////////////////////////////////////
767 /// Format1 - Bitfield for single component formats - 8 bit specialization
768 //////////////////////////////////////////////////////////////////////////
769 template<>
770 struct Format1<8>
771 {
772 union
773 {
774 uint8_t r;
775
776 ///@ The following are here to provide full template needed in Formats.
777 uint8_t g;
778 uint8_t b;
779 uint8_t a;
780 };
781 };
782
783 //////////////////////////////////////////////////////////////////////////
784 /// Format1 - Bitfield for single component formats - 16 bit specialization
785 //////////////////////////////////////////////////////////////////////////
786 template<>
787 struct Format1<16>
788 {
789 union
790 {
791 uint16_t r;
792
793 ///@ The following are here to provide full template needed in Formats.
794 uint16_t g;
795 uint16_t b;
796 uint16_t a;
797 };
798 };
799
800 //////////////////////////////////////////////////////////////////////////
801 /// Format2 - Bitfield for 2 component formats.
802 //////////////////////////////////////////////////////////////////////////
803 template<uint32_t x, uint32_t y>
804 union Format2
805 {
806 struct
807 {
808 uint32_t r : x;
809 uint32_t g : y;
810 };
811 struct
812 {
813 ///@ The following are here to provide full template needed in Formats.
814 uint32_t b : x;
815 uint32_t a : y;
816 };
817 };
818
819 //////////////////////////////////////////////////////////////////////////
820 /// Format2 - Bitfield for 2 component formats - 16 bit specialization
821 //////////////////////////////////////////////////////////////////////////
822 template<>
823 union Format2<8,8>
824 {
825 struct
826 {
827 uint16_t r : 8;
828 uint16_t g : 8;
829 };
830 struct
831 {
832 ///@ The following are here to provide full template needed in Formats.
833 uint16_t b : 8;
834 uint16_t a : 8;
835 };
836 };
837
838 //////////////////////////////////////////////////////////////////////////
839 /// Format3 - Bitfield for 3 component formats.
840 //////////////////////////////////////////////////////////////////////////
841 template<uint32_t x, uint32_t y, uint32_t z>
842 union Format3
843 {
844 struct
845 {
846 uint32_t r : x;
847 uint32_t g : y;
848 uint32_t b : z;
849 };
850 uint32_t a; ///@note This is here to provide full template needed in Formats.
851 };
852
853 //////////////////////////////////////////////////////////////////////////
854 /// Format3 - Bitfield for 3 component formats - 16 bit specialization
855 //////////////////////////////////////////////////////////////////////////
856 template<>
857 union Format3<5,6,5>
858 {
859 struct
860 {
861 uint16_t r : 5;
862 uint16_t g : 6;
863 uint16_t b : 5;
864 };
865 uint16_t a; ///@note This is here to provide full template needed in Formats.
866 };
867
868 //////////////////////////////////////////////////////////////////////////
869 /// Format4 - Bitfield for 4 component formats.
870 //////////////////////////////////////////////////////////////////////////
871 template<uint32_t x, uint32_t y, uint32_t z, uint32_t w>
872 struct Format4
873 {
874 uint32_t r : x;
875 uint32_t g : y;
876 uint32_t b : z;
877 uint32_t a : w;
878 };
879
880 //////////////////////////////////////////////////////////////////////////
881 /// Format4 - Bitfield for 4 component formats - 16 bit specialization
882 //////////////////////////////////////////////////////////////////////////
883 template<>
884 struct Format4<5,5,5,1>
885 {
886 uint16_t r : 5;
887 uint16_t g : 5;
888 uint16_t b : 5;
889 uint16_t a : 1;
890 };
891
892 //////////////////////////////////////////////////////////////////////////
893 /// Format4 - Bitfield for 4 component formats - 16 bit specialization
894 //////////////////////////////////////////////////////////////////////////
895 template<>
896 struct Format4<4,4,4,4>
897 {
898 uint16_t r : 4;
899 uint16_t g : 4;
900 uint16_t b : 4;
901 uint16_t a : 4;
902 };
903
904 //////////////////////////////////////////////////////////////////////////
905 /// ComponentTraits - Default components
906 //////////////////////////////////////////////////////////////////////////
907 template<uint32_t x, uint32_t y, uint32_t z, uint32_t w>
908 struct Defaults
909 {
910 INLINE static uint32_t GetDefault(uint32_t comp)
911 {
912 static const uint32_t defaults[4]{ x, y, z, w };
913 return defaults[comp];
914 }
915 };
916
917 //////////////////////////////////////////////////////////////////////////
918 /// ComponentTraits - Component type traits.
919 //////////////////////////////////////////////////////////////////////////
920 template<SWR_TYPE X, uint32_t NumBitsX, SWR_TYPE Y = SWR_TYPE_UNKNOWN, uint32_t NumBitsY = 0, SWR_TYPE Z = SWR_TYPE_UNKNOWN, uint32_t NumBitsZ = 0, SWR_TYPE W = SWR_TYPE_UNKNOWN, uint32_t NumBitsW = 0>
921 struct ComponentTraits
922 {
923 INLINE static SWR_TYPE GetType(uint32_t comp)
924 {
925 static const SWR_TYPE CompType[4]{ X, Y, Z, W };
926 return CompType[comp];
927 }
928
929 INLINE static uint32_t GetBPC(uint32_t comp)
930 {
931 static const uint32_t MyBpc[4]{ NumBitsX, NumBitsY, NumBitsZ, NumBitsW };
932 return MyBpc[comp];
933 }
934
935 INLINE static bool isNormalized(uint32_t comp)
936 {
937 switch (comp)
938 {
939 case 0:
940 return (X == SWR_TYPE_UNORM || X == SWR_TYPE_SNORM) ? true : false;
941 case 1:
942 return (Y == SWR_TYPE_UNORM || Y == SWR_TYPE_SNORM) ? true : false;
943 case 2:
944 return (Z == SWR_TYPE_UNORM || Z == SWR_TYPE_SNORM) ? true : false;
945 case 3:
946 return (W == SWR_TYPE_UNORM || W == SWR_TYPE_SNORM) ? true : false;
947 }
948 SWR_ASSERT(0);
949 return false;
950 }
951
952 INLINE static float toFloat(uint32_t comp)
953 {
954 switch (comp)
955 {
956 case 0:
957 return TypeTraits<X, NumBitsX>::toFloat();
958 case 1:
959 return TypeTraits<Y, NumBitsY>::toFloat();
960 case 2:
961 return TypeTraits<Z, NumBitsZ>::toFloat();
962 case 3:
963 return TypeTraits<W, NumBitsW>::toFloat();
964 }
965 SWR_ASSERT(0);
966 return TypeTraits<X, NumBitsX>::toFloat();
967
968 }
969
970 INLINE static float fromFloat(uint32_t comp)
971 {
972 switch (comp)
973 {
974 case 0:
975 return TypeTraits<X, NumBitsX>::fromFloat();
976 case 1:
977 return TypeTraits<Y, NumBitsY>::fromFloat();
978 case 2:
979 return TypeTraits<Z, NumBitsZ>::fromFloat();
980 case 3:
981 return TypeTraits<W, NumBitsW>::fromFloat();
982 }
983 SWR_ASSERT(0);
984 return TypeTraits<X, NumBitsX>::fromFloat();
985 }
986
987 INLINE static simdscalar loadSOA(uint32_t comp, const BYTE* pSrc)
988 {
989 switch (comp)
990 {
991 case 0:
992 return TypeTraits<X, NumBitsX>::loadSOA(pSrc);
993 case 1:
994 return TypeTraits<Y, NumBitsY>::loadSOA(pSrc);
995 case 2:
996 return TypeTraits<Z, NumBitsZ>::loadSOA(pSrc);
997 case 3:
998 return TypeTraits<W, NumBitsW>::loadSOA(pSrc);
999 }
1000 SWR_ASSERT(0);
1001 return TypeTraits<X, NumBitsX>::loadSOA(pSrc);
1002 }
1003
1004 INLINE static void storeSOA(uint32_t comp, BYTE *pDst, simdscalar src)
1005 {
1006 switch (comp)
1007 {
1008 case 0:
1009 TypeTraits<X, NumBitsX>::storeSOA(pDst, src);
1010 return;
1011 case 1:
1012 TypeTraits<Y, NumBitsY>::storeSOA(pDst, src);
1013 return;
1014 case 2:
1015 TypeTraits<Z, NumBitsZ>::storeSOA(pDst, src);
1016 return;
1017 case 3:
1018 TypeTraits<W, NumBitsW>::storeSOA(pDst, src);
1019 return;
1020 }
1021 SWR_ASSERT(0);
1022 TypeTraits<X, NumBitsX>::storeSOA(pDst, src);
1023 }
1024
1025 INLINE static simdscalar unpack(uint32_t comp, simdscalar &in)
1026 {
1027 switch (comp)
1028 {
1029 case 0:
1030 return TypeTraits<X, NumBitsX>::unpack(in);
1031 case 1:
1032 return TypeTraits<Y, NumBitsY>::unpack(in);
1033 case 2:
1034 return TypeTraits<Z, NumBitsZ>::unpack(in);
1035 case 3:
1036 return TypeTraits<W, NumBitsW>::unpack(in);
1037 }
1038 SWR_ASSERT(0);
1039 return TypeTraits<X, NumBitsX>::unpack(in);
1040 }
1041
1042 INLINE static simdscalar pack(uint32_t comp, simdscalar &in)
1043 {
1044 switch (comp)
1045 {
1046 case 0:
1047 return TypeTraits<X, NumBitsX>::pack(in);
1048 case 1:
1049 return TypeTraits<Y, NumBitsY>::pack(in);
1050 case 2:
1051 return TypeTraits<Z, NumBitsZ>::pack(in);
1052 case 3:
1053 return TypeTraits<W, NumBitsW>::pack(in);
1054 }
1055 SWR_ASSERT(0);
1056 return TypeTraits<X, NumBitsX>::pack(in);
1057 }
1058
1059 INLINE static simdscalar convertSrgb(uint32_t comp, simdscalar &in)
1060 {
1061 switch (comp)
1062 {
1063 case 0:
1064 return TypeTraits<X, NumBitsX>::convertSrgb(in);;
1065 case 1:
1066 return TypeTraits<Y, NumBitsY>::convertSrgb(in);;
1067 case 2:
1068 return TypeTraits<Z, NumBitsZ>::convertSrgb(in);;
1069 case 3:
1070 return TypeTraits<W, NumBitsW>::convertSrgb(in);;
1071 }
1072 SWR_ASSERT(0);
1073 return TypeTraits<X, NumBitsX>::convertSrgb(in);
1074 }
1075 };