swr: [rasterizer core] Remove deprecated simd intrinsics
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / format_types.h
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file formats.h
24 *
25 * @brief Definitions for SWR_FORMAT functions.
26 *
27 ******************************************************************************/
28 #pragma once
29
30 #include "utils.h"
31
32 //////////////////////////////////////////////////////////////////////////
33 /// PackTraits - Helpers for packing / unpacking same pixel sizes
34 //////////////////////////////////////////////////////////////////////////
35 template <uint32_t NumBits, bool Signed = false>
36 struct PackTraits
37 {
38 static const uint32_t MyNumBits = NumBits;
39 static simdscalar loadSOA(const uint8_t *pSrc) = delete;
40 static void storeSOA(uint8_t *pDst, simdscalar src) = delete;
41 static simdscalar unpack(simdscalar &in) = delete;
42 static simdscalar pack(simdscalar &in) = delete;
43 #if ENABLE_AVX512_SIMD16
44 static simd16scalar loadSOA_16(const uint8_t *pSrc) = delete;
45 static void storeSOA(uint8_t *pDst, simd16scalar src) = delete;
46 static simd16scalar unpack(simd16scalar &in) = delete;
47 static simd16scalar pack(simd16scalar &in) = delete;
48 #endif
49 };
50
51 //////////////////////////////////////////////////////////////////////////
52 /// PackTraits - Helpers for packing / unpacking unused channels
53 //////////////////////////////////////////////////////////////////////////
54 template <>
55 struct PackTraits<0, false>
56 {
57 static const uint32_t MyNumBits = 0;
58
59 static simdscalar loadSOA(const uint8_t *pSrc) { return _simd_setzero_ps(); }
60 static void storeSOA(uint8_t *pDst, simdscalar src) { return; }
61 static simdscalar unpack(simdscalar &in) { return _simd_setzero_ps(); }
62 static simdscalar pack(simdscalar &in) { return _simd_setzero_ps(); }
63 #if ENABLE_AVX512_SIMD16
64 static simd16scalar loadSOA_16(const uint8_t *pSrc) { return _simd16_setzero_ps(); }
65 static void storeSOA(uint8_t *pDst, simd16scalar src) { return; }
66 static simd16scalar unpack(simd16scalar &in) { return _simd16_setzero_ps(); }
67 static simd16scalar pack(simd16scalar &in) { return _simd16_setzero_ps(); }
68 #endif
69 };
70
71 //////////////////////////////////////////////////////////////////////////
72 /// PackTraits - Helpers for packing / unpacking 8 bit unsigned channels
73 //////////////////////////////////////////////////////////////////////////
74 template <>
75 struct PackTraits<8, false>
76 {
77 static const uint32_t MyNumBits = 8;
78
79 static simdscalar loadSOA(const uint8_t *pSrc)
80 {
81 #if KNOB_SIMD_WIDTH == 8
82 __m256 result = _mm256_setzero_ps();
83 __m128 vLo = _mm_castpd_ps(_mm_load_sd((double*)pSrc));
84 return _mm256_insertf128_ps(result, vLo, 0);
85 #else
86 #error Unsupported vector width
87 #endif
88 }
89
90 static void storeSOA(uint8_t *pDst, simdscalar src)
91 {
92 // store simd bytes
93 #if KNOB_SIMD_WIDTH == 8
94 _mm_storel_pd((double*)pDst, _mm_castps_pd(_mm256_castps256_ps128(src)));
95 #else
96 #error Unsupported vector width
97 #endif
98 }
99
100 static simdscalar unpack(simdscalar &in)
101 {
102 #if KNOB_SIMD_WIDTH == 8
103 #if KNOB_ARCH==KNOB_ARCH_AVX
104 __m128i src = _mm_castps_si128(_mm256_castps256_ps128(in));
105 __m128i resLo = _mm_cvtepu8_epi32(src);
106 __m128i resHi = _mm_shuffle_epi8(src,
107 _mm_set_epi32(0x80808007, 0x80808006, 0x80808005, 0x80808004));
108
109 __m256i result = _mm256_castsi128_si256(resLo);
110 result = _mm256_insertf128_si256(result, resHi, 1);
111 return _mm256_castsi256_ps(result);
112 #elif KNOB_ARCH>=KNOB_ARCH_AVX2
113 return _mm256_castsi256_ps(_mm256_cvtepu8_epi32(_mm_castps_si128(_mm256_castps256_ps128(in))));
114 #endif
115 #else
116 #error Unsupported vector width
117 #endif
118 }
119
120 static simdscalar pack(simdscalar &in)
121 {
122 #if KNOB_SIMD_WIDTH == 8
123 simdscalari src = _simd_castps_si(in);
124 __m128i res16 = _mm_packus_epi32(_mm256_castsi256_si128(src), _mm256_extractf128_si256(src, 1));
125 __m128i res8 = _mm_packus_epi16(res16, _mm_undefined_si128());
126 return _mm256_castsi256_ps(_mm256_castsi128_si256(res8));
127 #else
128 #error Unsupported vector width
129 #endif
130 }
131 #if ENABLE_AVX512_SIMD16
132
133 static simd16scalar loadSOA_16(const uint8_t *pSrc)
134 {
135 simd16scalar result = _simd16_setzero_ps();
136 simdscalar resultlo = _simd_setzero_ps();
137
138 const __m128 src = _mm_load_ps(reinterpret_cast<const float *>(pSrc));
139
140 resultlo = _mm256_insertf128_ps(resultlo, src, 0);
141 result = _simd16_insert_ps(result, resultlo, 0);
142
143 return result;
144 }
145
146 static void storeSOA(uint8_t *pDst, simd16scalar src)
147 {
148 // store simd16 bytes
149 _mm_store_ps(reinterpret_cast<float *>(pDst), _mm256_castps256_ps128(_simd16_extract_ps(src, 0)));
150 }
151
152 static simd16scalar unpack(simd16scalar &in)
153 {
154 simd16scalari result = _simd16_setzero_si();
155
156 __m128i src = _mm_castps_si128(_mm256_castps256_ps128(_simd16_extract_ps(in, 0)));
157
158 result = _simd16_insert_si(result, _simd_cvtepu8_epi32(src), 0);
159 result = _simd16_insert_si(result, _simd_cvtepu8_epi32(_mm_srli_si128(src, 8)), 1);
160
161 return _simd16_castsi_ps(result);
162 }
163
164 static simd16scalar pack(simd16scalar &in)
165 {
166 simd16scalari result = _simd16_setzero_si();
167 simdscalari resultlo = _simd_setzero_si();
168
169 __m128i templo = _mm_packus_epi32(_mm256_castsi256_si128(_mm256_castps_si256(_simd16_extract_ps(in, 0))), _mm256_extractf128_si256(_mm256_castps_si256(_simd16_extract_ps(in, 0)), 1));
170 __m128i temphi = _mm_packus_epi32(_mm256_castsi256_si128(_mm256_castps_si256(_simd16_extract_ps(in, 1))), _mm256_extractf128_si256(_mm256_castps_si256(_simd16_extract_ps(in, 1)), 1));
171
172 __m128i temp = _mm_packus_epi16(templo, temphi);
173
174 resultlo = _mm256_inserti128_si256(resultlo, temp, 0);
175 result = _simd16_insert_si(result, resultlo, 0);
176
177 return _simd16_castsi_ps(result);
178 }
179 #endif
180 };
181
182 //////////////////////////////////////////////////////////////////////////
183 /// PackTraits - Helpers for packing / unpacking 8 bit signed channels
184 //////////////////////////////////////////////////////////////////////////
185 template <>
186 struct PackTraits<8, true>
187 {
188 static const uint32_t MyNumBits = 8;
189
190 static simdscalar loadSOA(const uint8_t *pSrc)
191 {
192 #if KNOB_SIMD_WIDTH == 8
193 __m256 result = _mm256_setzero_ps();
194 __m128 vLo = _mm_castpd_ps(_mm_load_sd((double*)pSrc));
195 return _mm256_insertf128_ps(result, vLo, 0);
196 #else
197 #error Unsupported vector width
198 #endif
199 }
200
201 static void storeSOA(uint8_t *pDst, simdscalar src)
202 {
203 // store simd bytes
204 #if KNOB_SIMD_WIDTH == 8
205 _mm_storel_pd((double*)pDst, _mm_castps_pd(_mm256_castps256_ps128(src)));
206 #else
207 #error Unsupported vector width
208 #endif
209 }
210
211 static simdscalar unpack(simdscalar &in)
212 {
213 #if KNOB_SIMD_WIDTH == 8
214 #if KNOB_ARCH==KNOB_ARCH_AVX
215 SWR_ASSERT(0); // I think this may be incorrect.
216 __m128i src = _mm_castps_si128(_mm256_castps256_ps128(in));
217 __m128i resLo = _mm_cvtepi8_epi32(src);
218 __m128i resHi = _mm_shuffle_epi8(src,
219 _mm_set_epi32(0x80808007, 0x80808006, 0x80808005, 0x80808004));
220
221 __m256i result = _mm256_castsi128_si256(resLo);
222 result = _mm256_insertf128_si256(result, resHi, 1);
223 return _mm256_castsi256_ps(result);
224 #elif KNOB_ARCH>=KNOB_ARCH_AVX2
225 return _mm256_castsi256_ps(_mm256_cvtepi8_epi32(_mm_castps_si128(_mm256_castps256_ps128(in))));
226 #endif
227 #else
228 #error Unsupported vector width
229 #endif
230 }
231
232 static simdscalar pack(simdscalar &in)
233 {
234 #if KNOB_SIMD_WIDTH == 8
235 simdscalari src = _simd_castps_si(in);
236 __m128i res16 = _mm_packs_epi32(_mm256_castsi256_si128(src), _mm256_extractf128_si256(src, 1));
237 __m128i res8 = _mm_packs_epi16(res16, _mm_undefined_si128());
238 return _mm256_castsi256_ps(_mm256_castsi128_si256(res8));
239 #else
240 #error Unsupported vector width
241 #endif
242 }
243 #if ENABLE_AVX512_SIMD16
244
245 static simd16scalar loadSOA_16(const uint8_t *pSrc)
246 {
247 simd16scalar result = _simd16_setzero_ps();
248 simdscalar resultlo = _simd_setzero_ps();
249
250 const __m128 src = _mm_load_ps(reinterpret_cast<const float *>(pSrc));
251
252 resultlo = _mm256_insertf128_ps(resultlo, src, 0);
253 result = _simd16_insert_ps(result, resultlo, 0);
254
255 return result;
256 }
257
258 static void storeSOA(uint8_t *pDst, simd16scalar src)
259 {
260 // store simd16 bytes
261 _mm_store_ps(reinterpret_cast<float *>(pDst), _mm256_castps256_ps128(_simd16_extract_ps(src, 0)));
262 }
263
264 static simd16scalar unpack(simd16scalar &in)
265 {
266 simd16scalari result = _simd16_setzero_si();
267
268 __m128i src = _mm_castps_si128(_mm256_castps256_ps128(_simd16_extract_ps(in, 0)));
269
270 result = _simd16_insert_si(result, _simd_cvtepu8_epi32(src), 0);
271 result = _simd16_insert_si(result, _simd_cvtepu8_epi32(_mm_srli_si128(src, 8)), 1);
272
273 return _simd16_castsi_ps(result);
274 }
275
276 static simd16scalar pack(simd16scalar &in)
277 {
278 simd16scalari result = _simd16_setzero_si();
279 simdscalari resultlo = _simd_setzero_si();
280
281 __m128i templo = _mm_packs_epi32(_mm256_castsi256_si128(_mm256_castps_si256(_simd16_extract_ps(in, 0))), _mm256_extractf128_si256(_mm256_castps_si256(_simd16_extract_ps(in, 0)), 1));
282 __m128i temphi = _mm_packs_epi32(_mm256_castsi256_si128(_mm256_castps_si256(_simd16_extract_ps(in, 1))), _mm256_extractf128_si256(_mm256_castps_si256(_simd16_extract_ps(in, 1)), 1));
283
284 __m128i temp = _mm_packs_epi16(templo, temphi);
285
286 resultlo = _mm256_inserti128_si256(resultlo, temp, 0);
287 result = _simd16_insert_si(result, resultlo, 0);
288
289 return _simd16_castsi_ps(result);
290 }
291 #endif
292 };
293
294 //////////////////////////////////////////////////////////////////////////
295 /// PackTraits - Helpers for packing / unpacking 16 bit unsigned channels
296 //////////////////////////////////////////////////////////////////////////
297 template <>
298 struct PackTraits<16, false>
299 {
300 static const uint32_t MyNumBits = 16;
301
302 static simdscalar loadSOA(const uint8_t *pSrc)
303 {
304 #if KNOB_SIMD_WIDTH == 8
305 __m256 result = _mm256_setzero_ps();
306 __m128 vLo = _mm_load_ps((const float*)pSrc);
307 return _mm256_insertf128_ps(result, vLo, 0);
308 #else
309 #error Unsupported vector width
310 #endif
311 }
312
313 static void storeSOA(uint8_t *pDst, simdscalar src)
314 {
315 #if KNOB_SIMD_WIDTH == 8
316 // store 16B (2B * 8)
317 _mm_store_ps((float*)pDst, _mm256_castps256_ps128(src));
318 #else
319 #error Unsupported vector width
320 #endif
321 }
322
323 static simdscalar unpack(simdscalar &in)
324 {
325 #if KNOB_SIMD_WIDTH == 8
326 #if KNOB_ARCH==KNOB_ARCH_AVX
327 __m128i src = _mm_castps_si128(_mm256_castps256_ps128(in));
328 __m128i resLo = _mm_cvtepu16_epi32(src);
329 __m128i resHi = _mm_shuffle_epi8(src,
330 _mm_set_epi32(0x80800F0E, 0x80800D0C, 0x80800B0A, 0x80800908));
331
332 __m256i result = _mm256_castsi128_si256(resLo);
333 result = _mm256_insertf128_si256(result, resHi, 1);
334 return _mm256_castsi256_ps(result);
335 #elif KNOB_ARCH>=KNOB_ARCH_AVX2
336 return _mm256_castsi256_ps(_mm256_cvtepu16_epi32(_mm_castps_si128(_mm256_castps256_ps128(in))));
337 #endif
338 #else
339 #error Unsupported vector width
340 #endif
341 }
342
343 static simdscalar pack(simdscalar &in)
344 {
345 #if KNOB_SIMD_WIDTH == 8
346 simdscalari src = _simd_castps_si(in);
347 __m256i res = _mm256_castsi128_si256(_mm_packus_epi32(_mm256_castsi256_si128(src), _mm256_extractf128_si256(src, 1)));
348 return _mm256_castsi256_ps(res);
349 #else
350 #error Unsupported vector width
351 #endif
352 }
353 #if ENABLE_AVX512_SIMD16
354
355 static simd16scalar loadSOA_16(const uint8_t *pSrc)
356 {
357 simd16scalar result = _simd16_setzero_ps();
358
359 simdscalar resultlo = _simd_load_ps(reinterpret_cast<const float *>(pSrc));
360
361 result = _simd16_insert_ps(result, resultlo, 0);
362
363 return result;
364 }
365
366 static void storeSOA(uint8_t *pDst, simd16scalar src)
367 {
368 _simd_store_ps(reinterpret_cast<float *>(pDst), _simd16_extract_ps(src, 0));
369 }
370
371 static simd16scalar unpack(simd16scalar &in)
372 {
373 simd16scalari result = _simd16_setzero_si();
374
375 result = _simd16_insert_si(result, _simd_cvtepu16_epi32(_mm256_extracti128_si256(_mm256_castps_si256(_simd16_extract_ps(in, 0)), 0)), 0);
376 result = _simd16_insert_si(result, _simd_cvtepu16_epi32(_mm256_extracti128_si256(_mm256_castps_si256(_simd16_extract_ps(in, 0)), 1)), 1);
377
378 return _simd16_castsi_ps(result);
379 }
380
381 static simd16scalar pack(simd16scalar &in)
382 {
383 simd16scalari result = _simd16_setzero_si();
384
385 simdscalari inlo = _simd_castps_si(_simd16_extract_ps(in, 0));
386 simdscalari inhi = _simd_castps_si(_simd16_extract_ps(in, 1));
387
388 simdscalari templo = _simd_permute2f128_si(inlo, inhi, 0x20);
389 simdscalari temphi = _simd_permute2f128_si(inlo, inhi, 0x31);
390
391 result = _simd16_insert_si(result, _simd_packus_epi32(templo, temphi), 0);
392
393 return _simd16_castsi_ps(result);
394 }
395 #endif
396 };
397
398 //////////////////////////////////////////////////////////////////////////
399 /// PackTraits - Helpers for packing / unpacking 16 bit signed channels
400 //////////////////////////////////////////////////////////////////////////
401 template <>
402 struct PackTraits<16, true>
403 {
404 static const uint32_t MyNumBits = 16;
405
406 static simdscalar loadSOA(const uint8_t *pSrc)
407 {
408 #if KNOB_SIMD_WIDTH == 8
409 __m256 result = _mm256_setzero_ps();
410 __m128 vLo = _mm_load_ps((const float*)pSrc);
411 return _mm256_insertf128_ps(result, vLo, 0);
412 #else
413 #error Unsupported vector width
414 #endif
415 }
416
417 static void storeSOA(uint8_t *pDst, simdscalar src)
418 {
419 #if KNOB_SIMD_WIDTH == 8
420 // store 16B (2B * 8)
421 _mm_store_ps((float*)pDst, _mm256_castps256_ps128(src));
422 #else
423 #error Unsupported vector width
424 #endif
425 }
426
427 static simdscalar unpack(simdscalar &in)
428 {
429 #if KNOB_SIMD_WIDTH == 8
430 #if KNOB_ARCH==KNOB_ARCH_AVX
431 SWR_ASSERT(0); // I think this is incorrectly implemented
432 __m128i src = _mm_castps_si128(_mm256_castps256_ps128(in));
433 __m128i resLo = _mm_cvtepi16_epi32(src);
434 __m128i resHi = _mm_shuffle_epi8(src,
435 _mm_set_epi32(0x80800F0E, 0x80800D0C, 0x80800B0A, 0x80800908));
436
437 __m256i result = _mm256_castsi128_si256(resLo);
438 result = _mm256_insertf128_si256(result, resHi, 1);
439 return _mm256_castsi256_ps(result);
440 #elif KNOB_ARCH>=KNOB_ARCH_AVX2
441 return _mm256_castsi256_ps(_mm256_cvtepi16_epi32(_mm_castps_si128(_mm256_castps256_ps128(in))));
442 #endif
443 #else
444 #error Unsupported vector width
445 #endif
446 }
447
448 static simdscalar pack(simdscalar &in)
449 {
450 #if KNOB_SIMD_WIDTH == 8
451 simdscalari src = _simd_castps_si(in);
452 __m256i res = _mm256_castsi128_si256(_mm_packs_epi32(_mm256_castsi256_si128(src), _mm256_extractf128_si256(src, 1)));
453 return _mm256_castsi256_ps(res);
454 #else
455 #error Unsupported vector width
456 #endif
457 }
458 #if ENABLE_AVX512_SIMD16
459
460 static simd16scalar loadSOA_16(const uint8_t *pSrc)
461 {
462 simd16scalar result = _simd16_setzero_ps();
463
464 simdscalar resultlo = _simd_load_ps(reinterpret_cast<const float *>(pSrc));
465
466 result = _simd16_insert_ps(result, resultlo, 0);
467
468 return result;
469 }
470
471 static void storeSOA(uint8_t *pDst, simd16scalar src)
472 {
473 _simd_store_ps(reinterpret_cast<float *>(pDst), _simd16_extract_ps(src, 0));
474 }
475
476 static simd16scalar unpack(simd16scalar &in)
477 {
478 simd16scalari result = _simd16_setzero_si();
479
480 result = _simd16_insert_si(result, _simd_cvtepu16_epi32(_mm256_extracti128_si256(_mm256_castps_si256(_simd16_extract_ps(in, 0)), 0)), 0);
481 result = _simd16_insert_si(result, _simd_cvtepu16_epi32(_mm256_extracti128_si256(_mm256_castps_si256(_simd16_extract_ps(in, 0)), 1)), 1);
482
483 return _simd16_castsi_ps(result);
484 }
485
486 static simd16scalar pack(simd16scalar &in)
487 {
488 simd16scalari result = _simd16_setzero_si();
489
490 simdscalari inlo = _simd_castps_si(_simd16_extract_ps(in, 0));
491 simdscalari inhi = _simd_castps_si(_simd16_extract_ps(in, 1));
492
493 simdscalari templo = _simd_permute2f128_si(inlo, inhi, 0x20);
494 simdscalari temphi = _simd_permute2f128_si(inlo, inhi, 0x31);
495
496 result = _simd16_insert_si(result, _simd_packus_epi32(templo, temphi), 0);
497
498 return _simd16_castsi_ps(result);
499 }
500 #endif
501 };
502
503 //////////////////////////////////////////////////////////////////////////
504 /// PackTraits - Helpers for packing / unpacking 32 bit channels
505 //////////////////////////////////////////////////////////////////////////
506 template <>
507 struct PackTraits<32, false>
508 {
509 static const uint32_t MyNumBits = 32;
510
511 static simdscalar loadSOA(const uint8_t *pSrc) { return _simd_load_ps((const float*)pSrc); }
512 static void storeSOA(uint8_t *pDst, simdscalar src) { _simd_store_ps((float*)pDst, src); }
513 static simdscalar unpack(simdscalar &in) { return in; }
514 static simdscalar pack(simdscalar &in) { return in; }
515 #if ENABLE_AVX512_SIMD16
516
517 static simd16scalar loadSOA_16(const uint8_t *pSrc)
518 {
519 return _simd16_load_ps(reinterpret_cast<const float *>(pSrc));
520 }
521
522 static void storeSOA(uint8_t *pDst, simd16scalar src)
523 {
524 _simd16_store_ps(reinterpret_cast<float *>(pDst), src);
525 }
526
527 static simd16scalar unpack(simd16scalar &in)
528 {
529 return in;
530 }
531
532 static simd16scalar pack(simd16scalar &in)
533 {
534 return in;
535 }
536 #endif
537 };
538
539 //////////////////////////////////////////////////////////////////////////
540 /// TypeTraits - Format type traits.
541 //////////////////////////////////////////////////////////////////////////
542 template<SWR_TYPE type, uint32_t NumBits>
543 struct TypeTraits : PackTraits<NumBits>
544 {
545 static const SWR_TYPE MyType = type;
546 static float toFloat() { return 0.0; }
547 static float fromFloat() { SWR_ASSERT(0); return 0.0; }
548 static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
549 };
550
551 //////////////////////////////////////////////////////////////////////////
552 /// TypeTraits - Format type traits specialization for UINT8
553 //////////////////////////////////////////////////////////////////////////
554 template<> struct TypeTraits<SWR_TYPE_UINT, 8> : PackTraits<8>
555 {
556 static const SWR_TYPE MyType = SWR_TYPE_UINT;
557 static float toFloat() { return 0.0; }
558 static float fromFloat() { SWR_ASSERT(0); return 0.0; }
559 static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
560 };
561
562 //////////////////////////////////////////////////////////////////////////
563 /// TypeTraits - Format type traits specialization for UINT8
564 //////////////////////////////////////////////////////////////////////////
565 template<> struct TypeTraits<SWR_TYPE_SINT, 8> : PackTraits<8, true>
566 {
567 static const SWR_TYPE MyType = SWR_TYPE_SINT;
568 static float toFloat() { return 0.0; }
569 static float fromFloat() { SWR_ASSERT(0); return 0.0; }
570 static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
571 };
572
573 //////////////////////////////////////////////////////////////////////////
574 /// TypeTraits - Format type traits specialization for UINT16
575 //////////////////////////////////////////////////////////////////////////
576 template<> struct TypeTraits<SWR_TYPE_UINT, 16> : PackTraits<16>
577 {
578 static const SWR_TYPE MyType = SWR_TYPE_UINT;
579 static float toFloat() { return 0.0; }
580 static float fromFloat() { SWR_ASSERT(0); return 0.0; }
581 static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
582 };
583
584 //////////////////////////////////////////////////////////////////////////
585 /// TypeTraits - Format type traits specialization for SINT16
586 //////////////////////////////////////////////////////////////////////////
587 template<> struct TypeTraits<SWR_TYPE_SINT, 16> : PackTraits<16, true>
588 {
589 static const SWR_TYPE MyType = SWR_TYPE_SINT;
590 static float toFloat() { return 0.0; }
591 static float fromFloat() { SWR_ASSERT(0); return 0.0; }
592 static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
593 };
594
595 //////////////////////////////////////////////////////////////////////////
596 /// TypeTraits - Format type traits specialization for UINT32
597 //////////////////////////////////////////////////////////////////////////
598 template<> struct TypeTraits<SWR_TYPE_UINT, 32> : PackTraits<32>
599 {
600 static const SWR_TYPE MyType = SWR_TYPE_UINT;
601 static float toFloat() { return 0.0; }
602 static float fromFloat() { SWR_ASSERT(0); return 0.0; }
603 static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
604 };
605
606 //////////////////////////////////////////////////////////////////////////
607 /// TypeTraits - Format type traits specialization for UINT32
608 //////////////////////////////////////////////////////////////////////////
609 template<> struct TypeTraits<SWR_TYPE_SINT, 32> : PackTraits<32>
610 {
611 static const SWR_TYPE MyType = SWR_TYPE_SINT;
612 static float toFloat() { return 0.0; }
613 static float fromFloat() { SWR_ASSERT(0); return 0.0; }
614 static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
615 };
616
617 //////////////////////////////////////////////////////////////////////////
618 /// TypeTraits - Format type traits specialization for UNORM5
619 //////////////////////////////////////////////////////////////////////////
620 template<> struct TypeTraits<SWR_TYPE_UNORM, 5> : PackTraits<5>
621 {
622 static const SWR_TYPE MyType = SWR_TYPE_UNORM;
623 static float toFloat() { return 1.0f / 31.0f; }
624 static float fromFloat() { return 31.0f; }
625 static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
626 };
627
628 //////////////////////////////////////////////////////////////////////////
629 /// TypeTraits - Format type traits specialization for UNORM6
630 //////////////////////////////////////////////////////////////////////////
631 template<> struct TypeTraits<SWR_TYPE_UNORM, 6> : PackTraits<6>
632 {
633 static const SWR_TYPE MyType = SWR_TYPE_UNORM;
634 static float toFloat() { return 1.0f / 63.0f; }
635 static float fromFloat() { return 63.0f; }
636 static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
637 };
638
639 //////////////////////////////////////////////////////////////////////////
640 /// TypeTraits - Format type traits specialization for UNORM8
641 //////////////////////////////////////////////////////////////////////////
642 template<> struct TypeTraits<SWR_TYPE_UNORM, 8> : PackTraits<8>
643 {
644 static const SWR_TYPE MyType = SWR_TYPE_UNORM;
645 static float toFloat() { return 1.0f / 255.0f; }
646 static float fromFloat() { return 255.0f; }
647 static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
648 };
649
650 //////////////////////////////////////////////////////////////////////////
651 /// TypeTraits - Format type traits specialization for UNORM8
652 //////////////////////////////////////////////////////////////////////////
653 template<> struct TypeTraits<SWR_TYPE_SNORM, 8> : PackTraits<8, true>
654 {
655 static const SWR_TYPE MyType = SWR_TYPE_SNORM;
656 static float toFloat() { return 1.0f / 127.0f; }
657 static float fromFloat() { return 127.0f; }
658 static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
659 };
660
661 //////////////////////////////////////////////////////////////////////////
662 /// TypeTraits - Format type traits specialization for UNORM16
663 //////////////////////////////////////////////////////////////////////////
664 template<> struct TypeTraits<SWR_TYPE_UNORM, 16> : PackTraits<16>
665 {
666 static const SWR_TYPE MyType = SWR_TYPE_UNORM;
667 static float toFloat() { return 1.0f / 65535.0f; }
668 static float fromFloat() { return 65535.0f; }
669 static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
670 };
671
672 //////////////////////////////////////////////////////////////////////////
673 /// TypeTraits - Format type traits specialization for SNORM16
674 //////////////////////////////////////////////////////////////////////////
675 template<> struct TypeTraits<SWR_TYPE_SNORM, 16> : PackTraits<16, true>
676 {
677 static const SWR_TYPE MyType = SWR_TYPE_UNORM;
678 static float toFloat() { return 1.0f / 32767.0f; }
679 static float fromFloat() { return 32767.0f; }
680 static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
681 };
682
683 //////////////////////////////////////////////////////////////////////////
684 /// TypeTraits - Format type traits specialization for UNORM24
685 //////////////////////////////////////////////////////////////////////////
686 template<>
687 struct TypeTraits < SWR_TYPE_UNORM, 24 > : PackTraits<32>
688 {
689 static const SWR_TYPE MyType = SWR_TYPE_UNORM;
690 static float toFloat() { return 1.0f / 16777215.0f; }
691 static float fromFloat() { return 16777215.0f; }
692 static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
693 };
694
695 //////////////////////////////////////////////////////////////////////////
696 // FLOAT Specializations from here on...
697 //////////////////////////////////////////////////////////////////////////
698 #define TO_M128i(a) _mm_castps_si128(a)
699 #define TO_M128(a) _mm_castsi128_ps(a)
700
701 #include "math.h"
702
703 template< unsigned expnum, unsigned expden, unsigned coeffnum, unsigned coeffden >
704 inline static __m128 fastpow(__m128 arg) {
705 __m128 ret = arg;
706
707 static const __m128 factor = _mm_set1_ps(exp2(127.0f * expden / expnum - 127.0f)
708 * powf(1.0f * coeffnum / coeffden, 1.0f * expden / expnum));
709
710 // Apply a constant pre-correction factor.
711 ret = _mm_mul_ps(ret, factor);
712
713 // Reinterpret arg as integer to obtain logarithm.
714 //asm("cvtdq2ps %1, %0" : "=x" (ret) : "x" (ret));
715 ret = _mm_cvtepi32_ps(_mm_castps_si128(ret));
716
717 // Multiply logarithm by power.
718 ret = _mm_mul_ps(ret, _mm_set1_ps(1.0f * expnum / expden));
719
720 // Convert back to "integer" to exponentiate.
721 //asm("cvtps2dq %1, %0" : "=x" (ret) : "x" (ret));
722 ret = _mm_castsi128_ps(_mm_cvtps_epi32(ret));
723
724 return ret;
725 }
726
727 inline static __m128 pow512_4(__m128 arg) {
728 // 5/12 is too small, so compute the 4th root of 20/12 instead.
729 // 20/12 = 5/3 = 1 + 2/3 = 2 - 1/3. 2/3 is a suitable argument for fastpow.
730 // weighting coefficient: a^-1/2 = 2 a; a = 2^-2/3
731 __m128 xf = fastpow< 2, 3, int(0.629960524947437 * 1e9), int(1e9) >(arg);
732 __m128 xover = _mm_mul_ps(arg, xf);
733
734 __m128 xfm1 = _mm_rsqrt_ps(xf);
735 __m128 x2 = _mm_mul_ps(arg, arg);
736 __m128 xunder = _mm_mul_ps(x2, xfm1);
737
738 // sqrt2 * over + 2 * sqrt2 * under
739 __m128 xavg = _mm_mul_ps(_mm_set1_ps(1.0f / (3.0f * 0.629960524947437f) * 0.999852f),
740 _mm_add_ps(xover, xunder));
741
742 xavg = _mm_mul_ps(xavg, _mm_rsqrt_ps(xavg));
743 xavg = _mm_mul_ps(xavg, _mm_rsqrt_ps(xavg));
744 return xavg;
745 }
746
747 inline static __m128 powf_wrapper(__m128 Base, float Exp)
748 {
749 float *f = (float *)(&Base);
750
751 return _mm_set_ps(powf(f[3], Exp),
752 powf(f[2], Exp),
753 powf(f[1], Exp),
754 powf(f[0], Exp));
755 }
756
757 static inline __m128 ConvertFloatToSRGB2(__m128& Src)
758 {
759 // create a mask with 0xFFFFFFFF in the DWORDs where the source is <= the minimal SRGB float value
760 __m128i CmpToSRGBThresholdMask = TO_M128i(_mm_cmpnlt_ps(_mm_set1_ps(0.0031308f), Src));
761
762 // squeeze the mask down to 16 bits (4 bits per DWORD)
763 int CompareResult = _mm_movemask_epi8(CmpToSRGBThresholdMask);
764
765 __m128 Result;
766
767 //
768 if (CompareResult == 0xFFFF)
769 {
770 // all DWORDs are <= the threshold
771 Result = _mm_mul_ps(Src, _mm_set1_ps(12.92f));
772 }
773 else if (CompareResult == 0x0)
774 {
775 // all DWORDs are > the threshold
776 __m128 fSrc_0RGB = Src;
777
778 // --> 1.055f * c(1.0f/2.4f) - 0.055f
779 #if KNOB_USE_FAST_SRGB == TRUE
780 // 1.0f / 2.4f is 5.0f / 12.0f which is used for approximation.
781 __m128 f = pow512_4(fSrc_0RGB);
782 #else
783 __m128 f = powf_wrapper(fSrc_0RGB, 1.0f / 2.4f);
784 #endif
785 f = _mm_mul_ps(f, _mm_set1_ps(1.055f));
786 Result = _mm_sub_ps(f, _mm_set1_ps(0.055f));
787 }
788 else
789 {
790 // some DWORDs are <= the threshold and some are > threshold
791 __m128 Src_0RGB_mul_denorm = _mm_mul_ps(Src, _mm_set1_ps(12.92f));
792
793 __m128 fSrc_0RGB = Src;
794
795 // --> 1.055f * c(1.0f/2.4f) - 0.055f
796 #if KNOB_USE_FAST_SRGB == TRUE
797 // 1.0f / 2.4f is 5.0f / 12.0f which is used for approximation.
798 __m128 f = pow512_4(fSrc_0RGB);
799 #else
800 __m128 f = powf_wrapper(fSrc_0RGB, 1.0f / 2.4f);
801 #endif
802 f = _mm_mul_ps(f, _mm_set1_ps(1.055f));
803 f = _mm_sub_ps(f, _mm_set1_ps(0.055f));
804
805 // Clear the alpha (is garbage after the sub)
806 __m128i i = _mm_and_si128(TO_M128i(f), _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF));
807
808 __m128i LessThanPart = _mm_and_si128(CmpToSRGBThresholdMask, TO_M128i(Src_0RGB_mul_denorm));
809 __m128i GreaterEqualPart = _mm_andnot_si128(CmpToSRGBThresholdMask, i);
810 __m128i CombinedParts = _mm_or_si128(LessThanPart, GreaterEqualPart);
811
812 Result = TO_M128(CombinedParts);
813 }
814
815 return Result;
816 }
817
818 #if ENABLE_AVX512_SIMD16
819 template< unsigned expnum, unsigned expden, unsigned coeffnum, unsigned coeffden >
820 inline static simd16scalar fastpow(simd16scalar value)
821 {
822 static const float factor1 = exp2(127.0f * expden / expnum - 127.0f)
823 * powf(1.0f * coeffnum / coeffden, 1.0f * expden / expnum);
824
825 // Apply a constant pre-correction factor.
826 simd16scalar result = _simd16_mul_ps(value, _simd16_set1_ps(factor1));
827
828 // Reinterpret arg as integer to obtain logarithm.
829 //asm("cvtdq2ps %1, %0" : "=x" (result) : "x" (result));
830 result = _simd16_cvtepi32_ps(_simd16_castps_si(result));
831
832 // Multiply logarithm by power.
833 result = _simd16_mul_ps(result, _simd16_set1_ps(1.0f * expnum / expden));
834
835 // Convert back to "integer" to exponentiate.
836 //asm("cvtps2dq %1, %0" : "=x" (result) : "x" (result));
837 result = _simd16_castsi_ps(_simd16_cvtps_epi32(result));
838
839 return result;
840 }
841
842 inline static simd16scalar pow512_4(simd16scalar arg)
843 {
844 // 5/12 is too small, so compute the 4th root of 20/12 instead.
845 // 20/12 = 5/3 = 1 + 2/3 = 2 - 1/3. 2/3 is a suitable argument for fastpow.
846 // weighting coefficient: a^-1/2 = 2 a; a = 2^-2/3
847 simd16scalar xf = fastpow< 2, 3, int(0.629960524947437 * 1e9), int(1e9) >(arg);
848 simd16scalar xover = _simd16_mul_ps(arg, xf);
849
850 simd16scalar xfm1 = _simd16_rsqrt_ps(xf);
851 simd16scalar x2 = _simd16_mul_ps(arg, arg);
852 simd16scalar xunder = _simd16_mul_ps(x2, xfm1);
853
854 // sqrt2 * over + 2 * sqrt2 * under
855 simd16scalar xavg = _simd16_mul_ps(_simd16_set1_ps(1.0f / (3.0f * 0.629960524947437f) * 0.999852f), _simd16_add_ps(xover, xunder));
856
857 xavg = _simd16_mul_ps(xavg, _simd16_rsqrt_ps(xavg));
858 xavg = _simd16_mul_ps(xavg, _simd16_rsqrt_ps(xavg));
859
860 return xavg;
861 }
862
863 inline static simd16scalar powf_wrapper(const simd16scalar base, float exp)
864 {
865 const float *f = reinterpret_cast<const float *>(&base);
866
867 return _simd16_set_ps(
868 powf(f[15], exp),
869 powf(f[14], exp),
870 powf(f[13], exp),
871 powf(f[12], exp),
872 powf(f[11], exp),
873 powf(f[10], exp),
874 powf(f[ 9], exp),
875 powf(f[ 8], exp),
876 powf(f[ 7], exp),
877 powf(f[ 6], exp),
878 powf(f[ 5], exp),
879 powf(f[ 4], exp),
880 powf(f[ 3], exp),
881 powf(f[ 2], exp),
882 powf(f[ 1], exp),
883 powf(f[ 0], exp)
884 );
885 }
886
887 // float to SRGB conversion formula
888 //
889 // if (value < 0.0031308f)
890 // value *= 12.92f;
891 // else
892 // value = 1.055f * pow(value, 1.0f / 2.4f) - 0.055f;
893 //
894 static inline simd16scalar ConvertFloatToSRGB2(const simd16scalar &value)
895 {
896 // create a mask where the source is < the minimal SRGB float value
897 const simd16mask mask = _simd16_cmplt_ps_mask(value, _simd16_set1_ps(0.0031308f));
898
899 // if all elements are < the threshold, result = value * 12.92
900 simd16scalar result = _simd16_mul_ps(value, _simd16_set1_ps(12.92f));
901
902 if (_simd16_mask2int(mask) != 0xFFFF)
903 {
904 // some elements are >= threshold, result = 1.055 * power(value, 1.0 / 2.4) - 0.055
905 #if KNOB_USE_FAST_SRGB == TRUE
906 // 1.0f / 2.4f is 5.0f / 12.0f which is used for approximation.
907 simd16scalar result2 = pow512_4(value);
908 #else
909 simd16scalar result2 = powf_wrapper(value, 1.0f / 2.4f);
910 #endif
911
912 result2 = _simd16_mul_ps(result2, _simd16_set1_ps(1.055f));
913 result2 = _simd16_sub_ps(result2, _simd16_set1_ps(0.055f));
914
915 #if (KNOB_ARCH == KNOB_ARCH_AVX512)
916 // only native AVX512 can directly use the computed mask for the blend operation
917 result = _mm512_mask_blend_ps(mask, result2, result);
918 #else
919 result = _simd16_blendv_ps(result2, result, _simd16_cmplt_ps(value, _simd16_set1_ps(0.0031308f)));
920 #endif
921 }
922
923 return result;
924 }
925
926 #endif
927 //////////////////////////////////////////////////////////////////////////
928 /// TypeTraits - Format type traits specialization for FLOAT16
929 //////////////////////////////////////////////////////////////////////////
930 template<> struct TypeTraits<SWR_TYPE_FLOAT, 16> : PackTraits<16>
931 {
932 static const SWR_TYPE MyType = SWR_TYPE_FLOAT;
933 static float toFloat() { return 1.0f; }
934 static float fromFloat() { return 1.0f; }
935 static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
936
937 static simdscalar pack(const simdscalar &in)
938 {
939 #if KNOB_SIMD_WIDTH == 8
940 #if (KNOB_ARCH == KNOB_ARCH_AVX)
941 // input is 8 packed float32, output is 8 packed float16
942 simdscalari src = _simd_castps_si(in);
943
944 static const uint32_t FLOAT_EXP_BITS = 8;
945 static const uint32_t FLOAT_MANTISSA_BITS = 23;
946 static const uint32_t FLOAT_MANTISSA_MASK = (1U << FLOAT_MANTISSA_BITS) - 1;
947 static const uint32_t FLOAT_EXP_MASK = ((1U << FLOAT_EXP_BITS) - 1) << FLOAT_MANTISSA_BITS;
948
949 static const uint32_t HALF_EXP_BITS = 5;
950 static const uint32_t HALF_MANTISSA_BITS = 10;
951 static const uint32_t HALF_EXP_MASK = ((1U << HALF_EXP_BITS) - 1) << HALF_MANTISSA_BITS;
952
953 // minimum exponent required, exponents below this are flushed to 0.
954 static const int32_t HALF_EXP_MIN = -14;
955 static const int32_t FLOAT_EXP_BIAS = 127;
956 static const int32_t FLOAT_EXP_MIN = HALF_EXP_MIN + FLOAT_EXP_BIAS;
957 static const int32_t FLOAT_EXP_MIN_FTZ = FLOAT_EXP_MIN - (HALF_MANTISSA_BITS + 1); // +1 for the lack of implicit significand
958
959 // maximum exponent required, exponents above this are set to infinity
960 static const int32_t HALF_EXP_MAX = 15;
961 static const int32_t FLOAT_EXP_MAX = HALF_EXP_MAX + FLOAT_EXP_BIAS;
962
963 const simdscalari vSignMask = _simd_set1_epi32(0x80000000);
964 const simdscalari vExpMask = _simd_set1_epi32(FLOAT_EXP_MASK);
965 const simdscalari vManMask = _simd_set1_epi32(FLOAT_MANTISSA_MASK);
966 const simdscalari vExpMin = _simd_set1_epi32(FLOAT_EXP_MASK & uint32_t(FLOAT_EXP_MIN << FLOAT_MANTISSA_BITS));
967 const simdscalari vExpMinFtz = _simd_set1_epi32(FLOAT_EXP_MASK & uint32_t(FLOAT_EXP_MIN_FTZ << FLOAT_MANTISSA_BITS));
968 const simdscalari vExpMax = _simd_set1_epi32(FLOAT_EXP_MASK & uint32_t(FLOAT_EXP_MAX << FLOAT_MANTISSA_BITS));
969
970 simdscalari vSign = _simd_and_si(src, vSignMask);
971 simdscalari vExp = _simd_and_si(src, vExpMask);
972 simdscalari vMan = _simd_and_si(src, vManMask);
973
974 simdscalari vFTZMask = _simd_cmplt_epi32(vExp, vExpMinFtz);
975 simdscalari vDenormMask = _simd_andnot_si(vFTZMask, _simd_cmplt_epi32(vExp, vExpMin));
976 simdscalari vInfMask = _simd_cmpeq_epi32(vExpMask, vExp);
977 simdscalari vClampMask = _simd_andnot_si(vInfMask, _simd_cmplt_epi32(vExpMax, vExp));
978
979 simdscalari vHalfExp = _simd_add_epi32(_simd_sub_epi32(vExp, vExpMin), _simd_set1_epi32(1U << FLOAT_MANTISSA_BITS));
980
981 // pack output 16-bits into the lower 16-bits of each 32-bit channel
982 simdscalari vDst = _simd_and_si(_simd_srli_epi32(vHalfExp, 13), _simd_set1_epi32(HALF_EXP_MASK));
983 vDst = _simd_or_si(vDst, _simd_srli_epi32(vMan, FLOAT_MANTISSA_BITS - HALF_MANTISSA_BITS));
984
985 // Flush To Zero
986 vDst = _simd_andnot_si(vFTZMask, vDst);
987 // Apply Infinites / NaN
988 vDst = _simd_or_si(vDst, _simd_and_si(vInfMask, _simd_set1_epi32(HALF_EXP_MASK)));
989
990 // Apply clamps
991 vDst = _simd_andnot_si(vClampMask, vDst);
992 vDst = _simd_or_si(vDst,
993 _simd_and_si(vClampMask, _simd_set1_epi32(0x7BFF)));
994
995 // Compute Denormals (subnormals)
996 if (!_mm256_testz_si256(vDenormMask, vDenormMask))
997 {
998 uint32_t *pDenormMask = (uint32_t*)&vDenormMask;
999 uint32_t *pExp = (uint32_t*)&vExp;
1000 uint32_t *pMan = (uint32_t*)&vMan;
1001 uint32_t *pDst = (uint32_t*)&vDst;
1002 for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i)
1003 {
1004 if (pDenormMask[i])
1005 {
1006 // Need to compute subnormal value
1007 uint32_t exponent = pExp[i] >> FLOAT_MANTISSA_BITS;
1008 uint32_t mantissa = pMan[i] |
1009 (1U << FLOAT_MANTISSA_BITS); // Denorms include no "implicit" 1s. Make it explicit
1010
1011 pDst[i] = mantissa >> ((FLOAT_EXP_MIN - exponent) + (FLOAT_MANTISSA_BITS - HALF_MANTISSA_BITS));
1012 }
1013 }
1014 }
1015
1016 // Add in sign bits
1017 vDst = _simd_or_si(vDst, _simd_srli_epi32(vSign, 16));
1018
1019 // Pack to lower 128-bits
1020 vDst = _mm256_castsi128_si256(_mm_packus_epi32(_mm256_castsi256_si128(vDst), _mm256_extractf128_si256(vDst, 1)));
1021
1022 #if 0
1023 #if !defined(NDEBUG)
1024 simdscalari vCheck = _mm256_castsi128_si256(_mm256_cvtps_ph(in, _MM_FROUND_TRUNC));
1025
1026 for (uint32_t i = 0; i < 4; ++i)
1027 {
1028 SWR_ASSERT(vCheck.m256i_i32[i] == vDst.m256i_i32[i]);
1029 }
1030 #endif
1031 #endif
1032
1033 return _simd_castsi_ps(vDst);
1034
1035 #else
1036 return _mm256_castsi256_ps(_mm256_castsi128_si256(_mm256_cvtps_ph(in, _MM_FROUND_TRUNC)));
1037 #endif
1038 #else
1039 #error Unsupported vector width
1040 #endif
1041 }
1042
1043 static simdscalar unpack(const simdscalar &in)
1044 {
1045 // input is 8 packed float16, output is 8 packed float32
1046 SWR_ASSERT(0); // @todo
1047 return _simd_setzero_ps();
1048 }
1049 #if ENABLE_AVX512_SIMD16
1050
1051 static simd16scalar pack(const simd16scalar &in)
1052 {
1053 simd16scalari result = _simd16_setzero_si();
1054 simdscalari resultlo = _simd_setzero_si();
1055
1056 #if (KNOB_ARCH == KNOB_ARCH_AVX)
1057 simdscalar simdlo = pack(_simd16_extract_ps(in, 0));
1058 simdscalar simdhi = pack(_simd16_extract_ps(in, 1));
1059
1060 __m128i templo = _mm256_extractf128_si256(_simd_castps_si(simdlo), 0);
1061 __m128i temphi = _mm256_extractf128_si256(_simd_castps_si(simdhi), 0);
1062
1063 #else
1064 __m128i templo = _mm256_cvtps_ph(_simd16_extract_ps(in, 0), _MM_FROUND_TRUNC);
1065 __m128i temphi = _mm256_cvtps_ph(_simd16_extract_ps(in, 1), _MM_FROUND_TRUNC);
1066
1067 #endif
1068 resultlo = _mm256_insertf128_si256(resultlo, templo, 0);
1069 resultlo = _mm256_insertf128_si256(resultlo, temphi, 1);
1070
1071 result = _simd16_insert_si(result, resultlo, 0);
1072
1073 return _simd16_castsi_ps(result);
1074 }
1075
1076 static simd16scalar unpack(const simd16scalar &in)
1077 {
1078 // input is 16 packed float16, output is 16 packed float32
1079 SWR_ASSERT(0); // @todo
1080 return _simd16_setzero_ps();
1081 }
1082 #endif
1083 };
1084
1085 //////////////////////////////////////////////////////////////////////////
1086 /// TypeTraits - Format type traits specialization for FLOAT32
1087 //////////////////////////////////////////////////////////////////////////
1088 template<> struct TypeTraits<SWR_TYPE_FLOAT, 32> : PackTraits<32>
1089 {
1090 static const SWR_TYPE MyType = SWR_TYPE_FLOAT;
1091 static float toFloat() { return 1.0f; }
1092 static float fromFloat() { return 1.0f; }
1093 static inline simdscalar convertSrgb(simdscalar &in)
1094 {
1095 #if KNOB_SIMD_WIDTH == 8
1096 #if (KNOB_ARCH == KNOB_ARCH_AVX || KNOB_ARCH == KNOB_ARCH_AVX2)
1097 __m128 srcLo = _mm256_extractf128_ps(in, 0);
1098 __m128 srcHi = _mm256_extractf128_ps(in, 1);
1099
1100 srcLo = ConvertFloatToSRGB2(srcLo);
1101 srcHi = ConvertFloatToSRGB2(srcHi);
1102
1103 in = _mm256_insertf128_ps(in, srcLo, 0);
1104 in = _mm256_insertf128_ps(in, srcHi, 1);
1105 #endif
1106 #else
1107 #error Unsupported vector width
1108 #endif
1109 return in;
1110 }
1111 #if ENABLE_AVX512_SIMD16
1112
1113 static inline simd16scalar convertSrgb(simd16scalar &in)
1114 {
1115 return ConvertFloatToSRGB2(in);
1116 }
1117 #endif
1118 };
1119
1120 //////////////////////////////////////////////////////////////////////////
1121 /// Format1 - Bitfield for single component formats.
1122 //////////////////////////////////////////////////////////////////////////
1123 template<uint32_t x>
1124 struct Format1
1125 {
1126 union
1127 {
1128 uint32_t r : x;
1129
1130 ///@ The following are here to provide full template needed in Formats.
1131 uint32_t g : x;
1132 uint32_t b : x;
1133 uint32_t a : x;
1134 };
1135 };
1136
1137 //////////////////////////////////////////////////////////////////////////
1138 /// Format1 - Bitfield for single component formats - 8 bit specialization
1139 //////////////////////////////////////////////////////////////////////////
1140 template<>
1141 struct Format1<8>
1142 {
1143 union
1144 {
1145 uint8_t r;
1146
1147 ///@ The following are here to provide full template needed in Formats.
1148 uint8_t g;
1149 uint8_t b;
1150 uint8_t a;
1151 };
1152 };
1153
1154 //////////////////////////////////////////////////////////////////////////
1155 /// Format1 - Bitfield for single component formats - 16 bit specialization
1156 //////////////////////////////////////////////////////////////////////////
1157 template<>
1158 struct Format1<16>
1159 {
1160 union
1161 {
1162 uint16_t r;
1163
1164 ///@ The following are here to provide full template needed in Formats.
1165 uint16_t g;
1166 uint16_t b;
1167 uint16_t a;
1168 };
1169 };
1170
1171 //////////////////////////////////////////////////////////////////////////
1172 /// Format2 - Bitfield for 2 component formats.
1173 //////////////////////////////////////////////////////////////////////////
1174 template<uint32_t x, uint32_t y>
1175 union Format2
1176 {
1177 struct
1178 {
1179 uint32_t r : x;
1180 uint32_t g : y;
1181 };
1182 struct
1183 {
1184 ///@ The following are here to provide full template needed in Formats.
1185 uint32_t b : x;
1186 uint32_t a : y;
1187 };
1188 };
1189
1190 //////////////////////////////////////////////////////////////////////////
1191 /// Format2 - Bitfield for 2 component formats - 16 bit specialization
1192 //////////////////////////////////////////////////////////////////////////
1193 template<>
1194 union Format2<8,8>
1195 {
1196 struct
1197 {
1198 uint16_t r : 8;
1199 uint16_t g : 8;
1200 };
1201 struct
1202 {
1203 ///@ The following are here to provide full template needed in Formats.
1204 uint16_t b : 8;
1205 uint16_t a : 8;
1206 };
1207 };
1208
1209 //////////////////////////////////////////////////////////////////////////
1210 /// Format3 - Bitfield for 3 component formats.
1211 //////////////////////////////////////////////////////////////////////////
1212 template<uint32_t x, uint32_t y, uint32_t z>
1213 union Format3
1214 {
1215 struct
1216 {
1217 uint32_t r : x;
1218 uint32_t g : y;
1219 uint32_t b : z;
1220 };
1221 uint32_t a; ///@note This is here to provide full template needed in Formats.
1222 };
1223
1224 //////////////////////////////////////////////////////////////////////////
1225 /// Format3 - Bitfield for 3 component formats - 16 bit specialization
1226 //////////////////////////////////////////////////////////////////////////
1227 template<>
1228 union Format3<5,6,5>
1229 {
1230 struct
1231 {
1232 uint16_t r : 5;
1233 uint16_t g : 6;
1234 uint16_t b : 5;
1235 };
1236 uint16_t a; ///@note This is here to provide full template needed in Formats.
1237 };
1238
1239 //////////////////////////////////////////////////////////////////////////
1240 /// Format4 - Bitfield for 4 component formats.
1241 //////////////////////////////////////////////////////////////////////////
1242 template<uint32_t x, uint32_t y, uint32_t z, uint32_t w>
1243 struct Format4
1244 {
1245 uint32_t r : x;
1246 uint32_t g : y;
1247 uint32_t b : z;
1248 uint32_t a : w;
1249 };
1250
1251 //////////////////////////////////////////////////////////////////////////
1252 /// Format4 - Bitfield for 4 component formats - 16 bit specialization
1253 //////////////////////////////////////////////////////////////////////////
1254 template<>
1255 struct Format4<5,5,5,1>
1256 {
1257 uint16_t r : 5;
1258 uint16_t g : 5;
1259 uint16_t b : 5;
1260 uint16_t a : 1;
1261 };
1262
1263 //////////////////////////////////////////////////////////////////////////
1264 /// Format4 - Bitfield for 4 component formats - 16 bit specialization
1265 //////////////////////////////////////////////////////////////////////////
1266 template<>
1267 struct Format4<4,4,4,4>
1268 {
1269 uint16_t r : 4;
1270 uint16_t g : 4;
1271 uint16_t b : 4;
1272 uint16_t a : 4;
1273 };
1274
1275 //////////////////////////////////////////////////////////////////////////
1276 /// ComponentTraits - Default components
1277 //////////////////////////////////////////////////////////////////////////
1278 template<uint32_t x, uint32_t y, uint32_t z, uint32_t w>
1279 struct Defaults
1280 {
1281 INLINE static uint32_t GetDefault(uint32_t comp)
1282 {
1283 static const uint32_t defaults[4]{ x, y, z, w };
1284 return defaults[comp];
1285 }
1286 };
1287
1288 //////////////////////////////////////////////////////////////////////////
1289 /// ComponentTraits - Component type traits.
1290 //////////////////////////////////////////////////////////////////////////
1291 template<SWR_TYPE X, uint32_t NumBitsX, SWR_TYPE Y = SWR_TYPE_UNKNOWN, uint32_t NumBitsY = 0, SWR_TYPE Z = SWR_TYPE_UNKNOWN, uint32_t NumBitsZ = 0, SWR_TYPE W = SWR_TYPE_UNKNOWN, uint32_t NumBitsW = 0>
1292 struct ComponentTraits
1293 {
1294 INLINE static SWR_TYPE GetType(uint32_t comp)
1295 {
1296 static const SWR_TYPE CompType[4]{ X, Y, Z, W };
1297 return CompType[comp];
1298 }
1299
1300 INLINE static uint32_t GetBPC(uint32_t comp)
1301 {
1302 static const uint32_t MyBpc[4]{ NumBitsX, NumBitsY, NumBitsZ, NumBitsW };
1303 return MyBpc[comp];
1304 }
1305
1306 INLINE static bool isNormalized(uint32_t comp)
1307 {
1308 switch (comp)
1309 {
1310 case 0:
1311 return (X == SWR_TYPE_UNORM || X == SWR_TYPE_SNORM) ? true : false;
1312 case 1:
1313 return (Y == SWR_TYPE_UNORM || Y == SWR_TYPE_SNORM) ? true : false;
1314 case 2:
1315 return (Z == SWR_TYPE_UNORM || Z == SWR_TYPE_SNORM) ? true : false;
1316 case 3:
1317 return (W == SWR_TYPE_UNORM || W == SWR_TYPE_SNORM) ? true : false;
1318 }
1319 SWR_ASSERT(0);
1320 return false;
1321 }
1322
1323 INLINE static float toFloat(uint32_t comp)
1324 {
1325 switch (comp)
1326 {
1327 case 0:
1328 return TypeTraits<X, NumBitsX>::toFloat();
1329 case 1:
1330 return TypeTraits<Y, NumBitsY>::toFloat();
1331 case 2:
1332 return TypeTraits<Z, NumBitsZ>::toFloat();
1333 case 3:
1334 return TypeTraits<W, NumBitsW>::toFloat();
1335 }
1336 SWR_ASSERT(0);
1337 return TypeTraits<X, NumBitsX>::toFloat();
1338
1339 }
1340
1341 INLINE static float fromFloat(uint32_t comp)
1342 {
1343 switch (comp)
1344 {
1345 case 0:
1346 return TypeTraits<X, NumBitsX>::fromFloat();
1347 case 1:
1348 return TypeTraits<Y, NumBitsY>::fromFloat();
1349 case 2:
1350 return TypeTraits<Z, NumBitsZ>::fromFloat();
1351 case 3:
1352 return TypeTraits<W, NumBitsW>::fromFloat();
1353 }
1354 SWR_ASSERT(0);
1355 return TypeTraits<X, NumBitsX>::fromFloat();
1356 }
1357
1358 INLINE static simdscalar loadSOA(uint32_t comp, const uint8_t* pSrc)
1359 {
1360 switch (comp)
1361 {
1362 case 0:
1363 return TypeTraits<X, NumBitsX>::loadSOA(pSrc);
1364 case 1:
1365 return TypeTraits<Y, NumBitsY>::loadSOA(pSrc);
1366 case 2:
1367 return TypeTraits<Z, NumBitsZ>::loadSOA(pSrc);
1368 case 3:
1369 return TypeTraits<W, NumBitsW>::loadSOA(pSrc);
1370 }
1371 SWR_ASSERT(0);
1372 return TypeTraits<X, NumBitsX>::loadSOA(pSrc);
1373 }
1374
1375 INLINE static void storeSOA(uint32_t comp, uint8_t *pDst, simdscalar src)
1376 {
1377 switch (comp)
1378 {
1379 case 0:
1380 TypeTraits<X, NumBitsX>::storeSOA(pDst, src);
1381 return;
1382 case 1:
1383 TypeTraits<Y, NumBitsY>::storeSOA(pDst, src);
1384 return;
1385 case 2:
1386 TypeTraits<Z, NumBitsZ>::storeSOA(pDst, src);
1387 return;
1388 case 3:
1389 TypeTraits<W, NumBitsW>::storeSOA(pDst, src);
1390 return;
1391 }
1392 SWR_ASSERT(0);
1393 TypeTraits<X, NumBitsX>::storeSOA(pDst, src);
1394 }
1395
1396 INLINE static simdscalar unpack(uint32_t comp, simdscalar &in)
1397 {
1398 switch (comp)
1399 {
1400 case 0:
1401 return TypeTraits<X, NumBitsX>::unpack(in);
1402 case 1:
1403 return TypeTraits<Y, NumBitsY>::unpack(in);
1404 case 2:
1405 return TypeTraits<Z, NumBitsZ>::unpack(in);
1406 case 3:
1407 return TypeTraits<W, NumBitsW>::unpack(in);
1408 }
1409 SWR_ASSERT(0);
1410 return TypeTraits<X, NumBitsX>::unpack(in);
1411 }
1412
1413 INLINE static simdscalar pack(uint32_t comp, simdscalar &in)
1414 {
1415 switch (comp)
1416 {
1417 case 0:
1418 return TypeTraits<X, NumBitsX>::pack(in);
1419 case 1:
1420 return TypeTraits<Y, NumBitsY>::pack(in);
1421 case 2:
1422 return TypeTraits<Z, NumBitsZ>::pack(in);
1423 case 3:
1424 return TypeTraits<W, NumBitsW>::pack(in);
1425 }
1426 SWR_ASSERT(0);
1427 return TypeTraits<X, NumBitsX>::pack(in);
1428 }
1429
1430 INLINE static simdscalar convertSrgb(uint32_t comp, simdscalar &in)
1431 {
1432 switch (comp)
1433 {
1434 case 0:
1435 return TypeTraits<X, NumBitsX>::convertSrgb(in);
1436 case 1:
1437 return TypeTraits<Y, NumBitsY>::convertSrgb(in);
1438 case 2:
1439 return TypeTraits<Z, NumBitsZ>::convertSrgb(in);
1440 case 3:
1441 return TypeTraits<W, NumBitsW>::convertSrgb(in);
1442 }
1443 SWR_ASSERT(0);
1444 return TypeTraits<X, NumBitsX>::convertSrgb(in);
1445 }
1446 #if ENABLE_AVX512_SIMD16
1447
1448 INLINE static simd16scalar loadSOA_16(uint32_t comp, const uint8_t* pSrc)
1449 {
1450 switch (comp)
1451 {
1452 case 0:
1453 return TypeTraits<X, NumBitsX>::loadSOA_16(pSrc);
1454 case 1:
1455 return TypeTraits<Y, NumBitsY>::loadSOA_16(pSrc);
1456 case 2:
1457 return TypeTraits<Z, NumBitsZ>::loadSOA_16(pSrc);
1458 case 3:
1459 return TypeTraits<W, NumBitsW>::loadSOA_16(pSrc);
1460 }
1461 SWR_ASSERT(0);
1462 return TypeTraits<X, NumBitsX>::loadSOA_16(pSrc);
1463 }
1464
1465 INLINE static void storeSOA(uint32_t comp, uint8_t *pDst, simd16scalar src)
1466 {
1467 switch (comp)
1468 {
1469 case 0:
1470 TypeTraits<X, NumBitsX>::storeSOA(pDst, src);
1471 return;
1472 case 1:
1473 TypeTraits<Y, NumBitsY>::storeSOA(pDst, src);
1474 return;
1475 case 2:
1476 TypeTraits<Z, NumBitsZ>::storeSOA(pDst, src);
1477 return;
1478 case 3:
1479 TypeTraits<W, NumBitsW>::storeSOA(pDst, src);
1480 return;
1481 }
1482 SWR_ASSERT(0);
1483 TypeTraits<X, NumBitsX>::storeSOA(pDst, src);
1484 }
1485
1486 INLINE static simd16scalar unpack(uint32_t comp, simd16scalar &in)
1487 {
1488 switch (comp)
1489 {
1490 case 0:
1491 return TypeTraits<X, NumBitsX>::unpack(in);
1492 case 1:
1493 return TypeTraits<Y, NumBitsY>::unpack(in);
1494 case 2:
1495 return TypeTraits<Z, NumBitsZ>::unpack(in);
1496 case 3:
1497 return TypeTraits<W, NumBitsW>::unpack(in);
1498 }
1499 SWR_ASSERT(0);
1500 return TypeTraits<X, NumBitsX>::unpack(in);
1501 }
1502
1503 INLINE static simd16scalar pack(uint32_t comp, simd16scalar &in)
1504 {
1505 switch (comp)
1506 {
1507 case 0:
1508 return TypeTraits<X, NumBitsX>::pack(in);
1509 case 1:
1510 return TypeTraits<Y, NumBitsY>::pack(in);
1511 case 2:
1512 return TypeTraits<Z, NumBitsZ>::pack(in);
1513 case 3:
1514 return TypeTraits<W, NumBitsW>::pack(in);
1515 }
1516 SWR_ASSERT(0);
1517 return TypeTraits<X, NumBitsX>::pack(in);
1518 }
1519
1520 INLINE static simd16scalar convertSrgb(uint32_t comp, simd16scalar &in)
1521 {
1522 switch (comp)
1523 {
1524 case 0:
1525 return TypeTraits<X, NumBitsX>::convertSrgb(in);
1526 case 1:
1527 return TypeTraits<Y, NumBitsY>::convertSrgb(in);
1528 case 2:
1529 return TypeTraits<Z, NumBitsZ>::convertSrgb(in);
1530 case 3:
1531 return TypeTraits<W, NumBitsW>::convertSrgb(in);
1532 }
1533 SWR_ASSERT(0);
1534 return TypeTraits<X, NumBitsX>::convertSrgb(in);
1535 }
1536 #endif
1537 };