faa2e76ded842f0fe8ebd21cd1716c2700fc77d9
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / format_types.h
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file formats.h
24 *
25 * @brief Definitions for SWR_FORMAT functions.
26 *
27 ******************************************************************************/
28 #pragma once
29
30 #include "utils.h"
31
32 //////////////////////////////////////////////////////////////////////////
33 /// PackTraits - Helpers for packing / unpacking same pixel sizes
34 //////////////////////////////////////////////////////////////////////////
35 template <uint32_t NumBits, bool Signed = false>
36 struct PackTraits
37 {
38 static const uint32_t MyNumBits = NumBits;
39 static simdscalar loadSOA(const uint8_t *pSrc) = delete;
40 static void storeSOA(uint8_t *pDst, simdscalar src) = delete;
41 static simdscalar unpack(simdscalar &in) = delete;
42 static simdscalar pack(simdscalar &in) = delete;
43 #if ENABLE_AVX512_SIMD16
44 static simd16scalar loadSOA_16(const uint8_t *pSrc) = delete;
45 static void storeSOA(uint8_t *pDst, simd16scalar src) = delete;
46 static simd16scalar unpack(simd16scalar &in) = delete;
47 static simd16scalar pack(simd16scalar &in) = delete;
48 #endif
49 };
50
51 //////////////////////////////////////////////////////////////////////////
52 /// PackTraits - Helpers for packing / unpacking unused channels
53 //////////////////////////////////////////////////////////////////////////
54 template <>
55 struct PackTraits<0, false>
56 {
57 static const uint32_t MyNumBits = 0;
58
59 static simdscalar loadSOA(const uint8_t *pSrc) { return _simd_setzero_ps(); }
60 static void storeSOA(uint8_t *pDst, simdscalar src) { return; }
61 static simdscalar unpack(simdscalar &in) { return _simd_setzero_ps(); }
62 static simdscalar pack(simdscalar &in) { return _simd_setzero_ps(); }
63 #if ENABLE_AVX512_SIMD16
64 static simd16scalar loadSOA_16(const uint8_t *pSrc) { return _simd16_setzero_ps(); }
65 static void storeSOA(uint8_t *pDst, simd16scalar src) { return; }
66 static simd16scalar unpack(simd16scalar &in) { return _simd16_setzero_ps(); }
67 static simd16scalar pack(simd16scalar &in) { return _simd16_setzero_ps(); }
68 #endif
69 };
70
71 //////////////////////////////////////////////////////////////////////////
72 /// PackTraits - Helpers for packing / unpacking 8 bit unsigned channels
73 //////////////////////////////////////////////////////////////////////////
74 template <>
75 struct PackTraits<8, false>
76 {
77 static const uint32_t MyNumBits = 8;
78
79 static simdscalar loadSOA(const uint8_t *pSrc)
80 {
81 #if KNOB_SIMD_WIDTH == 8
82 __m256 result = _mm256_setzero_ps();
83 __m128 vLo = _mm_castpd_ps(_mm_load_sd((double*)pSrc));
84 return _mm256_insertf128_ps(result, vLo, 0);
85 #else
86 #error Unsupported vector width
87 #endif
88 }
89
90 static void storeSOA(uint8_t *pDst, simdscalar src)
91 {
92 // store simd bytes
93 #if KNOB_SIMD_WIDTH == 8
94 _mm_storel_pd((double*)pDst, _mm_castps_pd(_mm256_castps256_ps128(src)));
95 #else
96 #error Unsupported vector width
97 #endif
98 }
99
100 static simdscalar unpack(simdscalar &in)
101 {
102 #if KNOB_SIMD_WIDTH == 8
103 #if KNOB_ARCH==KNOB_ARCH_AVX
104 __m128i src = _mm_castps_si128(_mm256_castps256_ps128(in));
105 __m128i resLo = _mm_cvtepu8_epi32(src);
106 __m128i resHi = _mm_shuffle_epi8(src,
107 _mm_set_epi32(0x80808007, 0x80808006, 0x80808005, 0x80808004));
108
109 __m256i result = _mm256_castsi128_si256(resLo);
110 result = _mm256_insertf128_si256(result, resHi, 1);
111 return _mm256_castsi256_ps(result);
112 #elif KNOB_ARCH>=KNOB_ARCH_AVX2
113 return _mm256_castsi256_ps(_mm256_cvtepu8_epi32(_mm_castps_si128(_mm256_castps256_ps128(in))));
114 #endif
115 #else
116 #error Unsupported vector width
117 #endif
118 }
119
120 static simdscalar pack(simdscalar &in)
121 {
122 #if KNOB_SIMD_WIDTH == 8
123 simdscalari src = _simd_castps_si(in);
124 __m128i res16 = _mm_packus_epi32(_mm256_castsi256_si128(src), _mm256_extractf128_si256(src, 1));
125 __m128i res8 = _mm_packus_epi16(res16, _mm_undefined_si128());
126 return _mm256_castsi256_ps(_mm256_castsi128_si256(res8));
127 #else
128 #error Unsupported vector width
129 #endif
130 }
131 #if ENABLE_AVX512_SIMD16
132
133 static simd16scalar loadSOA_16(const uint8_t *pSrc)
134 {
135 simd16scalar result = _simd16_setzero_ps();
136 simdscalar resultlo = _simd_setzero_ps();
137
138 const __m128 src = _mm_load_ps(reinterpret_cast<const float *>(pSrc));
139
140 resultlo = _mm256_insertf128_ps(resultlo, src, 0);
141 result = _simd16_insert_ps(result, resultlo, 0);
142
143 return result;
144 }
145
146 static void storeSOA(uint8_t *pDst, simd16scalar src)
147 {
148 // store simd16 bytes
149 _mm_store_ps(reinterpret_cast<float *>(pDst), _mm256_castps256_ps128(_simd16_extract_ps(src, 0)));
150 }
151
152 static simd16scalar unpack(simd16scalar &in)
153 {
154 simd16scalari result = _simd16_cvtepu8_epi32(_mm_castps_si128(_mm256_castps256_ps128(_simd16_extract_ps(in, 0))));
155
156 return _simd16_castsi_ps(result);
157 }
158
159 static simd16scalar pack(simd16scalar &in)
160 {
161 simd16scalari result = _simd16_setzero_si();
162
163 simdscalari inlo = _simd_castps_si(_simd16_extract_ps(in, 0)); // r0 r1 r2 r3 r4 r5 r6 r7 (32b)
164 simdscalari inhi = _simd_castps_si(_simd16_extract_ps(in, 1)); // r8 r9 rA rB rC rD rE rF
165
166 simdscalari permlo = _simd_permute2f128_si(inlo, inhi, 0x20); // r0 r1 r2 r3 r8 r9 rA rB (32b)
167 simdscalari permhi = _simd_permute2f128_si(inlo, inhi, 0x31); // r4 r5 r6 r7 rC rD rE rF (32b)
168
169 simdscalari pack = _simd_packus_epi32(permlo, permhi); // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF (16b)
170
171 const simdscalari zero = _simd_setzero_si();
172
173 permlo = _simd_permute2f128_si(pack, zero, 0x20); // (2, 0) // r0 r1 r2 r3 r4 r5 r6 r7 00 00 00 00 00 00 00 00 (16b)
174 permhi = _simd_permute2f128_si(pack, zero, 0x31); // (3, 1) // r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 (16b)
175
176 pack = _simd_packus_epi16(permlo, permhi); // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 (8b)
177
178 result = _simd16_insert_si(result, pack, 0);
179
180 return _simd16_castsi_ps(result);
181 }
182 #endif
183 };
184
185 //////////////////////////////////////////////////////////////////////////
186 /// PackTraits - Helpers for packing / unpacking 8 bit signed channels
187 //////////////////////////////////////////////////////////////////////////
188 template <>
189 struct PackTraits<8, true>
190 {
191 static const uint32_t MyNumBits = 8;
192
193 static simdscalar loadSOA(const uint8_t *pSrc)
194 {
195 #if KNOB_SIMD_WIDTH == 8
196 __m256 result = _mm256_setzero_ps();
197 __m128 vLo = _mm_castpd_ps(_mm_load_sd((double*)pSrc));
198 return _mm256_insertf128_ps(result, vLo, 0);
199 #else
200 #error Unsupported vector width
201 #endif
202 }
203
204 static void storeSOA(uint8_t *pDst, simdscalar src)
205 {
206 // store simd bytes
207 #if KNOB_SIMD_WIDTH == 8
208 _mm_storel_pd((double*)pDst, _mm_castps_pd(_mm256_castps256_ps128(src)));
209 #else
210 #error Unsupported vector width
211 #endif
212 }
213
214 static simdscalar unpack(simdscalar &in)
215 {
216 #if KNOB_SIMD_WIDTH == 8
217 #if KNOB_ARCH==KNOB_ARCH_AVX
218 SWR_INVALID("I think this may be incorrect.");
219 __m128i src = _mm_castps_si128(_mm256_castps256_ps128(in));
220 __m128i resLo = _mm_cvtepi8_epi32(src);
221 __m128i resHi = _mm_shuffle_epi8(src,
222 _mm_set_epi32(0x80808007, 0x80808006, 0x80808005, 0x80808004));
223
224 __m256i result = _mm256_castsi128_si256(resLo);
225 result = _mm256_insertf128_si256(result, resHi, 1);
226 return _mm256_castsi256_ps(result);
227 #elif KNOB_ARCH>=KNOB_ARCH_AVX2
228 return _mm256_castsi256_ps(_mm256_cvtepi8_epi32(_mm_castps_si128(_mm256_castps256_ps128(in))));
229 #endif
230 #else
231 #error Unsupported vector width
232 #endif
233 }
234
235 static simdscalar pack(simdscalar &in)
236 {
237 #if KNOB_SIMD_WIDTH == 8
238 simdscalari src = _simd_castps_si(in);
239 __m128i res16 = _mm_packs_epi32(_mm256_castsi256_si128(src), _mm256_extractf128_si256(src, 1));
240 __m128i res8 = _mm_packs_epi16(res16, _mm_undefined_si128());
241 return _mm256_castsi256_ps(_mm256_castsi128_si256(res8));
242 #else
243 #error Unsupported vector width
244 #endif
245 }
246 #if ENABLE_AVX512_SIMD16
247
248 static simd16scalar loadSOA_16(const uint8_t *pSrc)
249 {
250 simd16scalar result = _simd16_setzero_ps();
251 simdscalar resultlo = _simd_setzero_ps();
252
253 const __m128 src = _mm_load_ps(reinterpret_cast<const float *>(pSrc));
254
255 resultlo = _mm256_insertf128_ps(resultlo, src, 0);
256 result = _simd16_insert_ps(result, resultlo, 0);
257
258 return result;
259 }
260
261 static void storeSOA(uint8_t *pDst, simd16scalar src)
262 {
263 // store simd16 bytes
264 _mm_store_ps(reinterpret_cast<float *>(pDst), _mm256_castps256_ps128(_simd16_extract_ps(src, 0)));
265 }
266
267 static simd16scalar unpack(simd16scalar &in)
268 {
269 simd16scalari result = _simd16_cvtepu8_epi32(_mm_castps_si128(_mm256_castps256_ps128(_simd16_extract_ps(in, 0))));
270
271 return _simd16_castsi_ps(result);
272 }
273
274 static simd16scalar pack(simd16scalar &in)
275 {
276 simd16scalari result = _simd16_setzero_si();
277
278 simdscalari inlo = _simd_castps_si(_simd16_extract_ps(in, 0)); // r0 r1 r2 r3 r4 r5 r6 r7 (32b)
279 simdscalari inhi = _simd_castps_si(_simd16_extract_ps(in, 1)); // r8 r9 rA rB rC rD rE rF
280
281 simdscalari permlo = _simd_permute2f128_si(inlo, inhi, 0x20); // r0 r1 r2 r3 r8 r9 rA rB (32b)
282 simdscalari permhi = _simd_permute2f128_si(inlo, inhi, 0x31); // r4 r5 r6 r7 rC rD rE rF (32b)
283
284 simdscalari pack = _simd_packs_epi32(permlo, permhi); // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF (16b)
285
286 const simdscalari zero = _simd_setzero_si();
287
288 permlo = _simd_permute2f128_si(pack, zero, 0x20); // (2, 0) // r0 r1 r2 r3 r4 r5 r6 r7 00 00 00 00 00 00 00 00 (16b)
289 permhi = _simd_permute2f128_si(pack, zero, 0x31); // (3, 1) // r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 (16b)
290
291 pack = _simd_packs_epi16(permlo, permhi); // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 (8b)
292
293 result = _simd16_insert_si(result, pack, 0);
294
295 return _simd16_castsi_ps(result);
296 }
297 #endif
298 };
299
300 //////////////////////////////////////////////////////////////////////////
301 /// PackTraits - Helpers for packing / unpacking 16 bit unsigned channels
302 //////////////////////////////////////////////////////////////////////////
303 template <>
304 struct PackTraits<16, false>
305 {
306 static const uint32_t MyNumBits = 16;
307
308 static simdscalar loadSOA(const uint8_t *pSrc)
309 {
310 #if KNOB_SIMD_WIDTH == 8
311 __m256 result = _mm256_setzero_ps();
312 __m128 vLo = _mm_load_ps((const float*)pSrc);
313 return _mm256_insertf128_ps(result, vLo, 0);
314 #else
315 #error Unsupported vector width
316 #endif
317 }
318
319 static void storeSOA(uint8_t *pDst, simdscalar src)
320 {
321 #if KNOB_SIMD_WIDTH == 8
322 // store 16B (2B * 8)
323 _mm_store_ps((float*)pDst, _mm256_castps256_ps128(src));
324 #else
325 #error Unsupported vector width
326 #endif
327 }
328
329 static simdscalar unpack(simdscalar &in)
330 {
331 #if KNOB_SIMD_WIDTH == 8
332 #if KNOB_ARCH==KNOB_ARCH_AVX
333 __m128i src = _mm_castps_si128(_mm256_castps256_ps128(in));
334 __m128i resLo = _mm_cvtepu16_epi32(src);
335 __m128i resHi = _mm_shuffle_epi8(src,
336 _mm_set_epi32(0x80800F0E, 0x80800D0C, 0x80800B0A, 0x80800908));
337
338 __m256i result = _mm256_castsi128_si256(resLo);
339 result = _mm256_insertf128_si256(result, resHi, 1);
340 return _mm256_castsi256_ps(result);
341 #elif KNOB_ARCH>=KNOB_ARCH_AVX2
342 return _mm256_castsi256_ps(_mm256_cvtepu16_epi32(_mm_castps_si128(_mm256_castps256_ps128(in))));
343 #endif
344 #else
345 #error Unsupported vector width
346 #endif
347 }
348
349 static simdscalar pack(simdscalar &in)
350 {
351 #if KNOB_SIMD_WIDTH == 8
352 simdscalari src = _simd_castps_si(in);
353 __m256i res = _mm256_castsi128_si256(_mm_packus_epi32(_mm256_castsi256_si128(src), _mm256_extractf128_si256(src, 1)));
354 return _mm256_castsi256_ps(res);
355 #else
356 #error Unsupported vector width
357 #endif
358 }
359 #if ENABLE_AVX512_SIMD16
360
361 static simd16scalar loadSOA_16(const uint8_t *pSrc)
362 {
363 simd16scalar result = _simd16_setzero_ps();
364
365 simdscalar resultlo = _simd_load_ps(reinterpret_cast<const float *>(pSrc));
366
367 result = _simd16_insert_ps(result, resultlo, 0);
368
369 return result;
370 }
371
372 static void storeSOA(uint8_t *pDst, simd16scalar src)
373 {
374 _simd_store_ps(reinterpret_cast<float *>(pDst), _simd16_extract_ps(src, 0));
375 }
376
377 static simd16scalar unpack(simd16scalar &in)
378 {
379 simd16scalari result = _simd16_cvtepu16_epi32(_simd_castps_si(_simd16_extract_ps(in, 0)));
380
381 return _simd16_castsi_ps(result);
382 }
383
384 static simd16scalar pack(simd16scalar &in)
385 {
386 const simd16scalari zero = _simd16_setzero_si();
387
388 simd16scalari permlo = _simd16_permute2f128_si(_simd16_castps_si(in), zero, 0x08); // (0, 0, 2, 0) // r0 r1 r2 r3 r8 r9 rA rB 00 00 00 00 00 00 00 00 (32b)
389 simd16scalari permhi = _simd16_permute2f128_si(_simd16_castps_si(in), zero, 0x0D); // (0, 0, 3, 1) // r4 r5 r6 r7 rC rD rE rF 00 00 00 00 00 00 00 00
390
391 simd16scalari result = _simd16_packus_epi32(permlo, permhi); // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 (16b)
392
393 return _simd16_castsi_ps(result);
394 }
395 #endif
396 };
397
398 //////////////////////////////////////////////////////////////////////////
399 /// PackTraits - Helpers for packing / unpacking 16 bit signed channels
400 //////////////////////////////////////////////////////////////////////////
401 template <>
402 struct PackTraits<16, true>
403 {
404 static const uint32_t MyNumBits = 16;
405
406 static simdscalar loadSOA(const uint8_t *pSrc)
407 {
408 #if KNOB_SIMD_WIDTH == 8
409 __m256 result = _mm256_setzero_ps();
410 __m128 vLo = _mm_load_ps((const float*)pSrc);
411 return _mm256_insertf128_ps(result, vLo, 0);
412 #else
413 #error Unsupported vector width
414 #endif
415 }
416
417 static void storeSOA(uint8_t *pDst, simdscalar src)
418 {
419 #if KNOB_SIMD_WIDTH == 8
420 // store 16B (2B * 8)
421 _mm_store_ps((float*)pDst, _mm256_castps256_ps128(src));
422 #else
423 #error Unsupported vector width
424 #endif
425 }
426
427 static simdscalar unpack(simdscalar &in)
428 {
429 #if KNOB_SIMD_WIDTH == 8
430 #if KNOB_ARCH==KNOB_ARCH_AVX
431 SWR_INVALID("I think this may be incorrect.");
432 __m128i src = _mm_castps_si128(_mm256_castps256_ps128(in));
433 __m128i resLo = _mm_cvtepi16_epi32(src);
434 __m128i resHi = _mm_shuffle_epi8(src,
435 _mm_set_epi32(0x80800F0E, 0x80800D0C, 0x80800B0A, 0x80800908));
436
437 __m256i result = _mm256_castsi128_si256(resLo);
438 result = _mm256_insertf128_si256(result, resHi, 1);
439 return _mm256_castsi256_ps(result);
440 #elif KNOB_ARCH>=KNOB_ARCH_AVX2
441 return _mm256_castsi256_ps(_mm256_cvtepi16_epi32(_mm_castps_si128(_mm256_castps256_ps128(in))));
442 #endif
443 #else
444 #error Unsupported vector width
445 #endif
446 }
447
448 static simdscalar pack(simdscalar &in)
449 {
450 #if KNOB_SIMD_WIDTH == 8
451 simdscalari src = _simd_castps_si(in);
452 __m256i res = _mm256_castsi128_si256(_mm_packs_epi32(_mm256_castsi256_si128(src), _mm256_extractf128_si256(src, 1)));
453 return _mm256_castsi256_ps(res);
454 #else
455 #error Unsupported vector width
456 #endif
457 }
458 #if ENABLE_AVX512_SIMD16
459
460 static simd16scalar loadSOA_16(const uint8_t *pSrc)
461 {
462 simd16scalar result = _simd16_setzero_ps();
463
464 simdscalar resultlo = _simd_load_ps(reinterpret_cast<const float *>(pSrc));
465
466 result = _simd16_insert_ps(result, resultlo, 0);
467
468 return result;
469 }
470
471 static void storeSOA(uint8_t *pDst, simd16scalar src)
472 {
473 _simd_store_ps(reinterpret_cast<float *>(pDst), _simd16_extract_ps(src, 0));
474 }
475
476 static simd16scalar unpack(simd16scalar &in)
477 {
478 simd16scalari result = _simd16_cvtepu16_epi32(_simd_castps_si(_simd16_extract_ps(in, 0)));
479
480 return _simd16_castsi_ps(result);
481 }
482
483 static simd16scalar pack(simd16scalar &in)
484 {
485 const simd16scalari zero = _simd16_setzero_si();
486
487 simd16scalari permlo = _simd16_permute2f128_si(_simd16_castps_si(in), zero, 0x08); // (0, 0, 2, 0) // r0 r1 r2 r3 r8 r9 rA rB 00 00 00 00 00 00 00 00 (32b)
488 simd16scalari permhi = _simd16_permute2f128_si(_simd16_castps_si(in), zero, 0x0D); // (0, 0, 3, 1) // r4 r5 r6 r7 rC rD rE rF 00 00 00 00 00 00 00 00
489
490 simd16scalari result = _simd16_packs_epi32(permlo, permhi); // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 (16b)
491
492 return _simd16_castsi_ps(result);
493 }
494 #endif
495 };
496
497 //////////////////////////////////////////////////////////////////////////
498 /// PackTraits - Helpers for packing / unpacking 32 bit channels
499 //////////////////////////////////////////////////////////////////////////
500 template <>
501 struct PackTraits<32, false>
502 {
503 static const uint32_t MyNumBits = 32;
504
505 static simdscalar loadSOA(const uint8_t *pSrc) { return _simd_load_ps((const float*)pSrc); }
506 static void storeSOA(uint8_t *pDst, simdscalar src) { _simd_store_ps((float*)pDst, src); }
507 static simdscalar unpack(simdscalar &in) { return in; }
508 static simdscalar pack(simdscalar &in) { return in; }
509 #if ENABLE_AVX512_SIMD16
510
511 static simd16scalar loadSOA_16(const uint8_t *pSrc)
512 {
513 return _simd16_load_ps(reinterpret_cast<const float *>(pSrc));
514 }
515
516 static void storeSOA(uint8_t *pDst, simd16scalar src)
517 {
518 _simd16_store_ps(reinterpret_cast<float *>(pDst), src);
519 }
520
521 static simd16scalar unpack(simd16scalar &in)
522 {
523 return in;
524 }
525
526 static simd16scalar pack(simd16scalar &in)
527 {
528 return in;
529 }
530 #endif
531 };
532
533 //////////////////////////////////////////////////////////////////////////
534 /// TypeTraits - Format type traits.
535 //////////////////////////////////////////////////////////////////////////
536 template<SWR_TYPE type, uint32_t NumBits>
537 struct TypeTraits : PackTraits<NumBits>
538 {
539 static const SWR_TYPE MyType = type;
540 static float toFloat() { return 0.0; }
541 static float fromFloat() { SWR_NOT_IMPL; return 0.0; }
542 static simdscalar convertSrgb(simdscalar &in) { SWR_NOT_IMPL; return _simd_setzero_ps(); }
543 };
544
545 //////////////////////////////////////////////////////////////////////////
546 /// TypeTraits - Format type traits specialization for UINT8
547 //////////////////////////////////////////////////////////////////////////
548 template<> struct TypeTraits<SWR_TYPE_UINT, 8> : PackTraits<8>
549 {
550 static const SWR_TYPE MyType = SWR_TYPE_UINT;
551 static float toFloat() { return 0.0; }
552 static float fromFloat() { SWR_NOT_IMPL; return 0.0; }
553 static simdscalar convertSrgb(simdscalar &in) { SWR_NOT_IMPL; return _simd_setzero_ps(); }
554 };
555
556 //////////////////////////////////////////////////////////////////////////
557 /// TypeTraits - Format type traits specialization for UINT8
558 //////////////////////////////////////////////////////////////////////////
559 template<> struct TypeTraits<SWR_TYPE_SINT, 8> : PackTraits<8, true>
560 {
561 static const SWR_TYPE MyType = SWR_TYPE_SINT;
562 static float toFloat() { return 0.0; }
563 static float fromFloat() { SWR_NOT_IMPL; return 0.0; }
564 static simdscalar convertSrgb(simdscalar &in) { SWR_NOT_IMPL; return _simd_setzero_ps(); }
565 };
566
567 //////////////////////////////////////////////////////////////////////////
568 /// TypeTraits - Format type traits specialization for UINT16
569 //////////////////////////////////////////////////////////////////////////
570 template<> struct TypeTraits<SWR_TYPE_UINT, 16> : PackTraits<16>
571 {
572 static const SWR_TYPE MyType = SWR_TYPE_UINT;
573 static float toFloat() { return 0.0; }
574 static float fromFloat() { SWR_NOT_IMPL; return 0.0; }
575 static simdscalar convertSrgb(simdscalar &in) { SWR_NOT_IMPL; return _simd_setzero_ps(); }
576 };
577
578 //////////////////////////////////////////////////////////////////////////
579 /// TypeTraits - Format type traits specialization for SINT16
580 //////////////////////////////////////////////////////////////////////////
581 template<> struct TypeTraits<SWR_TYPE_SINT, 16> : PackTraits<16, true>
582 {
583 static const SWR_TYPE MyType = SWR_TYPE_SINT;
584 static float toFloat() { return 0.0; }
585 static float fromFloat() { SWR_NOT_IMPL; return 0.0; }
586 static simdscalar convertSrgb(simdscalar &in) { SWR_NOT_IMPL; return _simd_setzero_ps(); }
587 };
588
589 //////////////////////////////////////////////////////////////////////////
590 /// TypeTraits - Format type traits specialization for UINT32
591 //////////////////////////////////////////////////////////////////////////
592 template<> struct TypeTraits<SWR_TYPE_UINT, 32> : PackTraits<32>
593 {
594 static const SWR_TYPE MyType = SWR_TYPE_UINT;
595 static float toFloat() { return 0.0; }
596 static float fromFloat() { SWR_NOT_IMPL; return 0.0; }
597 static simdscalar convertSrgb(simdscalar &in) { SWR_NOT_IMPL; return _simd_setzero_ps(); }
598 };
599
600 //////////////////////////////////////////////////////////////////////////
601 /// TypeTraits - Format type traits specialization for UINT32
602 //////////////////////////////////////////////////////////////////////////
603 template<> struct TypeTraits<SWR_TYPE_SINT, 32> : PackTraits<32>
604 {
605 static const SWR_TYPE MyType = SWR_TYPE_SINT;
606 static float toFloat() { return 0.0; }
607 static float fromFloat() { SWR_NOT_IMPL; return 0.0; }
608 static simdscalar convertSrgb(simdscalar &in) { SWR_NOT_IMPL; return _simd_setzero_ps(); }
609 };
610
611 //////////////////////////////////////////////////////////////////////////
612 /// TypeTraits - Format type traits specialization for UNORM5
613 //////////////////////////////////////////////////////////////////////////
614 template<> struct TypeTraits<SWR_TYPE_UNORM, 5> : PackTraits<5>
615 {
616 static const SWR_TYPE MyType = SWR_TYPE_UNORM;
617 static float toFloat() { return 1.0f / 31.0f; }
618 static float fromFloat() { return 31.0f; }
619 static simdscalar convertSrgb(simdscalar &in) { SWR_NOT_IMPL; return _simd_setzero_ps(); }
620 };
621
622 //////////////////////////////////////////////////////////////////////////
623 /// TypeTraits - Format type traits specialization for UNORM6
624 //////////////////////////////////////////////////////////////////////////
625 template<> struct TypeTraits<SWR_TYPE_UNORM, 6> : PackTraits<6>
626 {
627 static const SWR_TYPE MyType = SWR_TYPE_UNORM;
628 static float toFloat() { return 1.0f / 63.0f; }
629 static float fromFloat() { return 63.0f; }
630 static simdscalar convertSrgb(simdscalar &in) { SWR_NOT_IMPL; return _simd_setzero_ps(); }
631 };
632
633 //////////////////////////////////////////////////////////////////////////
634 /// TypeTraits - Format type traits specialization for UNORM8
635 //////////////////////////////////////////////////////////////////////////
636 template<> struct TypeTraits<SWR_TYPE_UNORM, 8> : PackTraits<8>
637 {
638 static const SWR_TYPE MyType = SWR_TYPE_UNORM;
639 static float toFloat() { return 1.0f / 255.0f; }
640 static float fromFloat() { return 255.0f; }
641 static simdscalar convertSrgb(simdscalar &in) { SWR_NOT_IMPL; return _simd_setzero_ps(); }
642 };
643
644 //////////////////////////////////////////////////////////////////////////
645 /// TypeTraits - Format type traits specialization for UNORM8
646 //////////////////////////////////////////////////////////////////////////
647 template<> struct TypeTraits<SWR_TYPE_SNORM, 8> : PackTraits<8, true>
648 {
649 static const SWR_TYPE MyType = SWR_TYPE_SNORM;
650 static float toFloat() { return 1.0f / 127.0f; }
651 static float fromFloat() { return 127.0f; }
652 static simdscalar convertSrgb(simdscalar &in) { SWR_NOT_IMPL; return _simd_setzero_ps(); }
653 };
654
655 //////////////////////////////////////////////////////////////////////////
656 /// TypeTraits - Format type traits specialization for UNORM16
657 //////////////////////////////////////////////////////////////////////////
658 template<> struct TypeTraits<SWR_TYPE_UNORM, 16> : PackTraits<16>
659 {
660 static const SWR_TYPE MyType = SWR_TYPE_UNORM;
661 static float toFloat() { return 1.0f / 65535.0f; }
662 static float fromFloat() { return 65535.0f; }
663 static simdscalar convertSrgb(simdscalar &in) { SWR_NOT_IMPL; return _simd_setzero_ps(); }
664 };
665
666 //////////////////////////////////////////////////////////////////////////
667 /// TypeTraits - Format type traits specialization for SNORM16
668 //////////////////////////////////////////////////////////////////////////
669 template<> struct TypeTraits<SWR_TYPE_SNORM, 16> : PackTraits<16, true>
670 {
671 static const SWR_TYPE MyType = SWR_TYPE_UNORM;
672 static float toFloat() { return 1.0f / 32767.0f; }
673 static float fromFloat() { return 32767.0f; }
674 static simdscalar convertSrgb(simdscalar &in) { SWR_NOT_IMPL; return _simd_setzero_ps(); }
675 };
676
677 //////////////////////////////////////////////////////////////////////////
678 /// TypeTraits - Format type traits specialization for UNORM24
679 //////////////////////////////////////////////////////////////////////////
680 template<>
681 struct TypeTraits < SWR_TYPE_UNORM, 24 > : PackTraits<32>
682 {
683 static const SWR_TYPE MyType = SWR_TYPE_UNORM;
684 static float toFloat() { return 1.0f / 16777215.0f; }
685 static float fromFloat() { return 16777215.0f; }
686 static simdscalar convertSrgb(simdscalar &in) { SWR_NOT_IMPL; return _simd_setzero_ps(); }
687 };
688
689 //////////////////////////////////////////////////////////////////////////
690 // FLOAT Specializations from here on...
691 //////////////////////////////////////////////////////////////////////////
692 #define TO_M128i(a) _mm_castps_si128(a)
693 #define TO_M128(a) _mm_castsi128_ps(a)
694
695 #include "math.h"
696
697 template< unsigned expnum, unsigned expden, unsigned coeffnum, unsigned coeffden >
698 inline static __m128 fastpow(__m128 arg) {
699 __m128 ret = arg;
700
701 static const __m128 factor = _mm_set1_ps(exp2(127.0f * expden / expnum - 127.0f)
702 * powf(1.0f * coeffnum / coeffden, 1.0f * expden / expnum));
703
704 // Apply a constant pre-correction factor.
705 ret = _mm_mul_ps(ret, factor);
706
707 // Reinterpret arg as integer to obtain logarithm.
708 //asm("cvtdq2ps %1, %0" : "=x" (ret) : "x" (ret));
709 ret = _mm_cvtepi32_ps(_mm_castps_si128(ret));
710
711 // Multiply logarithm by power.
712 ret = _mm_mul_ps(ret, _mm_set1_ps(1.0f * expnum / expden));
713
714 // Convert back to "integer" to exponentiate.
715 //asm("cvtps2dq %1, %0" : "=x" (ret) : "x" (ret));
716 ret = _mm_castsi128_ps(_mm_cvtps_epi32(ret));
717
718 return ret;
719 }
720
721 inline static __m128 pow512_4(__m128 arg) {
722 // 5/12 is too small, so compute the 4th root of 20/12 instead.
723 // 20/12 = 5/3 = 1 + 2/3 = 2 - 1/3. 2/3 is a suitable argument for fastpow.
724 // weighting coefficient: a^-1/2 = 2 a; a = 2^-2/3
725 __m128 xf = fastpow< 2, 3, int(0.629960524947437 * 1e9), int(1e9) >(arg);
726 __m128 xover = _mm_mul_ps(arg, xf);
727
728 __m128 xfm1 = _mm_rsqrt_ps(xf);
729 __m128 x2 = _mm_mul_ps(arg, arg);
730 __m128 xunder = _mm_mul_ps(x2, xfm1);
731
732 // sqrt2 * over + 2 * sqrt2 * under
733 __m128 xavg = _mm_mul_ps(_mm_set1_ps(1.0f / (3.0f * 0.629960524947437f) * 0.999852f),
734 _mm_add_ps(xover, xunder));
735
736 xavg = _mm_mul_ps(xavg, _mm_rsqrt_ps(xavg));
737 xavg = _mm_mul_ps(xavg, _mm_rsqrt_ps(xavg));
738 return xavg;
739 }
740
741 inline static __m128 powf_wrapper(__m128 Base, float Exp)
742 {
743 float *f = (float *)(&Base);
744
745 return _mm_set_ps(powf(f[3], Exp),
746 powf(f[2], Exp),
747 powf(f[1], Exp),
748 powf(f[0], Exp));
749 }
750
751 static inline __m128 ConvertFloatToSRGB2(__m128& Src)
752 {
753 // create a mask with 0xFFFFFFFF in the DWORDs where the source is <= the minimal SRGB float value
754 __m128i CmpToSRGBThresholdMask = TO_M128i(_mm_cmpnlt_ps(_mm_set1_ps(0.0031308f), Src));
755
756 // squeeze the mask down to 16 bits (4 bits per DWORD)
757 int CompareResult = _mm_movemask_epi8(CmpToSRGBThresholdMask);
758
759 __m128 Result;
760
761 //
762 if (CompareResult == 0xFFFF)
763 {
764 // all DWORDs are <= the threshold
765 Result = _mm_mul_ps(Src, _mm_set1_ps(12.92f));
766 }
767 else if (CompareResult == 0x0)
768 {
769 // all DWORDs are > the threshold
770 __m128 fSrc_0RGB = Src;
771
772 // --> 1.055f * c(1.0f/2.4f) - 0.055f
773 #if KNOB_USE_FAST_SRGB == TRUE
774 // 1.0f / 2.4f is 5.0f / 12.0f which is used for approximation.
775 __m128 f = pow512_4(fSrc_0RGB);
776 #else
777 __m128 f = powf_wrapper(fSrc_0RGB, 1.0f / 2.4f);
778 #endif
779 f = _mm_mul_ps(f, _mm_set1_ps(1.055f));
780 Result = _mm_sub_ps(f, _mm_set1_ps(0.055f));
781 }
782 else
783 {
784 // some DWORDs are <= the threshold and some are > threshold
785 __m128 Src_0RGB_mul_denorm = _mm_mul_ps(Src, _mm_set1_ps(12.92f));
786
787 __m128 fSrc_0RGB = Src;
788
789 // --> 1.055f * c(1.0f/2.4f) - 0.055f
790 #if KNOB_USE_FAST_SRGB == TRUE
791 // 1.0f / 2.4f is 5.0f / 12.0f which is used for approximation.
792 __m128 f = pow512_4(fSrc_0RGB);
793 #else
794 __m128 f = powf_wrapper(fSrc_0RGB, 1.0f / 2.4f);
795 #endif
796 f = _mm_mul_ps(f, _mm_set1_ps(1.055f));
797 f = _mm_sub_ps(f, _mm_set1_ps(0.055f));
798
799 // Clear the alpha (is garbage after the sub)
800 __m128i i = _mm_and_si128(TO_M128i(f), _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF));
801
802 __m128i LessThanPart = _mm_and_si128(CmpToSRGBThresholdMask, TO_M128i(Src_0RGB_mul_denorm));
803 __m128i GreaterEqualPart = _mm_andnot_si128(CmpToSRGBThresholdMask, i);
804 __m128i CombinedParts = _mm_or_si128(LessThanPart, GreaterEqualPart);
805
806 Result = TO_M128(CombinedParts);
807 }
808
809 return Result;
810 }
811
812 #if ENABLE_AVX512_SIMD16
813 template< unsigned expnum, unsigned expden, unsigned coeffnum, unsigned coeffden >
814 inline static simd16scalar fastpow(simd16scalar value)
815 {
816 static const float factor1 = exp2(127.0f * expden / expnum - 127.0f)
817 * powf(1.0f * coeffnum / coeffden, 1.0f * expden / expnum);
818
819 // Apply a constant pre-correction factor.
820 simd16scalar result = _simd16_mul_ps(value, _simd16_set1_ps(factor1));
821
822 // Reinterpret arg as integer to obtain logarithm.
823 //asm("cvtdq2ps %1, %0" : "=x" (result) : "x" (result));
824 result = _simd16_cvtepi32_ps(_simd16_castps_si(result));
825
826 // Multiply logarithm by power.
827 result = _simd16_mul_ps(result, _simd16_set1_ps(1.0f * expnum / expden));
828
829 // Convert back to "integer" to exponentiate.
830 //asm("cvtps2dq %1, %0" : "=x" (result) : "x" (result));
831 result = _simd16_castsi_ps(_simd16_cvtps_epi32(result));
832
833 return result;
834 }
835
836 inline static simd16scalar pow512_4(simd16scalar arg)
837 {
838 // 5/12 is too small, so compute the 4th root of 20/12 instead.
839 // 20/12 = 5/3 = 1 + 2/3 = 2 - 1/3. 2/3 is a suitable argument for fastpow.
840 // weighting coefficient: a^-1/2 = 2 a; a = 2^-2/3
841 simd16scalar xf = fastpow< 2, 3, int(0.629960524947437 * 1e9), int(1e9) >(arg);
842 simd16scalar xover = _simd16_mul_ps(arg, xf);
843
844 simd16scalar xfm1 = _simd16_rsqrt_ps(xf);
845 simd16scalar x2 = _simd16_mul_ps(arg, arg);
846 simd16scalar xunder = _simd16_mul_ps(x2, xfm1);
847
848 // sqrt2 * over + 2 * sqrt2 * under
849 simd16scalar xavg = _simd16_mul_ps(_simd16_set1_ps(1.0f / (3.0f * 0.629960524947437f) * 0.999852f), _simd16_add_ps(xover, xunder));
850
851 xavg = _simd16_mul_ps(xavg, _simd16_rsqrt_ps(xavg));
852 xavg = _simd16_mul_ps(xavg, _simd16_rsqrt_ps(xavg));
853
854 return xavg;
855 }
856
857 inline static simd16scalar powf_wrapper(const simd16scalar base, float exp)
858 {
859 const float *f = reinterpret_cast<const float *>(&base);
860
861 return _simd16_set_ps(
862 powf(f[15], exp),
863 powf(f[14], exp),
864 powf(f[13], exp),
865 powf(f[12], exp),
866 powf(f[11], exp),
867 powf(f[10], exp),
868 powf(f[ 9], exp),
869 powf(f[ 8], exp),
870 powf(f[ 7], exp),
871 powf(f[ 6], exp),
872 powf(f[ 5], exp),
873 powf(f[ 4], exp),
874 powf(f[ 3], exp),
875 powf(f[ 2], exp),
876 powf(f[ 1], exp),
877 powf(f[ 0], exp)
878 );
879 }
880
881 // float to SRGB conversion formula
882 //
883 // if (value < 0.0031308f)
884 // value *= 12.92f;
885 // else
886 // value = 1.055f * pow(value, 1.0f / 2.4f) - 0.055f;
887 //
888 static inline simd16scalar ConvertFloatToSRGB2(const simd16scalar &value)
889 {
890 // create a mask where the source is < the minimal SRGB float value
891 const simd16mask mask = _simd16_cmplt_ps_mask(value, _simd16_set1_ps(0.0031308f));
892
893 // if all elements are < the threshold, result = value * 12.92
894 simd16scalar result = _simd16_mul_ps(value, _simd16_set1_ps(12.92f));
895
896 if (_simd16_mask2int(mask) != 0xFFFF)
897 {
898 // some elements are >= threshold, result = 1.055 * power(value, 1.0 / 2.4) - 0.055
899 #if KNOB_USE_FAST_SRGB == TRUE
900 // 1.0f / 2.4f is 5.0f / 12.0f which is used for approximation.
901 simd16scalar result2 = pow512_4(value);
902 #else
903 simd16scalar result2 = powf_wrapper(value, 1.0f / 2.4f);
904 #endif
905
906 result2 = _simd16_mul_ps(result2, _simd16_set1_ps(1.055f));
907 result2 = _simd16_sub_ps(result2, _simd16_set1_ps(0.055f));
908
909 #if (KNOB_ARCH == KNOB_ARCH_AVX512)
910 // only native AVX512 can directly use the computed mask for the blend operation
911 result = _mm512_mask_blend_ps(mask, result2, result);
912 #else
913 result = _simd16_blendv_ps(result2, result, _simd16_cmplt_ps(value, _simd16_set1_ps(0.0031308f)));
914 #endif
915 }
916
917 return result;
918 }
919
920 #endif
921 //////////////////////////////////////////////////////////////////////////
922 /// TypeTraits - Format type traits specialization for FLOAT16
923 //////////////////////////////////////////////////////////////////////////
924 template<> struct TypeTraits<SWR_TYPE_FLOAT, 16> : PackTraits<16>
925 {
926 static const SWR_TYPE MyType = SWR_TYPE_FLOAT;
927 static float toFloat() { return 1.0f; }
928 static float fromFloat() { return 1.0f; }
929 static simdscalar convertSrgb(simdscalar &in) { SWR_NOT_IMPL; return _simd_setzero_ps(); }
930
931 static simdscalar pack(const simdscalar &in)
932 {
933 #if KNOB_SIMD_WIDTH == 8
934 #if (KNOB_ARCH == KNOB_ARCH_AVX)
935 // input is 8 packed float32, output is 8 packed float16
936 simdscalari src = _simd_castps_si(in);
937
938 static const uint32_t FLOAT_EXP_BITS = 8;
939 static const uint32_t FLOAT_MANTISSA_BITS = 23;
940 static const uint32_t FLOAT_MANTISSA_MASK = (1U << FLOAT_MANTISSA_BITS) - 1;
941 static const uint32_t FLOAT_EXP_MASK = ((1U << FLOAT_EXP_BITS) - 1) << FLOAT_MANTISSA_BITS;
942
943 static const uint32_t HALF_EXP_BITS = 5;
944 static const uint32_t HALF_MANTISSA_BITS = 10;
945 static const uint32_t HALF_EXP_MASK = ((1U << HALF_EXP_BITS) - 1) << HALF_MANTISSA_BITS;
946
947 // minimum exponent required, exponents below this are flushed to 0.
948 static const int32_t HALF_EXP_MIN = -14;
949 static const int32_t FLOAT_EXP_BIAS = 127;
950 static const int32_t FLOAT_EXP_MIN = HALF_EXP_MIN + FLOAT_EXP_BIAS;
951 static const int32_t FLOAT_EXP_MIN_FTZ = FLOAT_EXP_MIN - (HALF_MANTISSA_BITS + 1); // +1 for the lack of implicit significand
952
953 // maximum exponent required, exponents above this are set to infinity
954 static const int32_t HALF_EXP_MAX = 15;
955 static const int32_t FLOAT_EXP_MAX = HALF_EXP_MAX + FLOAT_EXP_BIAS;
956
957 const simdscalari vSignMask = _simd_set1_epi32(0x80000000);
958 const simdscalari vExpMask = _simd_set1_epi32(FLOAT_EXP_MASK);
959 const simdscalari vManMask = _simd_set1_epi32(FLOAT_MANTISSA_MASK);
960 const simdscalari vExpMin = _simd_set1_epi32(FLOAT_EXP_MASK & uint32_t(FLOAT_EXP_MIN << FLOAT_MANTISSA_BITS));
961 const simdscalari vExpMinFtz = _simd_set1_epi32(FLOAT_EXP_MASK & uint32_t(FLOAT_EXP_MIN_FTZ << FLOAT_MANTISSA_BITS));
962 const simdscalari vExpMax = _simd_set1_epi32(FLOAT_EXP_MASK & uint32_t(FLOAT_EXP_MAX << FLOAT_MANTISSA_BITS));
963
964 simdscalari vSign = _simd_and_si(src, vSignMask);
965 simdscalari vExp = _simd_and_si(src, vExpMask);
966 simdscalari vMan = _simd_and_si(src, vManMask);
967
968 simdscalari vFTZMask = _simd_cmplt_epi32(vExp, vExpMinFtz);
969 simdscalari vDenormMask = _simd_andnot_si(vFTZMask, _simd_cmplt_epi32(vExp, vExpMin));
970 simdscalari vInfMask = _simd_cmpeq_epi32(vExpMask, vExp);
971 simdscalari vClampMask = _simd_andnot_si(vInfMask, _simd_cmplt_epi32(vExpMax, vExp));
972
973 simdscalari vHalfExp = _simd_add_epi32(_simd_sub_epi32(vExp, vExpMin), _simd_set1_epi32(1U << FLOAT_MANTISSA_BITS));
974
975 // pack output 16-bits into the lower 16-bits of each 32-bit channel
976 simdscalari vDst = _simd_and_si(_simd_srli_epi32(vHalfExp, 13), _simd_set1_epi32(HALF_EXP_MASK));
977 vDst = _simd_or_si(vDst, _simd_srli_epi32(vMan, FLOAT_MANTISSA_BITS - HALF_MANTISSA_BITS));
978
979 // Flush To Zero
980 vDst = _simd_andnot_si(vFTZMask, vDst);
981 // Apply Infinites / NaN
982 vDst = _simd_or_si(vDst, _simd_and_si(vInfMask, _simd_set1_epi32(HALF_EXP_MASK)));
983
984 // Apply clamps
985 vDst = _simd_andnot_si(vClampMask, vDst);
986 vDst = _simd_or_si(vDst,
987 _simd_and_si(vClampMask, _simd_set1_epi32(0x7BFF)));
988
989 // Compute Denormals (subnormals)
990 if (!_mm256_testz_si256(vDenormMask, vDenormMask))
991 {
992 uint32_t *pDenormMask = (uint32_t*)&vDenormMask;
993 uint32_t *pExp = (uint32_t*)&vExp;
994 uint32_t *pMan = (uint32_t*)&vMan;
995 uint32_t *pDst = (uint32_t*)&vDst;
996 for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i)
997 {
998 if (pDenormMask[i])
999 {
1000 // Need to compute subnormal value
1001 uint32_t exponent = pExp[i] >> FLOAT_MANTISSA_BITS;
1002 uint32_t mantissa = pMan[i] |
1003 (1U << FLOAT_MANTISSA_BITS); // Denorms include no "implicit" 1s. Make it explicit
1004
1005 pDst[i] = mantissa >> ((FLOAT_EXP_MIN - exponent) + (FLOAT_MANTISSA_BITS - HALF_MANTISSA_BITS));
1006 }
1007 }
1008 }
1009
1010 // Add in sign bits
1011 vDst = _simd_or_si(vDst, _simd_srli_epi32(vSign, 16));
1012
1013 // Pack to lower 128-bits
1014 vDst = _mm256_castsi128_si256(_mm_packus_epi32(_mm256_castsi256_si128(vDst), _mm256_extractf128_si256(vDst, 1)));
1015
1016 #if 0
1017 #if !defined(NDEBUG)
1018 simdscalari vCheck = _mm256_castsi128_si256(_mm256_cvtps_ph(in, _MM_FROUND_TRUNC));
1019
1020 for (uint32_t i = 0; i < 4; ++i)
1021 {
1022 SWR_ASSERT(vCheck.m256i_i32[i] == vDst.m256i_i32[i]);
1023 }
1024 #endif
1025 #endif
1026
1027 return _simd_castsi_ps(vDst);
1028
1029 #else
1030 return _mm256_castsi256_ps(_mm256_castsi128_si256(_mm256_cvtps_ph(in, _MM_FROUND_TRUNC)));
1031 #endif
1032 #else
1033 #error Unsupported vector width
1034 #endif
1035 }
1036
1037 static simdscalar unpack(const simdscalar &in)
1038 {
1039 // input is 8 packed float16, output is 8 packed float32
1040 SWR_NOT_IMPL; // @todo
1041 return _simd_setzero_ps();
1042 }
1043 #if ENABLE_AVX512_SIMD16
1044
1045 static simd16scalar pack(const simd16scalar &in)
1046 {
1047 simd16scalari result = _simd16_setzero_si();
1048 simdscalari resultlo = _simd_setzero_si();
1049
1050 #if (KNOB_ARCH == KNOB_ARCH_AVX)
1051 simdscalar simdlo = pack(_simd16_extract_ps(in, 0));
1052 simdscalar simdhi = pack(_simd16_extract_ps(in, 1));
1053
1054 __m128i templo = _simd_extractf128_si(_simd_castps_si(simdlo), 0);
1055 __m128i temphi = _simd_extractf128_si(_simd_castps_si(simdhi), 0);
1056
1057 #else
1058 __m128i templo = _mm256_cvtps_ph(_simd16_extract_ps(in, 0), _MM_FROUND_TRUNC);
1059 __m128i temphi = _mm256_cvtps_ph(_simd16_extract_ps(in, 1), _MM_FROUND_TRUNC);
1060
1061 #endif
1062 resultlo = _simd_insertf128_si(resultlo, templo, 0);
1063 resultlo = _simd_insertf128_si(resultlo, temphi, 1);
1064
1065 result = _simd16_insert_si(result, resultlo, 0);
1066
1067 return _simd16_castsi_ps(result);
1068 }
1069
1070 static simd16scalar unpack(const simd16scalar &in)
1071 {
1072 // input is 16 packed float16, output is 16 packed float32
1073 SWR_NOT_IMPL; // @todo
1074 return _simd16_setzero_ps();
1075 }
1076 #endif
1077 };
1078
1079 //////////////////////////////////////////////////////////////////////////
1080 /// TypeTraits - Format type traits specialization for FLOAT32
1081 //////////////////////////////////////////////////////////////////////////
1082 template<> struct TypeTraits<SWR_TYPE_FLOAT, 32> : PackTraits<32>
1083 {
1084 static const SWR_TYPE MyType = SWR_TYPE_FLOAT;
1085 static float toFloat() { return 1.0f; }
1086 static float fromFloat() { return 1.0f; }
1087 static inline simdscalar convertSrgb(simdscalar &in)
1088 {
1089 #if KNOB_SIMD_WIDTH == 8
1090 #if (KNOB_ARCH == KNOB_ARCH_AVX || KNOB_ARCH == KNOB_ARCH_AVX2)
1091 __m128 srcLo = _mm256_extractf128_ps(in, 0);
1092 __m128 srcHi = _mm256_extractf128_ps(in, 1);
1093
1094 srcLo = ConvertFloatToSRGB2(srcLo);
1095 srcHi = ConvertFloatToSRGB2(srcHi);
1096
1097 in = _mm256_insertf128_ps(in, srcLo, 0);
1098 in = _mm256_insertf128_ps(in, srcHi, 1);
1099 #endif
1100 #else
1101 #error Unsupported vector width
1102 #endif
1103 return in;
1104 }
1105 #if ENABLE_AVX512_SIMD16
1106
1107 static inline simd16scalar convertSrgb(simd16scalar &in)
1108 {
1109 return ConvertFloatToSRGB2(in);
1110 }
1111 #endif
1112 };
1113
1114 //////////////////////////////////////////////////////////////////////////
1115 /// Format1 - Bitfield for single component formats.
1116 //////////////////////////////////////////////////////////////////////////
1117 template<uint32_t x>
1118 struct Format1
1119 {
1120 union
1121 {
1122 uint32_t r : x;
1123
1124 ///@ The following are here to provide full template needed in Formats.
1125 uint32_t g : x;
1126 uint32_t b : x;
1127 uint32_t a : x;
1128 };
1129 };
1130
1131 //////////////////////////////////////////////////////////////////////////
1132 /// Format1 - Bitfield for single component formats - 8 bit specialization
1133 //////////////////////////////////////////////////////////////////////////
1134 template<>
1135 struct Format1<8>
1136 {
1137 union
1138 {
1139 uint8_t r;
1140
1141 ///@ The following are here to provide full template needed in Formats.
1142 uint8_t g;
1143 uint8_t b;
1144 uint8_t a;
1145 };
1146 };
1147
1148 //////////////////////////////////////////////////////////////////////////
1149 /// Format1 - Bitfield for single component formats - 16 bit specialization
1150 //////////////////////////////////////////////////////////////////////////
1151 template<>
1152 struct Format1<16>
1153 {
1154 union
1155 {
1156 uint16_t r;
1157
1158 ///@ The following are here to provide full template needed in Formats.
1159 uint16_t g;
1160 uint16_t b;
1161 uint16_t a;
1162 };
1163 };
1164
1165 //////////////////////////////////////////////////////////////////////////
1166 /// Format2 - Bitfield for 2 component formats.
1167 //////////////////////////////////////////////////////////////////////////
1168 template<uint32_t x, uint32_t y>
1169 union Format2
1170 {
1171 struct
1172 {
1173 uint32_t r : x;
1174 uint32_t g : y;
1175 };
1176 struct
1177 {
1178 ///@ The following are here to provide full template needed in Formats.
1179 uint32_t b : x;
1180 uint32_t a : y;
1181 };
1182 };
1183
1184 //////////////////////////////////////////////////////////////////////////
1185 /// Format2 - Bitfield for 2 component formats - 16 bit specialization
1186 //////////////////////////////////////////////////////////////////////////
1187 template<>
1188 union Format2<8,8>
1189 {
1190 struct
1191 {
1192 uint16_t r : 8;
1193 uint16_t g : 8;
1194 };
1195 struct
1196 {
1197 ///@ The following are here to provide full template needed in Formats.
1198 uint16_t b : 8;
1199 uint16_t a : 8;
1200 };
1201 };
1202
1203 //////////////////////////////////////////////////////////////////////////
1204 /// Format3 - Bitfield for 3 component formats.
1205 //////////////////////////////////////////////////////////////////////////
1206 template<uint32_t x, uint32_t y, uint32_t z>
1207 union Format3
1208 {
1209 struct
1210 {
1211 uint32_t r : x;
1212 uint32_t g : y;
1213 uint32_t b : z;
1214 };
1215 uint32_t a; ///@note This is here to provide full template needed in Formats.
1216 };
1217
1218 //////////////////////////////////////////////////////////////////////////
1219 /// Format3 - Bitfield for 3 component formats - 16 bit specialization
1220 //////////////////////////////////////////////////////////////////////////
1221 template<>
1222 union Format3<5,6,5>
1223 {
1224 struct
1225 {
1226 uint16_t r : 5;
1227 uint16_t g : 6;
1228 uint16_t b : 5;
1229 };
1230 uint16_t a; ///@note This is here to provide full template needed in Formats.
1231 };
1232
1233 //////////////////////////////////////////////////////////////////////////
1234 /// Format4 - Bitfield for 4 component formats.
1235 //////////////////////////////////////////////////////////////////////////
1236 template<uint32_t x, uint32_t y, uint32_t z, uint32_t w>
1237 struct Format4
1238 {
1239 uint32_t r : x;
1240 uint32_t g : y;
1241 uint32_t b : z;
1242 uint32_t a : w;
1243 };
1244
1245 //////////////////////////////////////////////////////////////////////////
1246 /// Format4 - Bitfield for 4 component formats - 16 bit specialization
1247 //////////////////////////////////////////////////////////////////////////
1248 template<>
1249 struct Format4<5,5,5,1>
1250 {
1251 uint16_t r : 5;
1252 uint16_t g : 5;
1253 uint16_t b : 5;
1254 uint16_t a : 1;
1255 };
1256
1257 //////////////////////////////////////////////////////////////////////////
1258 /// Format4 - Bitfield for 4 component formats - 16 bit specialization
1259 //////////////////////////////////////////////////////////////////////////
1260 template<>
1261 struct Format4<4,4,4,4>
1262 {
1263 uint16_t r : 4;
1264 uint16_t g : 4;
1265 uint16_t b : 4;
1266 uint16_t a : 4;
1267 };
1268
1269 //////////////////////////////////////////////////////////////////////////
1270 /// ComponentTraits - Default components
1271 //////////////////////////////////////////////////////////////////////////
1272 template<uint32_t x, uint32_t y, uint32_t z, uint32_t w>
1273 struct Defaults
1274 {
1275 INLINE static uint32_t GetDefault(uint32_t comp)
1276 {
1277 static const uint32_t defaults[4]{ x, y, z, w };
1278 return defaults[comp];
1279 }
1280 };
1281
1282 //////////////////////////////////////////////////////////////////////////
1283 /// ComponentTraits - Component type traits.
1284 //////////////////////////////////////////////////////////////////////////
1285 template<SWR_TYPE X, uint32_t NumBitsX, SWR_TYPE Y = SWR_TYPE_UNKNOWN, uint32_t NumBitsY = 0, SWR_TYPE Z = SWR_TYPE_UNKNOWN, uint32_t NumBitsZ = 0, SWR_TYPE W = SWR_TYPE_UNKNOWN, uint32_t NumBitsW = 0>
1286 struct ComponentTraits
1287 {
1288 INLINE static SWR_TYPE GetType(uint32_t comp)
1289 {
1290 static const SWR_TYPE CompType[4]{ X, Y, Z, W };
1291 return CompType[comp];
1292 }
1293
1294 INLINE static uint32_t GetBPC(uint32_t comp)
1295 {
1296 static const uint32_t MyBpc[4]{ NumBitsX, NumBitsY, NumBitsZ, NumBitsW };
1297 return MyBpc[comp];
1298 }
1299
1300 INLINE static bool isNormalized(uint32_t comp)
1301 {
1302 switch (comp)
1303 {
1304 case 0:
1305 return (X == SWR_TYPE_UNORM || X == SWR_TYPE_SNORM) ? true : false;
1306 case 1:
1307 return (Y == SWR_TYPE_UNORM || Y == SWR_TYPE_SNORM) ? true : false;
1308 case 2:
1309 return (Z == SWR_TYPE_UNORM || Z == SWR_TYPE_SNORM) ? true : false;
1310 case 3:
1311 return (W == SWR_TYPE_UNORM || W == SWR_TYPE_SNORM) ? true : false;
1312 }
1313 SWR_INVALID("Invalid component: %d", comp);
1314 return false;
1315 }
1316
1317 INLINE static float toFloat(uint32_t comp)
1318 {
1319 switch (comp)
1320 {
1321 case 0:
1322 return TypeTraits<X, NumBitsX>::toFloat();
1323 case 1:
1324 return TypeTraits<Y, NumBitsY>::toFloat();
1325 case 2:
1326 return TypeTraits<Z, NumBitsZ>::toFloat();
1327 case 3:
1328 return TypeTraits<W, NumBitsW>::toFloat();
1329 }
1330 SWR_INVALID("Invalid component: %d", comp);
1331 return TypeTraits<X, NumBitsX>::toFloat();
1332
1333 }
1334
1335 INLINE static float fromFloat(uint32_t comp)
1336 {
1337 switch (comp)
1338 {
1339 case 0:
1340 return TypeTraits<X, NumBitsX>::fromFloat();
1341 case 1:
1342 return TypeTraits<Y, NumBitsY>::fromFloat();
1343 case 2:
1344 return TypeTraits<Z, NumBitsZ>::fromFloat();
1345 case 3:
1346 return TypeTraits<W, NumBitsW>::fromFloat();
1347 }
1348 SWR_INVALID("Invalid component: %d", comp);
1349 return TypeTraits<X, NumBitsX>::fromFloat();
1350 }
1351
1352 INLINE static simdscalar loadSOA(uint32_t comp, const uint8_t* pSrc)
1353 {
1354 switch (comp)
1355 {
1356 case 0:
1357 return TypeTraits<X, NumBitsX>::loadSOA(pSrc);
1358 case 1:
1359 return TypeTraits<Y, NumBitsY>::loadSOA(pSrc);
1360 case 2:
1361 return TypeTraits<Z, NumBitsZ>::loadSOA(pSrc);
1362 case 3:
1363 return TypeTraits<W, NumBitsW>::loadSOA(pSrc);
1364 }
1365 SWR_INVALID("Invalid component: %d", comp);
1366 return TypeTraits<X, NumBitsX>::loadSOA(pSrc);
1367 }
1368
1369 INLINE static void storeSOA(uint32_t comp, uint8_t *pDst, simdscalar src)
1370 {
1371 switch (comp)
1372 {
1373 case 0:
1374 TypeTraits<X, NumBitsX>::storeSOA(pDst, src);
1375 return;
1376 case 1:
1377 TypeTraits<Y, NumBitsY>::storeSOA(pDst, src);
1378 return;
1379 case 2:
1380 TypeTraits<Z, NumBitsZ>::storeSOA(pDst, src);
1381 return;
1382 case 3:
1383 TypeTraits<W, NumBitsW>::storeSOA(pDst, src);
1384 return;
1385 }
1386 SWR_INVALID("Invalid component: %d", comp);
1387 TypeTraits<X, NumBitsX>::storeSOA(pDst, src);
1388 }
1389
1390 INLINE static simdscalar unpack(uint32_t comp, simdscalar &in)
1391 {
1392 switch (comp)
1393 {
1394 case 0:
1395 return TypeTraits<X, NumBitsX>::unpack(in);
1396 case 1:
1397 return TypeTraits<Y, NumBitsY>::unpack(in);
1398 case 2:
1399 return TypeTraits<Z, NumBitsZ>::unpack(in);
1400 case 3:
1401 return TypeTraits<W, NumBitsW>::unpack(in);
1402 }
1403 SWR_INVALID("Invalid component: %d", comp);
1404 return TypeTraits<X, NumBitsX>::unpack(in);
1405 }
1406
1407 INLINE static simdscalar pack(uint32_t comp, simdscalar &in)
1408 {
1409 switch (comp)
1410 {
1411 case 0:
1412 return TypeTraits<X, NumBitsX>::pack(in);
1413 case 1:
1414 return TypeTraits<Y, NumBitsY>::pack(in);
1415 case 2:
1416 return TypeTraits<Z, NumBitsZ>::pack(in);
1417 case 3:
1418 return TypeTraits<W, NumBitsW>::pack(in);
1419 }
1420 SWR_INVALID("Invalid component: %d", comp);
1421 return TypeTraits<X, NumBitsX>::pack(in);
1422 }
1423
1424 INLINE static simdscalar convertSrgb(uint32_t comp, simdscalar &in)
1425 {
1426 switch (comp)
1427 {
1428 case 0:
1429 return TypeTraits<X, NumBitsX>::convertSrgb(in);
1430 case 1:
1431 return TypeTraits<Y, NumBitsY>::convertSrgb(in);
1432 case 2:
1433 return TypeTraits<Z, NumBitsZ>::convertSrgb(in);
1434 case 3:
1435 return TypeTraits<W, NumBitsW>::convertSrgb(in);
1436 }
1437 SWR_INVALID("Invalid component: %d", comp);
1438 return TypeTraits<X, NumBitsX>::convertSrgb(in);
1439 }
1440 #if ENABLE_AVX512_SIMD16
1441
1442 INLINE static simd16scalar loadSOA_16(uint32_t comp, const uint8_t* pSrc)
1443 {
1444 switch (comp)
1445 {
1446 case 0:
1447 return TypeTraits<X, NumBitsX>::loadSOA_16(pSrc);
1448 case 1:
1449 return TypeTraits<Y, NumBitsY>::loadSOA_16(pSrc);
1450 case 2:
1451 return TypeTraits<Z, NumBitsZ>::loadSOA_16(pSrc);
1452 case 3:
1453 return TypeTraits<W, NumBitsW>::loadSOA_16(pSrc);
1454 }
1455 SWR_INVALID("Invalid component: %d", comp);
1456 return TypeTraits<X, NumBitsX>::loadSOA_16(pSrc);
1457 }
1458
1459 INLINE static void storeSOA(uint32_t comp, uint8_t *pDst, simd16scalar src)
1460 {
1461 switch (comp)
1462 {
1463 case 0:
1464 TypeTraits<X, NumBitsX>::storeSOA(pDst, src);
1465 return;
1466 case 1:
1467 TypeTraits<Y, NumBitsY>::storeSOA(pDst, src);
1468 return;
1469 case 2:
1470 TypeTraits<Z, NumBitsZ>::storeSOA(pDst, src);
1471 return;
1472 case 3:
1473 TypeTraits<W, NumBitsW>::storeSOA(pDst, src);
1474 return;
1475 }
1476 SWR_INVALID("Invalid component: %d", comp);
1477 TypeTraits<X, NumBitsX>::storeSOA(pDst, src);
1478 }
1479
1480 INLINE static simd16scalar unpack(uint32_t comp, simd16scalar &in)
1481 {
1482 switch (comp)
1483 {
1484 case 0:
1485 return TypeTraits<X, NumBitsX>::unpack(in);
1486 case 1:
1487 return TypeTraits<Y, NumBitsY>::unpack(in);
1488 case 2:
1489 return TypeTraits<Z, NumBitsZ>::unpack(in);
1490 case 3:
1491 return TypeTraits<W, NumBitsW>::unpack(in);
1492 }
1493 SWR_INVALID("Invalid component: %d", comp);
1494 return TypeTraits<X, NumBitsX>::unpack(in);
1495 }
1496
1497 INLINE static simd16scalar pack(uint32_t comp, simd16scalar &in)
1498 {
1499 switch (comp)
1500 {
1501 case 0:
1502 return TypeTraits<X, NumBitsX>::pack(in);
1503 case 1:
1504 return TypeTraits<Y, NumBitsY>::pack(in);
1505 case 2:
1506 return TypeTraits<Z, NumBitsZ>::pack(in);
1507 case 3:
1508 return TypeTraits<W, NumBitsW>::pack(in);
1509 }
1510 SWR_INVALID("Invalid component: %d", comp);
1511 return TypeTraits<X, NumBitsX>::pack(in);
1512 }
1513
1514 INLINE static simd16scalar convertSrgb(uint32_t comp, simd16scalar &in)
1515 {
1516 switch (comp)
1517 {
1518 case 0:
1519 return TypeTraits<X, NumBitsX>::convertSrgb(in);
1520 case 1:
1521 return TypeTraits<Y, NumBitsY>::convertSrgb(in);
1522 case 2:
1523 return TypeTraits<Z, NumBitsZ>::convertSrgb(in);
1524 case 3:
1525 return TypeTraits<W, NumBitsW>::convertSrgb(in);
1526 }
1527 SWR_INVALID("Invalid component: %d", comp);
1528 return TypeTraits<X, NumBitsX>::convertSrgb(in);
1529 }
1530 #endif
1531 };