swr/rast: Separate RDTSC code from archrast
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / format_types.h
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file formats.h
24 *
25 * @brief Definitions for SWR_FORMAT functions.
26 *
27 ******************************************************************************/
28 #pragma once
29
30 #include "utils.h"
31 #include "common/simdintrin.h"
32
33 //////////////////////////////////////////////////////////////////////////
34 /// PackTraits - Helpers for packing / unpacking same pixel sizes
35 //////////////////////////////////////////////////////////////////////////
36 template <uint32_t NumBits, bool Signed = false>
37 struct PackTraits
38 {
39 static const uint32_t MyNumBits = NumBits;
40 static simdscalar loadSOA(const uint8_t *pSrc) = delete;
41 static void storeSOA(uint8_t *pDst, simdscalar const &src) = delete;
42 static simdscalar unpack(simdscalar &in) = delete;
43 static simdscalar pack(simdscalar &in) = delete;
44 #if ENABLE_AVX512_SIMD16
45 static simd16scalar loadSOA_16(const uint8_t *pSrc) = delete;
46 static void SIMDCALL storeSOA(uint8_t *pDst, simd16scalar const &src) = delete;
47 static simd16scalar unpack(simd16scalar &in) = delete;
48 static simd16scalar pack(simd16scalar &in) = delete;
49 #endif
50 };
51
52 //////////////////////////////////////////////////////////////////////////
53 /// PackTraits - Helpers for packing / unpacking unused channels
54 //////////////////////////////////////////////////////////////////////////
55 template <>
56 struct PackTraits<0, false>
57 {
58 static const uint32_t MyNumBits = 0;
59
60 static simdscalar loadSOA(const uint8_t *pSrc) { return _simd_setzero_ps(); }
61 static void storeSOA(uint8_t *pDst, simdscalar const &src) { return; }
62 static simdscalar unpack(simdscalar &in) { return _simd_setzero_ps(); }
63 static simdscalar pack(simdscalar &in) { return _simd_setzero_ps(); }
64 #if ENABLE_AVX512_SIMD16
65 static simd16scalar loadSOA_16(const uint8_t *pSrc) { return _simd16_setzero_ps(); }
66 static void SIMDCALL storeSOA(uint8_t *pDst, simd16scalar const &src) { return; }
67 static simd16scalar unpack(simd16scalar &in) { return _simd16_setzero_ps(); }
68 static simd16scalar pack(simd16scalar &in) { return _simd16_setzero_ps(); }
69 #endif
70 };
71
72 //////////////////////////////////////////////////////////////////////////
73 /// PackTraits - Helpers for packing / unpacking 8 bit unsigned channels
74 //////////////////////////////////////////////////////////////////////////
75 template <>
76 struct PackTraits<8, false>
77 {
78 static const uint32_t MyNumBits = 8;
79
80 static simdscalar loadSOA(const uint8_t *pSrc)
81 {
82 #if KNOB_SIMD_WIDTH == 8
83 __m256 result = _mm256_setzero_ps();
84 __m128 vLo = _mm_castpd_ps(_mm_load_sd((double*)pSrc));
85 return _mm256_insertf128_ps(result, vLo, 0);
86 #else
87 #error Unsupported vector width
88 #endif
89 }
90
91 static void storeSOA(uint8_t *pDst, simdscalar const &src)
92 {
93 // store simd bytes
94 #if KNOB_SIMD_WIDTH == 8
95 _mm_storel_pd((double*)pDst, _mm_castps_pd(_mm256_castps256_ps128(src)));
96 #else
97 #error Unsupported vector width
98 #endif
99 }
100
101 static simdscalar unpack(simdscalar &in)
102 {
103 #if KNOB_SIMD_WIDTH == 8
104 #if KNOB_ARCH <= KNOB_ARCH_AVX
105 __m128i src = _mm_castps_si128(_mm256_castps256_ps128(in));
106 __m128i resLo = _mm_cvtepu8_epi32(src);
107 __m128i resHi = _mm_shuffle_epi8(src,
108 _mm_set_epi32(0x80808007, 0x80808006, 0x80808005, 0x80808004));
109
110 __m256i result = _mm256_castsi128_si256(resLo);
111 result = _mm256_insertf128_si256(result, resHi, 1);
112 return simdscalar{ _mm256_castsi256_ps(result) };
113 #else
114 return _mm256_castsi256_ps(_mm256_cvtepu8_epi32(_mm_castps_si128(_mm256_castps256_ps128(in))));
115 #endif
116 #else
117 #error Unsupported vector width
118 #endif
119 }
120
121 static simdscalar pack(simdscalar &in)
122 {
123 #if KNOB_SIMD_WIDTH == 8
124 simdscalari src = _simd_castps_si(in);
125 __m128i res16 = _mm_packus_epi32(_mm256_castsi256_si128(src), _mm256_extractf128_si256(src, 1));
126 __m128i res8 = _mm_packus_epi16(res16, _mm_undefined_si128());
127 return _mm256_castsi256_ps(_mm256_castsi128_si256(res8));
128 #else
129 #error Unsupported vector width
130 #endif
131 }
132 #if ENABLE_AVX512_SIMD16
133
134 static simd16scalar loadSOA_16(const uint8_t *pSrc)
135 {
136 simd16scalar result = _simd16_setzero_ps();
137 simdscalar resultlo = _simd_setzero_ps();
138
139 const __m128 src = _mm_load_ps(reinterpret_cast<const float *>(pSrc));
140
141 resultlo = _mm256_insertf128_ps(resultlo, src, 0);
142 result = _simd16_insert_ps(result, resultlo, 0);
143
144 return result;
145 }
146
147 static void SIMDCALL storeSOA(uint8_t *pDst, simd16scalar const &src)
148 {
149 // store simd16 bytes
150 _mm_store_ps(reinterpret_cast<float *>(pDst), _mm256_castps256_ps128(_simd16_extract_ps(src, 0)));
151 }
152
153 static simd16scalar unpack(simd16scalar &in)
154 {
155 simd4scalari tmp = _mm_castps_si128(_mm256_castps256_ps128(_simd16_extract_ps(in, 0)));
156 simd16scalari result = _simd16_cvtepu8_epi32(tmp);
157
158 return _simd16_castsi_ps(result);
159 }
160
161 static simd16scalar pack(simd16scalar &in)
162 {
163 simd16scalari result = _simd16_setzero_si();
164
165 simdscalari inlo = _simd_castps_si(_simd16_extract_ps(in, 0)); // r0 r1 r2 r3 r4 r5 r6 r7 (32b)
166 simdscalari inhi = _simd_castps_si(_simd16_extract_ps(in, 1)); // r8 r9 rA rB rC rD rE rF
167
168 simdscalari permlo = _simd_permute2f128_si(inlo, inhi, 0x20); // r0 r1 r2 r3 r8 r9 rA rB (32b)
169 simdscalari permhi = _simd_permute2f128_si(inlo, inhi, 0x31); // r4 r5 r6 r7 rC rD rE rF (32b)
170
171 simdscalari pack = _simd_packus_epi32(permlo, permhi); // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF (16b)
172
173 const simdscalari zero = _simd_setzero_si();
174
175 permlo = _simd_permute2f128_si(pack, zero, 0x20); // (2, 0) // r0 r1 r2 r3 r4 r5 r6 r7 00 00 00 00 00 00 00 00 (16b)
176 permhi = _simd_permute2f128_si(pack, zero, 0x31); // (3, 1) // r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 (16b)
177
178 pack = _simd_packus_epi16(permlo, permhi); // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 (8b)
179
180 result = _simd16_insert_si(result, pack, 0);
181
182 return _simd16_castsi_ps(result);
183 }
184 #endif
185 };
186
187 //////////////////////////////////////////////////////////////////////////
188 /// PackTraits - Helpers for packing / unpacking 8 bit signed channels
189 //////////////////////////////////////////////////////////////////////////
190 template <>
191 struct PackTraits<8, true>
192 {
193 static const uint32_t MyNumBits = 8;
194
195 static simdscalar loadSOA(const uint8_t *pSrc)
196 {
197 #if KNOB_SIMD_WIDTH == 8
198 __m256 result = _mm256_setzero_ps();
199 __m128 vLo = _mm_castpd_ps(_mm_load_sd((double*)pSrc));
200 return _mm256_insertf128_ps(result, vLo, 0);
201 #else
202 #error Unsupported vector width
203 #endif
204 }
205
206 static void storeSOA(uint8_t *pDst, simdscalar const &src)
207 {
208 // store simd bytes
209 #if KNOB_SIMD_WIDTH == 8
210 _mm_storel_pd((double*)pDst, _mm_castps_pd(_mm256_castps256_ps128(src)));
211 #else
212 #error Unsupported vector width
213 #endif
214 }
215
216 static simdscalar unpack(simdscalar &in)
217 {
218 #if KNOB_SIMD_WIDTH == 8
219 #if KNOB_ARCH <= KNOB_ARCH_AVX
220 SWR_INVALID("I think this may be incorrect.");
221 __m128i src = _mm_castps_si128(_mm256_castps256_ps128(in));
222 __m128i resLo = _mm_cvtepi8_epi32(src);
223 __m128i resHi = _mm_shuffle_epi8(src,
224 _mm_set_epi32(0x80808007, 0x80808006, 0x80808005, 0x80808004));
225
226 __m256i result = _mm256_castsi128_si256(resLo);
227 result = _mm256_insertf128_si256(result, resHi, 1);
228 return _mm256_castsi256_ps(result);
229 #else
230 return _mm256_castsi256_ps(_mm256_cvtepi8_epi32(_mm_castps_si128(_mm256_castps256_ps128(in))));
231 #endif
232 #else
233 #error Unsupported vector width
234 #endif
235 }
236
237 static simdscalar pack(simdscalar &in)
238 {
239 #if KNOB_SIMD_WIDTH == 8
240 simdscalari src = _simd_castps_si(in);
241 __m128i res16 = _mm_packs_epi32(_mm256_castsi256_si128(src), _mm256_extractf128_si256(src, 1));
242 __m128i res8 = _mm_packs_epi16(res16, _mm_undefined_si128());
243 return _mm256_castsi256_ps(_mm256_castsi128_si256(res8));
244 #else
245 #error Unsupported vector width
246 #endif
247 }
248 #if ENABLE_AVX512_SIMD16
249
250 static simd16scalar loadSOA_16(const uint8_t *pSrc)
251 {
252 simd16scalar result = _simd16_setzero_ps();
253 simdscalar resultlo = _simd_setzero_ps();
254
255 const __m128 src = _mm_load_ps(reinterpret_cast<const float *>(pSrc));
256
257 resultlo = _mm256_insertf128_ps(resultlo, src, 0);
258 result = _simd16_insert_ps(result, resultlo, 0);
259
260 return result;
261 }
262
263 static void SIMDCALL storeSOA(uint8_t *pDst, simd16scalar const &src)
264 {
265 // store simd16 bytes
266 _mm_store_ps(reinterpret_cast<float *>(pDst), _mm256_castps256_ps128(_simd16_extract_ps(src, 0)));
267 }
268
269 static simd16scalar unpack(simd16scalar &in)
270 {
271 simd4scalari tmp = _mm_castps_si128(_mm256_castps256_ps128(_simd16_extract_ps(in, 0)));
272 simd16scalari result = _simd16_cvtepu8_epi32(tmp);
273
274 return _simd16_castsi_ps(result);
275 }
276
277 static simd16scalar pack(simd16scalar &in)
278 {
279 simd16scalari result = _simd16_setzero_si();
280
281 simdscalari inlo = _simd_castps_si(_simd16_extract_ps(in, 0)); // r0 r1 r2 r3 r4 r5 r6 r7 (32b)
282 simdscalari inhi = _simd_castps_si(_simd16_extract_ps(in, 1)); // r8 r9 rA rB rC rD rE rF
283
284 simdscalari permlo = _simd_permute2f128_si(inlo, inhi, 0x20); // r0 r1 r2 r3 r8 r9 rA rB (32b)
285 simdscalari permhi = _simd_permute2f128_si(inlo, inhi, 0x31); // r4 r5 r6 r7 rC rD rE rF (32b)
286
287 simdscalari pack = _simd_packs_epi32(permlo, permhi); // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF (16b)
288
289 const simdscalari zero = _simd_setzero_si();
290
291 permlo = _simd_permute2f128_si(pack, zero, 0x20); // (2, 0) // r0 r1 r2 r3 r4 r5 r6 r7 00 00 00 00 00 00 00 00 (16b)
292 permhi = _simd_permute2f128_si(pack, zero, 0x31); // (3, 1) // r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 (16b)
293
294 pack = _simd_packs_epi16(permlo, permhi); // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 (8b)
295
296 result = _simd16_insert_si(result, pack, 0);
297
298 return _simd16_castsi_ps(result);
299 }
300 #endif
301 };
302
303 //////////////////////////////////////////////////////////////////////////
304 /// PackTraits - Helpers for packing / unpacking 16 bit unsigned channels
305 //////////////////////////////////////////////////////////////////////////
306 template <>
307 struct PackTraits<16, false>
308 {
309 static const uint32_t MyNumBits = 16;
310
311 static simdscalar loadSOA(const uint8_t *pSrc)
312 {
313 #if KNOB_SIMD_WIDTH == 8
314 __m256 result = _mm256_setzero_ps();
315 __m128 vLo = _mm_load_ps((const float*)pSrc);
316 return _mm256_insertf128_ps(result, vLo, 0);
317 #else
318 #error Unsupported vector width
319 #endif
320 }
321
322 static void storeSOA(uint8_t *pDst, simdscalar const &src)
323 {
324 #if KNOB_SIMD_WIDTH == 8
325 // store 16B (2B * 8)
326 _mm_store_ps((float*)pDst, _mm256_castps256_ps128(src));
327 #else
328 #error Unsupported vector width
329 #endif
330 }
331
332 static simdscalar unpack(simdscalar &in)
333 {
334 #if KNOB_SIMD_WIDTH == 8
335 #if KNOB_ARCH <= KNOB_ARCH_AVX
336 __m128i src = _mm_castps_si128(_mm256_castps256_ps128(in));
337 __m128i resLo = _mm_cvtepu16_epi32(src);
338 __m128i resHi = _mm_shuffle_epi8(src,
339 _mm_set_epi32(0x80800F0E, 0x80800D0C, 0x80800B0A, 0x80800908));
340
341 __m256i result = _mm256_castsi128_si256(resLo);
342 result = _mm256_insertf128_si256(result, resHi, 1);
343 return _mm256_castsi256_ps(result);
344 #else
345 return _mm256_castsi256_ps(_mm256_cvtepu16_epi32(_mm_castps_si128(_mm256_castps256_ps128(in))));
346 #endif
347 #else
348 #error Unsupported vector width
349 #endif
350 }
351
352 static simdscalar pack(simdscalar &in)
353 {
354 #if KNOB_SIMD_WIDTH == 8
355 simdscalari src = _simd_castps_si(in);
356 __m256i res = _mm256_castsi128_si256(_mm_packus_epi32(_mm256_castsi256_si128(src), _mm256_extractf128_si256(src, 1)));
357 return _mm256_castsi256_ps(res);
358 #else
359 #error Unsupported vector width
360 #endif
361 }
362 #if ENABLE_AVX512_SIMD16
363
364 static simd16scalar loadSOA_16(const uint8_t *pSrc)
365 {
366 simd16scalar result = _simd16_setzero_ps();
367
368 simdscalar resultlo = _simd_load_ps(reinterpret_cast<const float *>(pSrc));
369
370 result = _simd16_insert_ps(result, resultlo, 0);
371
372 return result;
373 }
374
375 static void SIMDCALL storeSOA(uint8_t *pDst, simd16scalar const &src)
376 {
377 _simd_store_ps(reinterpret_cast<float *>(pDst), _simd16_extract_ps(src, 0));
378 }
379
380 static simd16scalar unpack(simd16scalar &in)
381 {
382 simd16scalari result = _simd16_cvtepu16_epi32(_simd_castps_si(_simd16_extract_ps(in, 0)));
383
384 return _simd16_castsi_ps(result);
385 }
386
387 static simd16scalar pack(simd16scalar &in)
388 {
389 const simd16scalari zero = _simd16_setzero_si();
390
391 simd16scalari permlo = _simd16_permute2f128_si(_simd16_castps_si(in), zero, 0x08); // (0, 0, 2, 0) // r0 r1 r2 r3 r8 r9 rA rB 00 00 00 00 00 00 00 00 (32b)
392 simd16scalari permhi = _simd16_permute2f128_si(_simd16_castps_si(in), zero, 0x0D); // (0, 0, 3, 1) // r4 r5 r6 r7 rC rD rE rF 00 00 00 00 00 00 00 00
393
394 simd16scalari result = _simd16_packus_epi32(permlo, permhi); // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 (16b)
395
396 return _simd16_castsi_ps(result);
397 }
398 #endif
399 };
400
401 //////////////////////////////////////////////////////////////////////////
402 /// PackTraits - Helpers for packing / unpacking 16 bit signed channels
403 //////////////////////////////////////////////////////////////////////////
404 template <>
405 struct PackTraits<16, true>
406 {
407 static const uint32_t MyNumBits = 16;
408
409 static simdscalar loadSOA(const uint8_t *pSrc)
410 {
411 #if KNOB_SIMD_WIDTH == 8
412 __m256 result = _mm256_setzero_ps();
413 __m128 vLo = _mm_load_ps((const float*)pSrc);
414 return _mm256_insertf128_ps(result, vLo, 0);
415 #else
416 #error Unsupported vector width
417 #endif
418 }
419
420 static void storeSOA(uint8_t *pDst, simdscalar const &src)
421 {
422 #if KNOB_SIMD_WIDTH == 8
423 // store 16B (2B * 8)
424 _mm_store_ps((float*)pDst, _mm256_castps256_ps128(src));
425 #else
426 #error Unsupported vector width
427 #endif
428 }
429
430 static simdscalar unpack(simdscalar &in)
431 {
432 #if KNOB_SIMD_WIDTH == 8
433 #if KNOB_ARCH <= KNOB_ARCH_AVX
434 SWR_INVALID("I think this may be incorrect.");
435 __m128i src = _mm_castps_si128(_mm256_castps256_ps128(in));
436 __m128i resLo = _mm_cvtepi16_epi32(src);
437 __m128i resHi = _mm_shuffle_epi8(src,
438 _mm_set_epi32(0x80800F0E, 0x80800D0C, 0x80800B0A, 0x80800908));
439
440 __m256i result = _mm256_castsi128_si256(resLo);
441 result = _mm256_insertf128_si256(result, resHi, 1);
442 return _mm256_castsi256_ps(result);
443 #else
444 return _mm256_castsi256_ps(_mm256_cvtepi16_epi32(_mm_castps_si128(_mm256_castps256_ps128(in))));
445 #endif
446 #else
447 #error Unsupported vector width
448 #endif
449 }
450
451 static simdscalar pack(simdscalar &in)
452 {
453 #if KNOB_SIMD_WIDTH == 8
454 simdscalari src = _simd_castps_si(in);
455 __m256i res = _mm256_castsi128_si256(_mm_packs_epi32(_mm256_castsi256_si128(src), _mm256_extractf128_si256(src, 1)));
456 return _mm256_castsi256_ps(res);
457 #else
458 #error Unsupported vector width
459 #endif
460 }
461 #if ENABLE_AVX512_SIMD16
462
463 static simd16scalar loadSOA_16(const uint8_t *pSrc)
464 {
465 simd16scalar result = _simd16_setzero_ps();
466
467 simdscalar resultlo = _simd_load_ps(reinterpret_cast<const float *>(pSrc));
468
469 result = _simd16_insert_ps(result, resultlo, 0);
470
471 return result;
472 }
473
474 static void SIMDCALL storeSOA(uint8_t *pDst, simd16scalar const &src)
475 {
476 _simd_store_ps(reinterpret_cast<float *>(pDst), _simd16_extract_ps(src, 0));
477 }
478
479 static simd16scalar unpack(simd16scalar &in)
480 {
481 simd16scalari result = _simd16_cvtepu16_epi32(_simd_castps_si(_simd16_extract_ps(in, 0)));
482
483 return _simd16_castsi_ps(result);
484 }
485
486 static simd16scalar pack(simd16scalar &in)
487 {
488 const simd16scalari zero = _simd16_setzero_si();
489
490 simd16scalari permlo = _simd16_permute2f128_si(_simd16_castps_si(in), zero, 0x08); // (0, 0, 2, 0) // r0 r1 r2 r3 r8 r9 rA rB 00 00 00 00 00 00 00 00 (32b)
491 simd16scalari permhi = _simd16_permute2f128_si(_simd16_castps_si(in), zero, 0x0D); // (0, 0, 3, 1) // r4 r5 r6 r7 rC rD rE rF 00 00 00 00 00 00 00 00
492
493 simd16scalari result = _simd16_packs_epi32(permlo, permhi); // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 (16b)
494
495 return _simd16_castsi_ps(result);
496 }
497 #endif
498 };
499
500 //////////////////////////////////////////////////////////////////////////
501 /// PackTraits - Helpers for packing / unpacking 32 bit channels
502 //////////////////////////////////////////////////////////////////////////
503 template <>
504 struct PackTraits<32, false>
505 {
506 static const uint32_t MyNumBits = 32;
507
508 static simdscalar loadSOA(const uint8_t *pSrc) { return _simd_load_ps((const float*)pSrc); }
509 static void storeSOA(uint8_t *pDst, simdscalar const &src) { _simd_store_ps((float*)pDst, src); }
510 static simdscalar unpack(simdscalar &in) { return in; }
511 static simdscalar pack(simdscalar &in) { return in; }
512 #if ENABLE_AVX512_SIMD16
513
514 static simd16scalar loadSOA_16(const uint8_t *pSrc)
515 {
516 return _simd16_load_ps(reinterpret_cast<const float *>(pSrc));
517 }
518
519 static void SIMDCALL storeSOA(uint8_t *pDst, simd16scalar const &src)
520 {
521 _simd16_store_ps(reinterpret_cast<float *>(pDst), src);
522 }
523
524 static simd16scalar unpack(simd16scalar &in)
525 {
526 return in;
527 }
528
529 static simd16scalar pack(simd16scalar &in)
530 {
531 return in;
532 }
533 #endif
534 };
535
536 //////////////////////////////////////////////////////////////////////////
537 /// TypeTraits - Format type traits.
538 //////////////////////////////////////////////////////////////////////////
539 template<SWR_TYPE type, uint32_t NumBits>
540 struct TypeTraits : PackTraits<NumBits>
541 {
542 static const SWR_TYPE MyType = type;
543 static float toFloat() { return 0.0; }
544 static float fromFloat() { SWR_NOT_IMPL; return 0.0; }
545 static simdscalar convertSrgb(simdscalar &in) { SWR_NOT_IMPL; return _simd_setzero_ps(); }
546 };
547
548 //////////////////////////////////////////////////////////////////////////
549 /// TypeTraits - Format type traits specialization for UINT8
550 //////////////////////////////////////////////////////////////////////////
551 template<> struct TypeTraits<SWR_TYPE_UINT, 8> : PackTraits<8>
552 {
553 static const SWR_TYPE MyType = SWR_TYPE_UINT;
554 static float toFloat() { return 0.0; }
555 static float fromFloat() { SWR_NOT_IMPL; return 0.0; }
556 static simdscalar convertSrgb(simdscalar &in) { SWR_NOT_IMPL; return _simd_setzero_ps(); }
557 };
558
559 //////////////////////////////////////////////////////////////////////////
560 /// TypeTraits - Format type traits specialization for UINT8
561 //////////////////////////////////////////////////////////////////////////
562 template<> struct TypeTraits<SWR_TYPE_SINT, 8> : PackTraits<8, true>
563 {
564 static const SWR_TYPE MyType = SWR_TYPE_SINT;
565 static float toFloat() { return 0.0; }
566 static float fromFloat() { SWR_NOT_IMPL; return 0.0; }
567 static simdscalar convertSrgb(simdscalar &in) { SWR_NOT_IMPL; return _simd_setzero_ps(); }
568 };
569
570 //////////////////////////////////////////////////////////////////////////
571 /// TypeTraits - Format type traits specialization for UINT16
572 //////////////////////////////////////////////////////////////////////////
573 template<> struct TypeTraits<SWR_TYPE_UINT, 16> : PackTraits<16>
574 {
575 static const SWR_TYPE MyType = SWR_TYPE_UINT;
576 static float toFloat() { return 0.0; }
577 static float fromFloat() { SWR_NOT_IMPL; return 0.0; }
578 static simdscalar convertSrgb(simdscalar &in) { SWR_NOT_IMPL; return _simd_setzero_ps(); }
579 };
580
581 //////////////////////////////////////////////////////////////////////////
582 /// TypeTraits - Format type traits specialization for SINT16
583 //////////////////////////////////////////////////////////////////////////
584 template<> struct TypeTraits<SWR_TYPE_SINT, 16> : PackTraits<16, true>
585 {
586 static const SWR_TYPE MyType = SWR_TYPE_SINT;
587 static float toFloat() { return 0.0; }
588 static float fromFloat() { SWR_NOT_IMPL; return 0.0; }
589 static simdscalar convertSrgb(simdscalar &in) { SWR_NOT_IMPL; return _simd_setzero_ps(); }
590 };
591
592 //////////////////////////////////////////////////////////////////////////
593 /// TypeTraits - Format type traits specialization for UINT32
594 //////////////////////////////////////////////////////////////////////////
595 template<> struct TypeTraits<SWR_TYPE_UINT, 32> : PackTraits<32>
596 {
597 static const SWR_TYPE MyType = SWR_TYPE_UINT;
598 static float toFloat() { return 0.0; }
599 static float fromFloat() { SWR_NOT_IMPL; return 0.0; }
600 static simdscalar convertSrgb(simdscalar &in) { SWR_NOT_IMPL; return _simd_setzero_ps(); }
601 };
602
603 //////////////////////////////////////////////////////////////////////////
604 /// TypeTraits - Format type traits specialization for UINT32
605 //////////////////////////////////////////////////////////////////////////
606 template<> struct TypeTraits<SWR_TYPE_SINT, 32> : PackTraits<32>
607 {
608 static const SWR_TYPE MyType = SWR_TYPE_SINT;
609 static float toFloat() { return 0.0; }
610 static float fromFloat() { SWR_NOT_IMPL; return 0.0; }
611 static simdscalar convertSrgb(simdscalar &in) { SWR_NOT_IMPL; return _simd_setzero_ps(); }
612 };
613
614 //////////////////////////////////////////////////////////////////////////
615 /// TypeTraits - Format type traits specialization for UNORM5
616 //////////////////////////////////////////////////////////////////////////
617 template<> struct TypeTraits<SWR_TYPE_UNORM, 5> : PackTraits<5>
618 {
619 static const SWR_TYPE MyType = SWR_TYPE_UNORM;
620 static float toFloat() { return 1.0f / 31.0f; }
621 static float fromFloat() { return 31.0f; }
622 static simdscalar convertSrgb(simdscalar &in) { SWR_NOT_IMPL; return _simd_setzero_ps(); }
623 };
624
625 //////////////////////////////////////////////////////////////////////////
626 /// TypeTraits - Format type traits specialization for UNORM6
627 //////////////////////////////////////////////////////////////////////////
628 template<> struct TypeTraits<SWR_TYPE_UNORM, 6> : PackTraits<6>
629 {
630 static const SWR_TYPE MyType = SWR_TYPE_UNORM;
631 static float toFloat() { return 1.0f / 63.0f; }
632 static float fromFloat() { return 63.0f; }
633 static simdscalar convertSrgb(simdscalar &in) { SWR_NOT_IMPL; return _simd_setzero_ps(); }
634 };
635
636 //////////////////////////////////////////////////////////////////////////
637 /// TypeTraits - Format type traits specialization for UNORM8
638 //////////////////////////////////////////////////////////////////////////
639 template<> struct TypeTraits<SWR_TYPE_UNORM, 8> : PackTraits<8>
640 {
641 static const SWR_TYPE MyType = SWR_TYPE_UNORM;
642 static float toFloat() { return 1.0f / 255.0f; }
643 static float fromFloat() { return 255.0f; }
644 static simdscalar convertSrgb(simdscalar &in) { SWR_NOT_IMPL; return _simd_setzero_ps(); }
645 };
646
647 //////////////////////////////////////////////////////////////////////////
648 /// TypeTraits - Format type traits specialization for UNORM8
649 //////////////////////////////////////////////////////////////////////////
650 template<> struct TypeTraits<SWR_TYPE_SNORM, 8> : PackTraits<8, true>
651 {
652 static const SWR_TYPE MyType = SWR_TYPE_SNORM;
653 static float toFloat() { return 1.0f / 127.0f; }
654 static float fromFloat() { return 127.0f; }
655 static simdscalar convertSrgb(simdscalar &in) { SWR_NOT_IMPL; return _simd_setzero_ps(); }
656 };
657
658 //////////////////////////////////////////////////////////////////////////
659 /// TypeTraits - Format type traits specialization for UNORM16
660 //////////////////////////////////////////////////////////////////////////
661 template<> struct TypeTraits<SWR_TYPE_UNORM, 16> : PackTraits<16>
662 {
663 static const SWR_TYPE MyType = SWR_TYPE_UNORM;
664 static float toFloat() { return 1.0f / 65535.0f; }
665 static float fromFloat() { return 65535.0f; }
666 static simdscalar convertSrgb(simdscalar &in) { SWR_NOT_IMPL; return _simd_setzero_ps(); }
667 };
668
669 //////////////////////////////////////////////////////////////////////////
670 /// TypeTraits - Format type traits specialization for SNORM16
671 //////////////////////////////////////////////////////////////////////////
672 template<> struct TypeTraits<SWR_TYPE_SNORM, 16> : PackTraits<16, true>
673 {
674 static const SWR_TYPE MyType = SWR_TYPE_UNORM;
675 static float toFloat() { return 1.0f / 32767.0f; }
676 static float fromFloat() { return 32767.0f; }
677 static simdscalar convertSrgb(simdscalar &in) { SWR_NOT_IMPL; return _simd_setzero_ps(); }
678 };
679
680 //////////////////////////////////////////////////////////////////////////
681 /// TypeTraits - Format type traits specialization for UNORM24
682 //////////////////////////////////////////////////////////////////////////
683 template<>
684 struct TypeTraits < SWR_TYPE_UNORM, 24 > : PackTraits<32>
685 {
686 static const SWR_TYPE MyType = SWR_TYPE_UNORM;
687 static float toFloat() { return 1.0f / 16777215.0f; }
688 static float fromFloat() { return 16777215.0f; }
689 static simdscalar convertSrgb(simdscalar &in) { SWR_NOT_IMPL; return _simd_setzero_ps(); }
690 };
691
692 //////////////////////////////////////////////////////////////////////////
693 // FLOAT Specializations from here on...
694 //////////////////////////////////////////////////////////////////////////
695 #define TO_M128i(a) _mm_castps_si128(a)
696 #define TO_M128(a) _mm_castsi128_ps(a)
697
698 #include "math.h"
699
700 template< unsigned expnum, unsigned expden, unsigned coeffnum, unsigned coeffden >
701 inline static __m128 fastpow(__m128 arg) {
702 __m128 ret = arg;
703
704 static const __m128 factor = _mm_set1_ps(exp2(127.0f * expden / expnum - 127.0f)
705 * powf(1.0f * coeffnum / coeffden, 1.0f * expden / expnum));
706
707 // Apply a constant pre-correction factor.
708 ret = _mm_mul_ps(ret, factor);
709
710 // Reinterpret arg as integer to obtain logarithm.
711 //asm("cvtdq2ps %1, %0" : "=x" (ret) : "x" (ret));
712 ret = _mm_cvtepi32_ps(_mm_castps_si128(ret));
713
714 // Multiply logarithm by power.
715 ret = _mm_mul_ps(ret, _mm_set1_ps(1.0f * expnum / expden));
716
717 // Convert back to "integer" to exponentiate.
718 //asm("cvtps2dq %1, %0" : "=x" (ret) : "x" (ret));
719 ret = _mm_castsi128_ps(_mm_cvtps_epi32(ret));
720
721 return ret;
722 }
723
724 inline static __m128 pow512_4(__m128 arg) {
725 // 5/12 is too small, so compute the 4th root of 20/12 instead.
726 // 20/12 = 5/3 = 1 + 2/3 = 2 - 1/3. 2/3 is a suitable argument for fastpow.
727 // weighting coefficient: a^-1/2 = 2 a; a = 2^-2/3
728 __m128 xf = fastpow< 2, 3, int(0.629960524947437 * 1e9), int(1e9) >(arg);
729 __m128 xover = _mm_mul_ps(arg, xf);
730
731 __m128 xfm1 = _mm_rsqrt_ps(xf);
732 __m128 x2 = _mm_mul_ps(arg, arg);
733 __m128 xunder = _mm_mul_ps(x2, xfm1);
734
735 // sqrt2 * over + 2 * sqrt2 * under
736 __m128 xavg = _mm_mul_ps(_mm_set1_ps(1.0f / (3.0f * 0.629960524947437f) * 0.999852f),
737 _mm_add_ps(xover, xunder));
738
739 xavg = _mm_mul_ps(xavg, _mm_rsqrt_ps(xavg));
740 xavg = _mm_mul_ps(xavg, _mm_rsqrt_ps(xavg));
741 return xavg;
742 }
743
744 inline static __m128 powf_wrapper(__m128 Base, float Exp)
745 {
746 float *f = (float *)(&Base);
747
748 return _mm_set_ps(powf(f[3], Exp),
749 powf(f[2], Exp),
750 powf(f[1], Exp),
751 powf(f[0], Exp));
752 }
753
754 static inline __m128 ConvertFloatToSRGB2(__m128& Src)
755 {
756 // create a mask with 0xFFFFFFFF in the DWORDs where the source is <= the minimal SRGB float value
757 __m128i CmpToSRGBThresholdMask = TO_M128i(_mm_cmpnlt_ps(_mm_set1_ps(0.0031308f), Src));
758
759 // squeeze the mask down to 16 bits (4 bits per DWORD)
760 int CompareResult = _mm_movemask_epi8(CmpToSRGBThresholdMask);
761
762 __m128 Result;
763
764 //
765 if (CompareResult == 0xFFFF)
766 {
767 // all DWORDs are <= the threshold
768 Result = _mm_mul_ps(Src, _mm_set1_ps(12.92f));
769 }
770 else if (CompareResult == 0x0)
771 {
772 // all DWORDs are > the threshold
773 __m128 fSrc_0RGB = Src;
774
775 // --> 1.055f * c(1.0f/2.4f) - 0.055f
776 #if KNOB_USE_FAST_SRGB == TRUE
777 // 1.0f / 2.4f is 5.0f / 12.0f which is used for approximation.
778 __m128 f = pow512_4(fSrc_0RGB);
779 #else
780 __m128 f = powf_wrapper(fSrc_0RGB, 1.0f / 2.4f);
781 #endif
782 f = _mm_mul_ps(f, _mm_set1_ps(1.055f));
783 Result = _mm_sub_ps(f, _mm_set1_ps(0.055f));
784 }
785 else
786 {
787 // some DWORDs are <= the threshold and some are > threshold
788 __m128 Src_0RGB_mul_denorm = _mm_mul_ps(Src, _mm_set1_ps(12.92f));
789
790 __m128 fSrc_0RGB = Src;
791
792 // --> 1.055f * c(1.0f/2.4f) - 0.055f
793 #if KNOB_USE_FAST_SRGB == TRUE
794 // 1.0f / 2.4f is 5.0f / 12.0f which is used for approximation.
795 __m128 f = pow512_4(fSrc_0RGB);
796 #else
797 __m128 f = powf_wrapper(fSrc_0RGB, 1.0f / 2.4f);
798 #endif
799 f = _mm_mul_ps(f, _mm_set1_ps(1.055f));
800 f = _mm_sub_ps(f, _mm_set1_ps(0.055f));
801
802 // Clear the alpha (is garbage after the sub)
803 __m128i i = _mm_and_si128(TO_M128i(f), _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF));
804
805 __m128i LessThanPart = _mm_and_si128(CmpToSRGBThresholdMask, TO_M128i(Src_0RGB_mul_denorm));
806 __m128i GreaterEqualPart = _mm_andnot_si128(CmpToSRGBThresholdMask, i);
807 __m128i CombinedParts = _mm_or_si128(LessThanPart, GreaterEqualPart);
808
809 Result = TO_M128(CombinedParts);
810 }
811
812 return Result;
813 }
814
815 #if ENABLE_AVX512_SIMD16
816 template< unsigned expnum, unsigned expden, unsigned coeffnum, unsigned coeffden >
817 inline static simd16scalar SIMDCALL fastpow(simd16scalar const &value)
818 {
819 static const float factor1 = exp2(127.0f * expden / expnum - 127.0f)
820 * powf(1.0f * coeffnum / coeffden, 1.0f * expden / expnum);
821
822 // Apply a constant pre-correction factor.
823 simd16scalar result = _simd16_mul_ps(value, _simd16_set1_ps(factor1));
824
825 // Reinterpret arg as integer to obtain logarithm.
826 //asm("cvtdq2ps %1, %0" : "=x" (result) : "x" (result));
827 result = _simd16_cvtepi32_ps(_simd16_castps_si(result));
828
829 // Multiply logarithm by power.
830 result = _simd16_mul_ps(result, _simd16_set1_ps(1.0f * expnum / expden));
831
832 // Convert back to "integer" to exponentiate.
833 //asm("cvtps2dq %1, %0" : "=x" (result) : "x" (result));
834 result = _simd16_castsi_ps(_simd16_cvtps_epi32(result));
835
836 return result;
837 }
838
839 inline static simd16scalar SIMDCALL pow512_4(simd16scalar const &arg)
840 {
841 // 5/12 is too small, so compute the 4th root of 20/12 instead.
842 // 20/12 = 5/3 = 1 + 2/3 = 2 - 1/3. 2/3 is a suitable argument for fastpow.
843 // weighting coefficient: a^-1/2 = 2 a; a = 2^-2/3
844 simd16scalar xf = fastpow< 2, 3, int(0.629960524947437 * 1e9), int(1e9) >(arg);
845 simd16scalar xover = _simd16_mul_ps(arg, xf);
846
847 simd16scalar xfm1 = _simd16_rsqrt_ps(xf);
848 simd16scalar x2 = _simd16_mul_ps(arg, arg);
849 simd16scalar xunder = _simd16_mul_ps(x2, xfm1);
850
851 // sqrt2 * over + 2 * sqrt2 * under
852 simd16scalar xavg = _simd16_mul_ps(_simd16_set1_ps(1.0f / (3.0f * 0.629960524947437f) * 0.999852f), _simd16_add_ps(xover, xunder));
853
854 xavg = _simd16_mul_ps(xavg, _simd16_rsqrt_ps(xavg));
855 xavg = _simd16_mul_ps(xavg, _simd16_rsqrt_ps(xavg));
856
857 return xavg;
858 }
859
860 inline static simd16scalar SIMDCALL powf_wrapper(const simd16scalar &base, float exp)
861 {
862 const float *f = reinterpret_cast<const float *>(&base);
863
864 return _simd16_set_ps(
865 powf(f[15], exp),
866 powf(f[14], exp),
867 powf(f[13], exp),
868 powf(f[12], exp),
869 powf(f[11], exp),
870 powf(f[10], exp),
871 powf(f[ 9], exp),
872 powf(f[ 8], exp),
873 powf(f[ 7], exp),
874 powf(f[ 6], exp),
875 powf(f[ 5], exp),
876 powf(f[ 4], exp),
877 powf(f[ 3], exp),
878 powf(f[ 2], exp),
879 powf(f[ 1], exp),
880 powf(f[ 0], exp)
881 );
882 }
883
884 // float to SRGB conversion formula
885 //
886 // if (value < 0.0031308f)
887 // value *= 12.92f;
888 // else
889 // value = 1.055f * pow(value, 1.0f / 2.4f) - 0.055f;
890 //
891 static inline simd16scalar ConvertFloatToSRGB2(const simd16scalar &value)
892 {
893 // create a mask where the source is < the minimal SRGB float value
894 const simd16mask mask = _simd16_cmplt_ps_mask(value, _simd16_set1_ps(0.0031308f));
895
896 // if all elements are < the threshold, result = value * 12.92
897 simd16scalar result = _simd16_mul_ps(value, _simd16_set1_ps(12.92f));
898
899 if (_simd16_mask2int(mask) != 0xFFFF)
900 {
901 // some elements are >= threshold, result = 1.055 * power(value, 1.0 / 2.4) - 0.055
902 #if KNOB_USE_FAST_SRGB == TRUE
903 // 1.0f / 2.4f is 5.0f / 12.0f which is used for approximation.
904 simd16scalar result2 = pow512_4(value);
905 #else
906 simd16scalar result2 = powf_wrapper(value, 1.0f / 2.4f);
907 #endif
908
909 result2 = _simd16_mul_ps(result2, _simd16_set1_ps(1.055f));
910 result2 = _simd16_sub_ps(result2, _simd16_set1_ps(0.055f));
911
912 #if (KNOB_ARCH == KNOB_ARCH_AVX512)
913 // only native AVX512 can directly use the computed mask for the blend operation
914 result = _mm512_mask_blend_ps(mask, result2, result);
915 #else
916 result = _simd16_blendv_ps(result2, result, _simd16_cmplt_ps(value, _simd16_set1_ps(0.0031308f)));
917 #endif
918 }
919
920 return result;
921 }
922
923 #endif
924 //////////////////////////////////////////////////////////////////////////
925 /// TypeTraits - Format type traits specialization for FLOAT16
926 //////////////////////////////////////////////////////////////////////////
927 template<> struct TypeTraits<SWR_TYPE_FLOAT, 16> : PackTraits<16>
928 {
929 static const SWR_TYPE MyType = SWR_TYPE_FLOAT;
930 static float toFloat() { return 1.0f; }
931 static float fromFloat() { return 1.0f; }
932 static simdscalar convertSrgb(simdscalar &in) { SWR_NOT_IMPL; return _simd_setzero_ps(); }
933
934 static simdscalar pack(const simdscalar &in)
935 {
936 #if KNOB_SIMD_WIDTH == 8
937 #if (KNOB_ARCH == KNOB_ARCH_AVX)
938 // input is 8 packed float32, output is 8 packed float16
939 simdscalari src = _simd_castps_si(in);
940
941 static const uint32_t FLOAT_EXP_BITS = 8;
942 static const uint32_t FLOAT_MANTISSA_BITS = 23;
943 static const uint32_t FLOAT_MANTISSA_MASK = (1U << FLOAT_MANTISSA_BITS) - 1;
944 static const uint32_t FLOAT_EXP_MASK = ((1U << FLOAT_EXP_BITS) - 1) << FLOAT_MANTISSA_BITS;
945
946 static const uint32_t HALF_EXP_BITS = 5;
947 static const uint32_t HALF_MANTISSA_BITS = 10;
948 static const uint32_t HALF_EXP_MASK = ((1U << HALF_EXP_BITS) - 1) << HALF_MANTISSA_BITS;
949
950 // minimum exponent required, exponents below this are flushed to 0.
951 static const int32_t HALF_EXP_MIN = -14;
952 static const int32_t FLOAT_EXP_BIAS = 127;
953 static const int32_t FLOAT_EXP_MIN = HALF_EXP_MIN + FLOAT_EXP_BIAS;
954 static const int32_t FLOAT_EXP_MIN_FTZ = FLOAT_EXP_MIN - (HALF_MANTISSA_BITS + 1); // +1 for the lack of implicit significand
955
956 // maximum exponent required, exponents above this are set to infinity
957 static const int32_t HALF_EXP_MAX = 15;
958 static const int32_t FLOAT_EXP_MAX = HALF_EXP_MAX + FLOAT_EXP_BIAS;
959
960 const simdscalari vSignMask = _simd_set1_epi32(0x80000000);
961 const simdscalari vExpMask = _simd_set1_epi32(FLOAT_EXP_MASK);
962 const simdscalari vManMask = _simd_set1_epi32(FLOAT_MANTISSA_MASK);
963 const simdscalari vExpMin = _simd_set1_epi32(FLOAT_EXP_MASK & uint32_t(FLOAT_EXP_MIN << FLOAT_MANTISSA_BITS));
964 const simdscalari vExpMinFtz = _simd_set1_epi32(FLOAT_EXP_MASK & uint32_t(FLOAT_EXP_MIN_FTZ << FLOAT_MANTISSA_BITS));
965 const simdscalari vExpMax = _simd_set1_epi32(FLOAT_EXP_MASK & uint32_t(FLOAT_EXP_MAX << FLOAT_MANTISSA_BITS));
966
967 simdscalari vSign = _simd_and_si(src, vSignMask);
968 simdscalari vExp = _simd_and_si(src, vExpMask);
969 simdscalari vMan = _simd_and_si(src, vManMask);
970
971 simdscalari vFTZMask = _simd_cmplt_epi32(vExp, vExpMinFtz);
972 simdscalari vDenormMask = _simd_andnot_si(vFTZMask, _simd_cmplt_epi32(vExp, vExpMin));
973 simdscalari vInfMask = _simd_cmpeq_epi32(vExpMask, vExp);
974 simdscalari vClampMask = _simd_andnot_si(vInfMask, _simd_cmplt_epi32(vExpMax, vExp));
975
976 simdscalari vHalfExp = _simd_add_epi32(_simd_sub_epi32(vExp, vExpMin), _simd_set1_epi32(1U << FLOAT_MANTISSA_BITS));
977
978 // pack output 16-bits into the lower 16-bits of each 32-bit channel
979 simdscalari vDst = _simd_and_si(_simd_srli_epi32(vHalfExp, 13), _simd_set1_epi32(HALF_EXP_MASK));
980 vDst = _simd_or_si(vDst, _simd_srli_epi32(vMan, FLOAT_MANTISSA_BITS - HALF_MANTISSA_BITS));
981
982 // Flush To Zero
983 vDst = _simd_andnot_si(vFTZMask, vDst);
984 // Apply Infinites / NaN
985 vDst = _simd_or_si(vDst, _simd_and_si(vInfMask, _simd_set1_epi32(HALF_EXP_MASK)));
986
987 // Apply clamps
988 vDst = _simd_andnot_si(vClampMask, vDst);
989 vDst = _simd_or_si(vDst,
990 _simd_and_si(vClampMask, _simd_set1_epi32(0x7BFF)));
991
992 // Compute Denormals (subnormals)
993 if (!_mm256_testz_si256(vDenormMask, vDenormMask))
994 {
995 uint32_t *pDenormMask = (uint32_t*)&vDenormMask;
996 uint32_t *pExp = (uint32_t*)&vExp;
997 uint32_t *pMan = (uint32_t*)&vMan;
998 uint32_t *pDst = (uint32_t*)&vDst;
999 for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i)
1000 {
1001 if (pDenormMask[i])
1002 {
1003 // Need to compute subnormal value
1004 uint32_t exponent = pExp[i] >> FLOAT_MANTISSA_BITS;
1005 uint32_t mantissa = pMan[i] |
1006 (1U << FLOAT_MANTISSA_BITS); // Denorms include no "implicit" 1s. Make it explicit
1007
1008 pDst[i] = mantissa >> ((FLOAT_EXP_MIN - exponent) + (FLOAT_MANTISSA_BITS - HALF_MANTISSA_BITS));
1009 }
1010 }
1011 }
1012
1013 // Add in sign bits
1014 vDst = _simd_or_si(vDst, _simd_srli_epi32(vSign, 16));
1015
1016 // Pack to lower 128-bits
1017 vDst = _mm256_castsi128_si256(_mm_packus_epi32(_mm256_castsi256_si128(vDst), _mm256_extractf128_si256(vDst, 1)));
1018
1019 #if 0
1020 #if !defined(NDEBUG)
1021 simdscalari vCheck = _mm256_castsi128_si256(_mm256_cvtps_ph(in, _MM_FROUND_TRUNC));
1022
1023 for (uint32_t i = 0; i < 4; ++i)
1024 {
1025 SWR_ASSERT(vCheck.m256i_i32[i] == vDst.m256i_i32[i]);
1026 }
1027 #endif
1028 #endif
1029
1030 return _simd_castsi_ps(vDst);
1031
1032 #else
1033 return _mm256_castsi256_ps(_mm256_castsi128_si256(_mm256_cvtps_ph(in, _MM_FROUND_TRUNC)));
1034 #endif
1035 #else
1036 #error Unsupported vector width
1037 #endif
1038 }
1039
1040 static simdscalar unpack(const simdscalar &in)
1041 {
1042 // input is 8 packed float16, output is 8 packed float32
1043 SWR_NOT_IMPL; // @todo
1044 return _simd_setzero_ps();
1045 }
1046 #if ENABLE_AVX512_SIMD16
1047
1048 static simd16scalar pack(const simd16scalar &in)
1049 {
1050 simd16scalari result = _simd16_setzero_si();
1051 simdscalari resultlo = _simd_setzero_si();
1052
1053 #if (KNOB_ARCH == KNOB_ARCH_AVX)
1054 simdscalar simdlo = pack(_simd16_extract_ps(in, 0));
1055 simdscalar simdhi = pack(_simd16_extract_ps(in, 1));
1056
1057 __m128i templo = _simd_extractf128_si(_simd_castps_si(simdlo), 0);
1058 __m128i temphi = _simd_extractf128_si(_simd_castps_si(simdhi), 0);
1059
1060 #else
1061 __m128i templo = _mm256_cvtps_ph(_simd16_extract_ps(in, 0), _MM_FROUND_TRUNC);
1062 __m128i temphi = _mm256_cvtps_ph(_simd16_extract_ps(in, 1), _MM_FROUND_TRUNC);
1063
1064 #endif
1065 resultlo = _simd_insertf128_si(resultlo, templo, 0);
1066 resultlo = _simd_insertf128_si(resultlo, temphi, 1);
1067
1068 result = _simd16_insert_si(result, resultlo, 0);
1069
1070 return _simd16_castsi_ps(result);
1071 }
1072
1073 static simd16scalar unpack(const simd16scalar &in)
1074 {
1075 // input is 16 packed float16, output is 16 packed float32
1076 SWR_NOT_IMPL; // @todo
1077 return _simd16_setzero_ps();
1078 }
1079 #endif
1080 };
1081
1082 //////////////////////////////////////////////////////////////////////////
1083 /// TypeTraits - Format type traits specialization for FLOAT32
1084 //////////////////////////////////////////////////////////////////////////
1085 template<> struct TypeTraits<SWR_TYPE_FLOAT, 32> : PackTraits<32>
1086 {
1087 static const SWR_TYPE MyType = SWR_TYPE_FLOAT;
1088 static float toFloat() { return 1.0f; }
1089 static float fromFloat() { return 1.0f; }
1090 static inline simdscalar convertSrgb(simdscalar &in)
1091 {
1092 #if KNOB_SIMD_WIDTH == 8
1093 __m128 srcLo = _mm256_extractf128_ps(in, 0);
1094 __m128 srcHi = _mm256_extractf128_ps(in, 1);
1095
1096 srcLo = ConvertFloatToSRGB2(srcLo);
1097 srcHi = ConvertFloatToSRGB2(srcHi);
1098
1099 in = _mm256_insertf128_ps(in, srcLo, 0);
1100 in = _mm256_insertf128_ps(in, srcHi, 1);
1101 #else
1102 #error Unsupported vector width
1103 #endif
1104 return in;
1105 }
1106 #if ENABLE_AVX512_SIMD16
1107
1108 static inline simd16scalar convertSrgb(simd16scalar &in)
1109 {
1110 return ConvertFloatToSRGB2(in);
1111 }
1112 #endif
1113 };
1114
1115 //////////////////////////////////////////////////////////////////////////
1116 /// FormatIntType - Calculate base integer type for pixel components based
1117 /// on total number of bits. Components can be smaller
1118 /// that this type, but the entire pixel must not be
1119 /// any smaller than this type.
1120 //////////////////////////////////////////////////////////////////////////
1121 template <uint32_t bits, bool bits8 = bits <= 8, bool bits16 = bits <= 16>
1122 struct FormatIntType
1123 {
1124 typedef uint32_t TYPE;
1125 };
1126
1127 template <uint32_t bits>
1128 struct FormatIntType<bits, true, true>
1129 {
1130 typedef uint8_t TYPE;
1131 };
1132
1133 template <uint32_t bits>
1134 struct FormatIntType<bits, false, true>
1135 {
1136 typedef uint16_t TYPE;
1137 };
1138
1139 //////////////////////////////////////////////////////////////////////////
1140 /// Format1 - Bitfield for single component formats.
1141 //////////////////////////////////////////////////////////////////////////
1142 template<uint32_t x>
1143 union Format1
1144 {
1145 typedef typename FormatIntType<x>::TYPE TYPE;
1146 struct
1147 {
1148 TYPE r : x;
1149 };
1150
1151 ///@ The following are here to provide full template needed in Formats.
1152 struct
1153 {
1154 TYPE g : x;
1155 };
1156 struct
1157 {
1158 TYPE b : x;
1159 };
1160 struct
1161 {
1162 TYPE a : x;
1163 };
1164 };
1165
1166 //////////////////////////////////////////////////////////////////////////
1167 /// Format2 - Bitfield for 2 component formats.
1168 //////////////////////////////////////////////////////////////////////////
1169 template<uint32_t x, uint32_t y>
1170 union Format2
1171 {
1172 typedef typename FormatIntType<x + y>::TYPE TYPE;
1173
1174 struct
1175 {
1176 TYPE r : x;
1177 TYPE g : y;
1178 };
1179 struct
1180 {
1181 ///@ The following are here to provide full template needed in Formats.
1182 TYPE b : x;
1183 TYPE a : y;
1184 };
1185 };
1186
1187 //////////////////////////////////////////////////////////////////////////
1188 /// Format3 - Bitfield for 3 component formats.
1189 //////////////////////////////////////////////////////////////////////////
1190 template<uint32_t x, uint32_t y, uint32_t z>
1191 union Format3
1192 {
1193 typedef typename FormatIntType<x + y + z>::TYPE TYPE;
1194
1195 struct
1196 {
1197 TYPE r : x;
1198 TYPE g : y;
1199 TYPE b : z;
1200 };
1201 TYPE a; ///@note This is here to provide full template needed in Formats.
1202 };
1203
1204 //////////////////////////////////////////////////////////////////////////
1205 /// Format4 - Bitfield for 4 component formats.
1206 //////////////////////////////////////////////////////////////////////////
1207 template<uint32_t x, uint32_t y, uint32_t z, uint32_t w>
1208 struct Format4
1209 {
1210 typedef typename FormatIntType<x + y + z + w>::TYPE TYPE;
1211
1212 TYPE r : x;
1213 TYPE g : y;
1214 TYPE b : z;
1215 TYPE a : w;
1216 };
1217
1218 //////////////////////////////////////////////////////////////////////////
1219 /// ComponentTraits - Default components
1220 //////////////////////////////////////////////////////////////////////////
1221 template<uint32_t x, uint32_t y, uint32_t z, uint32_t w>
1222 struct Defaults
1223 {
1224 INLINE static uint32_t GetDefault(uint32_t comp)
1225 {
1226 static const uint32_t defaults[4]{ x, y, z, w };
1227 return defaults[comp];
1228 }
1229 };
1230
1231 //////////////////////////////////////////////////////////////////////////
1232 /// ComponentTraits - Component type traits.
1233 //////////////////////////////////////////////////////////////////////////
1234 template<SWR_TYPE X, uint32_t NumBitsX, SWR_TYPE Y = SWR_TYPE_UNKNOWN, uint32_t NumBitsY = 0, SWR_TYPE Z = SWR_TYPE_UNKNOWN, uint32_t NumBitsZ = 0, SWR_TYPE W = SWR_TYPE_UNKNOWN, uint32_t NumBitsW = 0>
1235 struct ComponentTraits
1236 {
1237 INLINE static SWR_TYPE GetType(uint32_t comp)
1238 {
1239 static const SWR_TYPE CompType[4]{ X, Y, Z, W };
1240 return CompType[comp];
1241 }
1242
1243 INLINE static constexpr uint32_t GetConstBPC(uint32_t comp)
1244 {
1245 return (comp == 3) ? NumBitsW :
1246 ((comp == 2) ? NumBitsZ :
1247 ((comp == 1) ? NumBitsY : NumBitsX) );
1248 }
1249
1250 INLINE static uint32_t GetBPC(uint32_t comp)
1251 {
1252 static const uint32_t MyBpc[4]{ NumBitsX, NumBitsY, NumBitsZ, NumBitsW };
1253 return MyBpc[comp];
1254 }
1255
1256 INLINE static bool isNormalized(uint32_t comp)
1257 {
1258 switch (comp)
1259 {
1260 case 0:
1261 return (X == SWR_TYPE_UNORM || X == SWR_TYPE_SNORM) ? true : false;
1262 case 1:
1263 return (Y == SWR_TYPE_UNORM || Y == SWR_TYPE_SNORM) ? true : false;
1264 case 2:
1265 return (Z == SWR_TYPE_UNORM || Z == SWR_TYPE_SNORM) ? true : false;
1266 case 3:
1267 return (W == SWR_TYPE_UNORM || W == SWR_TYPE_SNORM) ? true : false;
1268 }
1269 SWR_INVALID("Invalid component: %d", comp);
1270 return false;
1271 }
1272
1273 INLINE static float toFloat(uint32_t comp)
1274 {
1275 switch (comp)
1276 {
1277 case 0:
1278 return TypeTraits<X, NumBitsX>::toFloat();
1279 case 1:
1280 return TypeTraits<Y, NumBitsY>::toFloat();
1281 case 2:
1282 return TypeTraits<Z, NumBitsZ>::toFloat();
1283 case 3:
1284 return TypeTraits<W, NumBitsW>::toFloat();
1285 }
1286 SWR_INVALID("Invalid component: %d", comp);
1287 return TypeTraits<X, NumBitsX>::toFloat();
1288
1289 }
1290
1291 INLINE static float fromFloat(uint32_t comp)
1292 {
1293 switch (comp)
1294 {
1295 case 0:
1296 return TypeTraits<X, NumBitsX>::fromFloat();
1297 case 1:
1298 return TypeTraits<Y, NumBitsY>::fromFloat();
1299 case 2:
1300 return TypeTraits<Z, NumBitsZ>::fromFloat();
1301 case 3:
1302 return TypeTraits<W, NumBitsW>::fromFloat();
1303 }
1304 SWR_INVALID("Invalid component: %d", comp);
1305 return TypeTraits<X, NumBitsX>::fromFloat();
1306 }
1307
1308 INLINE static simdscalar loadSOA(uint32_t comp, const uint8_t* pSrc)
1309 {
1310 switch (comp)
1311 {
1312 case 0:
1313 return TypeTraits<X, NumBitsX>::loadSOA(pSrc);
1314 case 1:
1315 return TypeTraits<Y, NumBitsY>::loadSOA(pSrc);
1316 case 2:
1317 return TypeTraits<Z, NumBitsZ>::loadSOA(pSrc);
1318 case 3:
1319 return TypeTraits<W, NumBitsW>::loadSOA(pSrc);
1320 }
1321 SWR_INVALID("Invalid component: %d", comp);
1322 return TypeTraits<X, NumBitsX>::loadSOA(pSrc);
1323 }
1324
1325 INLINE static void storeSOA(uint32_t comp, uint8_t *pDst, simdscalar const &src)
1326 {
1327 switch (comp)
1328 {
1329 case 0:
1330 TypeTraits<X, NumBitsX>::storeSOA(pDst, src);
1331 return;
1332 case 1:
1333 TypeTraits<Y, NumBitsY>::storeSOA(pDst, src);
1334 return;
1335 case 2:
1336 TypeTraits<Z, NumBitsZ>::storeSOA(pDst, src);
1337 return;
1338 case 3:
1339 TypeTraits<W, NumBitsW>::storeSOA(pDst, src);
1340 return;
1341 }
1342 SWR_INVALID("Invalid component: %d", comp);
1343 }
1344
1345 INLINE static simdscalar unpack(uint32_t comp, simdscalar &in)
1346 {
1347 simdscalar out;
1348 switch (comp)
1349 {
1350 case 0:
1351 out = TypeTraits<X, NumBitsX>::unpack(in); break;
1352 case 1:
1353 out = TypeTraits<Y, NumBitsY>::unpack(in); break;
1354 case 2:
1355 out = TypeTraits<Z, NumBitsZ>::unpack(in); break;
1356 case 3:
1357 out = TypeTraits<W, NumBitsW>::unpack(in); break;
1358 default:
1359 SWR_INVALID("Invalid component: %d", comp);
1360 out = in;
1361 break;
1362 }
1363 return out;
1364 }
1365
1366 INLINE static simdscalar pack(uint32_t comp, simdscalar &in)
1367 {
1368 simdscalar out;
1369 switch (comp)
1370 {
1371 case 0:
1372 out = TypeTraits<X, NumBitsX>::pack(in); break;
1373 case 1:
1374 out = TypeTraits<Y, NumBitsY>::pack(in); break;
1375 case 2:
1376 out = TypeTraits<Z, NumBitsZ>::pack(in); break;
1377 case 3:
1378 out = TypeTraits<W, NumBitsW>::pack(in); break;
1379 default:
1380 SWR_INVALID("Invalid component: %d", comp);
1381 out = in;
1382 break;
1383 }
1384 return out;
1385 }
1386
1387 INLINE static simdscalar convertSrgb(uint32_t comp, simdscalar &in)
1388 {
1389 switch (comp)
1390 {
1391 case 0:
1392 return TypeTraits<X, NumBitsX>::convertSrgb(in);
1393 case 1:
1394 return TypeTraits<Y, NumBitsY>::convertSrgb(in);
1395 case 2:
1396 return TypeTraits<Z, NumBitsZ>::convertSrgb(in);
1397 case 3:
1398 return TypeTraits<W, NumBitsW>::convertSrgb(in);
1399 }
1400 SWR_INVALID("Invalid component: %d", comp);
1401 return TypeTraits<X, NumBitsX>::convertSrgb(in);
1402 }
1403 #if ENABLE_AVX512_SIMD16
1404
1405 INLINE static simd16scalar loadSOA_16(uint32_t comp, const uint8_t* pSrc)
1406 {
1407 switch (comp)
1408 {
1409 case 0:
1410 return TypeTraits<X, NumBitsX>::loadSOA_16(pSrc);
1411 case 1:
1412 return TypeTraits<Y, NumBitsY>::loadSOA_16(pSrc);
1413 case 2:
1414 return TypeTraits<Z, NumBitsZ>::loadSOA_16(pSrc);
1415 case 3:
1416 return TypeTraits<W, NumBitsW>::loadSOA_16(pSrc);
1417 }
1418 SWR_INVALID("Invalid component: %d", comp);
1419 return TypeTraits<X, NumBitsX>::loadSOA_16(pSrc);
1420 }
1421
1422 INLINE static void SIMDCALL storeSOA(uint32_t comp, uint8_t *pDst, simd16scalar const &src)
1423 {
1424 switch (comp)
1425 {
1426 case 0:
1427 TypeTraits<X, NumBitsX>::storeSOA(pDst, src);
1428 return;
1429 case 1:
1430 TypeTraits<Y, NumBitsY>::storeSOA(pDst, src);
1431 return;
1432 case 2:
1433 TypeTraits<Z, NumBitsZ>::storeSOA(pDst, src);
1434 return;
1435 case 3:
1436 TypeTraits<W, NumBitsW>::storeSOA(pDst, src);
1437 return;
1438 }
1439 SWR_INVALID("Invalid component: %d", comp);
1440 TypeTraits<X, NumBitsX>::storeSOA(pDst, src);
1441 }
1442
1443 INLINE static simd16scalar unpack(uint32_t comp, simd16scalar &in)
1444 {
1445 switch (comp)
1446 {
1447 case 0:
1448 return TypeTraits<X, NumBitsX>::unpack(in);
1449 case 1:
1450 return TypeTraits<Y, NumBitsY>::unpack(in);
1451 case 2:
1452 return TypeTraits<Z, NumBitsZ>::unpack(in);
1453 case 3:
1454 return TypeTraits<W, NumBitsW>::unpack(in);
1455 }
1456 SWR_INVALID("Invalid component: %d", comp);
1457 return TypeTraits<X, NumBitsX>::unpack(in);
1458 }
1459
1460 INLINE static simd16scalar pack(uint32_t comp, simd16scalar &in)
1461 {
1462 switch (comp)
1463 {
1464 case 0:
1465 return TypeTraits<X, NumBitsX>::pack(in);
1466 case 1:
1467 return TypeTraits<Y, NumBitsY>::pack(in);
1468 case 2:
1469 return TypeTraits<Z, NumBitsZ>::pack(in);
1470 case 3:
1471 return TypeTraits<W, NumBitsW>::pack(in);
1472 }
1473 SWR_INVALID("Invalid component: %d", comp);
1474 return TypeTraits<X, NumBitsX>::pack(in);
1475 }
1476
1477 INLINE static simd16scalar convertSrgb(uint32_t comp, simd16scalar &in)
1478 {
1479 switch (comp)
1480 {
1481 case 0:
1482 return TypeTraits<X, NumBitsX>::convertSrgb(in);
1483 case 1:
1484 return TypeTraits<Y, NumBitsY>::convertSrgb(in);
1485 case 2:
1486 return TypeTraits<Z, NumBitsZ>::convertSrgb(in);
1487 case 3:
1488 return TypeTraits<W, NumBitsW>::convertSrgb(in);
1489 }
1490 SWR_INVALID("Invalid component: %d", comp);
1491 return TypeTraits<X, NumBitsX>::convertSrgb(in);
1492 }
1493 #endif
1494 };