swr: [rasterizer core/sim] 8x2 backend + 16-wide tile clear/load/store
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / format_types.h
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file formats.h
24 *
25 * @brief Definitions for SWR_FORMAT functions.
26 *
27 ******************************************************************************/
28 #pragma once
29
30 #include "utils.h"
31
32 //////////////////////////////////////////////////////////////////////////
33 /// PackTraits - Helpers for packing / unpacking same pixel sizes
34 //////////////////////////////////////////////////////////////////////////
35 template <uint32_t NumBits, bool Signed = false>
36 struct PackTraits
37 {
38 static const uint32_t MyNumBits = NumBits;
39 static simdscalar loadSOA(const uint8_t *pSrc) = delete;
40 static void storeSOA(uint8_t *pDst, simdscalar src) = delete;
41 static simdscalar unpack(simdscalar &in) = delete;
42 static simdscalar pack(simdscalar &in) = delete;
43 #if ENABLE_AVX512_SIMD16
44 static simd16scalar loadSOA_16(const uint8_t *pSrc) = delete;
45 static void storeSOA(uint8_t *pDst, simd16scalar src) = delete;
46 static simd16scalar unpack(simd16scalar &in) = delete;
47 static simd16scalar pack(simd16scalar &in) = delete;
48 #endif
49 };
50
51 //////////////////////////////////////////////////////////////////////////
52 /// PackTraits - Helpers for packing / unpacking unused channels
53 //////////////////////////////////////////////////////////////////////////
54 template <>
55 struct PackTraits<0, false>
56 {
57 static const uint32_t MyNumBits = 0;
58
59 static simdscalar loadSOA(const uint8_t *pSrc) { return _simd_setzero_ps(); }
60 static void storeSOA(uint8_t *pDst, simdscalar src) { return; }
61 static simdscalar unpack(simdscalar &in) { return _simd_setzero_ps(); }
62 static simdscalar pack(simdscalar &in) { return _simd_setzero_ps(); }
63 #if ENABLE_AVX512_SIMD16
64 static simd16scalar loadSOA_16(const uint8_t *pSrc) { return _simd16_setzero_ps(); }
65 static void storeSOA(uint8_t *pDst, simd16scalar src) { return; }
66 static simd16scalar unpack(simd16scalar &in) { return _simd16_setzero_ps(); }
67 static simd16scalar pack(simd16scalar &in) { return _simd16_setzero_ps(); }
68 #endif
69 };
70
71 //////////////////////////////////////////////////////////////////////////
72 /// PackTraits - Helpers for packing / unpacking 8 bit unsigned channels
73 //////////////////////////////////////////////////////////////////////////
74 template <>
75 struct PackTraits<8, false>
76 {
77 static const uint32_t MyNumBits = 8;
78
79 static simdscalar loadSOA(const uint8_t *pSrc)
80 {
81 #if KNOB_SIMD_WIDTH == 8
82 __m256 result = _mm256_setzero_ps();
83 __m128 vLo = _mm_castpd_ps(_mm_load_sd((double*)pSrc));
84 return _mm256_insertf128_ps(result, vLo, 0);
85 #elif KNOB_SIMD_WIDTH == 16
86 #if ENABLE_AVX512_EMULATION
87 simdscalar result = _simd_setzero_ps();
88
89 __m128 src = _mm_load_ps(reinterpret_cast<const float*>(pSrc));
90
91 result.lo = _mm256_insertf128_ps(result.lo, src, 0);
92
93 return result;
94 #endif
95 #else
96 #error Unsupported vector width
97 #endif
98 }
99
100 static void storeSOA(uint8_t *pDst, simdscalar src)
101 {
102 // store simd bytes
103 #if KNOB_SIMD_WIDTH == 8
104 _mm_storel_pd((double*)pDst, _mm_castps_pd(_mm256_castps256_ps128(src)));
105 #elif KNOB_SIMD_WIDTH == 16
106 #if ENABLE_AVX512_EMULATION
107 _mm_store_ps(reinterpret_cast<float*>(pDst), _mm256_castps256_ps128(src.lo));
108 #endif
109 #else
110 #error Unsupported vector width
111 #endif
112 }
113
114 static simdscalar unpack(simdscalar &in)
115 {
116 #if KNOB_SIMD_WIDTH == 8
117 #if KNOB_ARCH==KNOB_ARCH_AVX
118 __m128i src = _mm_castps_si128(_mm256_castps256_ps128(in));
119 __m128i resLo = _mm_cvtepu8_epi32(src);
120 __m128i resHi = _mm_shuffle_epi8(src,
121 _mm_set_epi32(0x80808007, 0x80808006, 0x80808005, 0x80808004));
122
123 __m256i result = _mm256_castsi128_si256(resLo);
124 result = _mm256_insertf128_si256(result, resHi, 1);
125 return _mm256_castsi256_ps(result);
126 #elif KNOB_ARCH>=KNOB_ARCH_AVX2
127 return _mm256_castsi256_ps(_mm256_cvtepu8_epi32(_mm_castps_si128(_mm256_castps256_ps128(in))));
128 #endif
129 #elif KNOB_SIMD_WIDTH == 16
130 #if ENABLE_AVX512_EMULATION
131 simdscalari result;
132
133 __m128i src = _mm_castps_si128(_mm256_castps256_ps128(in.lo));
134
135 result.lo = _mm256_cvtepu8_epi32(src);
136
137 result.hi = _mm256_cvtepu8_epi32(_mm_srli_si128(src, 8));
138
139 return _simd_castsi_ps(result);
140 #endif
141 #else
142 #error Unsupported vector width
143 #endif
144 }
145
146 static simdscalar pack(simdscalar &in)
147 {
148 #if KNOB_SIMD_WIDTH == 8
149 simdscalari src = _simd_castps_si(in);
150 __m128i res16 = _mm_packus_epi32(_mm256_castsi256_si128(src), _mm256_extractf128_si256(src, 1));
151 __m128i res8 = _mm_packus_epi16(res16, _mm_undefined_si128());
152 return _mm256_castsi256_ps(_mm256_castsi128_si256(res8));
153 #elif KNOB_SIMD_WIDTH == 16
154 #if ENABLE_AVX512_EMULATION
155 simdscalari result = _simd_setzero_si();
156
157 __m128i templo = _mm_packus_epi32(_mm256_castsi256_si128(_mm256_castps_si256(in.lo)), _mm256_extractf128_si256(_mm256_castps_si256(in.lo), 1));
158
159 __m128i temphi = _mm_packus_epi32(_mm256_castsi256_si128(_mm256_castps_si256(in.hi)), _mm256_extractf128_si256(_mm256_castps_si256(in.hi), 1));
160
161 __m128i temp = _mm_packus_epi16(templo, temphi);
162
163 result.lo = _mm256_insertf128_si256(result.lo, temp, 0);
164
165 return _simd_castsi_ps(result);
166 #endif
167 #else
168 #error Unsupported vector width
169 #endif
170 }
171 #if ENABLE_AVX512_SIMD16
172
173 static simd16scalar loadSOA_16(const uint8_t *pSrc)
174 {
175 simd16scalar result = _simd16_setzero_ps();
176 simdscalar resultlo = _simd_setzero_ps();
177
178 const __m128 src = _mm_load_ps(reinterpret_cast<const float *>(pSrc));
179
180 resultlo = _mm256_insertf128_ps(resultlo, src, 0);
181 result = _simd16_insert_ps(result, resultlo, 0);
182
183 return result;
184 }
185
186 static void storeSOA(uint8_t *pDst, simd16scalar src)
187 {
188 // store simd16 bytes
189 _mm_store_ps(reinterpret_cast<float *>(pDst), _mm256_castps256_ps128(_simd16_extract_ps(src, 0)));
190 }
191
192 static simd16scalar unpack(simd16scalar &in)
193 {
194 simd16scalari result = _simd16_setzero_si();
195
196 __m128i src = _mm_castps_si128(_mm256_castps256_ps128(_simd16_extract_ps(in, 0)));
197
198 result = _simd16_insert_si(result, _simd_cvtepu8_epi32(src), 0);
199 result = _simd16_insert_si(result, _simd_cvtepu8_epi32(_mm_srli_si128(src, 8)), 1);
200
201 return _simd16_castsi_ps(result);
202 }
203
204 static simd16scalar pack(simd16scalar &in)
205 {
206 simd16scalari result = _simd16_setzero_si();
207 simdscalari resultlo = _simd_setzero_si();
208
209 __m128i templo = _mm_packus_epi32(_mm256_castsi256_si128(_mm256_castps_si256(_simd16_extract_ps(in, 0))), _mm256_extractf128_si256(_mm256_castps_si256(_simd16_extract_ps(in, 0)), 1));
210 __m128i temphi = _mm_packus_epi32(_mm256_castsi256_si128(_mm256_castps_si256(_simd16_extract_ps(in, 1))), _mm256_extractf128_si256(_mm256_castps_si256(_simd16_extract_ps(in, 1)), 1));
211
212 __m128i temp = _mm_packus_epi16(templo, temphi);
213
214 resultlo = _mm256_inserti128_si256(resultlo, temp, 0);
215 result = _simd16_insert_si(result, resultlo, 0);
216
217 return _simd16_castsi_ps(result);
218 }
219 #endif
220 };
221
222 //////////////////////////////////////////////////////////////////////////
223 /// PackTraits - Helpers for packing / unpacking 8 bit signed channels
224 //////////////////////////////////////////////////////////////////////////
225 template <>
226 struct PackTraits<8, true>
227 {
228 static const uint32_t MyNumBits = 8;
229
230 static simdscalar loadSOA(const uint8_t *pSrc)
231 {
232 #if KNOB_SIMD_WIDTH == 8
233 __m256 result = _mm256_setzero_ps();
234 __m128 vLo = _mm_castpd_ps(_mm_load_sd((double*)pSrc));
235 return _mm256_insertf128_ps(result, vLo, 0);
236 #elif KNOB_SIMD_WIDTH == 16
237 #if ENABLE_AVX512_EMULATION
238 simdscalar result = _simd_setzero_ps();
239
240 __m128 src = _mm_load_ps(reinterpret_cast<const float*>(pSrc));
241
242 result.lo = _mm256_insertf128_ps(result.lo, src, 0);
243
244 return result;
245 #endif
246 #else
247 #error Unsupported vector width
248 #endif
249 }
250
251 static void storeSOA(uint8_t *pDst, simdscalar src)
252 {
253 // store simd bytes
254 #if KNOB_SIMD_WIDTH == 8
255 _mm_storel_pd((double*)pDst, _mm_castps_pd(_mm256_castps256_ps128(src)));
256 #elif KNOB_SIMD_WIDTH == 16
257 #if ENABLE_AVX512_EMULATION
258 _mm_store_ps(reinterpret_cast<float*>(pDst), _mm256_castps256_ps128(src.lo));
259 #endif
260 #else
261 #error Unsupported vector width
262 #endif
263 }
264
265 static simdscalar unpack(simdscalar &in)
266 {
267 #if KNOB_SIMD_WIDTH == 8
268 #if KNOB_ARCH==KNOB_ARCH_AVX
269 SWR_ASSERT(0); // I think this may be incorrect.
270 __m128i src = _mm_castps_si128(_mm256_castps256_ps128(in));
271 __m128i resLo = _mm_cvtepi8_epi32(src);
272 __m128i resHi = _mm_shuffle_epi8(src,
273 _mm_set_epi32(0x80808007, 0x80808006, 0x80808005, 0x80808004));
274
275 __m256i result = _mm256_castsi128_si256(resLo);
276 result = _mm256_insertf128_si256(result, resHi, 1);
277 return _mm256_castsi256_ps(result);
278 #elif KNOB_ARCH>=KNOB_ARCH_AVX2
279 return _mm256_castsi256_ps(_mm256_cvtepi8_epi32(_mm_castps_si128(_mm256_castps256_ps128(in))));
280 #endif
281 #elif KNOB_SIMD_WIDTH == 16
282 #if ENABLE_AVX512_EMULATION
283 simdscalari result;
284
285 __m128i src = _mm_castps_si128(_mm256_castps256_ps128(in.lo));
286
287 result.lo = _mm256_cvtepu8_epi32(src);
288
289 result.hi = _mm256_cvtepu8_epi32(_mm_srli_si128(src, 8));
290
291 return _simd_castsi_ps(result);
292 #endif
293 #else
294 #error Unsupported vector width
295 #endif
296 }
297
298 static simdscalar pack(simdscalar &in)
299 {
300 #if KNOB_SIMD_WIDTH == 8
301 simdscalari src = _simd_castps_si(in);
302 __m128i res16 = _mm_packs_epi32(_mm256_castsi256_si128(src), _mm256_extractf128_si256(src, 1));
303 __m128i res8 = _mm_packs_epi16(res16, _mm_undefined_si128());
304 return _mm256_castsi256_ps(_mm256_castsi128_si256(res8));
305 #elif KNOB_SIMD_WIDTH == 16
306 #if ENABLE_AVX512_EMULATION
307 simdscalari result = _simd_setzero_si();
308
309 __m128i templo = _mm_packs_epi32(_mm256_castsi256_si128(_mm256_castps_si256(in.lo)), _mm256_extractf128_si256(_mm256_castps_si256(in.lo), 1));
310
311 __m128i temphi = _mm_packs_epi32(_mm256_castsi256_si128(_mm256_castps_si256(in.hi)), _mm256_extractf128_si256(_mm256_castps_si256(in.hi), 1));
312
313 __m128i temp = _mm_packs_epi16(templo, temphi);
314
315 result.lo = _mm256_insertf128_si256(result.lo, temp, 0);
316
317 return _simd_castsi_ps(result);
318 #endif
319 #else
320 #error Unsupported vector width
321 #endif
322 }
323 #if ENABLE_AVX512_SIMD16
324
325 static simd16scalar loadSOA_16(const uint8_t *pSrc)
326 {
327 simd16scalar result = _simd16_setzero_ps();
328 simdscalar resultlo = _simd_setzero_ps();
329
330 const __m128 src = _mm_load_ps(reinterpret_cast<const float *>(pSrc));
331
332 resultlo = _mm256_insertf128_ps(resultlo, src, 0);
333 result = _simd16_insert_ps(result, resultlo, 0);
334
335 return result;
336 }
337
338 static void storeSOA(uint8_t *pDst, simd16scalar src)
339 {
340 // store simd16 bytes
341 _mm_store_ps(reinterpret_cast<float *>(pDst), _mm256_castps256_ps128(_simd16_extract_ps(src, 0)));
342 }
343
344 static simd16scalar unpack(simd16scalar &in)
345 {
346 simd16scalari result = _simd16_setzero_si();
347
348 __m128i src = _mm_castps_si128(_mm256_castps256_ps128(_simd16_extract_ps(in, 0)));
349
350 result = _simd16_insert_si(result, _simd_cvtepu8_epi32(src), 0);
351 result = _simd16_insert_si(result, _simd_cvtepu8_epi32(_mm_srli_si128(src, 8)), 1);
352
353 return _simd16_castsi_ps(result);
354 }
355
356 static simd16scalar pack(simd16scalar &in)
357 {
358 simd16scalari result = _simd16_setzero_si();
359 simdscalari resultlo = _simd_setzero_si();
360
361 __m128i templo = _mm_packs_epi32(_mm256_castsi256_si128(_mm256_castps_si256(_simd16_extract_ps(in, 0))), _mm256_extractf128_si256(_mm256_castps_si256(_simd16_extract_ps(in, 0)), 1));
362 __m128i temphi = _mm_packs_epi32(_mm256_castsi256_si128(_mm256_castps_si256(_simd16_extract_ps(in, 1))), _mm256_extractf128_si256(_mm256_castps_si256(_simd16_extract_ps(in, 1)), 1));
363
364 __m128i temp = _mm_packs_epi16(templo, temphi);
365
366 resultlo = _mm256_inserti128_si256(resultlo, temp, 0);
367 result = _simd16_insert_si(result, resultlo, 0);
368
369 return _simd16_castsi_ps(result);
370 }
371 #endif
372 };
373
374 //////////////////////////////////////////////////////////////////////////
375 /// PackTraits - Helpers for packing / unpacking 16 bit unsigned channels
376 //////////////////////////////////////////////////////////////////////////
377 template <>
378 struct PackTraits<16, false>
379 {
380 static const uint32_t MyNumBits = 16;
381
382 static simdscalar loadSOA(const uint8_t *pSrc)
383 {
384 #if KNOB_SIMD_WIDTH == 8
385 __m256 result = _mm256_setzero_ps();
386 __m128 vLo = _mm_load_ps((const float*)pSrc);
387 return _mm256_insertf128_ps(result, vLo, 0);
388 #elif KNOB_SIMD_WIDTH == 16
389 #if ENABLE_AVX512_EMULATION
390 simdscalar result;
391
392 result.lo = _mm256_load_ps(reinterpret_cast<const float*>(pSrc));
393
394 result.hi = _mm256_undefined_ps();
395
396 return result;
397 #endif
398 #else
399 #error Unsupported vector width
400 #endif
401 }
402
403 static void storeSOA(uint8_t *pDst, simdscalar src)
404 {
405 #if KNOB_SIMD_WIDTH == 8
406 // store 16B (2B * 8)
407 _mm_store_ps((float*)pDst, _mm256_castps256_ps128(src));
408 #elif KNOB_SIMD_WIDTH == 16
409 #if ENABLE_AVX512_EMULATION
410 _mm256_store_ps(reinterpret_cast<float*>(pDst), src.lo);
411 #endif
412 #else
413 #error Unsupported vector width
414 #endif
415 }
416
417 static simdscalar unpack(simdscalar &in)
418 {
419 #if KNOB_SIMD_WIDTH == 8
420 #if KNOB_ARCH==KNOB_ARCH_AVX
421 __m128i src = _mm_castps_si128(_mm256_castps256_ps128(in));
422 __m128i resLo = _mm_cvtepu16_epi32(src);
423 __m128i resHi = _mm_shuffle_epi8(src,
424 _mm_set_epi32(0x80800F0E, 0x80800D0C, 0x80800B0A, 0x80800908));
425
426 __m256i result = _mm256_castsi128_si256(resLo);
427 result = _mm256_insertf128_si256(result, resHi, 1);
428 return _mm256_castsi256_ps(result);
429 #elif KNOB_ARCH>=KNOB_ARCH_AVX2
430 return _mm256_castsi256_ps(_mm256_cvtepu16_epi32(_mm_castps_si128(_mm256_castps256_ps128(in))));
431 #endif
432 #elif KNOB_SIMD_WIDTH == 16
433 #if ENABLE_AVX512_EMULATION
434 simdscalari result;
435
436 result.lo = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(_mm256_castps_si256(in.lo), 0));
437
438 result.hi = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(_mm256_castps_si256(in.lo), 1));
439
440 return _simd_castsi_ps(result);
441 #endif
442 #else
443 #error Unsupported vector width
444 #endif
445 }
446
447 static simdscalar pack(simdscalar &in)
448 {
449 #if KNOB_SIMD_WIDTH == 8
450 simdscalari src = _simd_castps_si(in);
451 __m256i res = _mm256_castsi128_si256(_mm_packus_epi32(_mm256_castsi256_si128(src), _mm256_extractf128_si256(src, 1)));
452 return _mm256_castsi256_ps(res);
453 #elif KNOB_SIMD_WIDTH == 16
454 #if ENABLE_AVX512_EMULATION
455 simdscalari result;
456
457 __m256i inlo = _mm256_castps_si256(in.lo);
458 __m256i inhi = _mm256_castps_si256(in.hi);
459
460 __m256i templo = _mm256_permute2x128_si256(inlo, inhi, 0x20);
461 __m256i temphi = _mm256_permute2x128_si256(inlo, inhi, 0x31);
462
463 result.lo = _mm256_packus_epi32(templo, temphi);
464 result.hi = _mm256_undefined_si256();
465
466 return _simd_castsi_ps(result);
467 #endif
468 #else
469 #error Unsupported vector width
470 #endif
471 }
472 #if ENABLE_AVX512_SIMD16
473
474 static simd16scalar loadSOA_16(const uint8_t *pSrc)
475 {
476 simd16scalar result = _simd16_setzero_ps();
477
478 simdscalar resultlo = _simd_load_ps(reinterpret_cast<const float *>(pSrc));
479
480 result = _simd16_insert_ps(result, resultlo, 0);
481
482 return result;
483 }
484
485 static void storeSOA(uint8_t *pDst, simd16scalar src)
486 {
487 _simd_store_ps(reinterpret_cast<float *>(pDst), _simd16_extract_ps(src, 0));
488 }
489
490 static simd16scalar unpack(simd16scalar &in)
491 {
492 simd16scalari result = _simd16_setzero_si();
493
494 result = _simd16_insert_si(result, _simd_cvtepu16_epi32(_mm256_extracti128_si256(_mm256_castps_si256(_simd16_extract_ps(in, 0)), 0)), 0);
495 result = _simd16_insert_si(result, _simd_cvtepu16_epi32(_mm256_extracti128_si256(_mm256_castps_si256(_simd16_extract_ps(in, 0)), 1)), 1);
496
497 return _simd16_castsi_ps(result);
498 }
499
500 static simd16scalar pack(simd16scalar &in)
501 {
502 simd16scalari result = _simd16_setzero_si();
503
504 simdscalari inlo = _simd_castps_si(_simd16_extract_ps(in, 0));
505 simdscalari inhi = _simd_castps_si(_simd16_extract_ps(in, 1));
506
507 simdscalari templo = _simd_permute2f128_si(inlo, inhi, 0x20);
508 simdscalari temphi = _simd_permute2f128_si(inlo, inhi, 0x31);
509
510 result = _simd16_insert_si(result, _simd_packus_epi32(templo, temphi), 0);
511
512 return _simd16_castsi_ps(result);
513 }
514 #endif
515 };
516
517 //////////////////////////////////////////////////////////////////////////
518 /// PackTraits - Helpers for packing / unpacking 16 bit signed channels
519 //////////////////////////////////////////////////////////////////////////
520 template <>
521 struct PackTraits<16, true>
522 {
523 static const uint32_t MyNumBits = 16;
524
525 static simdscalar loadSOA(const uint8_t *pSrc)
526 {
527 #if KNOB_SIMD_WIDTH == 8
528 __m256 result = _mm256_setzero_ps();
529 __m128 vLo = _mm_load_ps((const float*)pSrc);
530 return _mm256_insertf128_ps(result, vLo, 0);
531 #elif KNOB_SIMD_WIDTH == 16
532 #if ENABLE_AVX512_EMULATION
533 simdscalar result;
534
535 result.lo = _mm256_load_ps(reinterpret_cast<const float*>(pSrc));
536
537 result.hi = _mm256_undefined_ps();
538
539 return result;
540 #endif
541 #else
542 #error Unsupported vector width
543 #endif
544 }
545
546 static void storeSOA(uint8_t *pDst, simdscalar src)
547 {
548 #if KNOB_SIMD_WIDTH == 8
549 // store 16B (2B * 8)
550 _mm_store_ps((float*)pDst, _mm256_castps256_ps128(src));
551 #elif KNOB_SIMD_WIDTH == 16
552 #if ENABLE_AVX512_EMULATION
553 _mm256_store_ps(reinterpret_cast<float*>(pDst), src.lo);
554 #endif
555 #else
556 #error Unsupported vector width
557 #endif
558 }
559
560 static simdscalar unpack(simdscalar &in)
561 {
562 #if KNOB_SIMD_WIDTH == 8
563 #if KNOB_ARCH==KNOB_ARCH_AVX
564 SWR_ASSERT(0); // I think this is incorrectly implemented
565 __m128i src = _mm_castps_si128(_mm256_castps256_ps128(in));
566 __m128i resLo = _mm_cvtepi16_epi32(src);
567 __m128i resHi = _mm_shuffle_epi8(src,
568 _mm_set_epi32(0x80800F0E, 0x80800D0C, 0x80800B0A, 0x80800908));
569
570 __m256i result = _mm256_castsi128_si256(resLo);
571 result = _mm256_insertf128_si256(result, resHi, 1);
572 return _mm256_castsi256_ps(result);
573 #elif KNOB_ARCH>=KNOB_ARCH_AVX2
574 return _mm256_castsi256_ps(_mm256_cvtepi16_epi32(_mm_castps_si128(_mm256_castps256_ps128(in))));
575 #endif
576 #elif KNOB_SIMD_WIDTH == 16
577 #if ENABLE_AVX512_EMULATION
578 simdscalari result;
579
580 result.lo = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(_mm256_castps_si256(in.lo), 0));
581
582 result.hi = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(_mm256_castps_si256(in.lo), 1));
583
584 return _simd_castsi_ps(result);
585 #endif
586 #else
587 #error Unsupported vector width
588 #endif
589 }
590
591 static simdscalar pack(simdscalar &in)
592 {
593 #if KNOB_SIMD_WIDTH == 8
594 simdscalari src = _simd_castps_si(in);
595 __m256i res = _mm256_castsi128_si256(_mm_packs_epi32(_mm256_castsi256_si128(src), _mm256_extractf128_si256(src, 1)));
596 return _mm256_castsi256_ps(res);
597 #elif KNOB_SIMD_WIDTH == 16
598 #if ENABLE_AVX512_EMULATION
599 simdscalari result;
600
601 __m256i inlo = _mm256_castps_si256(in.lo);
602 __m256i inhi = _mm256_castps_si256(in.hi);
603
604 __m256i templo = _mm256_permute2x128_si256(inlo, inhi, 0x20);
605 __m256i temphi = _mm256_permute2x128_si256(inlo, inhi, 0x31);
606
607 result.lo = _mm256_packs_epi32(templo, temphi);
608 result.hi = _mm256_undefined_si256();
609
610 return _simd_castsi_ps(result);
611 #endif
612 #else
613 #error Unsupported vector width
614 #endif
615 }
616 #if ENABLE_AVX512_SIMD16
617
618 static simd16scalar loadSOA_16(const uint8_t *pSrc)
619 {
620 simd16scalar result = _simd16_setzero_ps();
621
622 simdscalar resultlo = _simd_load_ps(reinterpret_cast<const float *>(pSrc));
623
624 result = _simd16_insert_ps(result, resultlo, 0);
625
626 return result;
627 }
628
629 static void storeSOA(uint8_t *pDst, simd16scalar src)
630 {
631 _simd_store_ps(reinterpret_cast<float *>(pDst), _simd16_extract_ps(src, 0));
632 }
633
634 static simd16scalar unpack(simd16scalar &in)
635 {
636 simd16scalari result = _simd16_setzero_si();
637
638 result = _simd16_insert_si(result, _simd_cvtepu16_epi32(_mm256_extracti128_si256(_mm256_castps_si256(_simd16_extract_ps(in, 0)), 0)), 0);
639 result = _simd16_insert_si(result, _simd_cvtepu16_epi32(_mm256_extracti128_si256(_mm256_castps_si256(_simd16_extract_ps(in, 0)), 1)), 1);
640
641 return _simd16_castsi_ps(result);
642 }
643
644 static simd16scalar pack(simd16scalar &in)
645 {
646 simd16scalari result = _simd16_setzero_si();
647
648 simdscalari inlo = _simd_castps_si(_simd16_extract_ps(in, 0));
649 simdscalari inhi = _simd_castps_si(_simd16_extract_ps(in, 1));
650
651 simdscalari templo = _simd_permute2f128_si(inlo, inhi, 0x20);
652 simdscalari temphi = _simd_permute2f128_si(inlo, inhi, 0x31);
653
654 result = _simd16_insert_si(result, _simd_packus_epi32(templo, temphi), 0);
655
656 return _simd16_castsi_ps(result);
657 }
658 #endif
659 };
660
661 //////////////////////////////////////////////////////////////////////////
662 /// PackTraits - Helpers for packing / unpacking 32 bit channels
663 //////////////////////////////////////////////////////////////////////////
664 template <>
665 struct PackTraits<32, false>
666 {
667 static const uint32_t MyNumBits = 32;
668
669 static simdscalar loadSOA(const uint8_t *pSrc) { return _simd_load_ps((const float*)pSrc); }
670 static void storeSOA(uint8_t *pDst, simdscalar src) { _simd_store_ps((float*)pDst, src); }
671 static simdscalar unpack(simdscalar &in) { return in; }
672 static simdscalar pack(simdscalar &in) { return in; }
673 #if ENABLE_AVX512_SIMD16
674
675 static simd16scalar loadSOA_16(const uint8_t *pSrc)
676 {
677 return _simd16_load_ps(reinterpret_cast<const float *>(pSrc));
678 }
679
680 static void storeSOA(uint8_t *pDst, simd16scalar src)
681 {
682 _simd16_store_ps(reinterpret_cast<float *>(pDst), src);
683 }
684
685 static simd16scalar unpack(simd16scalar &in)
686 {
687 return in;
688 }
689
690 static simd16scalar pack(simd16scalar &in)
691 {
692 return in;
693 }
694 #endif
695 };
696
697 //////////////////////////////////////////////////////////////////////////
698 /// TypeTraits - Format type traits.
699 //////////////////////////////////////////////////////////////////////////
700 template<SWR_TYPE type, uint32_t NumBits>
701 struct TypeTraits : PackTraits<NumBits>
702 {
703 static const SWR_TYPE MyType = type;
704 static float toFloat() { return 0.0; }
705 static float fromFloat() { SWR_ASSERT(0); return 0.0; }
706 static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
707 };
708
709 //////////////////////////////////////////////////////////////////////////
710 /// TypeTraits - Format type traits specialization for UINT8
711 //////////////////////////////////////////////////////////////////////////
712 template<> struct TypeTraits<SWR_TYPE_UINT, 8> : PackTraits<8>
713 {
714 static const SWR_TYPE MyType = SWR_TYPE_UINT;
715 static float toFloat() { return 0.0; }
716 static float fromFloat() { SWR_ASSERT(0); return 0.0; }
717 static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
718 };
719
720 //////////////////////////////////////////////////////////////////////////
721 /// TypeTraits - Format type traits specialization for UINT8
722 //////////////////////////////////////////////////////////////////////////
723 template<> struct TypeTraits<SWR_TYPE_SINT, 8> : PackTraits<8, true>
724 {
725 static const SWR_TYPE MyType = SWR_TYPE_SINT;
726 static float toFloat() { return 0.0; }
727 static float fromFloat() { SWR_ASSERT(0); return 0.0; }
728 static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
729 };
730
731 //////////////////////////////////////////////////////////////////////////
732 /// TypeTraits - Format type traits specialization for UINT16
733 //////////////////////////////////////////////////////////////////////////
734 template<> struct TypeTraits<SWR_TYPE_UINT, 16> : PackTraits<16>
735 {
736 static const SWR_TYPE MyType = SWR_TYPE_UINT;
737 static float toFloat() { return 0.0; }
738 static float fromFloat() { SWR_ASSERT(0); return 0.0; }
739 static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
740 };
741
742 //////////////////////////////////////////////////////////////////////////
743 /// TypeTraits - Format type traits specialization for SINT16
744 //////////////////////////////////////////////////////////////////////////
745 template<> struct TypeTraits<SWR_TYPE_SINT, 16> : PackTraits<16, true>
746 {
747 static const SWR_TYPE MyType = SWR_TYPE_SINT;
748 static float toFloat() { return 0.0; }
749 static float fromFloat() { SWR_ASSERT(0); return 0.0; }
750 static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
751 };
752
753 //////////////////////////////////////////////////////////////////////////
754 /// TypeTraits - Format type traits specialization for UINT32
755 //////////////////////////////////////////////////////////////////////////
756 template<> struct TypeTraits<SWR_TYPE_UINT, 32> : PackTraits<32>
757 {
758 static const SWR_TYPE MyType = SWR_TYPE_UINT;
759 static float toFloat() { return 0.0; }
760 static float fromFloat() { SWR_ASSERT(0); return 0.0; }
761 static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
762 };
763
764 //////////////////////////////////////////////////////////////////////////
765 /// TypeTraits - Format type traits specialization for UINT32
766 //////////////////////////////////////////////////////////////////////////
767 template<> struct TypeTraits<SWR_TYPE_SINT, 32> : PackTraits<32>
768 {
769 static const SWR_TYPE MyType = SWR_TYPE_SINT;
770 static float toFloat() { return 0.0; }
771 static float fromFloat() { SWR_ASSERT(0); return 0.0; }
772 static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
773 };
774
775 //////////////////////////////////////////////////////////////////////////
776 /// TypeTraits - Format type traits specialization for UNORM5
777 //////////////////////////////////////////////////////////////////////////
778 template<> struct TypeTraits<SWR_TYPE_UNORM, 5> : PackTraits<5>
779 {
780 static const SWR_TYPE MyType = SWR_TYPE_UNORM;
781 static float toFloat() { return 1.0f / 31.0f; }
782 static float fromFloat() { return 31.0f; }
783 static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
784 };
785
786 //////////////////////////////////////////////////////////////////////////
787 /// TypeTraits - Format type traits specialization for UNORM6
788 //////////////////////////////////////////////////////////////////////////
789 template<> struct TypeTraits<SWR_TYPE_UNORM, 6> : PackTraits<6>
790 {
791 static const SWR_TYPE MyType = SWR_TYPE_UNORM;
792 static float toFloat() { return 1.0f / 63.0f; }
793 static float fromFloat() { return 63.0f; }
794 static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
795 };
796
797 //////////////////////////////////////////////////////////////////////////
798 /// TypeTraits - Format type traits specialization for UNORM8
799 //////////////////////////////////////////////////////////////////////////
800 template<> struct TypeTraits<SWR_TYPE_UNORM, 8> : PackTraits<8>
801 {
802 static const SWR_TYPE MyType = SWR_TYPE_UNORM;
803 static float toFloat() { return 1.0f / 255.0f; }
804 static float fromFloat() { return 255.0f; }
805 static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
806 };
807
808 //////////////////////////////////////////////////////////////////////////
809 /// TypeTraits - Format type traits specialization for UNORM8
810 //////////////////////////////////////////////////////////////////////////
811 template<> struct TypeTraits<SWR_TYPE_SNORM, 8> : PackTraits<8, true>
812 {
813 static const SWR_TYPE MyType = SWR_TYPE_SNORM;
814 static float toFloat() { return 1.0f / 127.0f; }
815 static float fromFloat() { return 127.0f; }
816 static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
817 };
818
819 //////////////////////////////////////////////////////////////////////////
820 /// TypeTraits - Format type traits specialization for UNORM16
821 //////////////////////////////////////////////////////////////////////////
822 template<> struct TypeTraits<SWR_TYPE_UNORM, 16> : PackTraits<16>
823 {
824 static const SWR_TYPE MyType = SWR_TYPE_UNORM;
825 static float toFloat() { return 1.0f / 65535.0f; }
826 static float fromFloat() { return 65535.0f; }
827 static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
828 };
829
830 //////////////////////////////////////////////////////////////////////////
831 /// TypeTraits - Format type traits specialization for SNORM16
832 //////////////////////////////////////////////////////////////////////////
833 template<> struct TypeTraits<SWR_TYPE_SNORM, 16> : PackTraits<16, true>
834 {
835 static const SWR_TYPE MyType = SWR_TYPE_UNORM;
836 static float toFloat() { return 1.0f / 32767.0f; }
837 static float fromFloat() { return 32767.0f; }
838 static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
839 };
840
841 //////////////////////////////////////////////////////////////////////////
842 /// TypeTraits - Format type traits specialization for UNORM24
843 //////////////////////////////////////////////////////////////////////////
844 template<>
845 struct TypeTraits < SWR_TYPE_UNORM, 24 > : PackTraits<32>
846 {
847 static const SWR_TYPE MyType = SWR_TYPE_UNORM;
848 static float toFloat() { return 1.0f / 16777215.0f; }
849 static float fromFloat() { return 16777215.0f; }
850 static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
851 };
852
853 //////////////////////////////////////////////////////////////////////////
854 // FLOAT Specializations from here on...
855 //////////////////////////////////////////////////////////////////////////
856 #define TO_M128i(a) _mm_castps_si128(a)
857 #define TO_M128(a) _mm_castsi128_ps(a)
858
859 #include "math.h"
860
861 template< unsigned expnum, unsigned expden, unsigned coeffnum, unsigned coeffden >
862 inline static __m128 fastpow(__m128 arg) {
863 __m128 ret = arg;
864
865 static const __m128 factor = _mm_set1_ps(exp2(127.0f * expden / expnum - 127.0f)
866 * powf(1.0f * coeffnum / coeffden, 1.0f * expden / expnum));
867
868 // Apply a constant pre-correction factor.
869 ret = _mm_mul_ps(ret, factor);
870
871 // Reinterpret arg as integer to obtain logarithm.
872 //asm("cvtdq2ps %1, %0" : "=x" (ret) : "x" (ret));
873 ret = _mm_cvtepi32_ps(_mm_castps_si128(ret));
874
875 // Multiply logarithm by power.
876 ret = _mm_mul_ps(ret, _mm_set1_ps(1.0f * expnum / expden));
877
878 // Convert back to "integer" to exponentiate.
879 //asm("cvtps2dq %1, %0" : "=x" (ret) : "x" (ret));
880 ret = _mm_castsi128_ps(_mm_cvtps_epi32(ret));
881
882 return ret;
883 }
884
885 inline static __m128 pow512_4(__m128 arg) {
886 // 5/12 is too small, so compute the 4th root of 20/12 instead.
887 // 20/12 = 5/3 = 1 + 2/3 = 2 - 1/3. 2/3 is a suitable argument for fastpow.
888 // weighting coefficient: a^-1/2 = 2 a; a = 2^-2/3
889 __m128 xf = fastpow< 2, 3, int(0.629960524947437 * 1e9), int(1e9) >(arg);
890 __m128 xover = _mm_mul_ps(arg, xf);
891
892 __m128 xfm1 = _mm_rsqrt_ps(xf);
893 __m128 x2 = _mm_mul_ps(arg, arg);
894 __m128 xunder = _mm_mul_ps(x2, xfm1);
895
896 // sqrt2 * over + 2 * sqrt2 * under
897 __m128 xavg = _mm_mul_ps(_mm_set1_ps(1.0f / (3.0f * 0.629960524947437f) * 0.999852f),
898 _mm_add_ps(xover, xunder));
899
900 xavg = _mm_mul_ps(xavg, _mm_rsqrt_ps(xavg));
901 xavg = _mm_mul_ps(xavg, _mm_rsqrt_ps(xavg));
902 return xavg;
903 }
904
905 inline static __m128 powf_wrapper(__m128 Base, float Exp)
906 {
907 float *f = (float *)(&Base);
908
909 return _mm_set_ps(powf(f[3], Exp),
910 powf(f[2], Exp),
911 powf(f[1], Exp),
912 powf(f[0], Exp));
913 }
914
915 static inline __m128 ConvertFloatToSRGB2(__m128& Src)
916 {
917 // create a mask with 0xFFFFFFFF in the DWORDs where the source is <= the minimal SRGB float value
918 __m128i CmpToSRGBThresholdMask = TO_M128i(_mm_cmpnlt_ps(_mm_set1_ps(0.0031308f), Src));
919
920 // squeeze the mask down to 16 bits (4 bits per DWORD)
921 int CompareResult = _mm_movemask_epi8(CmpToSRGBThresholdMask);
922
923 __m128 Result;
924
925 //
926 if (CompareResult == 0xFFFF)
927 {
928 // all DWORDs are <= the threshold
929 Result = _mm_mul_ps(Src, _mm_set1_ps(12.92f));
930 }
931 else if (CompareResult == 0x0)
932 {
933 // all DWORDs are > the threshold
934 __m128 fSrc_0RGB = Src;
935
936 // --> 1.055f * c(1.0f/2.4f) - 0.055f
937 #if KNOB_USE_FAST_SRGB == TRUE
938 // 1.0f / 2.4f is 5.0f / 12.0f which is used for approximation.
939 __m128 f = pow512_4(fSrc_0RGB);
940 #else
941 __m128 f = powf_wrapper(fSrc_0RGB, 1.0f / 2.4f);
942 #endif
943 f = _mm_mul_ps(f, _mm_set1_ps(1.055f));
944 Result = _mm_sub_ps(f, _mm_set1_ps(0.055f));
945 }
946 else
947 {
948 // some DWORDs are <= the threshold and some are > threshold
949 __m128 Src_0RGB_mul_denorm = _mm_mul_ps(Src, _mm_set1_ps(12.92f));
950
951 __m128 fSrc_0RGB = Src;
952
953 // --> 1.055f * c(1.0f/2.4f) - 0.055f
954 #if KNOB_USE_FAST_SRGB == TRUE
955 // 1.0f / 2.4f is 5.0f / 12.0f which is used for approximation.
956 __m128 f = pow512_4(fSrc_0RGB);
957 #else
958 __m128 f = powf_wrapper(fSrc_0RGB, 1.0f / 2.4f);
959 #endif
960 f = _mm_mul_ps(f, _mm_set1_ps(1.055f));
961 f = _mm_sub_ps(f, _mm_set1_ps(0.055f));
962
963 // Clear the alpha (is garbage after the sub)
964 __m128i i = _mm_and_si128(TO_M128i(f), _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF));
965
966 __m128i LessThanPart = _mm_and_si128(CmpToSRGBThresholdMask, TO_M128i(Src_0RGB_mul_denorm));
967 __m128i GreaterEqualPart = _mm_andnot_si128(CmpToSRGBThresholdMask, i);
968 __m128i CombinedParts = _mm_or_si128(LessThanPart, GreaterEqualPart);
969
970 Result = TO_M128(CombinedParts);
971 }
972
973 return Result;
974 }
975
976 #if ENABLE_AVX512_SIMD16
977 template< unsigned expnum, unsigned expden, unsigned coeffnum, unsigned coeffden >
978 inline static simd16scalar fastpow(simd16scalar value)
979 {
980 static const float factor1 = exp2(127.0f * expden / expnum - 127.0f)
981 * powf(1.0f * coeffnum / coeffden, 1.0f * expden / expnum);
982
983 // Apply a constant pre-correction factor.
984 simd16scalar result = _simd16_mul_ps(value, _simd16_set1_ps(factor1));
985
986 // Reinterpret arg as integer to obtain logarithm.
987 //asm("cvtdq2ps %1, %0" : "=x" (result) : "x" (result));
988 result = _simd16_cvtepi32_ps(_simd16_castps_si(result));
989
990 // Multiply logarithm by power.
991 result = _simd16_mul_ps(result, _simd16_set1_ps(1.0f * expnum / expden));
992
993 // Convert back to "integer" to exponentiate.
994 //asm("cvtps2dq %1, %0" : "=x" (result) : "x" (result));
995 result = _simd16_castsi_ps(_simd16_cvtps_epi32(result));
996
997 return result;
998 }
999
1000 inline static simd16scalar pow512_4(simd16scalar arg)
1001 {
1002 // 5/12 is too small, so compute the 4th root of 20/12 instead.
1003 // 20/12 = 5/3 = 1 + 2/3 = 2 - 1/3. 2/3 is a suitable argument for fastpow.
1004 // weighting coefficient: a^-1/2 = 2 a; a = 2^-2/3
1005 simd16scalar xf = fastpow< 2, 3, int(0.629960524947437 * 1e9), int(1e9) >(arg);
1006 simd16scalar xover = _simd16_mul_ps(arg, xf);
1007
1008 simd16scalar xfm1 = _simd16_rsqrt_ps(xf);
1009 simd16scalar x2 = _simd16_mul_ps(arg, arg);
1010 simd16scalar xunder = _simd16_mul_ps(x2, xfm1);
1011
1012 // sqrt2 * over + 2 * sqrt2 * under
1013 simd16scalar xavg = _simd16_mul_ps(_simd16_set1_ps(1.0f / (3.0f * 0.629960524947437f) * 0.999852f), _simd16_add_ps(xover, xunder));
1014
1015 xavg = _simd16_mul_ps(xavg, _simd16_rsqrt_ps(xavg));
1016 xavg = _simd16_mul_ps(xavg, _simd16_rsqrt_ps(xavg));
1017
1018 return xavg;
1019 }
1020
1021 inline static simd16scalar powf_wrapper(const simd16scalar base, float exp)
1022 {
1023 const float *f = reinterpret_cast<const float *>(&base);
1024
1025 return _simd16_set_ps(
1026 powf(f[15], exp),
1027 powf(f[14], exp),
1028 powf(f[13], exp),
1029 powf(f[12], exp),
1030 powf(f[11], exp),
1031 powf(f[10], exp),
1032 powf(f[ 9], exp),
1033 powf(f[ 8], exp),
1034 powf(f[ 7], exp),
1035 powf(f[ 6], exp),
1036 powf(f[ 5], exp),
1037 powf(f[ 4], exp),
1038 powf(f[ 3], exp),
1039 powf(f[ 2], exp),
1040 powf(f[ 1], exp),
1041 powf(f[ 0], exp)
1042 );
1043 }
1044
1045 // float to SRGB conversion formula
1046 //
1047 // if (value < 0.0031308f)
1048 // value *= 12.92f;
1049 // else
1050 // value = 1.055f * pow(value, 1.0f / 2.4f) - 0.055f;
1051 //
1052 static inline simd16scalar ConvertFloatToSRGB2(const simd16scalar &value)
1053 {
1054 // create a mask where the source is < the minimal SRGB float value
1055 const simd16mask mask = _simd16_cmplt_ps_mask(value, _simd16_set1_ps(0.0031308f));
1056
1057 // if all elements are < the threshold, result = value * 12.92
1058 simd16scalar result = _simd16_mul_ps(value, _simd16_set1_ps(12.92f));
1059
1060 if (_simd16_mask2int(mask) != 0xFFFF)
1061 {
1062 // some elements are >= threshold, result = 1.055 * power(value, 1.0 / 2.4) - 0.055
1063 #if KNOB_USE_FAST_SRGB == TRUE
1064 // 1.0f / 2.4f is 5.0f / 12.0f which is used for approximation.
1065 simd16scalar result2 = pow512_4(value);
1066 #else
1067 simd16scalar result2 = powf_wrapper(value, 1.0f / 2.4f);
1068 #endif
1069
1070 result2 = _simd16_mul_ps(result2, _simd16_set1_ps(1.055f));
1071 result2 = _simd16_sub_ps(result2, _simd16_set1_ps(0.055f));
1072
1073 #if (KNOB_ARCH == KNOB_ARCH_AVX512)
1074 // only native AVX512 can directly use the computed mask for the blend operation
1075 result = _mm512_mask_blend_ps(mask, result2, result);
1076 #else
1077 result = _simd16_blendv_ps(result2, result, _simd16_cmplt_ps(value, _simd16_set1_ps(0.0031308f)));
1078 #endif
1079 }
1080
1081 return result;
1082 }
1083
1084 #endif
1085 //////////////////////////////////////////////////////////////////////////
1086 /// TypeTraits - Format type traits specialization for FLOAT16
1087 //////////////////////////////////////////////////////////////////////////
1088 template<> struct TypeTraits<SWR_TYPE_FLOAT, 16> : PackTraits<16>
1089 {
1090 static const SWR_TYPE MyType = SWR_TYPE_FLOAT;
1091 static float toFloat() { return 1.0f; }
1092 static float fromFloat() { return 1.0f; }
1093 static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
1094
1095 static simdscalar pack(const simdscalar &in)
1096 {
1097 #if KNOB_SIMD_WIDTH == 8
1098 #if (KNOB_ARCH == KNOB_ARCH_AVX)
1099 // input is 8 packed float32, output is 8 packed float16
1100 simdscalari src = _simd_castps_si(in);
1101
1102 static const uint32_t FLOAT_EXP_BITS = 8;
1103 static const uint32_t FLOAT_MANTISSA_BITS = 23;
1104 static const uint32_t FLOAT_MANTISSA_MASK = (1U << FLOAT_MANTISSA_BITS) - 1;
1105 static const uint32_t FLOAT_EXP_MASK = ((1U << FLOAT_EXP_BITS) - 1) << FLOAT_MANTISSA_BITS;
1106
1107 static const uint32_t HALF_EXP_BITS = 5;
1108 static const uint32_t HALF_MANTISSA_BITS = 10;
1109 static const uint32_t HALF_EXP_MASK = ((1U << HALF_EXP_BITS) - 1) << HALF_MANTISSA_BITS;
1110
1111 // minimum exponent required, exponents below this are flushed to 0.
1112 static const int32_t HALF_EXP_MIN = -14;
1113 static const int32_t FLOAT_EXP_BIAS = 127;
1114 static const int32_t FLOAT_EXP_MIN = HALF_EXP_MIN + FLOAT_EXP_BIAS;
1115 static const int32_t FLOAT_EXP_MIN_FTZ = FLOAT_EXP_MIN - (HALF_MANTISSA_BITS + 1); // +1 for the lack of implicit significand
1116
1117 // maximum exponent required, exponents above this are set to infinity
1118 static const int32_t HALF_EXP_MAX = 15;
1119 static const int32_t FLOAT_EXP_MAX = HALF_EXP_MAX + FLOAT_EXP_BIAS;
1120
1121 const simdscalari vSignMask = _simd_set1_epi32(0x80000000);
1122 const simdscalari vExpMask = _simd_set1_epi32(FLOAT_EXP_MASK);
1123 const simdscalari vManMask = _simd_set1_epi32(FLOAT_MANTISSA_MASK);
1124 const simdscalari vExpMin = _simd_set1_epi32(FLOAT_EXP_MASK & uint32_t(FLOAT_EXP_MIN << FLOAT_MANTISSA_BITS));
1125 const simdscalari vExpMinFtz = _simd_set1_epi32(FLOAT_EXP_MASK & uint32_t(FLOAT_EXP_MIN_FTZ << FLOAT_MANTISSA_BITS));
1126 const simdscalari vExpMax = _simd_set1_epi32(FLOAT_EXP_MASK & uint32_t(FLOAT_EXP_MAX << FLOAT_MANTISSA_BITS));
1127
1128 simdscalari vSign = _simd_and_si(src, vSignMask);
1129 simdscalari vExp = _simd_and_si(src, vExpMask);
1130 simdscalari vMan = _simd_and_si(src, vManMask);
1131
1132 simdscalari vFTZMask = _simd_cmplt_epi32(vExp, vExpMinFtz);
1133 simdscalari vDenormMask = _simd_andnot_si(vFTZMask, _simd_cmplt_epi32(vExp, vExpMin));
1134 simdscalari vInfMask = _simd_cmpeq_epi32(vExpMask, vExp);
1135 simdscalari vClampMask = _simd_andnot_si(vInfMask, _simd_cmplt_epi32(vExpMax, vExp));
1136
1137 simdscalari vHalfExp = _simd_add_epi32(_simd_sub_epi32(vExp, vExpMin), _simd_set1_epi32(1U << FLOAT_MANTISSA_BITS));
1138
1139 // pack output 16-bits into the lower 16-bits of each 32-bit channel
1140 simdscalari vDst = _simd_and_si(_simd_srli_epi32(vHalfExp, 13), _simd_set1_epi32(HALF_EXP_MASK));
1141 vDst = _simd_or_si(vDst, _simd_srli_epi32(vMan, FLOAT_MANTISSA_BITS - HALF_MANTISSA_BITS));
1142
1143 // Flush To Zero
1144 vDst = _simd_andnot_si(vFTZMask, vDst);
1145 // Apply Infinites / NaN
1146 vDst = _simd_or_si(vDst, _simd_and_si(vInfMask, _simd_set1_epi32(HALF_EXP_MASK)));
1147
1148 // Apply clamps
1149 vDst = _simd_andnot_si(vClampMask, vDst);
1150 vDst = _simd_or_si(vDst,
1151 _simd_and_si(vClampMask, _simd_set1_epi32(0x7BFF)));
1152
1153 // Compute Denormals (subnormals)
1154 if (!_mm256_testz_si256(vDenormMask, vDenormMask))
1155 {
1156 uint32_t *pDenormMask = (uint32_t*)&vDenormMask;
1157 uint32_t *pExp = (uint32_t*)&vExp;
1158 uint32_t *pMan = (uint32_t*)&vMan;
1159 uint32_t *pDst = (uint32_t*)&vDst;
1160 for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i)
1161 {
1162 if (pDenormMask[i])
1163 {
1164 // Need to compute subnormal value
1165 uint32_t exponent = pExp[i] >> FLOAT_MANTISSA_BITS;
1166 uint32_t mantissa = pMan[i] |
1167 (1U << FLOAT_MANTISSA_BITS); // Denorms include no "implicit" 1s. Make it explicit
1168
1169 pDst[i] = mantissa >> ((FLOAT_EXP_MIN - exponent) + (FLOAT_MANTISSA_BITS - HALF_MANTISSA_BITS));
1170 }
1171 }
1172 }
1173
1174 // Add in sign bits
1175 vDst = _simd_or_si(vDst, _simd_srli_epi32(vSign, 16));
1176
1177 // Pack to lower 128-bits
1178 vDst = _mm256_castsi128_si256(_mm_packus_epi32(_mm256_castsi256_si128(vDst), _mm256_extractf128_si256(vDst, 1)));
1179
1180 #if 0
1181 #if !defined(NDEBUG)
1182 simdscalari vCheck = _mm256_castsi128_si256(_mm256_cvtps_ph(in, _MM_FROUND_TRUNC));
1183
1184 for (uint32_t i = 0; i < 4; ++i)
1185 {
1186 SWR_ASSERT(vCheck.m256i_i32[i] == vDst.m256i_i32[i]);
1187 }
1188 #endif
1189 #endif
1190
1191 return _simd_castsi_ps(vDst);
1192
1193 #else
1194 return _mm256_castsi256_ps(_mm256_castsi128_si256(_mm256_cvtps_ph(in, _MM_FROUND_TRUNC)));
1195 #endif
1196 #elif KNOB_SIMD_WIDTH == 16
1197 #if ENABLE_AVX512_EMULATION
1198 simdscalari result;
1199
1200 __m128i templo = _mm256_cvtps_ph(in.lo, _MM_FROUND_TRUNC);
1201 __m128i temphi = _mm256_cvtps_ph(in.hi, _MM_FROUND_TRUNC);
1202
1203 result.lo = _mm256_castsi128_si256(templo);
1204 result.lo = _mm256_insertf128_si256(result.lo, temphi, 1);
1205
1206 result.hi = _mm256_undefined_si256();
1207
1208 return _simd_castsi_ps(result);
1209 #endif
1210 #else
1211 #error Unsupported vector width
1212 #endif
1213 }
1214
1215 static simdscalar unpack(const simdscalar &in)
1216 {
1217 // input is 8 packed float16, output is 8 packed float32
1218 SWR_ASSERT(0); // @todo
1219 return _simd_setzero_ps();
1220 }
1221 #if ENABLE_AVX512_SIMD16
1222
1223 static simd16scalar pack(const simd16scalar &in)
1224 {
1225 simd16scalari result = _simd16_setzero_si();
1226 simdscalari resultlo = _simd_setzero_si();
1227
1228 #if (KNOB_ARCH == KNOB_ARCH_AVX)
1229 simdscalar simdlo = pack(_simd16_extract_ps(in, 0));
1230 simdscalar simdhi = pack(_simd16_extract_ps(in, 1));
1231
1232 __m128i templo = _mm256_extractf128_si256(_simd_castps_si(simdlo), 0);
1233 __m128i temphi = _mm256_extractf128_si256(_simd_castps_si(simdhi), 0);
1234
1235 #else
1236 __m128i templo = _mm256_cvtps_ph(_simd16_extract_ps(in, 0), _MM_FROUND_TRUNC);
1237 __m128i temphi = _mm256_cvtps_ph(_simd16_extract_ps(in, 1), _MM_FROUND_TRUNC);
1238
1239 #endif
1240 resultlo = _mm256_insertf128_si256(resultlo, templo, 0);
1241 resultlo = _mm256_insertf128_si256(resultlo, temphi, 1);
1242
1243 result = _simd16_insert_si(result, resultlo, 0);
1244
1245 return _simd16_castsi_ps(result);
1246 }
1247
1248 static simd16scalar unpack(const simd16scalar &in)
1249 {
1250 // input is 16 packed float16, output is 16 packed float32
1251 SWR_ASSERT(0); // @todo
1252 return _simd16_setzero_ps();
1253 }
1254 #endif
1255 };
1256
1257 //////////////////////////////////////////////////////////////////////////
1258 /// TypeTraits - Format type traits specialization for FLOAT32
1259 //////////////////////////////////////////////////////////////////////////
1260 template<> struct TypeTraits<SWR_TYPE_FLOAT, 32> : PackTraits<32>
1261 {
1262 static const SWR_TYPE MyType = SWR_TYPE_FLOAT;
1263 static float toFloat() { return 1.0f; }
1264 static float fromFloat() { return 1.0f; }
1265 static inline simdscalar convertSrgb(simdscalar &in)
1266 {
1267 #if KNOB_SIMD_WIDTH == 8
1268 #if (KNOB_ARCH == KNOB_ARCH_AVX || KNOB_ARCH == KNOB_ARCH_AVX2)
1269 __m128 srcLo = _mm256_extractf128_ps(in, 0);
1270 __m128 srcHi = _mm256_extractf128_ps(in, 1);
1271
1272 srcLo = ConvertFloatToSRGB2(srcLo);
1273 srcHi = ConvertFloatToSRGB2(srcHi);
1274
1275 in = _mm256_insertf128_ps(in, srcLo, 0);
1276 in = _mm256_insertf128_ps(in, srcHi, 1);
1277 #endif
1278 #elif KNOB_SIMD_WIDTH == 16
1279 #if ENABLE_AVX512_EMULATION
1280 __m128 inlo0 = _mm256_extractf128_ps(in.lo, 0);
1281 __m128 inlo1 = _mm256_extractf128_ps(in.lo, 1);
1282 __m128 inhi0 = _mm256_extractf128_ps(in.hi, 0);
1283 __m128 inhi1 = _mm256_extractf128_ps(in.hi, 1);
1284
1285 inlo0 = ConvertFloatToSRGB2(inlo0);
1286 inlo1 = ConvertFloatToSRGB2(inlo1);
1287 inhi0 = ConvertFloatToSRGB2(inhi0);
1288 inhi1 = ConvertFloatToSRGB2(inhi1);
1289
1290 in.lo = _mm256_insertf128_ps(in.lo, inlo0, 0);
1291 in.lo = _mm256_insertf128_ps(in.lo, inlo1, 1);
1292 in.hi = _mm256_insertf128_ps(in.hi, inhi0, 0);
1293 in.hi = _mm256_insertf128_ps(in.hi, inhi1, 1);
1294 #endif
1295 #else
1296 #error Unsupported vector width
1297 #endif
1298 return in;
1299 }
1300 #if ENABLE_AVX512_SIMD16
1301
1302 static inline simd16scalar convertSrgb(simd16scalar &in)
1303 {
1304 return ConvertFloatToSRGB2(in);
1305 }
1306 #endif
1307 };
1308
1309 //////////////////////////////////////////////////////////////////////////
1310 /// Format1 - Bitfield for single component formats.
1311 //////////////////////////////////////////////////////////////////////////
1312 template<uint32_t x>
1313 struct Format1
1314 {
1315 union
1316 {
1317 uint32_t r : x;
1318
1319 ///@ The following are here to provide full template needed in Formats.
1320 uint32_t g : x;
1321 uint32_t b : x;
1322 uint32_t a : x;
1323 };
1324 };
1325
1326 //////////////////////////////////////////////////////////////////////////
1327 /// Format1 - Bitfield for single component formats - 8 bit specialization
1328 //////////////////////////////////////////////////////////////////////////
1329 template<>
1330 struct Format1<8>
1331 {
1332 union
1333 {
1334 uint8_t r;
1335
1336 ///@ The following are here to provide full template needed in Formats.
1337 uint8_t g;
1338 uint8_t b;
1339 uint8_t a;
1340 };
1341 };
1342
1343 //////////////////////////////////////////////////////////////////////////
1344 /// Format1 - Bitfield for single component formats - 16 bit specialization
1345 //////////////////////////////////////////////////////////////////////////
1346 template<>
1347 struct Format1<16>
1348 {
1349 union
1350 {
1351 uint16_t r;
1352
1353 ///@ The following are here to provide full template needed in Formats.
1354 uint16_t g;
1355 uint16_t b;
1356 uint16_t a;
1357 };
1358 };
1359
1360 //////////////////////////////////////////////////////////////////////////
1361 /// Format2 - Bitfield for 2 component formats.
1362 //////////////////////////////////////////////////////////////////////////
1363 template<uint32_t x, uint32_t y>
1364 union Format2
1365 {
1366 struct
1367 {
1368 uint32_t r : x;
1369 uint32_t g : y;
1370 };
1371 struct
1372 {
1373 ///@ The following are here to provide full template needed in Formats.
1374 uint32_t b : x;
1375 uint32_t a : y;
1376 };
1377 };
1378
1379 //////////////////////////////////////////////////////////////////////////
1380 /// Format2 - Bitfield for 2 component formats - 16 bit specialization
1381 //////////////////////////////////////////////////////////////////////////
1382 template<>
1383 union Format2<8,8>
1384 {
1385 struct
1386 {
1387 uint16_t r : 8;
1388 uint16_t g : 8;
1389 };
1390 struct
1391 {
1392 ///@ The following are here to provide full template needed in Formats.
1393 uint16_t b : 8;
1394 uint16_t a : 8;
1395 };
1396 };
1397
1398 //////////////////////////////////////////////////////////////////////////
1399 /// Format3 - Bitfield for 3 component formats.
1400 //////////////////////////////////////////////////////////////////////////
1401 template<uint32_t x, uint32_t y, uint32_t z>
1402 union Format3
1403 {
1404 struct
1405 {
1406 uint32_t r : x;
1407 uint32_t g : y;
1408 uint32_t b : z;
1409 };
1410 uint32_t a; ///@note This is here to provide full template needed in Formats.
1411 };
1412
1413 //////////////////////////////////////////////////////////////////////////
1414 /// Format3 - Bitfield for 3 component formats - 16 bit specialization
1415 //////////////////////////////////////////////////////////////////////////
1416 template<>
1417 union Format3<5,6,5>
1418 {
1419 struct
1420 {
1421 uint16_t r : 5;
1422 uint16_t g : 6;
1423 uint16_t b : 5;
1424 };
1425 uint16_t a; ///@note This is here to provide full template needed in Formats.
1426 };
1427
1428 //////////////////////////////////////////////////////////////////////////
1429 /// Format4 - Bitfield for 4 component formats.
1430 //////////////////////////////////////////////////////////////////////////
1431 template<uint32_t x, uint32_t y, uint32_t z, uint32_t w>
1432 struct Format4
1433 {
1434 uint32_t r : x;
1435 uint32_t g : y;
1436 uint32_t b : z;
1437 uint32_t a : w;
1438 };
1439
1440 //////////////////////////////////////////////////////////////////////////
1441 /// Format4 - Bitfield for 4 component formats - 16 bit specialization
1442 //////////////////////////////////////////////////////////////////////////
1443 template<>
1444 struct Format4<5,5,5,1>
1445 {
1446 uint16_t r : 5;
1447 uint16_t g : 5;
1448 uint16_t b : 5;
1449 uint16_t a : 1;
1450 };
1451
1452 //////////////////////////////////////////////////////////////////////////
1453 /// Format4 - Bitfield for 4 component formats - 16 bit specialization
1454 //////////////////////////////////////////////////////////////////////////
1455 template<>
1456 struct Format4<4,4,4,4>
1457 {
1458 uint16_t r : 4;
1459 uint16_t g : 4;
1460 uint16_t b : 4;
1461 uint16_t a : 4;
1462 };
1463
1464 //////////////////////////////////////////////////////////////////////////
1465 /// ComponentTraits - Default components
1466 //////////////////////////////////////////////////////////////////////////
1467 template<uint32_t x, uint32_t y, uint32_t z, uint32_t w>
1468 struct Defaults
1469 {
1470 INLINE static uint32_t GetDefault(uint32_t comp)
1471 {
1472 static const uint32_t defaults[4]{ x, y, z, w };
1473 return defaults[comp];
1474 }
1475 };
1476
1477 //////////////////////////////////////////////////////////////////////////
1478 /// ComponentTraits - Component type traits.
1479 //////////////////////////////////////////////////////////////////////////
1480 template<SWR_TYPE X, uint32_t NumBitsX, SWR_TYPE Y = SWR_TYPE_UNKNOWN, uint32_t NumBitsY = 0, SWR_TYPE Z = SWR_TYPE_UNKNOWN, uint32_t NumBitsZ = 0, SWR_TYPE W = SWR_TYPE_UNKNOWN, uint32_t NumBitsW = 0>
1481 struct ComponentTraits
1482 {
1483 INLINE static SWR_TYPE GetType(uint32_t comp)
1484 {
1485 static const SWR_TYPE CompType[4]{ X, Y, Z, W };
1486 return CompType[comp];
1487 }
1488
1489 INLINE static uint32_t GetBPC(uint32_t comp)
1490 {
1491 static const uint32_t MyBpc[4]{ NumBitsX, NumBitsY, NumBitsZ, NumBitsW };
1492 return MyBpc[comp];
1493 }
1494
1495 INLINE static bool isNormalized(uint32_t comp)
1496 {
1497 switch (comp)
1498 {
1499 case 0:
1500 return (X == SWR_TYPE_UNORM || X == SWR_TYPE_SNORM) ? true : false;
1501 case 1:
1502 return (Y == SWR_TYPE_UNORM || Y == SWR_TYPE_SNORM) ? true : false;
1503 case 2:
1504 return (Z == SWR_TYPE_UNORM || Z == SWR_TYPE_SNORM) ? true : false;
1505 case 3:
1506 return (W == SWR_TYPE_UNORM || W == SWR_TYPE_SNORM) ? true : false;
1507 }
1508 SWR_ASSERT(0);
1509 return false;
1510 }
1511
1512 INLINE static float toFloat(uint32_t comp)
1513 {
1514 switch (comp)
1515 {
1516 case 0:
1517 return TypeTraits<X, NumBitsX>::toFloat();
1518 case 1:
1519 return TypeTraits<Y, NumBitsY>::toFloat();
1520 case 2:
1521 return TypeTraits<Z, NumBitsZ>::toFloat();
1522 case 3:
1523 return TypeTraits<W, NumBitsW>::toFloat();
1524 }
1525 SWR_ASSERT(0);
1526 return TypeTraits<X, NumBitsX>::toFloat();
1527
1528 }
1529
1530 INLINE static float fromFloat(uint32_t comp)
1531 {
1532 switch (comp)
1533 {
1534 case 0:
1535 return TypeTraits<X, NumBitsX>::fromFloat();
1536 case 1:
1537 return TypeTraits<Y, NumBitsY>::fromFloat();
1538 case 2:
1539 return TypeTraits<Z, NumBitsZ>::fromFloat();
1540 case 3:
1541 return TypeTraits<W, NumBitsW>::fromFloat();
1542 }
1543 SWR_ASSERT(0);
1544 return TypeTraits<X, NumBitsX>::fromFloat();
1545 }
1546
1547 INLINE static simdscalar loadSOA(uint32_t comp, const uint8_t* pSrc)
1548 {
1549 switch (comp)
1550 {
1551 case 0:
1552 return TypeTraits<X, NumBitsX>::loadSOA(pSrc);
1553 case 1:
1554 return TypeTraits<Y, NumBitsY>::loadSOA(pSrc);
1555 case 2:
1556 return TypeTraits<Z, NumBitsZ>::loadSOA(pSrc);
1557 case 3:
1558 return TypeTraits<W, NumBitsW>::loadSOA(pSrc);
1559 }
1560 SWR_ASSERT(0);
1561 return TypeTraits<X, NumBitsX>::loadSOA(pSrc);
1562 }
1563
1564 INLINE static void storeSOA(uint32_t comp, uint8_t *pDst, simdscalar src)
1565 {
1566 switch (comp)
1567 {
1568 case 0:
1569 TypeTraits<X, NumBitsX>::storeSOA(pDst, src);
1570 return;
1571 case 1:
1572 TypeTraits<Y, NumBitsY>::storeSOA(pDst, src);
1573 return;
1574 case 2:
1575 TypeTraits<Z, NumBitsZ>::storeSOA(pDst, src);
1576 return;
1577 case 3:
1578 TypeTraits<W, NumBitsW>::storeSOA(pDst, src);
1579 return;
1580 }
1581 SWR_ASSERT(0);
1582 TypeTraits<X, NumBitsX>::storeSOA(pDst, src);
1583 }
1584
1585 INLINE static simdscalar unpack(uint32_t comp, simdscalar &in)
1586 {
1587 switch (comp)
1588 {
1589 case 0:
1590 return TypeTraits<X, NumBitsX>::unpack(in);
1591 case 1:
1592 return TypeTraits<Y, NumBitsY>::unpack(in);
1593 case 2:
1594 return TypeTraits<Z, NumBitsZ>::unpack(in);
1595 case 3:
1596 return TypeTraits<W, NumBitsW>::unpack(in);
1597 }
1598 SWR_ASSERT(0);
1599 return TypeTraits<X, NumBitsX>::unpack(in);
1600 }
1601
1602 INLINE static simdscalar pack(uint32_t comp, simdscalar &in)
1603 {
1604 switch (comp)
1605 {
1606 case 0:
1607 return TypeTraits<X, NumBitsX>::pack(in);
1608 case 1:
1609 return TypeTraits<Y, NumBitsY>::pack(in);
1610 case 2:
1611 return TypeTraits<Z, NumBitsZ>::pack(in);
1612 case 3:
1613 return TypeTraits<W, NumBitsW>::pack(in);
1614 }
1615 SWR_ASSERT(0);
1616 return TypeTraits<X, NumBitsX>::pack(in);
1617 }
1618
1619 INLINE static simdscalar convertSrgb(uint32_t comp, simdscalar &in)
1620 {
1621 switch (comp)
1622 {
1623 case 0:
1624 return TypeTraits<X, NumBitsX>::convertSrgb(in);
1625 case 1:
1626 return TypeTraits<Y, NumBitsY>::convertSrgb(in);
1627 case 2:
1628 return TypeTraits<Z, NumBitsZ>::convertSrgb(in);
1629 case 3:
1630 return TypeTraits<W, NumBitsW>::convertSrgb(in);
1631 }
1632 SWR_ASSERT(0);
1633 return TypeTraits<X, NumBitsX>::convertSrgb(in);
1634 }
1635 #if ENABLE_AVX512_SIMD16
1636
1637 INLINE static simd16scalar loadSOA_16(uint32_t comp, const uint8_t* pSrc)
1638 {
1639 switch (comp)
1640 {
1641 case 0:
1642 return TypeTraits<X, NumBitsX>::loadSOA_16(pSrc);
1643 case 1:
1644 return TypeTraits<Y, NumBitsY>::loadSOA_16(pSrc);
1645 case 2:
1646 return TypeTraits<Z, NumBitsZ>::loadSOA_16(pSrc);
1647 case 3:
1648 return TypeTraits<W, NumBitsW>::loadSOA_16(pSrc);
1649 }
1650 SWR_ASSERT(0);
1651 return TypeTraits<X, NumBitsX>::loadSOA_16(pSrc);
1652 }
1653
1654 INLINE static void storeSOA(uint32_t comp, uint8_t *pDst, simd16scalar src)
1655 {
1656 switch (comp)
1657 {
1658 case 0:
1659 TypeTraits<X, NumBitsX>::storeSOA(pDst, src);
1660 return;
1661 case 1:
1662 TypeTraits<Y, NumBitsY>::storeSOA(pDst, src);
1663 return;
1664 case 2:
1665 TypeTraits<Z, NumBitsZ>::storeSOA(pDst, src);
1666 return;
1667 case 3:
1668 TypeTraits<W, NumBitsW>::storeSOA(pDst, src);
1669 return;
1670 }
1671 SWR_ASSERT(0);
1672 TypeTraits<X, NumBitsX>::storeSOA(pDst, src);
1673 }
1674
1675 INLINE static simd16scalar unpack(uint32_t comp, simd16scalar &in)
1676 {
1677 switch (comp)
1678 {
1679 case 0:
1680 return TypeTraits<X, NumBitsX>::unpack(in);
1681 case 1:
1682 return TypeTraits<Y, NumBitsY>::unpack(in);
1683 case 2:
1684 return TypeTraits<Z, NumBitsZ>::unpack(in);
1685 case 3:
1686 return TypeTraits<W, NumBitsW>::unpack(in);
1687 }
1688 SWR_ASSERT(0);
1689 return TypeTraits<X, NumBitsX>::unpack(in);
1690 }
1691
1692 INLINE static simd16scalar pack(uint32_t comp, simd16scalar &in)
1693 {
1694 switch (comp)
1695 {
1696 case 0:
1697 return TypeTraits<X, NumBitsX>::pack(in);
1698 case 1:
1699 return TypeTraits<Y, NumBitsY>::pack(in);
1700 case 2:
1701 return TypeTraits<Z, NumBitsZ>::pack(in);
1702 case 3:
1703 return TypeTraits<W, NumBitsW>::pack(in);
1704 }
1705 SWR_ASSERT(0);
1706 return TypeTraits<X, NumBitsX>::pack(in);
1707 }
1708
1709 INLINE static simd16scalar convertSrgb(uint32_t comp, simd16scalar &in)
1710 {
1711 switch (comp)
1712 {
1713 case 0:
1714 return TypeTraits<X, NumBitsX>::convertSrgb(in);
1715 case 1:
1716 return TypeTraits<Y, NumBitsY>::convertSrgb(in);
1717 case 2:
1718 return TypeTraits<Z, NumBitsZ>::convertSrgb(in);
1719 case 3:
1720 return TypeTraits<W, NumBitsW>::convertSrgb(in);
1721 }
1722 SWR_ASSERT(0);
1723 return TypeTraits<X, NumBitsX>::convertSrgb(in);
1724 }
1725 #endif
1726 };