src/gallium/drivers/swr/rasterizer/core/format_types.h

   1 /****************************************************************************
   2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
   3 *
   4 * Permission is hereby granted, free of charge, to any person obtaining a
   5 * copy of this software and associated documentation files (the "Software"),
   6 * to deal in the Software without restriction, including without limitation
   7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 * and/or sell copies of the Software, and to permit persons to whom the
   9 * Software is furnished to do so, subject to the following conditions:
  10 *
  11 * The above copyright notice and this permission notice (including the next
  12 * paragraph) shall be included in all copies or substantial portions of the
  13 * Software.
  14 *
  15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 * IN THE SOFTWARE.
  22 *
  23 * @file formats.h
  24 *
  25 * @brief Definitions for SWR_FORMAT functions.
  26 *
  27 ******************************************************************************/
  28 #pragma once
  29
  30 #include "utils.h"
  31
  32 //////////////////////////////////////////////////////////////////////////
  33 /// PackTraits - Helpers for packing / unpacking same pixel sizes
  34 //////////////////////////////////////////////////////////////////////////
  35 template <uint32_t NumBits, bool Signed = false>
  36 struct PackTraits
  37 {
  38     static const uint32_t MyNumBits = NumBits;
  39     static simdscalar loadSOA(const uint8_t *pSrc) = delete;
  40     static void storeSOA(uint8_t *pDst, simdscalar src) = delete;
  41     static simdscalar unpack(simdscalar &in) = delete;
  42     static simdscalar pack(simdscalar &in) = delete;
  43 };
  44
  45 //////////////////////////////////////////////////////////////////////////
  46 /// PackTraits - Helpers for packing / unpacking unused channels
  47 //////////////////////////////////////////////////////////////////////////
  48 template <>
  49 struct PackTraits<0, false>
  50 {
  51     static const uint32_t MyNumBits = 0;
  52
  53     static simdscalar loadSOA(const uint8_t *pSrc) { return _simd_setzero_ps(); }
  54     static void storeSOA(uint8_t *pDst, simdscalar src) { return; }
  55     static simdscalar unpack(simdscalar &in) { return _simd_setzero_ps(); }
  56     static simdscalar pack(simdscalar &in) { return _simd_setzero_ps(); }
  57 };
  58
  59
  60 //////////////////////////////////////////////////////////////////////////
  61 /// PackTraits - Helpers for packing / unpacking 8 bit unsigned channels
  62 //////////////////////////////////////////////////////////////////////////
  63 template <>
  64 struct PackTraits<8, false>
  65 {
  66     static const uint32_t MyNumBits = 8;
  67
  68     static simdscalar loadSOA(const uint8_t *pSrc)
  69     {
  70 #if KNOB_SIMD_WIDTH == 8
  71         __m256 result = _mm256_setzero_ps();
  72         __m128 vLo = _mm_castpd_ps(_mm_load_sd((double*)pSrc));
  73         return _mm256_insertf128_ps(result, vLo, 0);
  74 #else
  75 #error Unsupported vector width
  76 #endif
  77     }
  78
  79     static void storeSOA(uint8_t *pDst, simdscalar src)
  80     {
  81         // store simd bytes
  82 #if KNOB_SIMD_WIDTH == 8
  83         _mm_storel_pd((double*)pDst, _mm_castps_pd(_mm256_castps256_ps128(src)));
  84 #else
  85 #error Unsupported vector width
  86 #endif
  87     }
  88
  89     static simdscalar unpack(simdscalar &in)
  90     {
  91 #if KNOB_SIMD_WIDTH == 8
  92 #if KNOB_ARCH==KNOB_ARCH_AVX
  93         __m128i src = _mm_castps_si128(_mm256_castps256_ps128(in));
  94         __m128i resLo = _mm_cvtepu8_epi32(src);
  95         __m128i resHi = _mm_shuffle_epi8(src,
  96             _mm_set_epi32(0x80808007, 0x80808006, 0x80808005, 0x80808004));
  97
  98         __m256i result = _mm256_castsi128_si256(resLo);
  99         result = _mm256_insertf128_si256(result, resHi, 1);
 100         return _mm256_castsi256_ps(result);
 101 #elif KNOB_ARCH==KNOB_ARCH_AVX2
 102         return _mm256_castsi256_ps(_mm256_cvtepu8_epi32(_mm_castps_si128(_mm256_castps256_ps128(in))));
 103 #endif
 104 #else
 105 #error Unsupported vector width
 106 #endif
 107     }
 108
 109     static simdscalar pack(simdscalar &in)
 110     {
 111 #if KNOB_SIMD_WIDTH == 8
 112         simdscalari src = _simd_castps_si(in);
 113         __m128i res16 = _mm_packus_epi32(_mm256_castsi256_si128(src), _mm256_extractf128_si256(src, 1));
 114         __m128i res8 = _mm_packus_epi16(res16, _mm_undefined_si128());
 115         return _mm256_castsi256_ps(_mm256_castsi128_si256(res8));
 116 #else
 117 #error Unsupported vector width
 118 #endif
 119     }
 120 };
 121
 122 //////////////////////////////////////////////////////////////////////////
 123 /// PackTraits - Helpers for packing / unpacking 8 bit signed channels
 124 //////////////////////////////////////////////////////////////////////////
 125 template <>
 126 struct PackTraits<8, true>
 127 {
 128     static const uint32_t MyNumBits = 8;
 129
 130     static simdscalar loadSOA(const uint8_t *pSrc)
 131     {
 132 #if KNOB_SIMD_WIDTH == 8
 133         __m256 result = _mm256_setzero_ps();
 134         __m128 vLo = _mm_castpd_ps(_mm_load_sd((double*)pSrc));
 135         return _mm256_insertf128_ps(result, vLo, 0);
 136 #else
 137 #error Unsupported vector width
 138 #endif
 139     }
 140
 141     static void storeSOA(uint8_t *pDst, simdscalar src)
 142     {
 143         // store simd bytes
 144 #if KNOB_SIMD_WIDTH == 8
 145         _mm_storel_pd((double*)pDst, _mm_castps_pd(_mm256_castps256_ps128(src)));
 146 #else
 147 #error Unsupported vector width
 148 #endif
 149     }
 150
 151     static simdscalar unpack(simdscalar &in)
 152     {
 153 #if KNOB_SIMD_WIDTH == 8
 154 #if KNOB_ARCH==KNOB_ARCH_AVX
 155         SWR_ASSERT(0); // I think this may be incorrect.
 156         __m128i src = _mm_castps_si128(_mm256_castps256_ps128(in));
 157         __m128i resLo = _mm_cvtepi8_epi32(src);
 158         __m128i resHi = _mm_shuffle_epi8(src,
 159             _mm_set_epi32(0x80808007, 0x80808006, 0x80808005, 0x80808004));
 160
 161         __m256i result = _mm256_castsi128_si256(resLo);
 162         result = _mm256_insertf128_si256(result, resHi, 1);
 163         return _mm256_castsi256_ps(result);
 164 #elif KNOB_ARCH==KNOB_ARCH_AVX2
 165         return _mm256_castsi256_ps(_mm256_cvtepi8_epi32(_mm_castps_si128(_mm256_castps256_ps128(in))));
 166 #endif
 167 #else
 168 #error Unsupported vector width
 169 #endif
 170     }
 171
 172     static simdscalar pack(simdscalar &in)
 173     {
 174 #if KNOB_SIMD_WIDTH == 8
 175         simdscalari src = _simd_castps_si(in);
 176         __m128i res16 = _mm_packs_epi32(_mm256_castsi256_si128(src), _mm256_extractf128_si256(src, 1));
 177         __m128i res8 = _mm_packs_epi16(res16, _mm_undefined_si128());
 178         return _mm256_castsi256_ps(_mm256_castsi128_si256(res8));
 179 #else
 180 #error Unsupported vector width
 181 #endif
 182     }
 183 };
 184
 185 //////////////////////////////////////////////////////////////////////////
 186 /// PackTraits - Helpers for packing / unpacking 16 bit unsigned channels
 187 //////////////////////////////////////////////////////////////////////////
 188 template <>
 189 struct PackTraits<16, false>
 190 {
 191     static const uint32_t MyNumBits = 16;
 192
 193     static simdscalar loadSOA(const uint8_t *pSrc)
 194     {
 195 #if KNOB_SIMD_WIDTH == 8
 196         __m256 result = _mm256_setzero_ps();
 197         __m128 vLo = _mm_load_ps((const float*)pSrc);
 198         return _mm256_insertf128_ps(result, vLo, 0);
 199 #else
 200 #error Unsupported vector width
 201 #endif
 202     }
 203
 204     static void storeSOA(uint8_t *pDst, simdscalar src)
 205     {
 206 #if KNOB_SIMD_WIDTH == 8
 207         // store 16B (2B * 8)
 208         _mm_store_ps((float*)pDst, _mm256_castps256_ps128(src));
 209 #else
 210 #error Unsupported vector width
 211 #endif
 212     }
 213
 214     static simdscalar unpack(simdscalar &in)
 215     {
 216 #if KNOB_SIMD_WIDTH == 8
 217 #if KNOB_ARCH==KNOB_ARCH_AVX
 218         __m128i src = _mm_castps_si128(_mm256_castps256_ps128(in));
 219         __m128i resLo = _mm_cvtepu16_epi32(src);
 220         __m128i resHi = _mm_shuffle_epi8(src,
 221             _mm_set_epi32(0x80800F0E, 0x80800D0C, 0x80800B0A, 0x80800908));
 222
 223         __m256i result = _mm256_castsi128_si256(resLo);
 224         result = _mm256_insertf128_si256(result, resHi, 1);
 225         return _mm256_castsi256_ps(result);
 226 #elif KNOB_ARCH==KNOB_ARCH_AVX2
 227         return _mm256_castsi256_ps(_mm256_cvtepu16_epi32(_mm_castps_si128(_mm256_castps256_ps128(in))));
 228 #endif
 229 #else
 230 #error Unsupported vector width
 231 #endif
 232     }
 233
 234     static simdscalar pack(simdscalar &in)
 235     {
 236 #if KNOB_SIMD_WIDTH == 8
 237         simdscalari src = _simd_castps_si(in);
 238         __m256i res = _mm256_castsi128_si256(_mm_packus_epi32(_mm256_castsi256_si128(src), _mm256_extractf128_si256(src, 1)));
 239         return _mm256_castsi256_ps(res);
 240 #else
 241 #error Unsupported vector width
 242 #endif
 243     }
 244 };
 245
 246 //////////////////////////////////////////////////////////////////////////
 247 /// PackTraits - Helpers for packing / unpacking 16 bit signed channels
 248 //////////////////////////////////////////////////////////////////////////
 249 template <>
 250 struct PackTraits<16, true>
 251 {
 252     static const uint32_t MyNumBits = 16;
 253
 254     static simdscalar loadSOA(const uint8_t *pSrc)
 255     {
 256 #if KNOB_SIMD_WIDTH == 8
 257         __m256 result = _mm256_setzero_ps();
 258         __m128 vLo = _mm_load_ps((const float*)pSrc);
 259         return _mm256_insertf128_ps(result, vLo, 0);
 260 #else
 261 #error Unsupported vector width
 262 #endif
 263     }
 264
 265     static void storeSOA(uint8_t *pDst, simdscalar src)
 266     {
 267 #if KNOB_SIMD_WIDTH == 8
 268         // store 16B (2B * 8)
 269         _mm_store_ps((float*)pDst, _mm256_castps256_ps128(src));
 270 #else
 271 #error Unsupported vector width
 272 #endif
 273     }
 274
 275     static simdscalar unpack(simdscalar &in)
 276     {
 277 #if KNOB_SIMD_WIDTH == 8
 278 #if KNOB_ARCH==KNOB_ARCH_AVX
 279         SWR_ASSERT(0); // I think this is incorrectly implemented
 280         __m128i src = _mm_castps_si128(_mm256_castps256_ps128(in));
 281         __m128i resLo = _mm_cvtepi16_epi32(src);
 282         __m128i resHi = _mm_shuffle_epi8(src,
 283             _mm_set_epi32(0x80800F0E, 0x80800D0C, 0x80800B0A, 0x80800908));
 284
 285         __m256i result = _mm256_castsi128_si256(resLo);
 286         result = _mm256_insertf128_si256(result, resHi, 1);
 287         return _mm256_castsi256_ps(result);
 288 #elif KNOB_ARCH==KNOB_ARCH_AVX2
 289         return _mm256_castsi256_ps(_mm256_cvtepi16_epi32(_mm_castps_si128(_mm256_castps256_ps128(in))));
 290 #endif
 291 #else
 292 #error Unsupported vector width
 293 #endif
 294     }
 295
 296     static simdscalar pack(simdscalar &in)
 297     {
 298 #if KNOB_SIMD_WIDTH == 8
 299         simdscalari src = _simd_castps_si(in);
 300         __m256i res = _mm256_castsi128_si256(_mm_packs_epi32(_mm256_castsi256_si128(src), _mm256_extractf128_si256(src, 1)));
 301         return _mm256_castsi256_ps(res);
 302 #else
 303 #error Unsupported vector width
 304 #endif
 305     }
 306 };
 307
 308 //////////////////////////////////////////////////////////////////////////
 309 /// PackTraits - Helpers for packing / unpacking 32 bit channels
 310 //////////////////////////////////////////////////////////////////////////
 311 template <>
 312 struct PackTraits<32, false>
 313 {
 314     static const uint32_t MyNumBits = 32;
 315
 316     static simdscalar loadSOA(const uint8_t *pSrc) { return _simd_load_ps((const float*)pSrc); }
 317     static void storeSOA(uint8_t *pDst, simdscalar src) { _simd_store_ps((float*)pDst, src); }
 318     static simdscalar unpack(simdscalar &in) { return in; }
 319     static simdscalar pack(simdscalar &in) { return in; }
 320 };
 321
 322 //////////////////////////////////////////////////////////////////////////
 323 /// TypeTraits - Format type traits.
 324 //////////////////////////////////////////////////////////////////////////
 325 template<SWR_TYPE type, uint32_t NumBits>
 326 struct TypeTraits : PackTraits<NumBits>
 327 {
 328     static const SWR_TYPE MyType = type;
 329     static float toFloat() { return 0.0; }
 330     static float fromFloat() { SWR_ASSERT(0); return 0.0; }
 331     static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
 332 };
 333
 334 //////////////////////////////////////////////////////////////////////////
 335 /// TypeTraits - Format type traits specialization for UINT8
 336 //////////////////////////////////////////////////////////////////////////
 337 template<> struct TypeTraits<SWR_TYPE_UINT, 8> : PackTraits<8>
 338 {
 339     static const SWR_TYPE MyType = SWR_TYPE_UINT;
 340     static float toFloat() { return 0.0; }
 341     static float fromFloat() { SWR_ASSERT(0); return 0.0; }
 342     static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
 343 };
 344
 345 //////////////////////////////////////////////////////////////////////////
 346 /// TypeTraits - Format type traits specialization for UINT8
 347 //////////////////////////////////////////////////////////////////////////
 348 template<> struct TypeTraits<SWR_TYPE_SINT, 8> : PackTraits<8, true>
 349 {
 350     static const SWR_TYPE MyType = SWR_TYPE_SINT;
 351     static float toFloat() { return 0.0; }
 352     static float fromFloat() { SWR_ASSERT(0); return 0.0; }
 353     static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
 354 };
 355
 356 //////////////////////////////////////////////////////////////////////////
 357 /// TypeTraits - Format type traits specialization for UINT16
 358 //////////////////////////////////////////////////////////////////////////
 359 template<> struct TypeTraits<SWR_TYPE_UINT, 16> : PackTraits<16>
 360 {
 361     static const SWR_TYPE MyType = SWR_TYPE_UINT;
 362     static float toFloat() { return 0.0; }
 363     static float fromFloat() { SWR_ASSERT(0); return 0.0; }
 364     static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
 365 };
 366
 367 //////////////////////////////////////////////////////////////////////////
 368 /// TypeTraits - Format type traits specialization for SINT16
 369 //////////////////////////////////////////////////////////////////////////
 370 template<> struct TypeTraits<SWR_TYPE_SINT, 16> : PackTraits<16, true>
 371 {
 372     static const SWR_TYPE MyType = SWR_TYPE_SINT;
 373     static float toFloat() { return 0.0; }
 374     static float fromFloat() { SWR_ASSERT(0); return 0.0; }
 375     static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
 376 };
 377
 378 //////////////////////////////////////////////////////////////////////////
 379 /// TypeTraits - Format type traits specialization for UINT32
 380 //////////////////////////////////////////////////////////////////////////
 381 template<> struct TypeTraits<SWR_TYPE_UINT, 32> : PackTraits<32>
 382 {
 383     static const SWR_TYPE MyType = SWR_TYPE_UINT;
 384     static float toFloat() { return 0.0; }
 385     static float fromFloat() { SWR_ASSERT(0); return 0.0; }
 386     static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
 387 };
 388
 389 //////////////////////////////////////////////////////////////////////////
 390 /// TypeTraits - Format type traits specialization for UINT32
 391 //////////////////////////////////////////////////////////////////////////
 392 template<> struct TypeTraits<SWR_TYPE_SINT, 32> : PackTraits<32>
 393 {
 394     static const SWR_TYPE MyType = SWR_TYPE_SINT;
 395     static float toFloat() { return 0.0; }
 396     static float fromFloat() { SWR_ASSERT(0); return 0.0; }
 397     static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
 398 };
 399
 400 //////////////////////////////////////////////////////////////////////////
 401 /// TypeTraits - Format type traits specialization for UNORM5
 402 //////////////////////////////////////////////////////////////////////////
 403 template<> struct TypeTraits<SWR_TYPE_UNORM, 5> : PackTraits<5>
 404 {
 405     static const SWR_TYPE MyType = SWR_TYPE_UNORM;
 406     static float toFloat() { return 1.0f / 31.0f; }
 407     static float fromFloat() { return 31.0f; }
 408     static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
 409 };
 410
 411 //////////////////////////////////////////////////////////////////////////
 412 /// TypeTraits - Format type traits specialization for UNORM6
 413 //////////////////////////////////////////////////////////////////////////
 414 template<> struct TypeTraits<SWR_TYPE_UNORM, 6> : PackTraits<6>
 415 {
 416     static const SWR_TYPE MyType = SWR_TYPE_UNORM;
 417     static float toFloat() { return 1.0f / 63.0f; }
 418     static float fromFloat() { return 63.0f; }
 419     static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
 420 };
 421
 422 //////////////////////////////////////////////////////////////////////////
 423 /// TypeTraits - Format type traits specialization for UNORM8
 424 //////////////////////////////////////////////////////////////////////////
 425 template<> struct TypeTraits<SWR_TYPE_UNORM, 8> : PackTraits<8>
 426 {
 427     static const SWR_TYPE MyType = SWR_TYPE_UNORM;
 428     static float toFloat() { return 1.0f / 255.0f; }
 429     static float fromFloat() { return 255.0f; }
 430     static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
 431 };
 432
 433 //////////////////////////////////////////////////////////////////////////
 434 /// TypeTraits - Format type traits specialization for UNORM8
 435 //////////////////////////////////////////////////////////////////////////
 436 template<> struct TypeTraits<SWR_TYPE_SNORM, 8> : PackTraits<8, true>
 437 {
 438     static const SWR_TYPE MyType = SWR_TYPE_SNORM;
 439     static float toFloat() { return 1.0f / 127.0f; }
 440     static float fromFloat() { return 127.0f; }
 441     static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
 442 };
 443
 444 //////////////////////////////////////////////////////////////////////////
 445 /// TypeTraits - Format type traits specialization for UNORM16
 446 //////////////////////////////////////////////////////////////////////////
 447 template<> struct TypeTraits<SWR_TYPE_UNORM, 16> : PackTraits<16>
 448 {
 449     static const SWR_TYPE MyType = SWR_TYPE_UNORM;
 450     static float toFloat() { return 1.0f / 65535.0f; }
 451     static float fromFloat() { return 65535.0f; }
 452     static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
 453 };
 454
 455 //////////////////////////////////////////////////////////////////////////
 456 /// TypeTraits - Format type traits specialization for SNORM16
 457 //////////////////////////////////////////////////////////////////////////
 458 template<> struct TypeTraits<SWR_TYPE_SNORM, 16> : PackTraits<16, true>
 459 {
 460     static const SWR_TYPE MyType = SWR_TYPE_UNORM;
 461     static float toFloat() { return 1.0f / 32767.0f; }
 462     static float fromFloat() { return 32767.0f; }
 463     static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
 464 };
 465
 466 //////////////////////////////////////////////////////////////////////////
 467 /// TypeTraits - Format type traits specialization for UNORM24
 468 //////////////////////////////////////////////////////////////////////////
 469 template<>
 470 struct TypeTraits < SWR_TYPE_UNORM, 24 > : PackTraits<32>
 471 {
 472     static const SWR_TYPE MyType = SWR_TYPE_UNORM;
 473     static float toFloat() { return 1.0f / 16777215.0f; }
 474     static float fromFloat() { return 16777215.0f; }
 475     static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
 476 };
 477
 478 //////////////////////////////////////////////////////////////////////////
 479 // FLOAT Specializations from here on...
 480 //////////////////////////////////////////////////////////////////////////
 481 #define TO_M128i(a) _mm_castps_si128(a)
 482 #define TO_M128(a) _mm_castsi128_ps(a)
 483
 484 #include "math.h"
 485
 486 template< unsigned expnum, unsigned expden, unsigned coeffnum, unsigned coeffden >
 487 inline static __m128 fastpow(__m128 arg) {
 488     __m128 ret = arg;
 489
 490     static const __m128 factor = _mm_set1_ps(exp2(127.0f * expden / expnum - 127.0f)
 491         * powf(1.0f * coeffnum / coeffden, 1.0f * expden / expnum));
 492
 493     // Apply a constant pre-correction factor.
 494     ret = _mm_mul_ps(ret, factor);
 495
 496     // Reinterpret arg as integer to obtain logarithm.
 497     //asm("cvtdq2ps %1, %0" : "=x" (ret) : "x" (ret));
 498     ret = _mm_cvtepi32_ps(_mm_castps_si128(ret));
 499
 500     // Multiply logarithm by power.
 501     ret = _mm_mul_ps(ret, _mm_set1_ps(1.0f * expnum / expden));
 502
 503     // Convert back to "integer" to exponentiate.
 504     //asm("cvtps2dq %1, %0" : "=x" (ret) : "x" (ret));
 505     ret = _mm_castsi128_ps(_mm_cvtps_epi32(ret));
 506
 507     return ret;
 508 }
 509
 510 inline static __m128 pow512_4(__m128 arg) {
 511     // 5/12 is too small, so compute the 4th root of 20/12 instead.
 512     // 20/12 = 5/3 = 1 + 2/3 = 2 - 1/3. 2/3 is a suitable argument for fastpow.
 513     // weighting coefficient: a^-1/2 = 2 a; a = 2^-2/3
 514     __m128 xf = fastpow< 2, 3, int(0.629960524947437 * 1e9), int(1e9) >(arg);
 515     __m128 xover = _mm_mul_ps(arg, xf);
 516
 517     __m128 xfm1 = _mm_rsqrt_ps(xf);
 518     __m128 x2 = _mm_mul_ps(arg, arg);
 519     __m128 xunder = _mm_mul_ps(x2, xfm1);
 520
 521     // sqrt2 * over + 2 * sqrt2 * under
 522     __m128 xavg = _mm_mul_ps(_mm_set1_ps(1.0f / (3.0f * 0.629960524947437f) * 0.999852f),
 523         _mm_add_ps(xover, xunder));
 524
 525     xavg = _mm_mul_ps(xavg, _mm_rsqrt_ps(xavg));
 526     xavg = _mm_mul_ps(xavg, _mm_rsqrt_ps(xavg));
 527     return xavg;
 528 }
 529
 530 inline static __m128 powf_wrapper(__m128 Base, float Exp)
 531 {
 532     float *f = (float *)(&Base);
 533
 534     return _mm_set_ps(powf(f[0], Exp),
 535                       powf(f[1], Exp),
 536                       powf(f[2], Exp),
 537                       powf(f[3], Exp));
 538 }
 539
 540 static inline __m128 ConvertFloatToSRGB2(__m128& Src)
 541 {
 542     // create a mask with 0xFFFFFFFF in the DWORDs where the source is <= the minimal SRGB float value
 543     __m128i CmpToSRGBThresholdMask = TO_M128i(_mm_cmpnlt_ps(_mm_set1_ps(0.0031308f), Src));
 544
 545     // squeeze the mask down to 16 bits (4 bits per DWORD)
 546     int CompareResult = _mm_movemask_epi8(CmpToSRGBThresholdMask);
 547
 548     __m128 Result;
 549
 550     //
 551     if (CompareResult == 0xFFFF)
 552     {
 553         // all DWORDs are <= the threshold
 554         Result = _mm_mul_ps(Src, _mm_set1_ps(12.92f));
 555     }
 556     else if (CompareResult == 0x0)
 557     {
 558         // all DWORDs are > the threshold
 559         __m128 fSrc_0RGB = Src;
 560
 561         // --> 1.055f * c(1.0f/2.4f) - 0.055f
 562 #if KNOB_USE_FAST_SRGB == TRUE
 563         // 1.0f / 2.4f is 5.0f / 12.0f which is used for approximation.
 564         __m128 f = pow512_4(fSrc_0RGB);
 565 #else
 566         __m128 f = powf_wrapper(fSrc_0RGB, 1.0f / 2.4f);
 567 #endif
 568         f = _mm_mul_ps(f, _mm_set1_ps(1.055f));
 569         Result = _mm_sub_ps(f, _mm_set1_ps(0.055f));
 570     }
 571     else
 572     {
 573         // some DWORDs are <= the threshold and some are > threshold
 574         __m128 Src_0RGB_mul_denorm = _mm_mul_ps(Src, _mm_set1_ps(12.92f));
 575
 576         __m128 fSrc_0RGB = Src;
 577
 578         // --> 1.055f * c(1.0f/2.4f) - 0.055f
 579 #if KNOB_USE_FAST_SRGB == TRUE
 580         // 1.0f / 2.4f is 5.0f / 12.0f which is used for approximation.
 581         __m128 f = pow512_4(fSrc_0RGB);
 582 #else
 583         __m128 f = powf_wrapper(fSrc_0RGB, 1.0f / 2.4f);
 584 #endif
 585         f = _mm_mul_ps(f, _mm_set1_ps(1.055f));
 586         f = _mm_sub_ps(f, _mm_set1_ps(0.055f));
 587
 588         // Clear the alpha (is garbage after the sub)
 589         __m128i i = _mm_and_si128(TO_M128i(f), _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF));
 590
 591         __m128i LessThanPart = _mm_and_si128(CmpToSRGBThresholdMask, TO_M128i(Src_0RGB_mul_denorm));
 592         __m128i GreaterEqualPart = _mm_andnot_si128(CmpToSRGBThresholdMask, i);
 593         __m128i CombinedParts = _mm_or_si128(LessThanPart, GreaterEqualPart);
 594
 595         Result = TO_M128(CombinedParts);
 596     }
 597
 598     return Result;
 599 }
 600
 601 //////////////////////////////////////////////////////////////////////////
 602 /// TypeTraits - Format type traits specialization for FLOAT16
 603 //////////////////////////////////////////////////////////////////////////
 604 template<> struct TypeTraits<SWR_TYPE_FLOAT, 16> : PackTraits<16>
 605 {
 606     static const SWR_TYPE MyType = SWR_TYPE_FLOAT;
 607     static float toFloat() { return 1.0f; }
 608     static float fromFloat() { return 1.0f; }
 609     static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
 610
 611     static simdscalar pack(const simdscalar &in)
 612     {
 613 #if KNOB_SIMD_WIDTH == 8
 614 #if (KNOB_ARCH == KNOB_ARCH_AVX)
 615         // input is 8 packed float32, output is 8 packed float16
 616         simdscalari src = _simd_castps_si(in);
 617
 618         static const uint32_t FLOAT_EXP_BITS = 8;
 619         static const uint32_t FLOAT_MANTISSA_BITS = 23;
 620         static const uint32_t FLOAT_MANTISSA_MASK = (1U << FLOAT_MANTISSA_BITS) - 1;
 621         static const uint32_t FLOAT_EXP_MASK = ((1U << FLOAT_EXP_BITS) - 1) << FLOAT_MANTISSA_BITS;
 622
 623         static const uint32_t HALF_EXP_BITS = 5;
 624         static const uint32_t HALF_MANTISSA_BITS = 10;
 625         static const uint32_t HALF_EXP_MASK = ((1U << HALF_EXP_BITS) - 1) << HALF_MANTISSA_BITS;
 626
 627         // minimum exponent required, exponents below this are flushed to 0.
 628         static const int32_t HALF_EXP_MIN = -14;
 629         static const int32_t FLOAT_EXP_BIAS = 127;
 630         static const int32_t FLOAT_EXP_MIN = HALF_EXP_MIN + FLOAT_EXP_BIAS;
 631         static const int32_t FLOAT_EXP_MIN_FTZ = FLOAT_EXP_MIN - (HALF_MANTISSA_BITS + 1); // +1 for the lack of implicit significand
 632
 633         // maximum exponent required, exponents above this are set to infinity
 634         static const int32_t HALF_EXP_MAX = 15;
 635         static const int32_t FLOAT_EXP_MAX = HALF_EXP_MAX + FLOAT_EXP_BIAS;
 636
 637         const simdscalari vSignMask     = _simd_set1_epi32(0x80000000);
 638         const simdscalari vExpMask      = _simd_set1_epi32(FLOAT_EXP_MASK);
 639         const simdscalari vManMask      = _simd_set1_epi32(FLOAT_MANTISSA_MASK);
 640         const simdscalari vExpMin       = _simd_set1_epi32(FLOAT_EXP_MASK & uint32_t(FLOAT_EXP_MIN << FLOAT_MANTISSA_BITS));
 641         const simdscalari vExpMinFtz    = _simd_set1_epi32(FLOAT_EXP_MASK & uint32_t(FLOAT_EXP_MIN_FTZ << FLOAT_MANTISSA_BITS));
 642         const simdscalari vExpMax       = _simd_set1_epi32(FLOAT_EXP_MASK & uint32_t(FLOAT_EXP_MAX << FLOAT_MANTISSA_BITS));
 643
 644         simdscalari vSign       = _simd_and_si(src, vSignMask);
 645         simdscalari vExp        = _simd_and_si(src, vExpMask);
 646         simdscalari vMan        = _simd_and_si(src, vManMask);
 647
 648         simdscalari vFTZMask    = _simd_cmplt_epi32(vExp, vExpMinFtz);
 649         simdscalari vDenormMask = _simd_andnot_si(vFTZMask, _simd_cmplt_epi32(vExp, vExpMin));
 650         simdscalari vInfMask    = _simd_cmpeq_epi32(vExpMask, vExp);
 651         simdscalari vClampMask  = _simd_andnot_si(vInfMask, _simd_cmplt_epi32(vExpMax, vExp));
 652
 653         simdscalari vHalfExp    = _simd_add_epi32(_simd_sub_epi32(vExp, vExpMin), _simd_set1_epi32(1U << FLOAT_MANTISSA_BITS));
 654
 655         // pack output 16-bits into the lower 16-bits of each 32-bit channel
 656         simdscalari vDst        = _simd_and_si(_simd_srli_epi32(vHalfExp, 13), _simd_set1_epi32(HALF_EXP_MASK));
 657         vDst   = _simd_or_si(vDst, _simd_srli_epi32(vMan, FLOAT_MANTISSA_BITS - HALF_MANTISSA_BITS));
 658
 659         // Flush To Zero
 660         vDst   = _simd_andnot_si(vFTZMask, vDst);
 661         // Apply Infinites / NaN
 662         vDst   = _simd_or_si(vDst, _simd_and_si(vInfMask, _simd_set1_epi32(HALF_EXP_MASK)));
 663
 664         // Apply clamps
 665         vDst = _simd_andnot_si(vClampMask, vDst);
 666         vDst = _simd_or_si(vDst,
 667                 _simd_and_si(vClampMask, _simd_set1_epi32(0x7BFF)));
 668
 669         // Compute Denormals (subnormals)
 670         if (!_mm256_testz_si256(vDenormMask, vDenormMask))
 671         {
 672             uint32_t *pDenormMask = (uint32_t*)&vDenormMask;
 673             uint32_t *pExp = (uint32_t*)&vExp;
 674             uint32_t *pMan = (uint32_t*)&vMan;
 675             uint32_t *pDst = (uint32_t*)&vDst;
 676             for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i)
 677             {
 678                 if (pDenormMask[i])
 679                 {
 680                     // Need to compute subnormal value
 681                     uint32_t exponent = pExp[i] >> FLOAT_MANTISSA_BITS;
 682                     uint32_t mantissa = pMan[i] |
 683                                         (1U << FLOAT_MANTISSA_BITS); // Denorms include no "implicit" 1s.  Make it explicit
 684
 685                     pDst[i] = mantissa >> ((FLOAT_EXP_MIN - exponent) + (FLOAT_MANTISSA_BITS - HALF_MANTISSA_BITS));
 686                 }
 687             }
 688         }
 689
 690         // Add in sign bits
 691         vDst = _simd_or_si(vDst, _simd_srli_epi32(vSign, 16));
 692
 693         // Pack to lower 128-bits
 694         vDst = _mm256_castsi128_si256(_mm_packus_epi32(_mm256_castsi256_si128(vDst), _mm256_extractf128_si256(vDst, 1)));
 695
 696 #if 0
 697 #if !defined(NDEBUG)
 698         simdscalari vCheck = _mm256_castsi128_si256(_mm256_cvtps_ph(in, _MM_FROUND_TRUNC));
 699
 700         for (uint32_t i = 0; i < 4; ++i)
 701         {
 702             SWR_ASSERT(vCheck.m256i_i32[i] == vDst.m256i_i32[i]);
 703         }
 704 #endif
 705 #endif
 706
 707         return _simd_castsi_ps(vDst);
 708
 709 #else
 710         return _mm256_castsi256_ps(_mm256_castsi128_si256(_mm256_cvtps_ph(in, _MM_FROUND_TRUNC)));
 711 #endif
 712 #else
 713 #error Unsupported vector width
 714 #endif
 715     }
 716
 717     static simdscalar unpack(const simdscalar &in)
 718     {
 719         // input is 8 packed float16, output is 8 packed float32
 720         SWR_ASSERT(0); // @todo
 721         return _simd_setzero_ps();
 722     }
 723 };
 724
 725 //////////////////////////////////////////////////////////////////////////
 726 /// TypeTraits - Format type traits specialization for FLOAT32
 727 //////////////////////////////////////////////////////////////////////////
 728 template<> struct TypeTraits<SWR_TYPE_FLOAT, 32> : PackTraits<32>
 729 {
 730     static const SWR_TYPE MyType = SWR_TYPE_FLOAT;
 731     static float toFloat() { return 1.0f; }
 732     static float fromFloat() { return 1.0f; }
 733     static inline simdscalar convertSrgb(simdscalar &in)
 734     {
 735 #if (KNOB_ARCH == KNOB_ARCH_AVX || KNOB_ARCH == KNOB_ARCH_AVX2)
 736         __m128 srcLo = _mm256_extractf128_ps(in, 0);
 737         __m128 srcHi = _mm256_extractf128_ps(in, 1);
 738
 739         srcLo = ConvertFloatToSRGB2(srcLo);
 740         srcHi = ConvertFloatToSRGB2(srcHi);
 741
 742         in = _mm256_insertf128_ps(in, srcLo, 0);
 743         in = _mm256_insertf128_ps(in, srcHi, 1);
 744
 745 #endif
 746         return in;
 747     }
 748 };
 749
 750 //////////////////////////////////////////////////////////////////////////
 751 /// Format1 - Bitfield for single component formats.
 752 //////////////////////////////////////////////////////////////////////////
 753 template<uint32_t x>
 754 struct Format1
 755 {
 756     union
 757     {
 758         uint32_t r : x;
 759
 760         ///@ The following are here to provide full template needed in Formats.
 761         uint32_t g : x;
 762         uint32_t b : x;
 763         uint32_t a : x;
 764     };
 765 };
 766
 767 //////////////////////////////////////////////////////////////////////////
 768 /// Format1 - Bitfield for single component formats - 8 bit specialization
 769 //////////////////////////////////////////////////////////////////////////
 770 template<>
 771 struct Format1<8>
 772 {
 773     union
 774     {
 775         uint8_t r;
 776
 777         ///@ The following are here to provide full template needed in Formats.
 778         uint8_t g;
 779         uint8_t b;
 780         uint8_t a;
 781     };
 782 };
 783
 784 //////////////////////////////////////////////////////////////////////////
 785 /// Format1 - Bitfield for single component formats - 16 bit specialization
 786 //////////////////////////////////////////////////////////////////////////
 787 template<>
 788 struct Format1<16>
 789 {
 790     union
 791     {
 792         uint16_t r;
 793
 794         ///@ The following are here to provide full template needed in Formats.
 795         uint16_t g;
 796         uint16_t b;
 797         uint16_t a;
 798     };
 799 };
 800
 801 //////////////////////////////////////////////////////////////////////////
 802 /// Format2 - Bitfield for 2 component formats.
 803 //////////////////////////////////////////////////////////////////////////
 804 template<uint32_t x, uint32_t y>
 805 union Format2
 806 {
 807     struct
 808     {
 809         uint32_t r : x;
 810         uint32_t g : y;
 811     };
 812     struct
 813     {
 814         ///@ The following are here to provide full template needed in Formats.
 815         uint32_t b : x;
 816         uint32_t a : y;
 817     };
 818 };
 819
 820 //////////////////////////////////////////////////////////////////////////
 821 /// Format2 - Bitfield for 2 component formats - 16 bit specialization
 822 //////////////////////////////////////////////////////////////////////////
 823 template<>
 824 union Format2<8,8>
 825 {
 826     struct
 827     {
 828         uint16_t r : 8;
 829         uint16_t g : 8;
 830     };
 831     struct
 832     {
 833         ///@ The following are here to provide full template needed in Formats.
 834         uint16_t b : 8;
 835         uint16_t a : 8;
 836     };
 837 };
 838
 839 //////////////////////////////////////////////////////////////////////////
 840 /// Format3 - Bitfield for 3 component formats.
 841 //////////////////////////////////////////////////////////////////////////
 842 template<uint32_t x, uint32_t y, uint32_t z>
 843 union Format3
 844 {
 845     struct
 846     {
 847         uint32_t r : x;
 848         uint32_t g : y;
 849         uint32_t b : z;
 850     };
 851     uint32_t a;  ///@note This is here to provide full template needed in Formats.
 852 };
 853
 854 //////////////////////////////////////////////////////////////////////////
 855 /// Format3 - Bitfield for 3 component formats - 16 bit specialization
 856 //////////////////////////////////////////////////////////////////////////
 857 template<>
 858 union Format3<5,6,5>
 859 {
 860     struct
 861     {
 862         uint16_t r : 5;
 863         uint16_t g : 6;
 864         uint16_t b : 5;
 865     };
 866     uint16_t a;  ///@note This is here to provide full template needed in Formats.
 867 };
 868
 869 //////////////////////////////////////////////////////////////////////////
 870 /// Format4 - Bitfield for 4 component formats.
 871 //////////////////////////////////////////////////////////////////////////
 872 template<uint32_t x, uint32_t y, uint32_t z, uint32_t w>
 873 struct Format4
 874 {
 875     uint32_t r : x;
 876     uint32_t g : y;
 877     uint32_t b : z;
 878     uint32_t a : w;
 879 };
 880
 881 //////////////////////////////////////////////////////////////////////////
 882 /// Format4 - Bitfield for 4 component formats - 16 bit specialization
 883 //////////////////////////////////////////////////////////////////////////
 884 template<>
 885 struct Format4<5,5,5,1>
 886 {
 887     uint16_t r : 5;
 888     uint16_t g : 5;
 889     uint16_t b : 5;
 890     uint16_t a : 1;
 891 };
 892
 893 //////////////////////////////////////////////////////////////////////////
 894 /// Format4 - Bitfield for 4 component formats - 16 bit specialization
 895 //////////////////////////////////////////////////////////////////////////
 896 template<>
 897 struct Format4<4,4,4,4>
 898 {
 899     uint16_t r : 4;
 900     uint16_t g : 4;
 901     uint16_t b : 4;
 902     uint16_t a : 4;
 903 };
 904
 905 //////////////////////////////////////////////////////////////////////////
 906 /// ComponentTraits - Default components
 907 //////////////////////////////////////////////////////////////////////////
 908 template<uint32_t x, uint32_t y, uint32_t z, uint32_t w>
 909 struct Defaults
 910 {
 911     INLINE static uint32_t GetDefault(uint32_t comp)
 912     {
 913         static const uint32_t defaults[4]{ x, y, z, w };
 914         return defaults[comp];
 915     }
 916 };
 917
 918 //////////////////////////////////////////////////////////////////////////
 919 /// ComponentTraits - Component type traits.
 920 //////////////////////////////////////////////////////////////////////////
 921 template<SWR_TYPE X, uint32_t NumBitsX, SWR_TYPE Y = SWR_TYPE_UNKNOWN, uint32_t NumBitsY = 0, SWR_TYPE Z = SWR_TYPE_UNKNOWN, uint32_t NumBitsZ = 0, SWR_TYPE W = SWR_TYPE_UNKNOWN, uint32_t NumBitsW = 0>
 922 struct ComponentTraits
 923 {
 924     INLINE static SWR_TYPE GetType(uint32_t comp)
 925     {
 926         static const SWR_TYPE CompType[4]{ X, Y, Z, W };
 927         return CompType[comp];
 928     }
 929
 930     INLINE static uint32_t GetBPC(uint32_t comp)
 931     {
 932         static const uint32_t MyBpc[4]{ NumBitsX, NumBitsY, NumBitsZ, NumBitsW };
 933         return MyBpc[comp];
 934     }
 935
 936     INLINE static bool isNormalized(uint32_t comp)
 937     {
 938         switch (comp)
 939         {
 940         case 0:
 941             return (X == SWR_TYPE_UNORM || X == SWR_TYPE_SNORM) ? true : false;
 942         case 1:
 943             return (Y == SWR_TYPE_UNORM || Y == SWR_TYPE_SNORM) ? true : false;
 944         case 2:
 945             return (Z == SWR_TYPE_UNORM || Z == SWR_TYPE_SNORM) ? true : false;
 946         case 3:
 947             return (W == SWR_TYPE_UNORM || W == SWR_TYPE_SNORM) ? true : false;
 948         }
 949         SWR_ASSERT(0);
 950         return false;
 951     }
 952
 953     INLINE static float toFloat(uint32_t comp)
 954     {
 955         switch (comp)
 956         {
 957         case 0:
 958             return TypeTraits<X, NumBitsX>::toFloat();
 959         case 1:
 960             return TypeTraits<Y, NumBitsY>::toFloat();
 961         case 2:
 962             return TypeTraits<Z, NumBitsZ>::toFloat();
 963         case 3:
 964             return TypeTraits<W, NumBitsW>::toFloat();
 965         }
 966         SWR_ASSERT(0);
 967         return TypeTraits<X, NumBitsX>::toFloat();
 968
 969     }
 970
 971     INLINE static float fromFloat(uint32_t comp)
 972     {
 973         switch (comp)
 974         {
 975         case 0:
 976             return TypeTraits<X, NumBitsX>::fromFloat();
 977         case 1:
 978             return TypeTraits<Y, NumBitsY>::fromFloat();
 979         case 2:
 980             return TypeTraits<Z, NumBitsZ>::fromFloat();
 981         case 3:
 982             return TypeTraits<W, NumBitsW>::fromFloat();
 983         }
 984         SWR_ASSERT(0);
 985         return TypeTraits<X, NumBitsX>::fromFloat();
 986     }
 987
 988     INLINE static simdscalar loadSOA(uint32_t comp, const uint8_t* pSrc)
 989     {
 990         switch (comp)
 991         {
 992         case 0:
 993             return TypeTraits<X, NumBitsX>::loadSOA(pSrc);
 994         case 1:
 995             return TypeTraits<Y, NumBitsY>::loadSOA(pSrc);
 996         case 2:
 997             return TypeTraits<Z, NumBitsZ>::loadSOA(pSrc);
 998         case 3:
 999             return TypeTraits<W, NumBitsW>::loadSOA(pSrc);
1000         }
1001         SWR_ASSERT(0);
1002         return TypeTraits<X, NumBitsX>::loadSOA(pSrc);
1003     }
1004
1005     INLINE static void storeSOA(uint32_t comp, uint8_t *pDst, simdscalar src)
1006     {
1007         switch (comp)
1008         {
1009         case 0:
1010             TypeTraits<X, NumBitsX>::storeSOA(pDst, src);
1011             return;
1012         case 1:
1013             TypeTraits<Y, NumBitsY>::storeSOA(pDst, src);
1014             return;
1015         case 2:
1016             TypeTraits<Z, NumBitsZ>::storeSOA(pDst, src);
1017             return;
1018         case 3:
1019             TypeTraits<W, NumBitsW>::storeSOA(pDst, src);
1020             return;
1021         }
1022         SWR_ASSERT(0);
1023         TypeTraits<X, NumBitsX>::storeSOA(pDst, src);
1024     }
1025
1026     INLINE static simdscalar unpack(uint32_t comp, simdscalar &in)
1027     {
1028         switch (comp)
1029         {
1030         case 0:
1031             return TypeTraits<X, NumBitsX>::unpack(in);
1032         case 1:
1033             return TypeTraits<Y, NumBitsY>::unpack(in);
1034         case 2:
1035             return TypeTraits<Z, NumBitsZ>::unpack(in);
1036         case 3:
1037             return TypeTraits<W, NumBitsW>::unpack(in);
1038         }
1039         SWR_ASSERT(0);
1040         return TypeTraits<X, NumBitsX>::unpack(in);
1041     }
1042
1043     INLINE static simdscalar pack(uint32_t comp, simdscalar &in)
1044     {
1045         switch (comp)
1046         {
1047         case 0:
1048             return TypeTraits<X, NumBitsX>::pack(in);
1049         case 1:
1050             return TypeTraits<Y, NumBitsY>::pack(in);
1051         case 2:
1052             return TypeTraits<Z, NumBitsZ>::pack(in);
1053         case 3:
1054             return TypeTraits<W, NumBitsW>::pack(in);
1055         }
1056         SWR_ASSERT(0);
1057         return TypeTraits<X, NumBitsX>::pack(in);
1058     }
1059
1060     INLINE static simdscalar convertSrgb(uint32_t comp, simdscalar &in)
1061     {
1062         switch (comp)
1063         {
1064         case 0:
1065             return TypeTraits<X, NumBitsX>::convertSrgb(in);
1066         case 1:
1067             return TypeTraits<Y, NumBitsY>::convertSrgb(in);
1068         case 2:
1069             return TypeTraits<Z, NumBitsZ>::convertSrgb(in);
1070         case 3:
1071             return TypeTraits<W, NumBitsW>::convertSrgb(in);
1072         }
1073         SWR_ASSERT(0);
1074         return TypeTraits<X, NumBitsX>::convertSrgb(in);
1075     }
1076 };