src/gallium/auxiliary/gallivm/lp_bld_conv.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2009 VMware, Inc.
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the
  15  * next paragraph) shall be included in all copies or substantial portions
  16  * of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  21  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
  22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25  *
  26  **************************************************************************/
  27
  28
  29 /**
  30  * @file
  31  * Helper functions for type conversions.
  32  *
  33  * We want to use the fastest type for a given computation whenever feasible.
  34  * The other side of this is that we need to be able convert between several
  35  * types accurately and efficiently.
  36  *
  37  * Conversion between types of different bit width is quite complex since a
  38  *
  39  * To remember there are a few invariants in type conversions:
  40  *
  41  * - register width must remain constant:
  42  *
  43  *     src_type.width * src_type.length == dst_type.width * dst_type.length
  44  *
  45  * - total number of elements must remain constant:
  46  *
  47  *     src_type.length * num_srcs == dst_type.length * num_dsts
  48  *
  49  * It is not always possible to do the conversion both accurately and
  50  * efficiently, usually due to lack of adequate machine instructions. In these
  51  * cases it is important not to cut shortcuts here and sacrifice accuracy, as
  52  * there this functions can be used anywhere. In the future we might have a
  53  * precision parameter which can gauge the accuracy vs efficiency compromise,
  54  * but for now if the data conversion between two stages happens to be the
  55  * bottleneck, then most likely should just avoid converting at all and run
  56  * both stages with the same type.
  57  *
  58  * Make sure to run lp_test_conv unit test after any change to this file.
  59  *
  60  * @author Jose Fonseca <jfonseca@vmware.com>
  61  */
  62
  63
  64 #include "util/u_debug.h"
  65 #include "util/u_math.h"
  66 #include "util/u_half.h"
  67 #include "util/u_cpu_detect.h"
  68
  69 #include "lp_bld_type.h"
  70 #include "lp_bld_const.h"
  71 #include "lp_bld_arit.h"
  72 #include "lp_bld_bitarit.h"
  73 #include "lp_bld_pack.h"
  74 #include "lp_bld_conv.h"
  75 #include "lp_bld_logic.h"
  76 #include "lp_bld_intr.h"
  77 #include "lp_bld_printf.h"
  78 #include "lp_bld_format.h"
  79
  80
  81
  82 /**
  83  * Converts int16 half-float to float32
  84  * Note this can be performed in 1 instruction if vcvtph2ps exists (f16c/cvt16)
  85  * [llvm.x86.vcvtph2ps / _mm_cvtph_ps]
  86  *
  87  * @param src           value to convert
  88  *
  89  */
  90 LLVMValueRef
  91 lp_build_half_to_float(struct gallivm_state *gallivm,
  92                        LLVMValueRef src)
  93 {
  94    LLVMBuilderRef builder = gallivm->builder;
  95    LLVMTypeRef src_type = LLVMTypeOf(src);
  96    unsigned src_length = LLVMGetTypeKind(src_type) == LLVMVectorTypeKind ?
  97                             LLVMGetVectorSize(src_type) : 1;
  98
  99    struct lp_type f32_type = lp_type_float_vec(32, 32 * src_length);
 100    struct lp_type i32_type = lp_type_int_vec(32, 32 * src_length);
 101    LLVMTypeRef int_vec_type = lp_build_vec_type(gallivm, i32_type);
 102    LLVMValueRef h;
 103
 104    if (util_cpu_caps.has_f16c &&
 105        (src_length == 4 || src_length == 8)) {
 106       if (LLVM_VERSION_MAJOR < 11) {
 107          const char *intrinsic = NULL;
 108          if (src_length == 4) {
 109             src = lp_build_pad_vector(gallivm, src, 8);
 110             intrinsic = "llvm.x86.vcvtph2ps.128";
 111          }
 112          else {
 113             intrinsic = "llvm.x86.vcvtph2ps.256";
 114          }
 115          return lp_build_intrinsic_unary(builder, intrinsic,
 116                                          lp_build_vec_type(gallivm, f32_type), src);
 117       } else {
 118          /*
 119           * XXX: could probably use on other archs as well.
 120           * But if the cpu doesn't support it natively it looks like the backends still
 121           * can't lower it and will try to call out to external libraries, which will crash.
 122           */
 123          /*
 124           * XXX: lp_build_vec_type() would use int16 vector. Probably need to revisit
 125           * this at some point.
 126           */
 127          src = LLVMBuildBitCast(builder, src,
 128                                 LLVMVectorType(LLVMHalfTypeInContext(gallivm->context), src_length), "");
 129          return LLVMBuildFPExt(builder, src, lp_build_vec_type(gallivm, f32_type), "");
 130       }
 131    }
 132
 133    h = LLVMBuildZExt(builder, src, int_vec_type, "");
 134    return lp_build_smallfloat_to_float(gallivm, f32_type, h, 10, 5, 0, true);
 135 }
 136
 137
 138 /**
 139  * Converts float32 to int16 half-float
 140  * Note this can be performed in 1 instruction if vcvtps2ph exists (f16c/cvt16)
 141  * [llvm.x86.vcvtps2ph / _mm_cvtps_ph]
 142  *
 143  * @param src           value to convert
 144  *
 145  * Convert float32 to half floats, preserving Infs and NaNs,
 146  * with rounding towards zero (trunc).
 147  * XXX: For GL, would prefer rounding towards nearest(-even).
 148  */
 149 LLVMValueRef
 150 lp_build_float_to_half(struct gallivm_state *gallivm,
 151                        LLVMValueRef src)
 152 {
 153    LLVMBuilderRef builder = gallivm->builder;
 154    LLVMTypeRef f32_vec_type = LLVMTypeOf(src);
 155    unsigned length = LLVMGetTypeKind(f32_vec_type) == LLVMVectorTypeKind
 156                    ? LLVMGetVectorSize(f32_vec_type) : 1;
 157    struct lp_type i32_type = lp_type_int_vec(32, 32 * length);
 158    struct lp_type i16_type = lp_type_int_vec(16, 16 * length);
 159    LLVMValueRef result;
 160
 161    /*
 162     * Note: Newer llvm versions (3.6 or so) support fptrunc to 16 bits
 163     * directly, without any (x86 or generic) intrinsics.
 164     * Albeit the rounding mode cannot be specified (and is undefined,
 165     * though in practice on x86 seems to do nearest-even but it may
 166     * be dependent on instruction set support), so is essentially
 167     * useless.
 168     */
 169
 170    if (util_cpu_caps.has_f16c &&
 171        (length == 4 || length == 8)) {
 172       struct lp_type i168_type = lp_type_int_vec(16, 16 * 8);
 173       unsigned mode = 3; /* same as LP_BUILD_ROUND_TRUNCATE */
 174       LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
 175       const char *intrinsic = NULL;
 176       if (length == 4) {
 177          intrinsic = "llvm.x86.vcvtps2ph.128";
 178       }
 179       else {
 180          intrinsic = "llvm.x86.vcvtps2ph.256";
 181       }
 182       result = lp_build_intrinsic_binary(builder, intrinsic,
 183                                          lp_build_vec_type(gallivm, i168_type),
 184                                          src, LLVMConstInt(i32t, mode, 0));
 185       if (length == 4) {
 186          result = lp_build_extract_range(gallivm, result, 0, 4);
 187       }
 188    }
 189
 190    else {
 191       result = lp_build_float_to_smallfloat(gallivm, i32_type, src, 10, 5, 0, true);
 192       /* Convert int32 vector to int16 vector by trunc (might generate bad code) */
 193       result = LLVMBuildTrunc(builder, result, lp_build_vec_type(gallivm, i16_type), "");
 194    }
 195
 196    /*
 197     * Debugging code.
 198     */
 199    if (0) {
 200      LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
 201      LLVMTypeRef i16t = LLVMInt16TypeInContext(gallivm->context);
 202      LLVMTypeRef f32t = LLVMFloatTypeInContext(gallivm->context);
 203      LLVMValueRef ref_result = LLVMGetUndef(LLVMVectorType(i16t, length));
 204      unsigned i;
 205
 206      LLVMTypeRef func_type = LLVMFunctionType(i16t, &f32t, 1, 0);
 207      LLVMValueRef func = lp_build_const_int_pointer(gallivm, func_to_pointer((func_pointer)util_float_to_half));
 208      func = LLVMBuildBitCast(builder, func, LLVMPointerType(func_type, 0), "util_float_to_half");
 209
 210      for (i = 0; i < length; ++i) {
 211         LLVMValueRef index = LLVMConstInt(i32t, i, 0);
 212         LLVMValueRef f32 = LLVMBuildExtractElement(builder, src, index, "");
 213 #if 0
 214         /*
 215          * XXX: not really supported by backends.
 216          * Even if they would now, rounding mode cannot be specified and
 217          * is undefined.
 218          */
 219         LLVMValueRef f16 = lp_build_intrinsic_unary(builder, "llvm.convert.to.fp16", i16t, f32);
 220 #else
 221         LLVMValueRef f16 = LLVMBuildCall(builder, func, &f32, 1, "");
 222 #endif
 223         ref_result = LLVMBuildInsertElement(builder, ref_result, f16, index, "");
 224      }
 225
 226      lp_build_print_value(gallivm, "src  = ", src);
 227      lp_build_print_value(gallivm, "llvm = ", result);
 228      lp_build_print_value(gallivm, "util = ", ref_result);
 229      lp_build_printf(gallivm, "\n");
 230   }
 231
 232    return result;
 233 }
 234
 235
 236 /**
 237  * Special case for converting clamped IEEE-754 floats to unsigned norms.
 238  *
 239  * The mathematical voodoo below may seem excessive but it is actually
 240  * paramount we do it this way for several reasons. First, there is no single
 241  * precision FP to unsigned integer conversion Intel SSE instruction. Second,
 242  * secondly, even if there was, since the FP's mantissa takes only a fraction
 243  * of register bits the typically scale and cast approach would require double
 244  * precision for accurate results, and therefore half the throughput
 245  *
 246  * Although the result values can be scaled to an arbitrary bit width specified
 247  * by dst_width, the actual result type will have the same width.
 248  *
 249  * Ex: src = { float, float, float, float }
 250  * return { i32, i32, i32, i32 } where each value is in [0, 2^dst_width-1].
 251  */
 252 LLVMValueRef
 253 lp_build_clamped_float_to_unsigned_norm(struct gallivm_state *gallivm,
 254                                         struct lp_type src_type,
 255                                         unsigned dst_width,
 256                                         LLVMValueRef src)
 257 {
 258    LLVMBuilderRef builder = gallivm->builder;
 259    LLVMTypeRef int_vec_type = lp_build_int_vec_type(gallivm, src_type);
 260    LLVMValueRef res;
 261    unsigned mantissa;
 262
 263    assert(src_type.floating);
 264    assert(dst_width <= src_type.width);
 265    src_type.sign = FALSE;
 266
 267    mantissa = lp_mantissa(src_type);
 268
 269    if (dst_width <= mantissa) {
 270       /*
 271        * Apply magic coefficients that will make the desired result to appear
 272        * in the lowest significant bits of the mantissa, with correct rounding.
 273        *
 274        * This only works if the destination width fits in the mantissa.
 275        */
 276
 277       unsigned long long ubound;
 278       unsigned long long mask;
 279       double scale;
 280       double bias;
 281
 282       ubound = (1ULL << dst_width);
 283       mask = ubound - 1;
 284       scale = (double)mask/ubound;
 285       bias = (double)(1ULL << (mantissa - dst_width));
 286
 287       res = LLVMBuildFMul(builder, src, lp_build_const_vec(gallivm, src_type, scale), "");
 288       /* instead of fadd/and could (with sse2) just use lp_build_iround */
 289       res = LLVMBuildFAdd(builder, res, lp_build_const_vec(gallivm, src_type, bias), "");
 290       res = LLVMBuildBitCast(builder, res, int_vec_type, "");
 291       res = LLVMBuildAnd(builder, res,
 292                          lp_build_const_int_vec(gallivm, src_type, mask), "");
 293    }
 294    else if (dst_width == (mantissa + 1)) {
 295       /*
 296        * The destination width matches exactly what can be represented in
 297        * floating point (i.e., mantissa + 1 bits). Even so correct rounding
 298        * still needs to be applied (only for numbers in [0.5-1.0] would
 299        * conversion using truncation after scaling be sufficient).
 300        */
 301       double scale;
 302       struct lp_build_context uf32_bld;
 303
 304       lp_build_context_init(&uf32_bld, gallivm, src_type);
 305       scale = (double)((1ULL << dst_width) - 1);
 306
 307       res = LLVMBuildFMul(builder, src,
 308                           lp_build_const_vec(gallivm, src_type, scale), "");
 309       res = lp_build_iround(&uf32_bld, res);
 310    }
 311    else {
 312       /*
 313        * The destination exceeds what can be represented in the floating point.
 314        * So multiply by the largest power two we get away with, and when
 315        * subtract the most significant bit to rescale to normalized values.
 316        *
 317        * The largest power of two factor we can get away is
 318        * (1 << (src_type.width - 1)), because we need to use signed . In theory it
 319        * should be (1 << (src_type.width - 2)), but IEEE 754 rules states
 320        * INT_MIN should be returned in FPToSI, which is the correct result for
 321        * values near 1.0!
 322        *
 323        * This means we get (src_type.width - 1) correct bits for values near 0.0,
 324        * and (mantissa + 1) correct bits for values near 1.0. Equally or more
 325        * important, we also get exact results for 0.0 and 1.0.
 326        */
 327
 328       unsigned n = MIN2(src_type.width - 1u, dst_width);
 329
 330       double scale = (double)(1ULL << n);
 331       unsigned lshift = dst_width - n;
 332       unsigned rshift = n;
 333       LLVMValueRef lshifted;
 334       LLVMValueRef rshifted;
 335
 336       res = LLVMBuildFMul(builder, src,
 337                           lp_build_const_vec(gallivm, src_type, scale), "");
 338       if (!src_type.sign && src_type.width == 32)
 339          res = LLVMBuildFPToUI(builder, res, int_vec_type, "");
 340       else
 341          res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
 342
 343       /*
 344        * Align the most significant bit to its final place.
 345        *
 346        * This will cause 1.0 to overflow to 0, but the later adjustment will
 347        * get it right.
 348        */
 349       if (lshift) {
 350          lshifted = LLVMBuildShl(builder, res,
 351                                  lp_build_const_int_vec(gallivm, src_type,
 352                                                         lshift), "");
 353       } else {
 354          lshifted = res;
 355       }
 356
 357       /*
 358        * Align the most significant bit to the right.
 359        */
 360       rshifted =  LLVMBuildLShr(builder, res,
 361                                 lp_build_const_int_vec(gallivm, src_type, rshift),
 362                                 "");
 363
 364       /*
 365        * Subtract the MSB to the LSB, therefore re-scaling from
 366        * (1 << dst_width) to ((1 << dst_width) - 1).
 367        */
 368
 369       res = LLVMBuildSub(builder, lshifted, rshifted, "");
 370    }
 371
 372    return res;
 373 }
 374
 375
 376 /**
 377  * Inverse of lp_build_clamped_float_to_unsigned_norm above.
 378  * Ex: src = { i32, i32, i32, i32 } with values in range [0, 2^src_width-1]
 379  * return {float, float, float, float} with values in range [0, 1].
 380  */
 381 LLVMValueRef
 382 lp_build_unsigned_norm_to_float(struct gallivm_state *gallivm,
 383                                 unsigned src_width,
 384                                 struct lp_type dst_type,
 385                                 LLVMValueRef src)
 386 {
 387    LLVMBuilderRef builder = gallivm->builder;
 388    LLVMTypeRef vec_type = lp_build_vec_type(gallivm, dst_type);
 389    LLVMTypeRef int_vec_type = lp_build_int_vec_type(gallivm, dst_type);
 390    LLVMValueRef bias_;
 391    LLVMValueRef res;
 392    unsigned mantissa;
 393    unsigned n;
 394    unsigned long long ubound;
 395    unsigned long long mask;
 396    double scale;
 397    double bias;
 398
 399    assert(dst_type.floating);
 400
 401    mantissa = lp_mantissa(dst_type);
 402
 403    if (src_width <= (mantissa + 1)) {
 404       /*
 405        * The source width matches fits what can be represented in floating
 406        * point (i.e., mantissa + 1 bits). So do a straight multiplication
 407        * followed by casting. No further rounding is necessary.
 408        */
 409
 410       scale = 1.0/(double)((1ULL << src_width) - 1);
 411       res = LLVMBuildSIToFP(builder, src, vec_type, "");
 412       res = LLVMBuildFMul(builder, res,
 413                           lp_build_const_vec(gallivm, dst_type, scale), "");
 414       return res;
 415    }
 416    else {
 417       /*
 418        * The source width exceeds what can be represented in floating
 419        * point. So truncate the incoming values.
 420        */
 421
 422       n = MIN2(mantissa, src_width);
 423
 424       ubound = ((unsigned long long)1 << n);
 425       mask = ubound - 1;
 426       scale = (double)ubound/mask;
 427       bias = (double)((unsigned long long)1 << (mantissa - n));
 428
 429       res = src;
 430
 431       if (src_width > mantissa) {
 432          int shift = src_width - mantissa;
 433          res = LLVMBuildLShr(builder, res,
 434                              lp_build_const_int_vec(gallivm, dst_type, shift), "");
 435       }
 436
 437       bias_ = lp_build_const_vec(gallivm, dst_type, bias);
 438
 439       res = LLVMBuildOr(builder,
 440                         res,
 441                         LLVMBuildBitCast(builder, bias_, int_vec_type, ""), "");
 442
 443       res = LLVMBuildBitCast(builder, res, vec_type, "");
 444
 445       res = LLVMBuildFSub(builder, res, bias_, "");
 446       res = LLVMBuildFMul(builder, res, lp_build_const_vec(gallivm, dst_type, scale), "");
 447    }
 448
 449    return res;
 450 }
 451
 452
 453 /**
 454  * Pick a suitable num_dsts for lp_build_conv to ensure optimal cases are used.
 455  *
 456  * Returns the number of dsts created from src
 457  */
 458 int lp_build_conv_auto(struct gallivm_state *gallivm,
 459                        struct lp_type src_type,
 460                        struct lp_type* dst_type,
 461                        const LLVMValueRef *src,
 462                        unsigned num_srcs,
 463                        LLVMValueRef *dst)
 464 {
 465    unsigned i;
 466    int num_dsts = num_srcs;
 467
 468    if (src_type.floating == dst_type->floating &&
 469        src_type.width == dst_type->width &&
 470        src_type.length == dst_type->length &&
 471        src_type.fixed == dst_type->fixed &&
 472        src_type.norm == dst_type->norm &&
 473        src_type.sign == dst_type->sign)
 474       return num_dsts;
 475
 476    /* Special case 4x4x32 -> 1x16x8 or 2x8x32 -> 1x16x8
 477     */
 478    if (src_type.norm     == 0 &&
 479        src_type.width    == 32 &&
 480        src_type.fixed    == 0 &&
 481
 482        dst_type->floating == 0 &&
 483        dst_type->fixed    == 0 &&
 484        dst_type->width    == 8 &&
 485
 486        ((src_type.floating == 1 && src_type.sign == 1 && dst_type->norm == 1) ||
 487         (src_type.floating == 0 && dst_type->floating == 0 &&
 488          src_type.sign == dst_type->sign && dst_type->norm == 0))) {
 489
 490       /* Special case 4x4x32 --> 1x16x8 */
 491       if (src_type.length == 4 &&
 492             (util_cpu_caps.has_sse2 || util_cpu_caps.has_altivec))
 493       {
 494          num_dsts = (num_srcs + 3) / 4;
 495          dst_type->length = num_srcs * 4 >= 16 ? 16 : num_srcs * 4;
 496
 497          lp_build_conv(gallivm, src_type, *dst_type, src, num_srcs, dst, num_dsts);
 498          return num_dsts;
 499       }
 500
 501       /* Special case 2x8x32 --> 1x16x8 */
 502       if (src_type.length == 8 &&
 503           util_cpu_caps.has_avx)
 504       {
 505          num_dsts = (num_srcs + 1) / 2;
 506          dst_type->length = num_srcs * 8 >= 16 ? 16 : num_srcs * 8;
 507
 508          lp_build_conv(gallivm, src_type, *dst_type, src, num_srcs, dst, num_dsts);
 509          return num_dsts;
 510       }
 511    }
 512
 513    /* lp_build_resize does not support M:N */
 514    if (src_type.width == dst_type->width) {
 515       lp_build_conv(gallivm, src_type, *dst_type, src, num_srcs, dst, num_dsts);
 516    } else {
 517       /*
 518        * If dst_width is 16 bits and src_width 32 and the dst vector size
 519        * 64bit, try feeding 2 vectors at once so pack intrinsics can be used.
 520        * (For AVX, this isn't needed, since we usually get 256bit src and
 521        * 128bit dst vectors which works ok. If we do AVX2 pack this should
 522        * be extended but need to be able to tell conversion code about pack
 523        * ordering first.)
 524        */
 525       unsigned ratio = 1;
 526       if (src_type.width == 2 * dst_type->width &&
 527           src_type.length == dst_type->length &&
 528           dst_type->floating == 0 && (num_srcs % 2 == 0) &&
 529           dst_type->width * dst_type->length == 64) {
 530          ratio = 2;
 531          num_dsts /= 2;
 532          dst_type->length *= 2;
 533       }
 534       for (i = 0; i < num_dsts; i++) {
 535          lp_build_conv(gallivm, src_type, *dst_type, &src[i*ratio], ratio, &dst[i], 1);
 536       }
 537    }
 538
 539    return num_dsts;
 540 }
 541
 542
 543 /**
 544  * Generic type conversion.
 545  *
 546  * TODO: Take a precision argument, or even better, add a new precision member
 547  * to the lp_type union.
 548  */
 549 void
 550 lp_build_conv(struct gallivm_state *gallivm,
 551               struct lp_type src_type,
 552               struct lp_type dst_type,
 553               const LLVMValueRef *src, unsigned num_srcs,
 554               LLVMValueRef *dst, unsigned num_dsts)
 555 {
 556    LLVMBuilderRef builder = gallivm->builder;
 557    struct lp_type tmp_type;
 558    LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH];
 559    unsigned num_tmps;
 560    unsigned i;
 561
 562    /* We must not loose or gain channels. Only precision */
 563    assert(src_type.length * num_srcs == dst_type.length * num_dsts);
 564
 565    assert(src_type.length <= LP_MAX_VECTOR_LENGTH);
 566    assert(dst_type.length <= LP_MAX_VECTOR_LENGTH);
 567    assert(num_srcs <= LP_MAX_VECTOR_LENGTH);
 568    assert(num_dsts <= LP_MAX_VECTOR_LENGTH);
 569
 570    tmp_type = src_type;
 571    for(i = 0; i < num_srcs; ++i) {
 572       assert(lp_check_value(src_type, src[i]));
 573       tmp[i] = src[i];
 574    }
 575    num_tmps = num_srcs;
 576
 577
 578    /*
 579     * Special case 4x4x32 --> 1x16x8, 2x4x32 -> 1x8x8, 1x4x32 -> 1x4x8
 580     * Only float -> s/unorm8 and (u)int32->(u)int8.
 581     * XXX: This should cover all interesting backend cases for 8 bit,
 582     * but should use same strategy if dst is 16 bit.
 583     */
 584    if (src_type.norm     == 0 &&
 585        src_type.width    == 32 &&
 586        src_type.length   == 4 &&
 587        src_type.fixed    == 0 &&
 588
 589        dst_type.floating == 0 &&
 590        dst_type.fixed    == 0 &&
 591        dst_type.width    == 8 &&
 592
 593        ((src_type.floating == 1 && src_type.sign == 1 && dst_type.norm == 1) ||
 594         (src_type.floating == 0 && dst_type.floating == 0 &&
 595          src_type.sign == dst_type.sign && dst_type.norm == 0)) &&
 596
 597        ((dst_type.length == 16 && 4 * num_dsts == num_srcs) ||
 598         (num_dsts == 1 && dst_type.length * num_srcs == 16 && num_srcs != 3)) &&
 599
 600        (util_cpu_caps.has_sse2 || util_cpu_caps.has_altivec))
 601    {
 602       struct lp_build_context bld;
 603       struct lp_type int16_type, int32_type;
 604       struct lp_type dst_type_ext = dst_type;
 605       LLVMValueRef const_scale;
 606       unsigned i, j;
 607
 608       lp_build_context_init(&bld, gallivm, src_type);
 609
 610       dst_type_ext.length = 16;
 611       int16_type = int32_type = dst_type_ext;
 612
 613       int16_type.width *= 2;
 614       int16_type.length /= 2;
 615       int16_type.sign = 1;
 616
 617       int32_type.width *= 4;
 618       int32_type.length /= 4;
 619       int32_type.sign = 1;
 620
 621       const_scale = lp_build_const_vec(gallivm, src_type, lp_const_scale(dst_type));
 622
 623       for (i = 0; i < num_dsts; ++i, src += 4) {
 624          LLVMValueRef lo, hi;
 625
 626          if (src_type.floating) {
 627             for (j = 0; j < dst_type.length / 4; ++j) {
 628                /*
 629                 * XXX This is not actually fully correct. The float to int
 630                 * conversion will produce 0x80000000 value for everything
 631                 * out of range and NaNs (on x86, llvm.x86.sse2.cvtps2dq).
 632                 * Hence, NaNs and negatives will get clamped just fine to zero
 633                 * (relying on clamping pack behavior) when converting to unorm,
 634                 * however too large values (both finite and infinite) will also
 635                 * end up as zero, not 255.
 636                 * For snorm, for now we'll keep bug compatibility with generic
 637                 * conversion path (meaning too large values are fine, but
 638                 * NaNs get converted to -128 (purely by luck, as we don't
 639                 * specify nan behavior for the max there) instead of 0).
 640                 *
 641                 * dEQP has GLES31 tests that expect +inf -> 255.0.
 642                 */
 643                if (dst_type.sign) {
 644                   tmp[j] = lp_build_min(&bld, bld.one, src[j]);
 645
 646                }
 647                else {
 648                   if (1) {
 649                      tmp[j] = lp_build_min_ext(&bld, bld.one, src[j],
 650                                                GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
 651                   }
 652                   tmp[j] = src[j];
 653                }
 654                tmp[j] = LLVMBuildFMul(builder, tmp[j], const_scale, "");
 655                tmp[j] = lp_build_iround(&bld, tmp[j]);
 656             }
 657          } else {
 658             for (j = 0; j < dst_type.length / 4; ++j) {
 659                if (!dst_type.sign) {
 660                   /*
 661                    * Pack clamp is always signed->unsigned (or signed->signed).
 662                    * Hence need min.
 663                    */
 664                   LLVMValueRef const_max;
 665                   const_max = lp_build_const_int_vec(gallivm, src_type, 255);
 666                   tmp[j] = lp_build_min(&bld, src[j], const_max);
 667                } else {
 668                   tmp[j] = src[j];
 669                }
 670             }
 671          }
 672
 673          if (num_srcs == 1) {
 674             tmp[1] = tmp[0];
 675          }
 676
 677          /* relying on clamping behavior of sse2 intrinsics here */
 678          lo = lp_build_pack2(gallivm, int32_type, int16_type, tmp[0], tmp[1]);
 679
 680          if (num_srcs < 4) {
 681             hi = lo;
 682          }
 683          else {
 684             hi = lp_build_pack2(gallivm, int32_type, int16_type, tmp[2], tmp[3]);
 685          }
 686          dst[i] = lp_build_pack2(gallivm, int16_type, dst_type_ext, lo, hi);
 687       }
 688       if (num_srcs < 4) {
 689          dst[0] = lp_build_extract_range(gallivm, dst[0], 0, dst_type.length);
 690       }
 691
 692       return;
 693    }
 694
 695    /* Special case 2x8x32 --> 1x16x8, 1x8x32 ->1x8x8
 696     */
 697    else if (src_type.norm     == 0 &&
 698        src_type.width    == 32 &&
 699        src_type.length   == 8 &&
 700        src_type.fixed    == 0 &&
 701
 702        dst_type.floating == 0 &&
 703        dst_type.fixed    == 0 &&
 704        dst_type.width    == 8 &&
 705
 706        ((src_type.floating == 1 && src_type.sign == 1 && dst_type.norm == 1) ||
 707         (src_type.floating == 0 && dst_type.floating == 0 &&
 708          src_type.sign == dst_type.sign && dst_type.norm == 0)) &&
 709
 710       ((dst_type.length == 16 && 2 * num_dsts == num_srcs) ||
 711        (num_dsts == 1 && dst_type.length * num_srcs == 8)) &&
 712
 713       util_cpu_caps.has_avx) {
 714
 715       struct lp_build_context bld;
 716       struct lp_type int16_type, int32_type;
 717       struct lp_type dst_type_ext = dst_type;
 718       LLVMValueRef const_scale;
 719       unsigned i;
 720
 721       lp_build_context_init(&bld, gallivm, src_type);
 722
 723       dst_type_ext.length = 16;
 724       int16_type = int32_type = dst_type_ext;
 725
 726       int16_type.width *= 2;
 727       int16_type.length /= 2;
 728       int16_type.sign = 1;
 729
 730       int32_type.width *= 4;
 731       int32_type.length /= 4;
 732       int32_type.sign = 1;
 733
 734       const_scale = lp_build_const_vec(gallivm, src_type, lp_const_scale(dst_type));
 735
 736       for (i = 0; i < num_dsts; ++i, src += 2) {
 737          unsigned j;
 738          for (j = 0; j < (num_srcs == 1 ? 1 : 2); j++) {
 739             LLVMValueRef lo, hi, a;
 740
 741             a = src[j];
 742             if (src_type.floating) {
 743                if (dst_type.sign) {
 744                   a = lp_build_min(&bld, bld.one, a);
 745
 746                }
 747                else {
 748                   if (1) {
 749                      a = lp_build_min_ext(&bld, bld.one, a,
 750                                           GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
 751                   }
 752                }
 753                a = LLVMBuildFMul(builder, a, const_scale, "");
 754                a = lp_build_iround(&bld, a);
 755             } else {
 756                if (!dst_type.sign) {
 757                   LLVMValueRef const_max;
 758                   const_max = lp_build_const_int_vec(gallivm, src_type, 255);
 759                   a = lp_build_min(&bld, a, const_max);
 760                }
 761             }
 762             lo = lp_build_extract_range(gallivm, a, 0, 4);
 763             hi = lp_build_extract_range(gallivm, a, 4, 4);
 764             /* relying on clamping behavior of sse2 intrinsics here */
 765             tmp[j] = lp_build_pack2(gallivm, int32_type, int16_type, lo, hi);
 766          }
 767
 768          if (num_srcs == 1) {
 769             tmp[1] = tmp[0];
 770          }
 771          dst[i] = lp_build_pack2(gallivm, int16_type, dst_type_ext, tmp[0], tmp[1]);
 772       }
 773
 774       if (num_srcs == 1) {
 775          dst[0] = lp_build_extract_range(gallivm, dst[0], 0, dst_type.length);
 776       }
 777
 778       return;
 779    }
 780
 781    /* Special case -> 16bit half-float
 782     */
 783    else if (dst_type.floating && dst_type.width == 16)
 784    {
 785       /* Only support src as 32bit float currently */
 786       assert(src_type.floating && src_type.width == 32);
 787
 788       for(i = 0; i < num_tmps; ++i)
 789          dst[i] = lp_build_float_to_half(gallivm, tmp[i]);
 790
 791       return;
 792    }
 793
 794    /* Pre convert half-floats to floats
 795     */
 796    else if (src_type.floating && src_type.width == 16)
 797    {
 798       for(i = 0; i < num_tmps; ++i)
 799          tmp[i] = lp_build_half_to_float(gallivm, tmp[i]);
 800
 801       tmp_type.width = 32;
 802    }
 803
 804    /*
 805     * Clamp if necessary
 806     */
 807
 808    if(memcmp(&src_type, &dst_type, sizeof src_type) != 0) {
 809       struct lp_build_context bld;
 810       double src_min = lp_const_min(src_type);
 811       double dst_min = lp_const_min(dst_type);
 812       double src_max = lp_const_max(src_type);
 813       double dst_max = lp_const_max(dst_type);
 814       LLVMValueRef thres;
 815
 816       lp_build_context_init(&bld, gallivm, tmp_type);
 817
 818       if(src_min < dst_min) {
 819          if(dst_min == 0.0)
 820             thres = bld.zero;
 821          else
 822             thres = lp_build_const_vec(gallivm, src_type, dst_min);
 823          for(i = 0; i < num_tmps; ++i)
 824             tmp[i] = lp_build_max(&bld, tmp[i], thres);
 825       }
 826
 827       if(src_max > dst_max) {
 828          if(dst_max == 1.0)
 829             thres = bld.one;
 830          else
 831             thres = lp_build_const_vec(gallivm, src_type, dst_max);
 832          for(i = 0; i < num_tmps; ++i)
 833             tmp[i] = lp_build_min(&bld, tmp[i], thres);
 834       }
 835    }
 836
 837    /*
 838     * Scale to the narrowest range
 839     */
 840
 841    if(dst_type.floating) {
 842       /* Nothing to do */
 843    }
 844    else if(tmp_type.floating) {
 845       if(!dst_type.fixed && !dst_type.sign && dst_type.norm) {
 846          for(i = 0; i < num_tmps; ++i) {
 847             tmp[i] = lp_build_clamped_float_to_unsigned_norm(gallivm,
 848                                                              tmp_type,
 849                                                              dst_type.width,
 850                                                              tmp[i]);
 851          }
 852          tmp_type.floating = FALSE;
 853       }
 854       else {
 855          double dst_scale = lp_const_scale(dst_type);
 856
 857          if (dst_scale != 1.0) {
 858             LLVMValueRef scale = lp_build_const_vec(gallivm, tmp_type, dst_scale);
 859             for(i = 0; i < num_tmps; ++i)
 860                tmp[i] = LLVMBuildFMul(builder, tmp[i], scale, "");
 861          }
 862
 863          /*
 864           * these functions will use fptosi in some form which won't work
 865           * with 32bit uint dst. Causes lp_test_conv failures though.
 866           */
 867          if (0)
 868             assert(dst_type.sign || dst_type.width < 32);
 869
 870          if (dst_type.sign && dst_type.norm && !dst_type.fixed) {
 871             struct lp_build_context bld;
 872
 873             lp_build_context_init(&bld, gallivm, tmp_type);
 874             for(i = 0; i < num_tmps; ++i) {
 875                tmp[i] = lp_build_iround(&bld, tmp[i]);
 876             }
 877             tmp_type.floating = FALSE;
 878          }
 879          else {
 880             LLVMTypeRef tmp_vec_type;
 881
 882             tmp_type.floating = FALSE;
 883             tmp_vec_type = lp_build_vec_type(gallivm, tmp_type);
 884             for(i = 0; i < num_tmps; ++i) {
 885 #if 0
 886                if(dst_type.sign)
 887                   tmp[i] = LLVMBuildFPToSI(builder, tmp[i], tmp_vec_type, "");
 888                else
 889                   tmp[i] = LLVMBuildFPToUI(builder, tmp[i], tmp_vec_type, "");
 890 #else
 891               /* FIXME: there is no SSE counterpart for LLVMBuildFPToUI */
 892                tmp[i] = LLVMBuildFPToSI(builder, tmp[i], tmp_vec_type, "");
 893 #endif
 894             }
 895          }
 896       }
 897    }
 898    else {
 899       unsigned src_shift = lp_const_shift(src_type);
 900       unsigned dst_shift = lp_const_shift(dst_type);
 901       unsigned src_offset = lp_const_offset(src_type);
 902       unsigned dst_offset = lp_const_offset(dst_type);
 903       struct lp_build_context bld;
 904       lp_build_context_init(&bld, gallivm, tmp_type);
 905
 906       /* Compensate for different offsets */
 907       /* sscaled -> unorm and similar would cause negative shift count, skip */
 908       if (dst_offset > src_offset && src_type.width > dst_type.width && src_shift > 0) {
 909          for (i = 0; i < num_tmps; ++i) {
 910             LLVMValueRef shifted;
 911
 912             shifted = lp_build_shr_imm(&bld, tmp[i], src_shift - 1);
 913             tmp[i] = LLVMBuildSub(builder, tmp[i], shifted, "");
 914          }
 915       }
 916
 917       if(src_shift > dst_shift) {
 918          for(i = 0; i < num_tmps; ++i)
 919             tmp[i] = lp_build_shr_imm(&bld, tmp[i], src_shift - dst_shift);
 920       }
 921    }
 922
 923    /*
 924     * Truncate or expand bit width
 925     *
 926     * No data conversion should happen here, although the sign bits are
 927     * crucial to avoid bad clamping.
 928     */
 929
 930    {
 931       struct lp_type new_type;
 932
 933       new_type = tmp_type;
 934       new_type.sign   = dst_type.sign;
 935       new_type.width  = dst_type.width;
 936       new_type.length = dst_type.length;
 937
 938       /*
 939        * Note that resize when using packs can sometimes get min/max
 940        * clamping for free. Should be able to exploit this...
 941        */
 942       lp_build_resize(gallivm, tmp_type, new_type, tmp, num_srcs, tmp, num_dsts);
 943
 944       tmp_type = new_type;
 945       num_tmps = num_dsts;
 946    }
 947
 948    /*
 949     * Scale to the widest range
 950     */
 951
 952    if(src_type.floating) {
 953       /* Nothing to do */
 954    }
 955    else if(!src_type.floating && dst_type.floating) {
 956       if(!src_type.fixed && !src_type.sign && src_type.norm) {
 957          for(i = 0; i < num_tmps; ++i) {
 958             tmp[i] = lp_build_unsigned_norm_to_float(gallivm,
 959                                                      src_type.width,
 960                                                      dst_type,
 961                                                      tmp[i]);
 962          }
 963          tmp_type.floating = TRUE;
 964       }
 965       else {
 966          double src_scale = lp_const_scale(src_type);
 967          LLVMTypeRef tmp_vec_type;
 968
 969          /* Use an equally sized integer for intermediate computations */
 970          tmp_type.floating = TRUE;
 971          tmp_type.sign = TRUE;
 972          tmp_vec_type = lp_build_vec_type(gallivm, tmp_type);
 973          for(i = 0; i < num_tmps; ++i) {
 974 #if 0
 975             if(dst_type.sign)
 976                tmp[i] = LLVMBuildSIToFP(builder, tmp[i], tmp_vec_type, "");
 977             else
 978                tmp[i] = LLVMBuildUIToFP(builder, tmp[i], tmp_vec_type, "");
 979 #else
 980             /* FIXME: there is no SSE counterpart for LLVMBuildUIToFP */
 981             tmp[i] = LLVMBuildSIToFP(builder, tmp[i], tmp_vec_type, "");
 982 #endif
 983           }
 984
 985           if (src_scale != 1.0) {
 986              LLVMValueRef scale = lp_build_const_vec(gallivm, tmp_type, 1.0/src_scale);
 987              for(i = 0; i < num_tmps; ++i)
 988                 tmp[i] = LLVMBuildFMul(builder, tmp[i], scale, "");
 989           }
 990
 991           /* the formula above will produce value below -1.0 for most negative
 992            * value but everything seems happy with that hence disable for now */
 993           if (0 && !src_type.fixed && src_type.norm && src_type.sign) {
 994              struct lp_build_context bld;
 995
 996              lp_build_context_init(&bld, gallivm, dst_type);
 997              for(i = 0; i < num_tmps; ++i) {
 998                 tmp[i] = lp_build_max(&bld, tmp[i],
 999                                       lp_build_const_vec(gallivm, dst_type, -1.0f));
1000              }
1001           }
1002       }
1003     }
1004     else {
1005        unsigned src_shift = lp_const_shift(src_type);
1006        unsigned dst_shift = lp_const_shift(dst_type);
1007        unsigned src_offset = lp_const_offset(src_type);
1008        unsigned dst_offset = lp_const_offset(dst_type);
1009        struct lp_build_context bld;
1010        lp_build_context_init(&bld, gallivm, tmp_type);
1011
1012        if (src_shift < dst_shift) {
1013           LLVMValueRef pre_shift[LP_MAX_VECTOR_LENGTH];
1014
1015           if (dst_shift - src_shift < dst_type.width) {
1016              for (i = 0; i < num_tmps; ++i) {
1017                 pre_shift[i] = tmp[i];
1018                 tmp[i] = lp_build_shl_imm(&bld, tmp[i], dst_shift - src_shift);
1019              }
1020           }
1021           else {
1022              /*
1023               * This happens for things like sscaled -> unorm conversions. Shift
1024               * counts equal to bit width cause undefined results, so hack around it.
1025               */
1026              for (i = 0; i < num_tmps; ++i) {
1027                 pre_shift[i] = tmp[i];
1028                 tmp[i] = lp_build_zero(gallivm, dst_type);
1029              }
1030           }
1031
1032           /* Compensate for different offsets */
1033           if (dst_offset > src_offset) {
1034              for (i = 0; i < num_tmps; ++i) {
1035                 tmp[i] = LLVMBuildSub(builder, tmp[i], pre_shift[i], "");
1036              }
1037           }
1038        }
1039     }
1040
1041    for(i = 0; i < num_dsts; ++i) {
1042       dst[i] = tmp[i];
1043       assert(lp_check_value(dst_type, dst[i]));
1044    }
1045 }
1046
1047
1048 /**
1049  * Bit mask conversion.
1050  *
1051  * This will convert the integer masks that match the given types.
1052  *
1053  * The mask values should 0 or -1, i.e., all bits either set to zero or one.
1054  * Any other value will likely cause unpredictable results.
1055  *
1056  * This is basically a very trimmed down version of lp_build_conv.
1057  */
1058 void
1059 lp_build_conv_mask(struct gallivm_state *gallivm,
1060                    struct lp_type src_type,
1061                    struct lp_type dst_type,
1062                    const LLVMValueRef *src, unsigned num_srcs,
1063                    LLVMValueRef *dst, unsigned num_dsts)
1064 {
1065
1066    /* We must not loose or gain channels. Only precision */
1067    assert(src_type.length * num_srcs == dst_type.length * num_dsts);
1068
1069    /*
1070     * Drop
1071     *
1072     * We assume all values are 0 or -1
1073     */
1074
1075    src_type.floating = FALSE;
1076    src_type.fixed = FALSE;
1077    src_type.sign = TRUE;
1078    src_type.norm = FALSE;
1079
1080    dst_type.floating = FALSE;
1081    dst_type.fixed = FALSE;
1082    dst_type.sign = TRUE;
1083    dst_type.norm = FALSE;
1084
1085    /*
1086     * Truncate or expand bit width
1087     */
1088
1089    lp_build_resize(gallivm, src_type, dst_type, src, num_srcs, dst, num_dsts);
1090 }