src/gallium/auxiliary/gallivm/lp_bld_conv.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2009 VMware, Inc.
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the
  15  * next paragraph) shall be included in all copies or substantial portions
  16  * of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  21  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
  22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25  *
  26  **************************************************************************/
  27
  28
  29 /**
  30  * @file
  31  * Helper functions for type conversions.
  32  *
  33  * We want to use the fastest type for a given computation whenever feasible.
  34  * The other side of this is that we need to be able convert between several
  35  * types accurately and efficiently.
  36  *
  37  * Conversion between types of different bit width is quite complex since a
  38  *
  39  * To remember there are a few invariants in type conversions:
  40  *
  41  * - register width must remain constant:
  42  *
  43  *     src_type.width * src_type.length == dst_type.width * dst_type.length
  44  *
  45  * - total number of elements must remain constant:
  46  *
  47  *     src_type.length * num_srcs == dst_type.length * num_dsts
  48  *
  49  * It is not always possible to do the conversion both accurately and
  50  * efficiently, usually due to lack of adequate machine instructions. In these
  51  * cases it is important not to cut shortcuts here and sacrifice accuracy, as
  52  * there this functions can be used anywhere. In the future we might have a
  53  * precision parameter which can gauge the accuracy vs efficiency compromise,
  54  * but for now if the data conversion between two stages happens to be the
  55  * bottleneck, then most likely should just avoid converting at all and run
  56  * both stages with the same type.
  57  *
  58  * Make sure to run lp_test_conv unit test after any change to this file.
  59  *
  60  * @author Jose Fonseca <jfonseca@vmware.com>
  61  */
  62
  63
  64 #include "util/u_debug.h"
  65 #include "util/u_math.h"
  66 #include "util/u_half.h"
  67 #include "util/u_cpu_detect.h"
  68
  69 #include "lp_bld_type.h"
  70 #include "lp_bld_const.h"
  71 #include "lp_bld_arit.h"
  72 #include "lp_bld_bitarit.h"
  73 #include "lp_bld_pack.h"
  74 #include "lp_bld_conv.h"
  75 #include "lp_bld_logic.h"
  76 #include "lp_bld_intr.h"
  77 #include "lp_bld_printf.h"
  78 #include "lp_bld_format.h"
  79
  80
  81
  82 /**
  83  * Converts int16 half-float to float32
  84  * Note this can be performed in 1 instruction if vcvtph2ps exists (f16c/cvt16)
  85  * [llvm.x86.vcvtph2ps / _mm_cvtph_ps]
  86  *
  87  * @param src           value to convert
  88  *
  89  */
  90 LLVMValueRef
  91 lp_build_half_to_float(struct gallivm_state *gallivm,
  92                        LLVMValueRef src)
  93 {
  94    LLVMBuilderRef builder = gallivm->builder;
  95    LLVMTypeRef src_type = LLVMTypeOf(src);
  96    unsigned src_length = LLVMGetTypeKind(src_type) == LLVMVectorTypeKind ?
  97                             LLVMGetVectorSize(src_type) : 1;
  98
  99    struct lp_type f32_type = lp_type_float_vec(32, 32 * src_length);
 100    struct lp_type i32_type = lp_type_int_vec(32, 32 * src_length);
 101    LLVMTypeRef int_vec_type = lp_build_vec_type(gallivm, i32_type);
 102    LLVMValueRef h;
 103
 104    if (util_cpu_caps.has_f16c &&
 105        (src_length == 4 || src_length == 8)) {
 106       const char *intrinsic = NULL;
 107       if (src_length == 4) {
 108          src = lp_build_pad_vector(gallivm, src, 8);
 109          intrinsic = "llvm.x86.vcvtph2ps.128";
 110       }
 111       else {
 112          intrinsic = "llvm.x86.vcvtph2ps.256";
 113       }
 114       return lp_build_intrinsic_unary(builder, intrinsic,
 115                                       lp_build_vec_type(gallivm, f32_type), src);
 116    }
 117
 118    /* Convert int16 vector to int32 vector by zero ext (might generate bad code) */
 119    h = LLVMBuildZExt(builder, src, int_vec_type, "");
 120    return lp_build_smallfloat_to_float(gallivm, f32_type, h, 10, 5, 0, true);
 121 }
 122
 123
 124 /**
 125  * Converts float32 to int16 half-float
 126  * Note this can be performed in 1 instruction if vcvtps2ph exists (f16c/cvt16)
 127  * [llvm.x86.vcvtps2ph / _mm_cvtps_ph]
 128  *
 129  * @param src           value to convert
 130  *
 131  * Convert float32 to half floats, preserving Infs and NaNs,
 132  * with rounding towards zero (trunc).
 133  * XXX: For GL, would prefer rounding towards nearest(-even).
 134  */
 135 LLVMValueRef
 136 lp_build_float_to_half(struct gallivm_state *gallivm,
 137                        LLVMValueRef src)
 138 {
 139    LLVMBuilderRef builder = gallivm->builder;
 140    LLVMTypeRef f32_vec_type = LLVMTypeOf(src);
 141    unsigned length = LLVMGetTypeKind(f32_vec_type) == LLVMVectorTypeKind
 142                    ? LLVMGetVectorSize(f32_vec_type) : 1;
 143    struct lp_type i32_type = lp_type_int_vec(32, 32 * length);
 144    struct lp_type i16_type = lp_type_int_vec(16, 16 * length);
 145    LLVMValueRef result;
 146
 147    /*
 148     * Note: Newer llvm versions (3.6 or so) support fptrunc to 16 bits
 149     * directly, without any (x86 or generic) intrinsics.
 150     * Albeit the rounding mode cannot be specified (and is undefined,
 151     * though in practice on x86 seems to do nearest-even but it may
 152     * be dependent on instruction set support), so is essentially
 153     * useless.
 154     */
 155
 156    if (util_cpu_caps.has_f16c &&
 157        (length == 4 || length == 8)) {
 158       struct lp_type i168_type = lp_type_int_vec(16, 16 * 8);
 159       unsigned mode = 3; /* same as LP_BUILD_ROUND_TRUNCATE */
 160       LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
 161       const char *intrinsic = NULL;
 162       if (length == 4) {
 163          intrinsic = "llvm.x86.vcvtps2ph.128";
 164       }
 165       else {
 166          intrinsic = "llvm.x86.vcvtps2ph.256";
 167       }
 168       result = lp_build_intrinsic_binary(builder, intrinsic,
 169                                          lp_build_vec_type(gallivm, i168_type),
 170                                          src, LLVMConstInt(i32t, mode, 0));
 171       if (length == 4) {
 172          result = lp_build_extract_range(gallivm, result, 0, 4);
 173       }
 174    }
 175
 176    else {
 177       result = lp_build_float_to_smallfloat(gallivm, i32_type, src, 10, 5, 0, true);
 178       /* Convert int32 vector to int16 vector by trunc (might generate bad code) */
 179       result = LLVMBuildTrunc(builder, result, lp_build_vec_type(gallivm, i16_type), "");
 180    }
 181
 182    /*
 183     * Debugging code.
 184     */
 185    if (0) {
 186      LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
 187      LLVMTypeRef i16t = LLVMInt16TypeInContext(gallivm->context);
 188      LLVMTypeRef f32t = LLVMFloatTypeInContext(gallivm->context);
 189      LLVMValueRef ref_result = LLVMGetUndef(LLVMVectorType(i16t, length));
 190      unsigned i;
 191
 192      LLVMTypeRef func_type = LLVMFunctionType(i16t, &f32t, 1, 0);
 193      LLVMValueRef func = lp_build_const_int_pointer(gallivm, func_to_pointer((func_pointer)util_float_to_half));
 194      func = LLVMBuildBitCast(builder, func, LLVMPointerType(func_type, 0), "util_float_to_half");
 195
 196      for (i = 0; i < length; ++i) {
 197         LLVMValueRef index = LLVMConstInt(i32t, i, 0);
 198         LLVMValueRef f32 = LLVMBuildExtractElement(builder, src, index, "");
 199 #if 0
 200         /*
 201          * XXX: not really supported by backends.
 202          * Even if they would now, rounding mode cannot be specified and
 203          * is undefined.
 204          */
 205         LLVMValueRef f16 = lp_build_intrinsic_unary(builder, "llvm.convert.to.fp16", i16t, f32);
 206 #else
 207         LLVMValueRef f16 = LLVMBuildCall(builder, func, &f32, 1, "");
 208 #endif
 209         ref_result = LLVMBuildInsertElement(builder, ref_result, f16, index, "");
 210      }
 211
 212      lp_build_print_value(gallivm, "src  = ", src);
 213      lp_build_print_value(gallivm, "llvm = ", result);
 214      lp_build_print_value(gallivm, "util = ", ref_result);
 215      lp_build_printf(gallivm, "\n");
 216   }
 217
 218    return result;
 219 }
 220
 221
 222 /**
 223  * Special case for converting clamped IEEE-754 floats to unsigned norms.
 224  *
 225  * The mathematical voodoo below may seem excessive but it is actually
 226  * paramount we do it this way for several reasons. First, there is no single
 227  * precision FP to unsigned integer conversion Intel SSE instruction. Second,
 228  * secondly, even if there was, since the FP's mantissa takes only a fraction
 229  * of register bits the typically scale and cast approach would require double
 230  * precision for accurate results, and therefore half the throughput
 231  *
 232  * Although the result values can be scaled to an arbitrary bit width specified
 233  * by dst_width, the actual result type will have the same width.
 234  *
 235  * Ex: src = { float, float, float, float }
 236  * return { i32, i32, i32, i32 } where each value is in [0, 2^dst_width-1].
 237  */
 238 LLVMValueRef
 239 lp_build_clamped_float_to_unsigned_norm(struct gallivm_state *gallivm,
 240                                         struct lp_type src_type,
 241                                         unsigned dst_width,
 242                                         LLVMValueRef src)
 243 {
 244    LLVMBuilderRef builder = gallivm->builder;
 245    LLVMTypeRef int_vec_type = lp_build_int_vec_type(gallivm, src_type);
 246    LLVMValueRef res;
 247    unsigned mantissa;
 248
 249    assert(src_type.floating);
 250    assert(dst_width <= src_type.width);
 251    src_type.sign = FALSE;
 252
 253    mantissa = lp_mantissa(src_type);
 254
 255    if (dst_width <= mantissa) {
 256       /*
 257        * Apply magic coefficients that will make the desired result to appear
 258        * in the lowest significant bits of the mantissa, with correct rounding.
 259        *
 260        * This only works if the destination width fits in the mantissa.
 261        */
 262
 263       unsigned long long ubound;
 264       unsigned long long mask;
 265       double scale;
 266       double bias;
 267
 268       ubound = (1ULL << dst_width);
 269       mask = ubound - 1;
 270       scale = (double)mask/ubound;
 271       bias = (double)(1ULL << (mantissa - dst_width));
 272
 273       res = LLVMBuildFMul(builder, src, lp_build_const_vec(gallivm, src_type, scale), "");
 274       /* instead of fadd/and could (with sse2) just use lp_build_iround */
 275       res = LLVMBuildFAdd(builder, res, lp_build_const_vec(gallivm, src_type, bias), "");
 276       res = LLVMBuildBitCast(builder, res, int_vec_type, "");
 277       res = LLVMBuildAnd(builder, res,
 278                          lp_build_const_int_vec(gallivm, src_type, mask), "");
 279    }
 280    else if (dst_width == (mantissa + 1)) {
 281       /*
 282        * The destination width matches exactly what can be represented in
 283        * floating point (i.e., mantissa + 1 bits). Even so correct rounding
 284        * still needs to be applied (only for numbers in [0.5-1.0] would
 285        * conversion using truncation after scaling be sufficient).
 286        */
 287       double scale;
 288       struct lp_build_context uf32_bld;
 289
 290       lp_build_context_init(&uf32_bld, gallivm, src_type);
 291       scale = (double)((1ULL << dst_width) - 1);
 292
 293       res = LLVMBuildFMul(builder, src,
 294                           lp_build_const_vec(gallivm, src_type, scale), "");
 295       res = lp_build_iround(&uf32_bld, res);
 296    }
 297    else {
 298       /*
 299        * The destination exceeds what can be represented in the floating point.
 300        * So multiply by the largest power two we get away with, and when
 301        * subtract the most significant bit to rescale to normalized values.
 302        *
 303        * The largest power of two factor we can get away is
 304        * (1 << (src_type.width - 1)), because we need to use signed . In theory it
 305        * should be (1 << (src_type.width - 2)), but IEEE 754 rules states
 306        * INT_MIN should be returned in FPToSI, which is the correct result for
 307        * values near 1.0!
 308        *
 309        * This means we get (src_type.width - 1) correct bits for values near 0.0,
 310        * and (mantissa + 1) correct bits for values near 1.0. Equally or more
 311        * important, we also get exact results for 0.0 and 1.0.
 312        */
 313
 314       unsigned n = MIN2(src_type.width - 1u, dst_width);
 315
 316       double scale = (double)(1ULL << n);
 317       unsigned lshift = dst_width - n;
 318       unsigned rshift = n;
 319       LLVMValueRef lshifted;
 320       LLVMValueRef rshifted;
 321
 322       res = LLVMBuildFMul(builder, src,
 323                           lp_build_const_vec(gallivm, src_type, scale), "");
 324       if (!src_type.sign && src_type.width == 32)
 325          res = LLVMBuildFPToUI(builder, res, int_vec_type, "");
 326       else
 327          res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
 328
 329       /*
 330        * Align the most significant bit to its final place.
 331        *
 332        * This will cause 1.0 to overflow to 0, but the later adjustment will
 333        * get it right.
 334        */
 335       if (lshift) {
 336          lshifted = LLVMBuildShl(builder, res,
 337                                  lp_build_const_int_vec(gallivm, src_type,
 338                                                         lshift), "");
 339       } else {
 340          lshifted = res;
 341       }
 342
 343       /*
 344        * Align the most significant bit to the right.
 345        */
 346       rshifted =  LLVMBuildLShr(builder, res,
 347                                 lp_build_const_int_vec(gallivm, src_type, rshift),
 348                                 "");
 349
 350       /*
 351        * Subtract the MSB to the LSB, therefore re-scaling from
 352        * (1 << dst_width) to ((1 << dst_width) - 1).
 353        */
 354
 355       res = LLVMBuildSub(builder, lshifted, rshifted, "");
 356    }
 357
 358    return res;
 359 }
 360
 361
 362 /**
 363  * Inverse of lp_build_clamped_float_to_unsigned_norm above.
 364  * Ex: src = { i32, i32, i32, i32 } with values in range [0, 2^src_width-1]
 365  * return {float, float, float, float} with values in range [0, 1].
 366  */
 367 LLVMValueRef
 368 lp_build_unsigned_norm_to_float(struct gallivm_state *gallivm,
 369                                 unsigned src_width,
 370                                 struct lp_type dst_type,
 371                                 LLVMValueRef src)
 372 {
 373    LLVMBuilderRef builder = gallivm->builder;
 374    LLVMTypeRef vec_type = lp_build_vec_type(gallivm, dst_type);
 375    LLVMTypeRef int_vec_type = lp_build_int_vec_type(gallivm, dst_type);
 376    LLVMValueRef bias_;
 377    LLVMValueRef res;
 378    unsigned mantissa;
 379    unsigned n;
 380    unsigned long long ubound;
 381    unsigned long long mask;
 382    double scale;
 383    double bias;
 384
 385    assert(dst_type.floating);
 386
 387    mantissa = lp_mantissa(dst_type);
 388
 389    if (src_width <= (mantissa + 1)) {
 390       /*
 391        * The source width matches fits what can be represented in floating
 392        * point (i.e., mantissa + 1 bits). So do a straight multiplication
 393        * followed by casting. No further rounding is necessary.
 394        */
 395
 396       scale = 1.0/(double)((1ULL << src_width) - 1);
 397       res = LLVMBuildSIToFP(builder, src, vec_type, "");
 398       res = LLVMBuildFMul(builder, res,
 399                           lp_build_const_vec(gallivm, dst_type, scale), "");
 400       return res;
 401    }
 402    else {
 403       /*
 404        * The source width exceeds what can be represented in floating
 405        * point. So truncate the incoming values.
 406        */
 407
 408       n = MIN2(mantissa, src_width);
 409
 410       ubound = ((unsigned long long)1 << n);
 411       mask = ubound - 1;
 412       scale = (double)ubound/mask;
 413       bias = (double)((unsigned long long)1 << (mantissa - n));
 414
 415       res = src;
 416
 417       if (src_width > mantissa) {
 418          int shift = src_width - mantissa;
 419          res = LLVMBuildLShr(builder, res,
 420                              lp_build_const_int_vec(gallivm, dst_type, shift), "");
 421       }
 422
 423       bias_ = lp_build_const_vec(gallivm, dst_type, bias);
 424
 425       res = LLVMBuildOr(builder,
 426                         res,
 427                         LLVMBuildBitCast(builder, bias_, int_vec_type, ""), "");
 428
 429       res = LLVMBuildBitCast(builder, res, vec_type, "");
 430
 431       res = LLVMBuildFSub(builder, res, bias_, "");
 432       res = LLVMBuildFMul(builder, res, lp_build_const_vec(gallivm, dst_type, scale), "");
 433    }
 434
 435    return res;
 436 }
 437
 438
 439 /**
 440  * Pick a suitable num_dsts for lp_build_conv to ensure optimal cases are used.
 441  *
 442  * Returns the number of dsts created from src
 443  */
 444 int lp_build_conv_auto(struct gallivm_state *gallivm,
 445                        struct lp_type src_type,
 446                        struct lp_type* dst_type,
 447                        const LLVMValueRef *src,
 448                        unsigned num_srcs,
 449                        LLVMValueRef *dst)
 450 {
 451    unsigned i;
 452    int num_dsts = num_srcs;
 453
 454    if (src_type.floating == dst_type->floating &&
 455        src_type.width == dst_type->width &&
 456        src_type.length == dst_type->length &&
 457        src_type.fixed == dst_type->fixed &&
 458        src_type.norm == dst_type->norm &&
 459        src_type.sign == dst_type->sign)
 460       return num_dsts;
 461
 462    /* Special case 4x4x32 -> 1x16x8 or 2x8x32 -> 1x16x8
 463     */
 464    if (src_type.norm     == 0 &&
 465        src_type.width    == 32 &&
 466        src_type.fixed    == 0 &&
 467
 468        dst_type->floating == 0 &&
 469        dst_type->fixed    == 0 &&
 470        dst_type->width    == 8 &&
 471
 472        ((src_type.floating == 1 && src_type.sign == 1 && dst_type->norm == 1) ||
 473         (src_type.floating == 0 && dst_type->floating == 0 &&
 474          src_type.sign == dst_type->sign && dst_type->norm == 0))) {
 475
 476       /* Special case 4x4x32 --> 1x16x8 */
 477       if (src_type.length == 4 &&
 478             (util_cpu_caps.has_sse2 || util_cpu_caps.has_altivec))
 479       {
 480          num_dsts = (num_srcs + 3) / 4;
 481          dst_type->length = num_srcs * 4 >= 16 ? 16 : num_srcs * 4;
 482
 483          lp_build_conv(gallivm, src_type, *dst_type, src, num_srcs, dst, num_dsts);
 484          return num_dsts;
 485       }
 486
 487       /* Special case 2x8x32 --> 1x16x8 */
 488       if (src_type.length == 8 &&
 489           util_cpu_caps.has_avx)
 490       {
 491          num_dsts = (num_srcs + 1) / 2;
 492          dst_type->length = num_srcs * 8 >= 16 ? 16 : num_srcs * 8;
 493
 494          lp_build_conv(gallivm, src_type, *dst_type, src, num_srcs, dst, num_dsts);
 495          return num_dsts;
 496       }
 497    }
 498
 499    /* lp_build_resize does not support M:N */
 500    if (src_type.width == dst_type->width) {
 501       lp_build_conv(gallivm, src_type, *dst_type, src, num_srcs, dst, num_dsts);
 502    } else {
 503       /*
 504        * If dst_width is 16 bits and src_width 32 and the dst vector size
 505        * 64bit, try feeding 2 vectors at once so pack intrinsics can be used.
 506        * (For AVX, this isn't needed, since we usually get 256bit src and
 507        * 128bit dst vectors which works ok. If we do AVX2 pack this should
 508        * be extended but need to be able to tell conversion code about pack
 509        * ordering first.)
 510        */
 511       unsigned ratio = 1;
 512       if (src_type.width == 2 * dst_type->width &&
 513           src_type.length == dst_type->length &&
 514           dst_type->floating == 0 && (num_srcs % 2 == 0) &&
 515           dst_type->width * dst_type->length == 64) {
 516          ratio = 2;
 517          num_dsts /= 2;
 518          dst_type->length *= 2;
 519       }
 520       for (i = 0; i < num_dsts; i++) {
 521          lp_build_conv(gallivm, src_type, *dst_type, &src[i*ratio], ratio, &dst[i], 1);
 522       }
 523    }
 524
 525    return num_dsts;
 526 }
 527
 528
 529 /**
 530  * Generic type conversion.
 531  *
 532  * TODO: Take a precision argument, or even better, add a new precision member
 533  * to the lp_type union.
 534  */
 535 void
 536 lp_build_conv(struct gallivm_state *gallivm,
 537               struct lp_type src_type,
 538               struct lp_type dst_type,
 539               const LLVMValueRef *src, unsigned num_srcs,
 540               LLVMValueRef *dst, unsigned num_dsts)
 541 {
 542    LLVMBuilderRef builder = gallivm->builder;
 543    struct lp_type tmp_type;
 544    LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH];
 545    unsigned num_tmps;
 546    unsigned i;
 547
 548    /* We must not loose or gain channels. Only precision */
 549    assert(src_type.length * num_srcs == dst_type.length * num_dsts);
 550
 551    assert(src_type.length <= LP_MAX_VECTOR_LENGTH);
 552    assert(dst_type.length <= LP_MAX_VECTOR_LENGTH);
 553    assert(num_srcs <= LP_MAX_VECTOR_LENGTH);
 554    assert(num_dsts <= LP_MAX_VECTOR_LENGTH);
 555
 556    tmp_type = src_type;
 557    for(i = 0; i < num_srcs; ++i) {
 558       assert(lp_check_value(src_type, src[i]));
 559       tmp[i] = src[i];
 560    }
 561    num_tmps = num_srcs;
 562
 563
 564    /*
 565     * Special case 4x4x32 --> 1x16x8, 2x4x32 -> 1x8x8, 1x4x32 -> 1x4x8
 566     * Only float -> s/unorm8 and (u)int32->(u)int8.
 567     * XXX: This should cover all interesting backend cases for 8 bit,
 568     * but should use same strategy if dst is 16 bit.
 569     */
 570    if (src_type.norm     == 0 &&
 571        src_type.width    == 32 &&
 572        src_type.length   == 4 &&
 573        src_type.fixed    == 0 &&
 574
 575        dst_type.floating == 0 &&
 576        dst_type.fixed    == 0 &&
 577        dst_type.width    == 8 &&
 578
 579        ((src_type.floating == 1 && src_type.sign == 1 && dst_type.norm == 1) ||
 580         (src_type.floating == 0 && dst_type.floating == 0 &&
 581          src_type.sign == dst_type.sign && dst_type.norm == 0)) &&
 582
 583        ((dst_type.length == 16 && 4 * num_dsts == num_srcs) ||
 584         (num_dsts == 1 && dst_type.length * num_srcs == 16 && num_srcs != 3)) &&
 585
 586        (util_cpu_caps.has_sse2 || util_cpu_caps.has_altivec))
 587    {
 588       struct lp_build_context bld;
 589       struct lp_type int16_type, int32_type;
 590       struct lp_type dst_type_ext = dst_type;
 591       LLVMValueRef const_scale;
 592       unsigned i, j;
 593
 594       lp_build_context_init(&bld, gallivm, src_type);
 595
 596       dst_type_ext.length = 16;
 597       int16_type = int32_type = dst_type_ext;
 598
 599       int16_type.width *= 2;
 600       int16_type.length /= 2;
 601       int16_type.sign = 1;
 602
 603       int32_type.width *= 4;
 604       int32_type.length /= 4;
 605       int32_type.sign = 1;
 606
 607       const_scale = lp_build_const_vec(gallivm, src_type, lp_const_scale(dst_type));
 608
 609       for (i = 0; i < num_dsts; ++i, src += 4) {
 610          LLVMValueRef lo, hi;
 611
 612          if (src_type.floating) {
 613             for (j = 0; j < dst_type.length / 4; ++j) {
 614                /*
 615                 * XXX This is not actually fully correct. The float to int
 616                 * conversion will produce 0x80000000 value for everything
 617                 * out of range and NaNs (on x86, llvm.x86.sse2.cvtps2dq).
 618                 * Hence, NaNs and negatives will get clamped just fine to zero
 619                 * (relying on clamping pack behavior) when converting to unorm,
 620                 * however too large values (both finite and infinite) will also
 621                 * end up as zero, not 255.
 622                 * For snorm, for now we'll keep bug compatibility with generic
 623                 * conversion path (meaning too large values are fine, but
 624                 * NaNs get converted to -128 (purely by luck, as we don't
 625                 * specify nan behavior for the max there) instead of 0).
 626                 */
 627                if (dst_type.sign) {
 628                   tmp[j] = lp_build_min(&bld, bld.one, src[j]);
 629
 630                }
 631                else {
 632                   if (0) {
 633                      tmp[j] = lp_build_min_ext(&bld, bld.one, src[j],
 634                                                GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
 635                   }
 636                   tmp[j] = src[j];
 637                }
 638                tmp[j] = LLVMBuildFMul(builder, tmp[j], const_scale, "");
 639                tmp[j] = lp_build_iround(&bld, tmp[j]);
 640             }
 641          } else {
 642             for (j = 0; j < dst_type.length / 4; ++j) {
 643                if (!dst_type.sign) {
 644                   /*
 645                    * Pack clamp is always signed->unsigned (or signed->signed).
 646                    * Hence need min.
 647                    */
 648                   LLVMValueRef const_max;
 649                   const_max = lp_build_const_int_vec(gallivm, src_type, 255);
 650                   tmp[j] = lp_build_min(&bld, src[j], const_max);
 651                } else {
 652                   tmp[j] = src[j];
 653                }
 654             }
 655          }
 656
 657          if (num_srcs == 1) {
 658             tmp[1] = tmp[0];
 659          }
 660
 661          /* relying on clamping behavior of sse2 intrinsics here */
 662          lo = lp_build_pack2(gallivm, int32_type, int16_type, tmp[0], tmp[1]);
 663
 664          if (num_srcs < 4) {
 665             hi = lo;
 666          }
 667          else {
 668             hi = lp_build_pack2(gallivm, int32_type, int16_type, tmp[2], tmp[3]);
 669          }
 670          dst[i] = lp_build_pack2(gallivm, int16_type, dst_type_ext, lo, hi);
 671       }
 672       if (num_srcs < 4) {
 673          dst[0] = lp_build_extract_range(gallivm, dst[0], 0, dst_type.length);
 674       }
 675
 676       return;
 677    }
 678
 679    /* Special case 2x8x32 --> 1x16x8, 1x8x32 ->1x8x8
 680     */
 681    else if (src_type.norm     == 0 &&
 682        src_type.width    == 32 &&
 683        src_type.length   == 8 &&
 684        src_type.fixed    == 0 &&
 685
 686        dst_type.floating == 0 &&
 687        dst_type.fixed    == 0 &&
 688        dst_type.width    == 8 &&
 689
 690        ((src_type.floating == 1 && src_type.sign == 1 && dst_type.norm == 1) ||
 691         (src_type.floating == 0 && dst_type.floating == 0 &&
 692          src_type.sign == dst_type.sign && dst_type.norm == 0)) &&
 693
 694       ((dst_type.length == 16 && 2 * num_dsts == num_srcs) ||
 695        (num_dsts == 1 && dst_type.length * num_srcs == 8)) &&
 696
 697       util_cpu_caps.has_avx) {
 698
 699       struct lp_build_context bld;
 700       struct lp_type int16_type, int32_type;
 701       struct lp_type dst_type_ext = dst_type;
 702       LLVMValueRef const_scale;
 703       unsigned i;
 704
 705       lp_build_context_init(&bld, gallivm, src_type);
 706
 707       dst_type_ext.length = 16;
 708       int16_type = int32_type = dst_type_ext;
 709
 710       int16_type.width *= 2;
 711       int16_type.length /= 2;
 712       int16_type.sign = 1;
 713
 714       int32_type.width *= 4;
 715       int32_type.length /= 4;
 716       int32_type.sign = 1;
 717
 718       const_scale = lp_build_const_vec(gallivm, src_type, lp_const_scale(dst_type));
 719
 720       for (i = 0; i < num_dsts; ++i, src += 2) {
 721          unsigned j;
 722          for (j = 0; j < (num_srcs == 1 ? 1 : 2); j++) {
 723             LLVMValueRef lo, hi, a;
 724
 725             a = src[j];
 726             if (src_type.floating) {
 727                if (dst_type.sign) {
 728                   a = lp_build_min(&bld, bld.one, a);
 729
 730                }
 731                else {
 732                   if (0) {
 733                      a = lp_build_min_ext(&bld, bld.one, a,
 734                                           GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
 735                   }
 736                }
 737                a = LLVMBuildFMul(builder, a, const_scale, "");
 738                a = lp_build_iround(&bld, a);
 739             } else {
 740                if (!dst_type.sign) {
 741                   LLVMValueRef const_max;
 742                   const_max = lp_build_const_int_vec(gallivm, src_type, 255);
 743                   a = lp_build_min(&bld, a, const_max);
 744                }
 745             }
 746             lo = lp_build_extract_range(gallivm, a, 0, 4);
 747             hi = lp_build_extract_range(gallivm, a, 4, 4);
 748             /* relying on clamping behavior of sse2 intrinsics here */
 749             tmp[j] = lp_build_pack2(gallivm, int32_type, int16_type, lo, hi);
 750          }
 751
 752          if (num_srcs == 1) {
 753             tmp[1] = tmp[0];
 754          }
 755          dst[i] = lp_build_pack2(gallivm, int16_type, dst_type_ext, tmp[0], tmp[1]);
 756       }
 757
 758       if (num_srcs == 1) {
 759          dst[0] = lp_build_extract_range(gallivm, dst[0], 0, dst_type.length);
 760       }
 761
 762       return;
 763    }
 764
 765    /* Special case -> 16bit half-float
 766     */
 767    else if (dst_type.floating && dst_type.width == 16)
 768    {
 769       /* Only support src as 32bit float currently */
 770       assert(src_type.floating && src_type.width == 32);
 771
 772       for(i = 0; i < num_tmps; ++i)
 773          dst[i] = lp_build_float_to_half(gallivm, tmp[i]);
 774
 775       return;
 776    }
 777
 778    /* Pre convert half-floats to floats
 779     */
 780    else if (src_type.floating && src_type.width == 16)
 781    {
 782       for(i = 0; i < num_tmps; ++i)
 783          tmp[i] = lp_build_half_to_float(gallivm, tmp[i]);
 784
 785       tmp_type.width = 32;
 786    }
 787
 788    /*
 789     * Clamp if necessary
 790     */
 791
 792    if(memcmp(&src_type, &dst_type, sizeof src_type) != 0) {
 793       struct lp_build_context bld;
 794       double src_min = lp_const_min(src_type);
 795       double dst_min = lp_const_min(dst_type);
 796       double src_max = lp_const_max(src_type);
 797       double dst_max = lp_const_max(dst_type);
 798       LLVMValueRef thres;
 799
 800       lp_build_context_init(&bld, gallivm, tmp_type);
 801
 802       if(src_min < dst_min) {
 803          if(dst_min == 0.0)
 804             thres = bld.zero;
 805          else
 806             thres = lp_build_const_vec(gallivm, src_type, dst_min);
 807          for(i = 0; i < num_tmps; ++i)
 808             tmp[i] = lp_build_max(&bld, tmp[i], thres);
 809       }
 810
 811       if(src_max > dst_max) {
 812          if(dst_max == 1.0)
 813             thres = bld.one;
 814          else
 815             thres = lp_build_const_vec(gallivm, src_type, dst_max);
 816          for(i = 0; i < num_tmps; ++i)
 817             tmp[i] = lp_build_min(&bld, tmp[i], thres);
 818       }
 819    }
 820
 821    /*
 822     * Scale to the narrowest range
 823     */
 824
 825    if(dst_type.floating) {
 826       /* Nothing to do */
 827    }
 828    else if(tmp_type.floating) {
 829       if(!dst_type.fixed && !dst_type.sign && dst_type.norm) {
 830          for(i = 0; i < num_tmps; ++i) {
 831             tmp[i] = lp_build_clamped_float_to_unsigned_norm(gallivm,
 832                                                              tmp_type,
 833                                                              dst_type.width,
 834                                                              tmp[i]);
 835          }
 836          tmp_type.floating = FALSE;
 837       }
 838       else {
 839          double dst_scale = lp_const_scale(dst_type);
 840
 841          if (dst_scale != 1.0) {
 842             LLVMValueRef scale = lp_build_const_vec(gallivm, tmp_type, dst_scale);
 843             for(i = 0; i < num_tmps; ++i)
 844                tmp[i] = LLVMBuildFMul(builder, tmp[i], scale, "");
 845          }
 846
 847          /*
 848           * these functions will use fptosi in some form which won't work
 849           * with 32bit uint dst. Causes lp_test_conv failures though.
 850           */
 851          if (0)
 852             assert(dst_type.sign || dst_type.width < 32);
 853
 854          if (dst_type.sign && dst_type.norm && !dst_type.fixed) {
 855             struct lp_build_context bld;
 856
 857             lp_build_context_init(&bld, gallivm, tmp_type);
 858             for(i = 0; i < num_tmps; ++i) {
 859                tmp[i] = lp_build_iround(&bld, tmp[i]);
 860             }
 861             tmp_type.floating = FALSE;
 862          }
 863          else {
 864             LLVMTypeRef tmp_vec_type;
 865
 866             tmp_type.floating = FALSE;
 867             tmp_vec_type = lp_build_vec_type(gallivm, tmp_type);
 868             for(i = 0; i < num_tmps; ++i) {
 869 #if 0
 870                if(dst_type.sign)
 871                   tmp[i] = LLVMBuildFPToSI(builder, tmp[i], tmp_vec_type, "");
 872                else
 873                   tmp[i] = LLVMBuildFPToUI(builder, tmp[i], tmp_vec_type, "");
 874 #else
 875               /* FIXME: there is no SSE counterpart for LLVMBuildFPToUI */
 876                tmp[i] = LLVMBuildFPToSI(builder, tmp[i], tmp_vec_type, "");
 877 #endif
 878             }
 879          }
 880       }
 881    }
 882    else {
 883       unsigned src_shift = lp_const_shift(src_type);
 884       unsigned dst_shift = lp_const_shift(dst_type);
 885       unsigned src_offset = lp_const_offset(src_type);
 886       unsigned dst_offset = lp_const_offset(dst_type);
 887       struct lp_build_context bld;
 888       lp_build_context_init(&bld, gallivm, tmp_type);
 889
 890       /* Compensate for different offsets */
 891       /* sscaled -> unorm and similar would cause negative shift count, skip */
 892       if (dst_offset > src_offset && src_type.width > dst_type.width && src_shift > 0) {
 893          for (i = 0; i < num_tmps; ++i) {
 894             LLVMValueRef shifted;
 895
 896             shifted = lp_build_shr_imm(&bld, tmp[i], src_shift - 1);
 897             tmp[i] = LLVMBuildSub(builder, tmp[i], shifted, "");
 898          }
 899       }
 900
 901       if(src_shift > dst_shift) {
 902          for(i = 0; i < num_tmps; ++i)
 903             tmp[i] = lp_build_shr_imm(&bld, tmp[i], src_shift - dst_shift);
 904       }
 905    }
 906
 907    /*
 908     * Truncate or expand bit width
 909     *
 910     * No data conversion should happen here, although the sign bits are
 911     * crucial to avoid bad clamping.
 912     */
 913
 914    {
 915       struct lp_type new_type;
 916
 917       new_type = tmp_type;
 918       new_type.sign   = dst_type.sign;
 919       new_type.width  = dst_type.width;
 920       new_type.length = dst_type.length;
 921
 922       /*
 923        * Note that resize when using packs can sometimes get min/max
 924        * clamping for free. Should be able to exploit this...
 925        */
 926       lp_build_resize(gallivm, tmp_type, new_type, tmp, num_srcs, tmp, num_dsts);
 927
 928       tmp_type = new_type;
 929       num_tmps = num_dsts;
 930    }
 931
 932    /*
 933     * Scale to the widest range
 934     */
 935
 936    if(src_type.floating) {
 937       /* Nothing to do */
 938    }
 939    else if(!src_type.floating && dst_type.floating) {
 940       if(!src_type.fixed && !src_type.sign && src_type.norm) {
 941          for(i = 0; i < num_tmps; ++i) {
 942             tmp[i] = lp_build_unsigned_norm_to_float(gallivm,
 943                                                      src_type.width,
 944                                                      dst_type,
 945                                                      tmp[i]);
 946          }
 947          tmp_type.floating = TRUE;
 948       }
 949       else {
 950          double src_scale = lp_const_scale(src_type);
 951          LLVMTypeRef tmp_vec_type;
 952
 953          /* Use an equally sized integer for intermediate computations */
 954          tmp_type.floating = TRUE;
 955          tmp_type.sign = TRUE;
 956          tmp_vec_type = lp_build_vec_type(gallivm, tmp_type);
 957          for(i = 0; i < num_tmps; ++i) {
 958 #if 0
 959             if(dst_type.sign)
 960                tmp[i] = LLVMBuildSIToFP(builder, tmp[i], tmp_vec_type, "");
 961             else
 962                tmp[i] = LLVMBuildUIToFP(builder, tmp[i], tmp_vec_type, "");
 963 #else
 964             /* FIXME: there is no SSE counterpart for LLVMBuildUIToFP */
 965             tmp[i] = LLVMBuildSIToFP(builder, tmp[i], tmp_vec_type, "");
 966 #endif
 967           }
 968
 969           if (src_scale != 1.0) {
 970              LLVMValueRef scale = lp_build_const_vec(gallivm, tmp_type, 1.0/src_scale);
 971              for(i = 0; i < num_tmps; ++i)
 972                 tmp[i] = LLVMBuildFMul(builder, tmp[i], scale, "");
 973           }
 974
 975           /* the formula above will produce value below -1.0 for most negative
 976            * value but everything seems happy with that hence disable for now */
 977           if (0 && !src_type.fixed && src_type.norm && src_type.sign) {
 978              struct lp_build_context bld;
 979
 980              lp_build_context_init(&bld, gallivm, dst_type);
 981              for(i = 0; i < num_tmps; ++i) {
 982                 tmp[i] = lp_build_max(&bld, tmp[i],
 983                                       lp_build_const_vec(gallivm, dst_type, -1.0f));
 984              }
 985           }
 986       }
 987     }
 988     else {
 989        unsigned src_shift = lp_const_shift(src_type);
 990        unsigned dst_shift = lp_const_shift(dst_type);
 991        unsigned src_offset = lp_const_offset(src_type);
 992        unsigned dst_offset = lp_const_offset(dst_type);
 993        struct lp_build_context bld;
 994        lp_build_context_init(&bld, gallivm, tmp_type);
 995
 996        if (src_shift < dst_shift) {
 997           LLVMValueRef pre_shift[LP_MAX_VECTOR_LENGTH];
 998
 999           if (dst_shift - src_shift < dst_type.width) {
1000              for (i = 0; i < num_tmps; ++i) {
1001                 pre_shift[i] = tmp[i];
1002                 tmp[i] = lp_build_shl_imm(&bld, tmp[i], dst_shift - src_shift);
1003              }
1004           }
1005           else {
1006              /*
1007               * This happens for things like sscaled -> unorm conversions. Shift
1008               * counts equal to bit width cause undefined results, so hack around it.
1009               */
1010              for (i = 0; i < num_tmps; ++i) {
1011                 pre_shift[i] = tmp[i];
1012                 tmp[i] = lp_build_zero(gallivm, dst_type);
1013              }
1014           }
1015
1016           /* Compensate for different offsets */
1017           if (dst_offset > src_offset) {
1018              for (i = 0; i < num_tmps; ++i) {
1019                 tmp[i] = LLVMBuildSub(builder, tmp[i], pre_shift[i], "");
1020              }
1021           }
1022        }
1023     }
1024
1025    for(i = 0; i < num_dsts; ++i) {
1026       dst[i] = tmp[i];
1027       assert(lp_check_value(dst_type, dst[i]));
1028    }
1029 }
1030
1031
1032 /**
1033  * Bit mask conversion.
1034  *
1035  * This will convert the integer masks that match the given types.
1036  *
1037  * The mask values should 0 or -1, i.e., all bits either set to zero or one.
1038  * Any other value will likely cause unpredictable results.
1039  *
1040  * This is basically a very trimmed down version of lp_build_conv.
1041  */
1042 void
1043 lp_build_conv_mask(struct gallivm_state *gallivm,
1044                    struct lp_type src_type,
1045                    struct lp_type dst_type,
1046                    const LLVMValueRef *src, unsigned num_srcs,
1047                    LLVMValueRef *dst, unsigned num_dsts)
1048 {
1049
1050    /* We must not loose or gain channels. Only precision */
1051    assert(src_type.length * num_srcs == dst_type.length * num_dsts);
1052
1053    /*
1054     * Drop
1055     *
1056     * We assume all values are 0 or -1
1057     */
1058
1059    src_type.floating = FALSE;
1060    src_type.fixed = FALSE;
1061    src_type.sign = TRUE;
1062    src_type.norm = FALSE;
1063
1064    dst_type.floating = FALSE;
1065    dst_type.fixed = FALSE;
1066    dst_type.sign = TRUE;
1067    dst_type.norm = FALSE;
1068
1069    /*
1070     * Truncate or expand bit width
1071     */
1072
1073    lp_build_resize(gallivm, src_type, dst_type, src, num_srcs, dst, num_dsts);
1074 }