src/gallium/auxiliary/gallivm/lp_bld_conv.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2009 VMware, Inc.
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the
  15  * next paragraph) shall be included in all copies or substantial portions
  16  * of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  21  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
  22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25  *
  26  **************************************************************************/
  27
  28
  29 /**
  30  * @file
  31  * Helper functions for type conversions.
  32  *
  33  * We want to use the fastest type for a given computation whenever feasible.
  34  * The other side of this is that we need to be able convert between several
  35  * types accurately and efficiently.
  36  *
  37  * Conversion between types of different bit width is quite complex since a
  38  *
  39  * To remember there are a few invariants in type conversions:
  40  *
  41  * - register width must remain constant:
  42  *
  43  *     src_type.width * src_type.length == dst_type.width * dst_type.length
  44  *
  45  * - total number of elements must remain constant:
  46  *
  47  *     src_type.length * num_srcs == dst_type.length * num_dsts
  48  *
  49  * It is not always possible to do the conversion both accurately and
  50  * efficiently, usually due to lack of adequate machine instructions. In these
  51  * cases it is important not to cut shortcuts here and sacrifice accuracy, as
  52  * there this functions can be used anywhere. In the future we might have a
  53  * precision parameter which can gauge the accuracy vs efficiency compromise,
  54  * but for now if the data conversion between two stages happens to be the
  55  * bottleneck, then most likely should just avoid converting at all and run
  56  * both stages with the same type.
  57  *
  58  * Make sure to run lp_test_conv unit test after any change to this file.
  59  *
  60  * @author Jose Fonseca <jfonseca@vmware.com>
  61  */
  62
  63
  64 #include "util/u_debug.h"
  65 #include "util/u_math.h"
  66 #include "util/u_cpu_detect.h"
  67
  68 #include "lp_bld_type.h"
  69 #include "lp_bld_const.h"
  70 #include "lp_bld_arit.h"
  71 #include "lp_bld_pack.h"
  72 #include "lp_bld_conv.h"
  73 #include "lp_bld_logic.h"
  74 #include "lp_bld_intr.h"
  75
  76
  77
  78 /**
  79  * Byte swap on element. It will construct a call to intrinsic llvm.bswap
  80  * based on the type.
  81  *
  82  * @param res           element to byte swap.
  83  * @param type          int16_t, int32_t, int64_t, float or double
  84  * @param
  85  */
  86 LLVMValueRef
  87 lp_build_bswap(struct gallivm_state *gallivm,
  88                LLVMValueRef res,
  89                struct lp_type type)
  90 {
  91    LLVMTypeRef int_type = LLVMIntTypeInContext(gallivm->context,
  92                                                type.width);
  93    const char *intrinsic = NULL;
  94    if (type.width == 8)
  95       return res;
  96    if (type.width == 16)
  97       intrinsic = "llvm.bswap.i16";
  98    else if (type.width == 32)
  99      intrinsic = "llvm.bswap.i32";
 100    else if (type.width == 64)
 101       intrinsic = "llvm.bswap.i64";
 102
 103    assert (intrinsic != NULL);
 104
 105    /* In case of a floating-point type cast to a int of same size and then
 106     * cast back to fp type.
 107     */
 108    if (type.floating)
 109       res = LLVMBuildBitCast(gallivm->builder, res, int_type, "");
 110    res = lp_build_intrinsic_unary(gallivm->builder, intrinsic, int_type, res);
 111    if (type.floating)
 112       res = LLVMBuildBitCast(gallivm->builder, res,
 113                              lp_build_elem_type(gallivm, type), "");
 114    return res;
 115 }
 116
 117
 118 /**
 119  * Byte swap every element in the vector.
 120  *
 121  * @param packed        <vector> to convert
 122  * @param src_type      <vector> type of int16_t, int32_t, int64_t, float or
 123  *                      double
 124  * @param dst_type      <vector> type to return
 125  */
 126 LLVMValueRef
 127 lp_build_bswap_vec(struct gallivm_state *gallivm,
 128                    LLVMValueRef packed,
 129                    struct lp_type src_type_vec,
 130                    struct lp_type dst_type_vec)
 131 {
 132    LLVMBuilderRef builder = gallivm->builder;
 133    LLVMTypeRef dst_type = lp_build_elem_type(gallivm, dst_type_vec);
 134    LLVMValueRef res;
 135
 136    if (src_type_vec.length == 1) {
 137       res = lp_build_bswap(gallivm, packed, src_type_vec);
 138       res = LLVMBuildBitCast(gallivm->builder, res, dst_type, "");
 139    } else {
 140       unsigned i;
 141       res = LLVMGetUndef(lp_build_vec_type(gallivm, dst_type_vec));
 142       for (i = 0; i < src_type_vec.length; ++i) {
 143          LLVMValueRef index = lp_build_const_int32(gallivm, i);
 144          LLVMValueRef elem = LLVMBuildExtractElement(builder, packed, index, "");
 145          elem = lp_build_bswap(gallivm, elem, src_type_vec);
 146          elem = LLVMBuildBitCast(gallivm->builder, elem, dst_type, "");
 147          res = LLVMBuildInsertElement(gallivm->builder, res, elem, index, "");
 148       }
 149    }
 150    return res;
 151 }
 152
 153
 154 /**
 155  * Converts int16 half-float to float32
 156  * Note this can be performed in 1 instruction if vcvtph2ps exists (sse5 i think?)
 157  * [llvm.x86.vcvtph2ps / _mm_cvtph_ps]
 158  *
 159  * @param src           value to convert
 160  *
 161  * ref http://fgiesen.wordpress.com/2012/03/28/half-to-float-done-quic/
 162  * ref https://gist.github.com/2144712
 163  */
 164 LLVMValueRef
 165 lp_build_half_to_float(struct gallivm_state *gallivm,
 166                        LLVMValueRef src)
 167 {
 168    int src_length = LLVMGetVectorSize(LLVMTypeOf(src));
 169
 170    struct lp_type f32_type = lp_type_float_vec(32, 32 * src_length);
 171    struct lp_type i32_type = lp_type_int_vec(32, 32 * src_length);
 172
 173    LLVMBuilderRef builder = gallivm->builder;
 174    LLVMTypeRef int_vec_type = lp_build_vec_type(gallivm, i32_type);
 175    LLVMTypeRef float_vec_type = lp_build_vec_type(gallivm, f32_type);
 176
 177    /* Constants */
 178    LLVMValueRef i32_13          = lp_build_const_int_vec(gallivm, i32_type, 13);
 179    LLVMValueRef i32_16          = lp_build_const_int_vec(gallivm, i32_type, 16);
 180    LLVMValueRef i32_mask_nosign = lp_build_const_int_vec(gallivm, i32_type, 0x7fff);
 181    LLVMValueRef i32_was_infnan  = lp_build_const_int_vec(gallivm, i32_type, 0x7bff);
 182    LLVMValueRef i32_exp_infnan  = lp_build_const_int_vec(gallivm, i32_type, 0xff << 23);
 183    LLVMValueRef f32_magic       = LLVMBuildBitCast(builder,
 184                                                    lp_build_const_int_vec(gallivm, i32_type, (254 - 15) << 23),
 185                                                    float_vec_type, "");
 186
 187    /* Convert int16 vector to int32 vector by zero ext */
 188    LLVMValueRef h             = LLVMBuildZExt(builder, src, int_vec_type, "");
 189
 190    /* Exponent / mantissa bits */
 191    LLVMValueRef expmant       = LLVMBuildAnd(builder, i32_mask_nosign, h, "");
 192    LLVMValueRef shifted       = LLVMBuildBitCast(builder, LLVMBuildShl(builder, expmant, i32_13, ""), float_vec_type, "");
 193
 194    /* Exponent adjust */
 195    LLVMValueRef scaled        = LLVMBuildBitCast(builder, LLVMBuildFMul(builder, shifted, f32_magic, ""), int_vec_type, "");
 196
 197    /* Make sure Inf/NaN survive */
 198    LLVMValueRef b_wasinfnan   = lp_build_compare(gallivm, i32_type, PIPE_FUNC_GREATER, expmant, i32_was_infnan);
 199    LLVMValueRef infnanexp     = LLVMBuildAnd(builder, b_wasinfnan, i32_exp_infnan, "");
 200
 201    /* Sign bit */
 202    LLVMValueRef justsign      = LLVMBuildXor(builder, h, expmant, "");
 203    LLVMValueRef sign          = LLVMBuildShl(builder, justsign, i32_16, "");
 204
 205    /* Combine result */
 206    LLVMValueRef sign_inf      = LLVMBuildOr(builder, sign, infnanexp, "");
 207    LLVMValueRef final         = LLVMBuildOr(builder, scaled, sign_inf, "");
 208
 209    /* Cast from int32 vector to float32 vector */
 210    return LLVMBuildBitCast(builder, final, float_vec_type, "");
 211 }
 212
 213
 214 /**
 215  * Converts float32 to int16 half-float
 216  * Note this can be performed in 1 instruction if vcvtps2ph exists (sse5 i think?)
 217  * [llvm.x86.vcvtps2ph / _mm_cvtps_ph]
 218  *
 219  * @param src           value to convert
 220  *
 221  * ref http://fgiesen.wordpress.com/2012/03/28/half-to-float-done-quic/
 222  * ref https://gist.github.com/2156668
 223  */
 224 LLVMValueRef
 225 lp_build_float_to_half(struct gallivm_state *gallivm,
 226                        LLVMValueRef src)
 227 {
 228    struct lp_type i32_type = lp_type_int_vec(32, 32 * LLVMGetVectorSize(LLVMTypeOf(src)));
 229
 230    LLVMBuilderRef builder = gallivm->builder;
 231    LLVMTypeRef int_vec_type = lp_build_vec_type(gallivm, i32_type);
 232
 233    struct lp_build_context bld;
 234
 235    LLVMValueRef result;
 236
 237    lp_build_context_init(&bld, gallivm, i32_type);
 238
 239    /* Extra scope because lp_build_min needs a build context, le sigh */
 240    {
 241       /* Constants */
 242       LLVMValueRef i32_13        = lp_build_const_int_vec(gallivm, i32_type, 13);
 243       LLVMValueRef i32_16        = lp_build_const_int_vec(gallivm, i32_type, 16);
 244       LLVMValueRef i32_mask_fabs = lp_build_const_int_vec(gallivm, i32_type, 0x7fffffff);
 245       LLVMValueRef i32_f32infty  = lp_build_const_int_vec(gallivm, i32_type, 0xff << 23);
 246       LLVMValueRef i32_expinf    = lp_build_const_int_vec(gallivm, i32_type, 0xe0 << 23);
 247       LLVMValueRef i32_f16max    = lp_build_const_int_vec(gallivm, i32_type, 0x8f << 23);
 248       LLVMValueRef i32_magic     = lp_build_const_int_vec(gallivm, i32_type, 0x0f << 23);
 249
 250       /* Cast from float32 to int32 */
 251       LLVMValueRef f             = LLVMBuildBitCast(builder, src, int_vec_type, "");
 252
 253       /* Remove sign */
 254       LLVMValueRef fabs          = LLVMBuildAnd(builder, i32_mask_fabs, f, "");
 255
 256       /* Magic conversion */
 257       LLVMValueRef clamped       = lp_build_min(&bld, i32_f16max, fabs);
 258       LLVMValueRef scaled        = LLVMBuildMul(builder, clamped, i32_magic, "");
 259
 260       /* Make sure Inf/NaN and unormalised survive */
 261       LLVMValueRef infnancase    = LLVMBuildXor(builder, i32_expinf, fabs, "");
 262       LLVMValueRef b_notnormal   = lp_build_compare(gallivm, i32_type, PIPE_FUNC_GREATER, fabs, i32_f32infty);
 263
 264       /* Merge normal / unnormal case */
 265       LLVMValueRef merge1        = LLVMBuildAnd(builder, infnancase, b_notnormal, "");
 266       LLVMValueRef merge2        = LLVMBuildNot(builder, LLVMBuildAnd(builder, b_notnormal, scaled, ""), "");
 267       LLVMValueRef merged        = LLVMBuildOr(builder, merge1, merge2, "");
 268       LLVMValueRef shifted       = LLVMBuildLShr(builder, merged, i32_13, "");
 269
 270       /* Sign bit */
 271       LLVMValueRef justsign      = LLVMBuildXor(builder, f, fabs, "");
 272       LLVMValueRef signshifted   = LLVMBuildLShr(builder, justsign, i32_16, "");
 273
 274       /* Combine result */
 275       result                     = LLVMBuildOr(builder, shifted, signshifted, "");
 276    }
 277
 278    /* Truncate from 32 bit to 16 bit */
 279    i32_type.width = 16;
 280    return LLVMBuildTrunc(builder, result, lp_build_vec_type(gallivm, i32_type), "");
 281 }
 282
 283
 284 /**
 285  * Special case for converting clamped IEEE-754 floats to unsigned norms.
 286  *
 287  * The mathematical voodoo below may seem excessive but it is actually
 288  * paramount we do it this way for several reasons. First, there is no single
 289  * precision FP to unsigned integer conversion Intel SSE instruction. Second,
 290  * secondly, even if there was, since the FP's mantissa takes only a fraction
 291  * of register bits the typically scale and cast approach would require double
 292  * precision for accurate results, and therefore half the throughput
 293  *
 294  * Although the result values can be scaled to an arbitrary bit width specified
 295  * by dst_width, the actual result type will have the same width.
 296  *
 297  * Ex: src = { float, float, float, float }
 298  * return { i32, i32, i32, i32 } where each value is in [0, 2^dst_width-1].
 299  */
 300 LLVMValueRef
 301 lp_build_clamped_float_to_unsigned_norm(struct gallivm_state *gallivm,
 302                                         struct lp_type src_type,
 303                                         unsigned dst_width,
 304                                         LLVMValueRef src)
 305 {
 306    LLVMBuilderRef builder = gallivm->builder;
 307    LLVMTypeRef int_vec_type = lp_build_int_vec_type(gallivm, src_type);
 308    LLVMValueRef res;
 309    unsigned mantissa;
 310
 311    assert(src_type.floating);
 312    assert(dst_width <= src_type.width);
 313    src_type.sign = FALSE;
 314
 315    mantissa = lp_mantissa(src_type);
 316
 317    if (dst_width <= mantissa) {
 318       /*
 319        * Apply magic coefficients that will make the desired result to appear
 320        * in the lowest significant bits of the mantissa, with correct rounding.
 321        *
 322        * This only works if the destination width fits in the mantissa.
 323        */
 324
 325       unsigned long long ubound;
 326       unsigned long long mask;
 327       double scale;
 328       double bias;
 329
 330       ubound = (1ULL << dst_width);
 331       mask = ubound - 1;
 332       scale = (double)mask/ubound;
 333       bias = (double)(1ULL << (mantissa - dst_width));
 334
 335       res = LLVMBuildFMul(builder, src, lp_build_const_vec(gallivm, src_type, scale), "");
 336       res = LLVMBuildFAdd(builder, res, lp_build_const_vec(gallivm, src_type, bias), "");
 337       res = LLVMBuildBitCast(builder, res, int_vec_type, "");
 338       res = LLVMBuildAnd(builder, res,
 339                          lp_build_const_int_vec(gallivm, src_type, mask), "");
 340    }
 341    else if (dst_width == (mantissa + 1)) {
 342       /*
 343        * The destination width matches exactly what can be represented in
 344        * floating point (i.e., mantissa + 1 bits). So do a straight
 345        * multiplication followed by casting. No further rounding is necessary.
 346        */
 347
 348       double scale;
 349
 350       scale = (double)((1ULL << dst_width) - 1);
 351
 352       res = LLVMBuildFMul(builder, src,
 353                           lp_build_const_vec(gallivm, src_type, scale), "");
 354       res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
 355    }
 356    else {
 357       /*
 358        * The destination exceeds what can be represented in the floating point.
 359        * So multiply by the largest power two we get away with, and when
 360        * subtract the most significant bit to rescale to normalized values.
 361        *
 362        * The largest power of two factor we can get away is
 363        * (1 << (src_type.width - 1)), because we need to use signed . In theory it
 364        * should be (1 << (src_type.width - 2)), but IEEE 754 rules states
 365        * INT_MIN should be returned in FPToSI, which is the correct result for
 366        * values near 1.0!
 367        *
 368        * This means we get (src_type.width - 1) correct bits for values near 0.0,
 369        * and (mantissa + 1) correct bits for values near 1.0. Equally or more
 370        * important, we also get exact results for 0.0 and 1.0.
 371        */
 372
 373       unsigned n = MIN2(src_type.width - 1, dst_width);
 374
 375       double scale = (double)(1ULL << n);
 376       unsigned lshift = dst_width - n;
 377       unsigned rshift = n;
 378       LLVMValueRef lshifted;
 379       LLVMValueRef rshifted;
 380
 381       res = LLVMBuildFMul(builder, src,
 382                           lp_build_const_vec(gallivm, src_type, scale), "");
 383       res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
 384
 385       /*
 386        * Align the most significant bit to its final place.
 387        *
 388        * This will cause 1.0 to overflow to 0, but the later adjustment will
 389        * get it right.
 390        */
 391       if (lshift) {
 392          lshifted = LLVMBuildShl(builder, res,
 393                                  lp_build_const_int_vec(gallivm, src_type,
 394                                                         lshift), "");
 395       } else {
 396          lshifted = res;
 397       }
 398
 399       /*
 400        * Align the most significant bit to the right.
 401        */
 402       rshifted =  LLVMBuildLShr(builder, res,
 403                                 lp_build_const_int_vec(gallivm, src_type, rshift),
 404                                 "");
 405
 406       /*
 407        * Subtract the MSB to the LSB, therefore re-scaling from
 408        * (1 << dst_width) to ((1 << dst_width) - 1).
 409        */
 410
 411       res = LLVMBuildSub(builder, lshifted, rshifted, "");
 412    }
 413
 414    return res;
 415 }
 416
 417
 418 /**
 419  * Inverse of lp_build_clamped_float_to_unsigned_norm above.
 420  * Ex: src = { i32, i32, i32, i32 } with values in range [0, 2^src_width-1]
 421  * return {float, float, float, float} with values in range [0, 1].
 422  */
 423 LLVMValueRef
 424 lp_build_unsigned_norm_to_float(struct gallivm_state *gallivm,
 425                                 unsigned src_width,
 426                                 struct lp_type dst_type,
 427                                 LLVMValueRef src)
 428 {
 429    LLVMBuilderRef builder = gallivm->builder;
 430    LLVMTypeRef vec_type = lp_build_vec_type(gallivm, dst_type);
 431    LLVMTypeRef int_vec_type = lp_build_int_vec_type(gallivm, dst_type);
 432    LLVMValueRef bias_;
 433    LLVMValueRef res;
 434    unsigned mantissa;
 435    unsigned n;
 436    unsigned long long ubound;
 437    unsigned long long mask;
 438    double scale;
 439    double bias;
 440
 441    assert(dst_type.floating);
 442
 443    mantissa = lp_mantissa(dst_type);
 444
 445    if (src_width <= (mantissa + 1)) {
 446       /*
 447        * The source width matches fits what can be represented in floating
 448        * point (i.e., mantissa + 1 bits). So do a straight multiplication
 449        * followed by casting. No further rounding is necessary.
 450        */
 451
 452       scale = 1.0/(double)((1ULL << src_width) - 1);
 453       res = LLVMBuildSIToFP(builder, src, vec_type, "");
 454       res = LLVMBuildFMul(builder, res,
 455                           lp_build_const_vec(gallivm, dst_type, scale), "");
 456       return res;
 457    }
 458    else {
 459       /*
 460        * The source width exceeds what can be represented in floating
 461        * point. So truncate the incoming values.
 462        */
 463
 464       n = MIN2(mantissa, src_width);
 465
 466       ubound = ((unsigned long long)1 << n);
 467       mask = ubound - 1;
 468       scale = (double)ubound/mask;
 469       bias = (double)((unsigned long long)1 << (mantissa - n));
 470
 471       res = src;
 472
 473       if (src_width > mantissa) {
 474          int shift = src_width - mantissa;
 475          res = LLVMBuildLShr(builder, res,
 476                              lp_build_const_int_vec(gallivm, dst_type, shift), "");
 477       }
 478
 479       bias_ = lp_build_const_vec(gallivm, dst_type, bias);
 480
 481       res = LLVMBuildOr(builder,
 482                         res,
 483                         LLVMBuildBitCast(builder, bias_, int_vec_type, ""), "");
 484
 485       res = LLVMBuildBitCast(builder, res, vec_type, "");
 486
 487       res = LLVMBuildFSub(builder, res, bias_, "");
 488       res = LLVMBuildFMul(builder, res, lp_build_const_vec(gallivm, dst_type, scale), "");
 489    }
 490
 491    return res;
 492 }
 493
 494
 495 /**
 496  * Pick a suitable num_dsts for lp_build_conv to ensure optimal cases are used.
 497  *
 498  * Returns the number of dsts created from src
 499  */
 500 int lp_build_conv_auto(struct gallivm_state *gallivm,
 501                        struct lp_type src_type,
 502                        struct lp_type* dst_type,
 503                        const LLVMValueRef *src,
 504                        unsigned num_srcs,
 505                        LLVMValueRef *dst)
 506 {
 507    int i;
 508    int num_dsts = num_srcs;
 509
 510    if (src_type.floating == dst_type->floating &&
 511        src_type.width == dst_type->width &&
 512        src_type.length == dst_type->length &&
 513        src_type.fixed == dst_type->fixed &&
 514        src_type.norm == dst_type->norm &&
 515        src_type.sign == dst_type->sign)
 516       return num_dsts;
 517
 518    /* Special case 4x4f -> 1x16ub or 2x8f -> 1x16ub
 519     */
 520    if (src_type.floating == 1 &&
 521        src_type.fixed    == 0 &&
 522        src_type.sign     == 1 &&
 523        src_type.norm     == 0 &&
 524        src_type.width    == 32 &&
 525
 526        dst_type->floating == 0 &&
 527        dst_type->fixed    == 0 &&
 528        dst_type->sign     == 0 &&
 529        dst_type->norm     == 1 &&
 530        dst_type->width    == 8)
 531    {
 532       /* Special case 4x4f --> 1x16ub */
 533       if (src_type.length == 4 && util_cpu_caps.has_sse2)
 534       {
 535          assert((num_srcs % 4) == 0);
 536
 537          num_dsts = num_srcs / 4;
 538          dst_type->length = 16;
 539
 540          lp_build_conv(gallivm, src_type, *dst_type, src, num_srcs, dst, num_dsts);
 541          return num_dsts;
 542       }
 543
 544       /* Special case 2x8f --> 1x16ub */
 545       if (src_type.length == 8 && util_cpu_caps.has_avx)
 546       {
 547          assert((num_srcs % 2) == 0);
 548
 549          num_dsts = num_srcs / 2;
 550          dst_type->length = 16;
 551
 552          lp_build_conv(gallivm, src_type, *dst_type, src, num_srcs, dst, num_dsts);
 553          return num_dsts;
 554       }
 555    }
 556
 557    /* lp_build_resize does not support M:N */
 558    if (src_type.width == dst_type->width) {
 559       lp_build_conv(gallivm, src_type, *dst_type, src, num_srcs, dst, num_dsts);
 560    } else {
 561       for (i = 0; i < num_srcs; ++i) {
 562          lp_build_conv(gallivm, src_type, *dst_type, &src[i], 1, &dst[i], 1);
 563       }
 564    }
 565
 566    return num_dsts;
 567 }
 568
 569
 570 /**
 571  * Generic type conversion.
 572  *
 573  * TODO: Take a precision argument, or even better, add a new precision member
 574  * to the lp_type union.
 575  */
 576 void
 577 lp_build_conv(struct gallivm_state *gallivm,
 578               struct lp_type src_type,
 579               struct lp_type dst_type,
 580               const LLVMValueRef *src, unsigned num_srcs,
 581               LLVMValueRef *dst, unsigned num_dsts)
 582 {
 583    LLVMBuilderRef builder = gallivm->builder;
 584    struct lp_type tmp_type;
 585    LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH];
 586    unsigned num_tmps;
 587    unsigned i;
 588
 589    /* We must not loose or gain channels. Only precision */
 590    assert(src_type.length * num_srcs == dst_type.length * num_dsts);
 591
 592    assert(src_type.length <= LP_MAX_VECTOR_LENGTH);
 593    assert(dst_type.length <= LP_MAX_VECTOR_LENGTH);
 594    assert(num_srcs <= LP_MAX_VECTOR_LENGTH);
 595    assert(num_dsts <= LP_MAX_VECTOR_LENGTH);
 596
 597    tmp_type = src_type;
 598    for(i = 0; i < num_srcs; ++i) {
 599       assert(lp_check_value(src_type, src[i]));
 600       tmp[i] = src[i];
 601    }
 602    num_tmps = num_srcs;
 603
 604
 605    /* Special case 4x4f --> 1x16ub
 606     */
 607    if (src_type.floating == 1 &&
 608        src_type.fixed    == 0 &&
 609        src_type.sign     == 1 &&
 610        src_type.norm     == 0 &&
 611        src_type.width    == 32 &&
 612        src_type.length   == 4 &&
 613
 614        dst_type.floating == 0 &&
 615        dst_type.fixed    == 0 &&
 616        dst_type.sign     == 0 &&
 617        dst_type.norm     == 1 &&
 618        dst_type.width    == 8 &&
 619        dst_type.length   == 16 &&
 620
 621        4 * num_dsts      == num_srcs &&
 622
 623        util_cpu_caps.has_sse2)
 624    {
 625       struct lp_build_context bld;
 626       struct lp_type int16_type = dst_type;
 627       struct lp_type int32_type = dst_type;
 628       LLVMValueRef const_255f;
 629       unsigned i, j;
 630
 631       lp_build_context_init(&bld, gallivm, src_type);
 632
 633       int16_type.width *= 2;
 634       int16_type.length /= 2;
 635       int16_type.sign = 1;
 636
 637       int32_type.width *= 4;
 638       int32_type.length /= 4;
 639       int32_type.sign = 1;
 640
 641       const_255f = lp_build_const_vec(gallivm, src_type, 255.0f);
 642
 643       for (i = 0; i < num_dsts; ++i, src += 4) {
 644          LLVMValueRef lo, hi;
 645
 646          for (j = 0; j < 4; ++j) {
 647             tmp[j] = LLVMBuildFMul(builder, src[j], const_255f, "");
 648             tmp[j] = lp_build_iround(&bld, tmp[j]);
 649          }
 650
 651          /* relying on clamping behavior of sse2 intrinsics here */
 652          lo = lp_build_pack2(gallivm, int32_type, int16_type, tmp[0], tmp[1]);
 653          hi = lp_build_pack2(gallivm, int32_type, int16_type, tmp[2], tmp[3]);
 654          dst[i] = lp_build_pack2(gallivm, int16_type, dst_type, lo, hi);
 655       }
 656
 657       return;
 658    }
 659
 660    /* Special case 2x8f --> 1x16ub
 661     */
 662    else if (src_type.floating == 1 &&
 663       src_type.fixed    == 0 &&
 664       src_type.sign     == 1 &&
 665       src_type.norm     == 0 &&
 666       src_type.width    == 32 &&
 667       src_type.length   == 8 &&
 668
 669       dst_type.floating == 0 &&
 670       dst_type.fixed    == 0 &&
 671       dst_type.sign     == 0 &&
 672       dst_type.norm     == 1 &&
 673       dst_type.width    == 8 &&
 674       dst_type.length   == 16 &&
 675
 676       2 * num_dsts      == num_srcs &&
 677
 678       util_cpu_caps.has_avx) {
 679
 680       struct lp_build_context bld;
 681       struct lp_type int16_type = dst_type;
 682       struct lp_type int32_type = dst_type;
 683       LLVMValueRef const_255f;
 684       unsigned i;
 685
 686       lp_build_context_init(&bld, gallivm, src_type);
 687
 688       int16_type.width *= 2;
 689       int16_type.length /= 2;
 690       int16_type.sign = 1;
 691
 692       int32_type.width *= 4;
 693       int32_type.length /= 4;
 694       int32_type.sign = 1;
 695
 696       const_255f = lp_build_const_vec(gallivm, src_type, 255.0f);
 697
 698       for (i = 0; i < num_dsts; ++i, src += 2) {
 699          LLVMValueRef lo, hi, a, b;
 700
 701          a = LLVMBuildFMul(builder, src[0], const_255f, "");
 702          b = LLVMBuildFMul(builder, src[1], const_255f, "");
 703
 704          a = lp_build_iround(&bld, a);
 705          b = lp_build_iround(&bld, b);
 706
 707          tmp[0] = lp_build_extract_range(gallivm, a, 0, 4);
 708          tmp[1] = lp_build_extract_range(gallivm, a, 4, 4);
 709          tmp[2] = lp_build_extract_range(gallivm, b, 0, 4);
 710          tmp[3] = lp_build_extract_range(gallivm, b, 4, 4);
 711
 712          /* relying on clamping behavior of sse2 intrinsics here */
 713          lo = lp_build_pack2(gallivm, int32_type, int16_type, tmp[0], tmp[1]);
 714          hi = lp_build_pack2(gallivm, int32_type, int16_type, tmp[2], tmp[3]);
 715          dst[i] = lp_build_pack2(gallivm, int16_type, dst_type, lo, hi);
 716       }
 717       return;
 718    }
 719
 720    /* Special case -> 16bit half-float
 721     */
 722    else if (dst_type.floating && dst_type.width == 16)
 723    {
 724       /* Only support src as 32bit float currently */
 725       assert(src_type.floating && src_type.width == 32);
 726
 727       for(i = 0; i < num_tmps; ++i)
 728          dst[i] = lp_build_float_to_half(gallivm, tmp[i]);
 729
 730       return;
 731    }
 732
 733    /* Pre convert half-floats to floats
 734     */
 735    else if (src_type.floating && src_type.width == 16)
 736    {
 737       for(i = 0; i < num_tmps; ++i)
 738          tmp[i] = lp_build_half_to_float(gallivm, tmp[i]);
 739
 740       tmp_type.width = 32;
 741    }
 742
 743    /*
 744     * Clamp if necessary
 745     */
 746
 747    if(memcmp(&src_type, &dst_type, sizeof src_type) != 0) {
 748       struct lp_build_context bld;
 749       double src_min = lp_const_min(src_type);
 750       double dst_min = lp_const_min(dst_type);
 751       double src_max = lp_const_max(src_type);
 752       double dst_max = lp_const_max(dst_type);
 753       LLVMValueRef thres;
 754
 755       lp_build_context_init(&bld, gallivm, tmp_type);
 756
 757       if(src_min < dst_min) {
 758          if(dst_min == 0.0)
 759             thres = bld.zero;
 760          else
 761             thres = lp_build_const_vec(gallivm, src_type, dst_min);
 762          for(i = 0; i < num_tmps; ++i)
 763             tmp[i] = lp_build_max(&bld, tmp[i], thres);
 764       }
 765
 766       if(src_max > dst_max) {
 767          if(dst_max == 1.0)
 768             thres = bld.one;
 769          else
 770             thres = lp_build_const_vec(gallivm, src_type, dst_max);
 771          for(i = 0; i < num_tmps; ++i)
 772             tmp[i] = lp_build_min(&bld, tmp[i], thres);
 773       }
 774    }
 775
 776    /*
 777     * Scale to the narrowest range
 778     */
 779
 780    if(dst_type.floating) {
 781       /* Nothing to do */
 782    }
 783    else if(tmp_type.floating) {
 784       if(!dst_type.fixed && !dst_type.sign && dst_type.norm) {
 785          for(i = 0; i < num_tmps; ++i) {
 786             tmp[i] = lp_build_clamped_float_to_unsigned_norm(gallivm,
 787                                                              tmp_type,
 788                                                              dst_type.width,
 789                                                              tmp[i]);
 790          }
 791          tmp_type.floating = FALSE;
 792       }
 793       else {
 794          double dst_scale = lp_const_scale(dst_type);
 795          LLVMTypeRef tmp_vec_type;
 796
 797          if (dst_scale != 1.0) {
 798             LLVMValueRef scale = lp_build_const_vec(gallivm, tmp_type, dst_scale);
 799             for(i = 0; i < num_tmps; ++i)
 800                tmp[i] = LLVMBuildFMul(builder, tmp[i], scale, "");
 801          }
 802
 803          /* Use an equally sized integer for intermediate computations */
 804          tmp_type.floating = FALSE;
 805          tmp_vec_type = lp_build_vec_type(gallivm, tmp_type);
 806          for(i = 0; i < num_tmps; ++i) {
 807 #if 0
 808             if(dst_type.sign)
 809                tmp[i] = LLVMBuildFPToSI(builder, tmp[i], tmp_vec_type, "");
 810             else
 811                tmp[i] = LLVMBuildFPToUI(builder, tmp[i], tmp_vec_type, "");
 812 #else
 813            /* FIXME: there is no SSE counterpart for LLVMBuildFPToUI */
 814             tmp[i] = LLVMBuildFPToSI(builder, tmp[i], tmp_vec_type, "");
 815 #endif
 816          }
 817       }
 818    }
 819    else {
 820       unsigned src_shift = lp_const_shift(src_type);
 821       unsigned dst_shift = lp_const_shift(dst_type);
 822       unsigned src_offset = lp_const_offset(src_type);
 823       unsigned dst_offset = lp_const_offset(dst_type);
 824
 825       /* Compensate for different offsets */
 826       if (dst_offset > src_offset && src_type.width > dst_type.width) {
 827          for (i = 0; i < num_tmps; ++i) {
 828             LLVMValueRef shifted;
 829             LLVMValueRef shift = lp_build_const_int_vec(gallivm, tmp_type, src_shift - 1);
 830             if(src_type.sign)
 831                shifted = LLVMBuildAShr(builder, tmp[i], shift, "");
 832             else
 833                shifted = LLVMBuildLShr(builder, tmp[i], shift, "");
 834
 835             tmp[i] = LLVMBuildSub(builder, tmp[i], shifted, "");
 836          }
 837       }
 838
 839       if(src_shift > dst_shift) {
 840          LLVMValueRef shift = lp_build_const_int_vec(gallivm, tmp_type,
 841                                                      src_shift - dst_shift);
 842          for(i = 0; i < num_tmps; ++i)
 843             if(src_type.sign)
 844                tmp[i] = LLVMBuildAShr(builder, tmp[i], shift, "");
 845             else
 846                tmp[i] = LLVMBuildLShr(builder, tmp[i], shift, "");
 847       }
 848    }
 849
 850    /*
 851     * Truncate or expand bit width
 852     *
 853     * No data conversion should happen here, although the sign bits are
 854     * crucial to avoid bad clamping.
 855     */
 856
 857    {
 858       struct lp_type new_type;
 859
 860       new_type = tmp_type;
 861       new_type.sign   = dst_type.sign;
 862       new_type.width  = dst_type.width;
 863       new_type.length = dst_type.length;
 864
 865       lp_build_resize(gallivm, tmp_type, new_type, tmp, num_srcs, tmp, num_dsts);
 866
 867       tmp_type = new_type;
 868       num_tmps = num_dsts;
 869    }
 870
 871    /*
 872     * Scale to the widest range
 873     */
 874
 875    if(src_type.floating) {
 876       /* Nothing to do */
 877    }
 878    else if(!src_type.floating && dst_type.floating) {
 879       if(!src_type.fixed && !src_type.sign && src_type.norm) {
 880          for(i = 0; i < num_tmps; ++i) {
 881             tmp[i] = lp_build_unsigned_norm_to_float(gallivm,
 882                                                      src_type.width,
 883                                                      dst_type,
 884                                                      tmp[i]);
 885          }
 886          tmp_type.floating = TRUE;
 887       }
 888       else {
 889          double src_scale = lp_const_scale(src_type);
 890          LLVMTypeRef tmp_vec_type;
 891
 892          /* Use an equally sized integer for intermediate computations */
 893          tmp_type.floating = TRUE;
 894          tmp_type.sign = TRUE;
 895          tmp_vec_type = lp_build_vec_type(gallivm, tmp_type);
 896          for(i = 0; i < num_tmps; ++i) {
 897 #if 0
 898             if(dst_type.sign)
 899                tmp[i] = LLVMBuildSIToFP(builder, tmp[i], tmp_vec_type, "");
 900             else
 901                tmp[i] = LLVMBuildUIToFP(builder, tmp[i], tmp_vec_type, "");
 902 #else
 903             /* FIXME: there is no SSE counterpart for LLVMBuildUIToFP */
 904             tmp[i] = LLVMBuildSIToFP(builder, tmp[i], tmp_vec_type, "");
 905 #endif
 906           }
 907
 908           if (src_scale != 1.0) {
 909              LLVMValueRef scale = lp_build_const_vec(gallivm, tmp_type, 1.0/src_scale);
 910              for(i = 0; i < num_tmps; ++i)
 911                 tmp[i] = LLVMBuildFMul(builder, tmp[i], scale, "");
 912           }
 913       }
 914     }
 915     else {
 916        unsigned src_shift = lp_const_shift(src_type);
 917        unsigned dst_shift = lp_const_shift(dst_type);
 918        unsigned src_offset = lp_const_offset(src_type);
 919        unsigned dst_offset = lp_const_offset(dst_type);
 920
 921        if (src_shift < dst_shift) {
 922           LLVMValueRef pre_shift[LP_MAX_VECTOR_LENGTH];
 923           LLVMValueRef shift = lp_build_const_int_vec(gallivm, tmp_type, dst_shift - src_shift);
 924
 925           for (i = 0; i < num_tmps; ++i) {
 926              pre_shift[i] = tmp[i];
 927              tmp[i] = LLVMBuildShl(builder, tmp[i], shift, "");
 928           }
 929
 930           /* Compensate for different offsets */
 931           if (dst_offset > src_offset) {
 932              for (i = 0; i < num_tmps; ++i) {
 933                 tmp[i] = LLVMBuildSub(builder, tmp[i], pre_shift[i], "");
 934              }
 935           }
 936        }
 937     }
 938
 939    for(i = 0; i < num_dsts; ++i) {
 940       dst[i] = tmp[i];
 941       assert(lp_check_value(dst_type, dst[i]));
 942    }
 943 }
 944
 945
 946 /**
 947  * Bit mask conversion.
 948  *
 949  * This will convert the integer masks that match the given types.
 950  *
 951  * The mask values should 0 or -1, i.e., all bits either set to zero or one.
 952  * Any other value will likely cause unpredictable results.
 953  *
 954  * This is basically a very trimmed down version of lp_build_conv.
 955  */
 956 void
 957 lp_build_conv_mask(struct gallivm_state *gallivm,
 958                    struct lp_type src_type,
 959                    struct lp_type dst_type,
 960                    const LLVMValueRef *src, unsigned num_srcs,
 961                    LLVMValueRef *dst, unsigned num_dsts)
 962 {
 963
 964    /* We must not loose or gain channels. Only precision */
 965    assert(src_type.length * num_srcs == dst_type.length * num_dsts);
 966
 967    /*
 968     * Drop
 969     *
 970     * We assume all values are 0 or -1
 971     */
 972
 973    src_type.floating = FALSE;
 974    src_type.fixed = FALSE;
 975    src_type.sign = TRUE;
 976    src_type.norm = FALSE;
 977
 978    dst_type.floating = FALSE;
 979    dst_type.fixed = FALSE;
 980    dst_type.sign = TRUE;
 981    dst_type.norm = FALSE;
 982
 983    /*
 984     * Truncate or expand bit width
 985     */
 986
 987    lp_build_resize(gallivm, src_type, dst_type, src, num_srcs, dst, num_dsts);
 988 }