src/gallium/drivers/llvmpipe/lp_bld_conv.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2009 VMware, Inc.
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the
  15  * next paragraph) shall be included in all copies or substantial portions
  16  * of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  21  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
  22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25  *
  26  **************************************************************************/
  27
  28
  29 /**
  30  * @file
  31  * Helper functions for type conversions.
  32  *
  33  * We want to use the fastest type for a given computation whenever feasible.
  34  * The other side of this is that we need to be able convert between several
  35  * types accurately and efficiently.
  36  *
  37  * Conversion between types of different bit width is quite complex since a
  38  *
  39  * To remember there are a few invariants in type conversions:
  40  *
  41  * - register width must remain constant:
  42  *
  43  *     src_type.width * src_type.length == dst_type.width * dst_type.length
  44  *
  45  * - total number of elements must remain constant:
  46  *
  47  *     src_type.length * num_srcs == dst_type.length * num_dsts
  48  *
  49  * It is not always possible to do the conversion both accurately and
  50  * efficiently, usually due to lack of adequate machine instructions. In these
  51  * cases it is important not to cut shortcuts here and sacrifice accuracy, as
  52  * there this functions can be used anywhere. In the future we might have a
  53  * precision parameter which can gauge the accuracy vs efficiency compromise,
  54  * but for now if the data conversion between two stages happens to be the
  55  * bottleneck, then most likely should just avoid converting at all and run
  56  * both stages with the same type.
  57  *
  58  * Make sure to run lp_test_conv unit test after any change to this file.
  59  *
  60  * @author Jose Fonseca <jfonseca@vmware.com>
  61  */
  62
  63
  64 #include "util/u_debug.h"
  65 #include "util/u_math.h"
  66 #include "util/u_cpu_detect.h"
  67
  68 #include "lp_bld_type.h"
  69 #include "lp_bld_const.h"
  70 #include "lp_bld_intr.h"
  71 #include "lp_bld_arit.h"
  72 #include "lp_bld_conv.h"
  73
  74
  75 /**
  76  * Special case for converting clamped IEEE-754 floats to unsigned norms.
  77  *
  78  * The mathematical voodoo below may seem excessive but it is actually
  79  * paramount we do it this way for several reasons. First, there is no single
  80  * precision FP to unsigned integer conversion Intel SSE instruction. Second,
  81  * secondly, even if there was, since the FP's mantissa takes only a fraction
  82  * of register bits the typically scale and cast approach would require double
  83  * precision for accurate results, and therefore half the throughput
  84  *
  85  * Although the result values can be scaled to an arbitrary bit width specified
  86  * by dst_width, the actual result type will have the same width.
  87  */
  88 LLVMValueRef
  89 lp_build_clamped_float_to_unsigned_norm(LLVMBuilderRef builder,
  90                                         struct lp_type src_type,
  91                                         unsigned dst_width,
  92                                         LLVMValueRef src)
  93 {
  94    LLVMTypeRef int_vec_type = lp_build_int_vec_type(src_type);
  95    LLVMValueRef res;
  96    unsigned mantissa;
  97    unsigned n;
  98    unsigned long long ubound;
  99    unsigned long long mask;
 100    double scale;
 101    double bias;
 102
 103    assert(src_type.floating);
 104
 105    mantissa = lp_mantissa(src_type);
 106
 107    /* We cannot carry more bits than the mantissa */
 108    n = MIN2(mantissa, dst_width);
 109
 110    /* This magic coefficients will make the desired result to appear in the
 111     * lowest significant bits of the mantissa.
 112     */
 113    ubound = ((unsigned long long)1 << n);
 114    mask = ubound - 1;
 115    scale = (double)mask/ubound;
 116    bias = (double)((unsigned long long)1 << (mantissa - n));
 117
 118    res = LLVMBuildMul(builder, src, lp_build_const_scalar(src_type, scale), "");
 119    res = LLVMBuildAdd(builder, res, lp_build_const_scalar(src_type, bias), "");
 120    res = LLVMBuildBitCast(builder, res, int_vec_type, "");
 121
 122    if(dst_width > n) {
 123       int shift = dst_width - n;
 124       res = LLVMBuildShl(builder, res, lp_build_int_const_scalar(src_type, shift), "");
 125
 126       /* TODO: Fill in the empty lower bits for additional precision? */
 127 #if 0
 128       {
 129          LLVMValueRef msb;
 130          msb = LLVMBuildLShr(builder, res, lp_build_int_const_scalar(src_type, dst_width - 1), "");
 131          msb = LLVMBuildShl(builder, msb, lp_build_int_const_scalar(src_type, shift), "");
 132          msb = LLVMBuildSub(builder, msb, lp_build_int_const_scalar(src_type, 1), "");
 133          res = LLVMBuildOr(builder, res, msb, "");
 134       }
 135 #elif 0
 136       while(shift > 0) {
 137          res = LLVMBuildOr(builder, res, LLVMBuildLShr(builder, res, lp_build_int_const_scalar(src_type, n), ""), "");
 138          shift -= n;
 139          n *= 2;
 140       }
 141 #endif
 142    }
 143    else
 144       res = LLVMBuildAnd(builder, res, lp_build_int_const_scalar(src_type, mask), "");
 145
 146    return res;
 147 }
 148
 149
 150 /**
 151  * Inverse of lp_build_clamped_float_to_unsigned_norm above.
 152  */
 153 LLVMValueRef
 154 lp_build_unsigned_norm_to_float(LLVMBuilderRef builder,
 155                                 unsigned src_width,
 156                                 struct lp_type dst_type,
 157                                 LLVMValueRef src)
 158 {
 159    LLVMTypeRef vec_type = lp_build_vec_type(dst_type);
 160    LLVMTypeRef int_vec_type = lp_build_int_vec_type(dst_type);
 161    LLVMValueRef bias_;
 162    LLVMValueRef res;
 163    unsigned mantissa;
 164    unsigned n;
 165    unsigned long long ubound;
 166    unsigned long long mask;
 167    double scale;
 168    double bias;
 169
 170    mantissa = lp_mantissa(dst_type);
 171
 172    n = MIN2(mantissa, src_width);
 173
 174    ubound = ((unsigned long long)1 << n);
 175    mask = ubound - 1;
 176    scale = (double)ubound/mask;
 177    bias = (double)((unsigned long long)1 << (mantissa - n));
 178
 179    res = src;
 180
 181    if(src_width > mantissa) {
 182       int shift = src_width - mantissa;
 183       res = LLVMBuildLShr(builder, res, lp_build_int_const_scalar(dst_type, shift), "");
 184    }
 185
 186    bias_ = lp_build_const_scalar(dst_type, bias);
 187
 188    res = LLVMBuildOr(builder,
 189                      res,
 190                      LLVMBuildBitCast(builder, bias_, int_vec_type, ""), "");
 191
 192    res = LLVMBuildBitCast(builder, res, vec_type, "");
 193
 194    res = LLVMBuildSub(builder, res, bias_, "");
 195    res = LLVMBuildMul(builder, res, lp_build_const_scalar(dst_type, scale), "");
 196
 197    return res;
 198 }
 199
 200
 201 /**
 202  * Build shuffle vectors that match PUNPCKLxx and PUNPCKHxx instructions.
 203  */
 204 static LLVMValueRef
 205 lp_build_const_unpack_shuffle(unsigned n, unsigned lo_hi)
 206 {
 207    LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
 208    unsigned i, j;
 209
 210    assert(n <= LP_MAX_VECTOR_LENGTH);
 211    assert(lo_hi < 2);
 212
 213    /* TODO: cache results in a static table */
 214
 215    for(i = 0, j = lo_hi*n/2; i < n; i += 2, ++j) {
 216       elems[i + 0] = LLVMConstInt(LLVMInt32Type(), 0 + j, 0);
 217       elems[i + 1] = LLVMConstInt(LLVMInt32Type(), n + j, 0);
 218    }
 219
 220    return LLVMConstVector(elems, n);
 221 }
 222
 223
 224 /**
 225  * Build shuffle vectors that match PACKxx instructions.
 226  */
 227 static LLVMValueRef
 228 lp_build_const_pack_shuffle(unsigned n)
 229 {
 230    LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
 231    unsigned i;
 232
 233    assert(n <= LP_MAX_VECTOR_LENGTH);
 234
 235    /* TODO: cache results in a static table */
 236
 237    for(i = 0; i < n; ++i)
 238       elems[i] = LLVMConstInt(LLVMInt32Type(), 2*i, 0);
 239
 240    return LLVMConstVector(elems, n);
 241 }
 242
 243
 244 /**
 245  * Expand the bit width.
 246  *
 247  * This will only change the number of bits the values are represented, not the
 248  * values themselves.
 249  */
 250 static void
 251 lp_build_expand(LLVMBuilderRef builder,
 252                struct lp_type src_type,
 253                struct lp_type dst_type,
 254                LLVMValueRef src,
 255                LLVMValueRef *dst, unsigned num_dsts)
 256 {
 257    unsigned num_tmps;
 258    unsigned i;
 259
 260    /* Register width must remain constant */
 261    assert(src_type.width * src_type.length == dst_type.width * dst_type.length);
 262
 263    /* We must not loose or gain channels. Only precision */
 264    assert(src_type.length == dst_type.length * num_dsts);
 265
 266    num_tmps = 1;
 267    dst[0] = src;
 268
 269    while(src_type.width < dst_type.width) {
 270       struct lp_type new_type = src_type;
 271       LLVMTypeRef new_vec_type;
 272
 273       new_type.width *= 2;
 274       new_type.length /= 2;
 275       new_vec_type = lp_build_vec_type(new_type);
 276
 277       for(i = num_tmps; i--; ) {
 278          LLVMValueRef zero;
 279          LLVMValueRef shuffle_lo;
 280          LLVMValueRef shuffle_hi;
 281          LLVMValueRef lo;
 282          LLVMValueRef hi;
 283
 284          zero = lp_build_zero(src_type);
 285          shuffle_lo = lp_build_const_unpack_shuffle(src_type.length, 0);
 286          shuffle_hi = lp_build_const_unpack_shuffle(src_type.length, 1);
 287
 288          /*  PUNPCKLBW, PUNPCKHBW */
 289          lo = LLVMBuildShuffleVector(builder, dst[i], zero, shuffle_lo, "");
 290          hi = LLVMBuildShuffleVector(builder, dst[i], zero, shuffle_hi, "");
 291
 292          dst[2*i + 0] = LLVMBuildBitCast(builder, lo, new_vec_type, "");
 293          dst[2*i + 1] = LLVMBuildBitCast(builder, hi, new_vec_type, "");
 294       }
 295
 296       src_type = new_type;
 297
 298       num_tmps *= 2;
 299    }
 300
 301    assert(num_tmps == num_dsts);
 302 }
 303
 304
 305 /**
 306  * Non-interleaved pack.
 307  *
 308  * This will move values as
 309  *
 310  *   lo =   __ l0 __ l1 __ l2 __..  __ ln
 311  *   hi =   __ h0 __ h1 __ h2 __..  __ hn
 312  *   res =  l0 l1 l2 .. ln h0 h1 h2 .. hn
 313  *
 314  * TODO: handle saturation consistently.
 315  */
 316 static LLVMValueRef
 317 lp_build_pack2(LLVMBuilderRef builder,
 318                struct lp_type src_type,
 319                struct lp_type dst_type,
 320                boolean clamped,
 321                LLVMValueRef lo,
 322                LLVMValueRef hi)
 323 {
 324    LLVMTypeRef src_vec_type = lp_build_vec_type(src_type);
 325    LLVMTypeRef dst_vec_type = lp_build_vec_type(dst_type);
 326    LLVMValueRef shuffle;
 327    LLVMValueRef res;
 328
 329    /* Register width must remain constant */
 330    assert(src_type.width * src_type.length == dst_type.width * dst_type.length);
 331
 332    /* We must not loose or gain channels. Only precision */
 333    assert(src_type.length * 2 == dst_type.length);
 334
 335    assert(!src_type.floating);
 336    assert(!dst_type.floating);
 337
 338    if(util_cpu_caps.has_sse2 && src_type.width * src_type.length == 128) {
 339       /* All X86 non-interleaved pack instructions all take signed inputs and
 340        * saturate them, so saturate beforehand. */
 341       if(!src_type.sign && !clamped) {
 342          struct lp_build_context bld;
 343          unsigned dst_bits = dst_type.sign ? dst_type.width - 1 : dst_type.width;
 344          LLVMValueRef dst_max = lp_build_int_const_scalar(src_type, ((unsigned long long)1 << dst_bits) - 1);
 345          lp_build_context_init(&bld, builder, src_type);
 346          lo = lp_build_min(&bld, lo, dst_max);
 347          hi = lp_build_min(&bld, hi, dst_max);
 348       }
 349
 350       switch(src_type.width) {
 351       case 32:
 352          if(dst_type.sign || !util_cpu_caps.has_sse4_1)
 353             res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packssdw.128", src_vec_type, lo, hi);
 354          else
 355             /* PACKUSDW is the only instrinsic with a consistent signature */
 356             return lp_build_intrinsic_binary(builder, "llvm.x86.sse41.packusdw", dst_vec_type, lo, hi);
 357          break;
 358
 359       case 16:
 360          if(dst_type.sign)
 361             res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packsswb.128", src_vec_type, lo, hi);
 362          else
 363             res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packuswb.128", src_vec_type, lo, hi);
 364          break;
 365
 366       default:
 367          assert(0);
 368          return LLVMGetUndef(dst_vec_type);
 369          break;
 370       }
 371
 372       res = LLVMBuildBitCast(builder, res, dst_vec_type, "");
 373       return res;
 374    }
 375
 376    lo = LLVMBuildBitCast(builder, lo, dst_vec_type, "");
 377    hi = LLVMBuildBitCast(builder, hi, dst_vec_type, "");
 378
 379    shuffle = lp_build_const_pack_shuffle(dst_type.length);
 380
 381    res = LLVMBuildShuffleVector(builder, lo, hi, shuffle, "");
 382
 383    return res;
 384 }
 385
 386
 387 /**
 388  * Truncate the bit width.
 389  *
 390  * TODO: Handle saturation consistently.
 391  */
 392 static LLVMValueRef
 393 lp_build_pack(LLVMBuilderRef builder,
 394               struct lp_type src_type,
 395               struct lp_type dst_type,
 396               boolean clamped,
 397               const LLVMValueRef *src, unsigned num_srcs)
 398 {
 399    LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH];
 400    unsigned i;
 401
 402    /* Register width must remain constant */
 403    assert(src_type.width * src_type.length == dst_type.width * dst_type.length);
 404
 405    /* We must not loose or gain channels. Only precision */
 406    assert(src_type.length * num_srcs == dst_type.length);
 407
 408    for(i = 0; i < num_srcs; ++i)
 409       tmp[i] = src[i];
 410
 411    while(src_type.width > dst_type.width) {
 412       struct lp_type new_type = src_type;
 413
 414       new_type.width /= 2;
 415       new_type.length *= 2;
 416
 417       /* Take in consideration the sign changes only in the last step */
 418       if(new_type.width == dst_type.width)
 419          new_type.sign = dst_type.sign;
 420
 421       num_srcs /= 2;
 422
 423       for(i = 0; i < num_srcs; ++i)
 424          tmp[i] = lp_build_pack2(builder, src_type, new_type, clamped,
 425                                  tmp[2*i + 0], tmp[2*i + 1]);
 426
 427       src_type = new_type;
 428    }
 429
 430    assert(num_srcs == 1);
 431
 432    return tmp[0];
 433 }
 434
 435
 436 /**
 437  * Generic type conversion.
 438  *
 439  * TODO: Take a precision argument, or even better, add a new precision member
 440  * to the lp_type union.
 441  */
 442 void
 443 lp_build_conv(LLVMBuilderRef builder,
 444               struct lp_type src_type,
 445               struct lp_type dst_type,
 446               const LLVMValueRef *src, unsigned num_srcs,
 447               LLVMValueRef *dst, unsigned num_dsts)
 448 {
 449    struct lp_type tmp_type;
 450    LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH];
 451    unsigned num_tmps;
 452    unsigned i;
 453
 454    /* Register width must remain constant */
 455    assert(src_type.width * src_type.length == dst_type.width * dst_type.length);
 456
 457    /* We must not loose or gain channels. Only precision */
 458    assert(src_type.length * num_srcs == dst_type.length * num_dsts);
 459
 460    assert(src_type.length <= LP_MAX_VECTOR_LENGTH);
 461    assert(dst_type.length <= LP_MAX_VECTOR_LENGTH);
 462
 463    tmp_type = src_type;
 464    for(i = 0; i < num_srcs; ++i)
 465       tmp[i] = src[i];
 466    num_tmps = num_srcs;
 467
 468    /*
 469     * Clamp if necessary
 470     */
 471
 472    if(memcmp(&src_type, &dst_type, sizeof src_type) != 0) {
 473       struct lp_build_context bld;
 474       double src_min = lp_const_min(src_type);
 475       double dst_min = lp_const_min(dst_type);
 476       double src_max = lp_const_max(src_type);
 477       double dst_max = lp_const_max(dst_type);
 478       LLVMValueRef thres;
 479
 480       lp_build_context_init(&bld, builder, tmp_type);
 481
 482       if(src_min < dst_min) {
 483          if(dst_min == 0.0)
 484             thres = bld.zero;
 485          else
 486             thres = lp_build_const_scalar(src_type, dst_min);
 487          for(i = 0; i < num_tmps; ++i)
 488             tmp[i] = lp_build_max(&bld, tmp[i], thres);
 489       }
 490
 491       if(src_max > dst_max) {
 492          if(dst_max == 1.0)
 493             thres = bld.one;
 494          else
 495             thres = lp_build_const_scalar(src_type, dst_max);
 496          for(i = 0; i < num_tmps; ++i)
 497             tmp[i] = lp_build_min(&bld, tmp[i], thres);
 498       }
 499    }
 500
 501    /*
 502     * Scale to the narrowest range
 503     */
 504
 505    if(dst_type.floating) {
 506       /* Nothing to do */
 507    }
 508    else if(tmp_type.floating) {
 509       if(!dst_type.fixed && !dst_type.sign && dst_type.norm) {
 510          for(i = 0; i < num_tmps; ++i) {
 511             tmp[i] = lp_build_clamped_float_to_unsigned_norm(builder,
 512                                                              tmp_type,
 513                                                              dst_type.width,
 514                                                              tmp[i]);
 515          }
 516          tmp_type.floating = FALSE;
 517       }
 518       else {
 519          double dst_scale = lp_const_scale(dst_type);
 520          LLVMTypeRef tmp_vec_type;
 521
 522          if (dst_scale != 1.0) {
 523             LLVMValueRef scale = lp_build_const_scalar(tmp_type, dst_scale);
 524             for(i = 0; i < num_tmps; ++i)
 525                tmp[i] = LLVMBuildMul(builder, tmp[i], scale, "");
 526          }
 527
 528          /* Use an equally sized integer for intermediate computations */
 529          tmp_type.floating = FALSE;
 530          tmp_vec_type = lp_build_vec_type(tmp_type);
 531          for(i = 0; i < num_tmps; ++i) {
 532 #if 0
 533             if(dst_type.sign)
 534                tmp[i] = LLVMBuildFPToSI(builder, tmp[i], tmp_vec_type, "");
 535             else
 536                tmp[i] = LLVMBuildFPToUI(builder, tmp[i], tmp_vec_type, "");
 537 #else
 538            /* FIXME: there is no SSE counterpart for LLVMBuildFPToUI */
 539             tmp[i] = LLVMBuildFPToSI(builder, tmp[i], tmp_vec_type, "");
 540 #endif
 541          }
 542       }
 543    }
 544    else {
 545       unsigned src_shift = lp_const_shift(src_type);
 546       unsigned dst_shift = lp_const_shift(dst_type);
 547
 548       /* FIXME: compensate different offsets too */
 549       if(src_shift > dst_shift) {
 550          LLVMValueRef shift = lp_build_int_const_scalar(tmp_type, src_shift - dst_shift);
 551          for(i = 0; i < num_tmps; ++i)
 552             if(src_type.sign)
 553                tmp[i] = LLVMBuildAShr(builder, tmp[i], shift, "");
 554             else
 555                tmp[i] = LLVMBuildLShr(builder, tmp[i], shift, "");
 556       }
 557    }
 558
 559    /*
 560     * Truncate or expand bit width
 561     */
 562
 563    assert(!tmp_type.floating || tmp_type.width == dst_type.width);
 564
 565    if(tmp_type.width > dst_type.width) {
 566       assert(num_dsts == 1);
 567       tmp[0] = lp_build_pack(builder, tmp_type, dst_type, TRUE, tmp, num_tmps);
 568       tmp_type.width = dst_type.width;
 569       tmp_type.length = dst_type.length;
 570       num_tmps = 1;
 571    }
 572
 573    if(tmp_type.width < dst_type.width) {
 574       assert(num_tmps == 1);
 575       lp_build_expand(builder, tmp_type, dst_type, tmp[0], tmp, num_dsts);
 576       tmp_type.width = dst_type.width;
 577       tmp_type.length = dst_type.length;
 578       num_tmps = num_dsts;
 579    }
 580
 581    assert(tmp_type.width == dst_type.width);
 582    assert(tmp_type.length == dst_type.length);
 583    assert(num_tmps == num_dsts);
 584
 585    /*
 586     * Scale to the widest range
 587     */
 588
 589    if(src_type.floating) {
 590       /* Nothing to do */
 591    }
 592    else if(!src_type.floating && dst_type.floating) {
 593       if(!src_type.fixed && !src_type.sign && src_type.norm) {
 594          for(i = 0; i < num_tmps; ++i) {
 595             tmp[i] = lp_build_unsigned_norm_to_float(builder,
 596                                                      src_type.width,
 597                                                      dst_type,
 598                                                      tmp[i]);
 599          }
 600          tmp_type.floating = TRUE;
 601       }
 602       else {
 603          double src_scale = lp_const_scale(src_type);
 604          LLVMTypeRef tmp_vec_type;
 605
 606          /* Use an equally sized integer for intermediate computations */
 607          tmp_type.floating = TRUE;
 608          tmp_type.sign = TRUE;
 609          tmp_vec_type = lp_build_vec_type(tmp_type);
 610          for(i = 0; i < num_tmps; ++i) {
 611 #if 0
 612             if(dst_type.sign)
 613                tmp[i] = LLVMBuildSIToFP(builder, tmp[i], tmp_vec_type, "");
 614             else
 615                tmp[i] = LLVMBuildUIToFP(builder, tmp[i], tmp_vec_type, "");
 616 #else
 617             /* FIXME: there is no SSE counterpart for LLVMBuildUIToFP */
 618             tmp[i] = LLVMBuildSIToFP(builder, tmp[i], tmp_vec_type, "");
 619 #endif
 620           }
 621
 622           if (src_scale != 1.0) {
 623              LLVMValueRef scale = lp_build_const_scalar(tmp_type, 1.0/src_scale);
 624              for(i = 0; i < num_tmps; ++i)
 625                 tmp[i] = LLVMBuildMul(builder, tmp[i], scale, "");
 626           }
 627       }
 628     }
 629     else {
 630        unsigned src_shift = lp_const_shift(src_type);
 631        unsigned dst_shift = lp_const_shift(dst_type);
 632
 633        /* FIXME: compensate different offsets too */
 634        if(src_shift < dst_shift) {
 635           LLVMValueRef shift = lp_build_int_const_scalar(tmp_type, dst_shift - src_shift);
 636           for(i = 0; i < num_tmps; ++i)
 637              tmp[i] = LLVMBuildShl(builder, tmp[i], shift, "");
 638        }
 639     }
 640
 641    for(i = 0; i < num_dsts; ++i)
 642       dst[i] = tmp[i];
 643 }
 644
 645
 646 /**
 647  * Bit mask conversion.
 648  *
 649  * This will convert the integer masks that match the given types.
 650  *
 651  * The mask values should 0 or -1, i.e., all bits either set to zero or one.
 652  * Any other value will likely cause in unpredictable results.
 653  *
 654  * This is basically a very trimmed down version of lp_build_conv.
 655  */
 656 void
 657 lp_build_conv_mask(LLVMBuilderRef builder,
 658                    struct lp_type src_type,
 659                    struct lp_type dst_type,
 660                    const LLVMValueRef *src, unsigned num_srcs,
 661                    LLVMValueRef *dst, unsigned num_dsts)
 662 {
 663    /* Register width must remain constant */
 664    assert(src_type.width * src_type.length == dst_type.width * dst_type.length);
 665
 666    /* We must not loose or gain channels. Only precision */
 667    assert(src_type.length * num_srcs == dst_type.length * num_dsts);
 668
 669    /*
 670     * Drop
 671     *
 672     * We assume all values are 0 or -1
 673     */
 674
 675    src_type.floating = FALSE;
 676    src_type.fixed = FALSE;
 677    src_type.sign = TRUE;
 678    src_type.norm = FALSE;
 679
 680    dst_type.floating = FALSE;
 681    dst_type.fixed = FALSE;
 682    dst_type.sign = TRUE;
 683    dst_type.norm = FALSE;
 684
 685    /*
 686     * Truncate or expand bit width
 687     */
 688
 689    if(src_type.width > dst_type.width) {
 690       assert(num_dsts == 1);
 691       dst[0] = lp_build_pack(builder, src_type, dst_type, TRUE, src, num_srcs);
 692    }
 693    else if(src_type.width < dst_type.width) {
 694       assert(num_srcs == 1);
 695       lp_build_expand(builder, src_type, dst_type, src[0], dst, num_dsts);
 696    }
 697    else {
 698       assert(num_srcs == num_dsts);
 699       memcpy(dst, src, num_dsts * sizeof *dst);
 700    }
 701 }