src/gallium/auxiliary/gallivm/lp_bld_conv.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2009 VMware, Inc.
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the
  15  * next paragraph) shall be included in all copies or substantial portions
  16  * of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  21  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
  22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25  *
  26  **************************************************************************/
  27
  28
  29 /**
  30  * @file
  31  * Helper functions for type conversions.
  32  *
  33  * We want to use the fastest type for a given computation whenever feasible.
  34  * The other side of this is that we need to be able convert between several
  35  * types accurately and efficiently.
  36  *
  37  * Conversion between types of different bit width is quite complex since a
  38  *
  39  * To remember there are a few invariants in type conversions:
  40  *
  41  * - register width must remain constant:
  42  *
  43  *     src_type.width * src_type.length == dst_type.width * dst_type.length
  44  *
  45  * - total number of elements must remain constant:
  46  *
  47  *     src_type.length * num_srcs == dst_type.length * num_dsts
  48  *
  49  * It is not always possible to do the conversion both accurately and
  50  * efficiently, usually due to lack of adequate machine instructions. In these
  51  * cases it is important not to cut shortcuts here and sacrifice accuracy, as
  52  * there this functions can be used anywhere. In the future we might have a
  53  * precision parameter which can gauge the accuracy vs efficiency compromise,
  54  * but for now if the data conversion between two stages happens to be the
  55  * bottleneck, then most likely should just avoid converting at all and run
  56  * both stages with the same type.
  57  *
  58  * Make sure to run lp_test_conv unit test after any change to this file.
  59  *
  60  * @author Jose Fonseca <jfonseca@vmware.com>
  61  */
  62
  63
  64 #include "util/u_debug.h"
  65 #include "util/u_math.h"
  66 #include "util/u_cpu_detect.h"
  67
  68 #include "lp_bld_type.h"
  69 #include "lp_bld_const.h"
  70 #include "lp_bld_arit.h"
  71 #include "lp_bld_pack.h"
  72 #include "lp_bld_conv.h"
  73 #include "lp_bld_intr.h"
  74
  75
  76 /**
  77  * Special case for converting clamped IEEE-754 floats to unsigned norms.
  78  *
  79  * The mathematical voodoo below may seem excessive but it is actually
  80  * paramount we do it this way for several reasons. First, there is no single
  81  * precision FP to unsigned integer conversion Intel SSE instruction. Second,
  82  * secondly, even if there was, since the FP's mantissa takes only a fraction
  83  * of register bits the typically scale and cast approach would require double
  84  * precision for accurate results, and therefore half the throughput
  85  *
  86  * Although the result values can be scaled to an arbitrary bit width specified
  87  * by dst_width, the actual result type will have the same width.
  88  *
  89  * Ex: src = { float, float, float, float }
  90  * return { i32, i32, i32, i32 } where each value is in [0, 2^dst_width-1].
  91  */
  92 LLVMValueRef
  93 lp_build_clamped_float_to_unsigned_norm(LLVMBuilderRef builder,
  94                                         struct lp_type src_type,
  95                                         unsigned dst_width,
  96                                         LLVMValueRef src)
  97 {
  98    LLVMTypeRef int_vec_type = lp_build_int_vec_type(src_type);
  99    LLVMValueRef res;
 100    unsigned mantissa;
 101    unsigned n;
 102    unsigned long long ubound;
 103    unsigned long long mask;
 104    double scale;
 105    double bias;
 106
 107    assert(src_type.floating);
 108
 109    mantissa = lp_mantissa(src_type);
 110
 111    /* We cannot carry more bits than the mantissa */
 112    n = MIN2(mantissa, dst_width);
 113
 114    /* This magic coefficients will make the desired result to appear in the
 115     * lowest significant bits of the mantissa.
 116     */
 117    ubound = ((unsigned long long)1 << n);
 118    mask = ubound - 1;
 119    scale = (double)mask/ubound;
 120    bias = (double)((unsigned long long)1 << (mantissa - n));
 121
 122    res = LLVMBuildFMul(builder, src, lp_build_const_vec(src_type, scale), "");
 123    res = LLVMBuildFAdd(builder, res, lp_build_const_vec(src_type, bias), "");
 124    res = LLVMBuildBitCast(builder, res, int_vec_type, "");
 125
 126    if(dst_width > n) {
 127       int shift = dst_width - n;
 128       res = LLVMBuildShl(builder, res, lp_build_const_int_vec(src_type, shift), "");
 129
 130       /* TODO: Fill in the empty lower bits for additional precision? */
 131       /* YES: this fixes progs/trivial/tri-z-eq.c.
 132        * Otherwise vertex Z=1.0 values get converted to something like
 133        * 0xfffffb00 and the test for equality with 0xffffffff fails.
 134        */
 135 #if 0
 136       {
 137          LLVMValueRef msb;
 138          msb = LLVMBuildLShr(builder, res, lp_build_const_int_vec(src_type, dst_width - 1), "");
 139          msb = LLVMBuildShl(builder, msb, lp_build_const_int_vec(src_type, shift), "");
 140          msb = LLVMBuildSub(builder, msb, lp_build_const_int_vec(src_type, 1), "");
 141          res = LLVMBuildOr(builder, res, msb, "");
 142       }
 143 #elif 0
 144       while(shift > 0) {
 145          res = LLVMBuildOr(builder, res, LLVMBuildLShr(builder, res, lp_build_const_int_vec(src_type, n), ""), "");
 146          shift -= n;
 147          n *= 2;
 148       }
 149 #endif
 150    }
 151    else
 152       res = LLVMBuildAnd(builder, res, lp_build_const_int_vec(src_type, mask), "");
 153
 154    return res;
 155 }
 156
 157
 158 /**
 159  * Inverse of lp_build_clamped_float_to_unsigned_norm above.
 160  * Ex: src = { i32, i32, i32, i32 } with values in range [0, 2^src_width-1]
 161  * return {float, float, float, float} with values in range [0, 1].
 162  */
 163 LLVMValueRef
 164 lp_build_unsigned_norm_to_float(LLVMBuilderRef builder,
 165                                 unsigned src_width,
 166                                 struct lp_type dst_type,
 167                                 LLVMValueRef src)
 168 {
 169    LLVMTypeRef vec_type = lp_build_vec_type(dst_type);
 170    LLVMTypeRef int_vec_type = lp_build_int_vec_type(dst_type);
 171    LLVMValueRef bias_;
 172    LLVMValueRef res;
 173    unsigned mantissa;
 174    unsigned n;
 175    unsigned long long ubound;
 176    unsigned long long mask;
 177    double scale;
 178    double bias;
 179
 180    assert(dst_type.floating);
 181
 182    mantissa = lp_mantissa(dst_type);
 183
 184    n = MIN2(mantissa, src_width);
 185
 186    ubound = ((unsigned long long)1 << n);
 187    mask = ubound - 1;
 188    scale = (double)ubound/mask;
 189    bias = (double)((unsigned long long)1 << (mantissa - n));
 190
 191    res = src;
 192
 193    if(src_width > mantissa) {
 194       int shift = src_width - mantissa;
 195       res = LLVMBuildLShr(builder, res, lp_build_const_int_vec(dst_type, shift), "");
 196    }
 197
 198    bias_ = lp_build_const_vec(dst_type, bias);
 199
 200    res = LLVMBuildOr(builder,
 201                      res,
 202                      LLVMBuildBitCast(builder, bias_, int_vec_type, ""), "");
 203
 204    res = LLVMBuildBitCast(builder, res, vec_type, "");
 205
 206    res = LLVMBuildFSub(builder, res, bias_, "");
 207    res = LLVMBuildFMul(builder, res, lp_build_const_vec(dst_type, scale), "");
 208
 209    return res;
 210 }
 211
 212
 213 /**
 214  * Generic type conversion.
 215  *
 216  * TODO: Take a precision argument, or even better, add a new precision member
 217  * to the lp_type union.
 218  */
 219 void
 220 lp_build_conv(LLVMBuilderRef builder,
 221               struct lp_type src_type,
 222               struct lp_type dst_type,
 223               const LLVMValueRef *src, unsigned num_srcs,
 224               LLVMValueRef *dst, unsigned num_dsts)
 225 {
 226    struct lp_type tmp_type;
 227    LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH];
 228    unsigned num_tmps;
 229    unsigned i;
 230
 231    /* We must not loose or gain channels. Only precision */
 232    assert(src_type.length * num_srcs == dst_type.length * num_dsts);
 233
 234    assert(src_type.length <= LP_MAX_VECTOR_LENGTH);
 235    assert(dst_type.length <= LP_MAX_VECTOR_LENGTH);
 236    assert(num_srcs <= LP_MAX_VECTOR_LENGTH);
 237    assert(num_dsts <= LP_MAX_VECTOR_LENGTH);
 238
 239    tmp_type = src_type;
 240    for(i = 0; i < num_srcs; ++i) {
 241       assert(lp_check_value(src_type, src[i]));
 242       tmp[i] = src[i];
 243    }
 244    num_tmps = num_srcs;
 245
 246
 247    /* Special case 4x4f --> 1x16ub
 248     */
 249    if (src_type.floating == 1 &&
 250        src_type.fixed    == 0 &&
 251        src_type.sign     == 1 &&
 252        src_type.norm     == 0 &&
 253        src_type.width    == 32 &&
 254        src_type.length   == 4 &&
 255
 256        dst_type.floating == 0 &&
 257        dst_type.fixed    == 0 &&
 258        dst_type.sign     == 0 &&
 259        dst_type.norm     == 1 &&
 260        dst_type.width    == 8 &&
 261        dst_type.length   == 16)
 262    {
 263       int i;
 264
 265       for (i = 0; i < num_dsts; i++, src += 4) {
 266          struct lp_type int16_type = dst_type;
 267          struct lp_type int32_type = dst_type;
 268          LLVMValueRef lo, hi;
 269          LLVMValueRef src_int0;
 270          LLVMValueRef src_int1;
 271          LLVMValueRef src_int2;
 272          LLVMValueRef src_int3;
 273          LLVMTypeRef int16_vec_type;
 274          LLVMTypeRef int32_vec_type;
 275          LLVMTypeRef src_vec_type;
 276          LLVMTypeRef dst_vec_type;
 277          LLVMValueRef const_255f;
 278          LLVMValueRef a, b, c, d;
 279
 280          int16_type.width *= 2;
 281          int16_type.length /= 2;
 282          int16_type.sign = 1;
 283
 284          int32_type.width *= 4;
 285          int32_type.length /= 4;
 286          int32_type.sign = 1;
 287
 288          src_vec_type   = lp_build_vec_type(src_type);
 289          dst_vec_type   = lp_build_vec_type(dst_type);
 290          int16_vec_type = lp_build_vec_type(int16_type);
 291          int32_vec_type = lp_build_vec_type(int32_type);
 292
 293          const_255f = lp_build_const_vec(src_type, 255.0f);
 294
 295          a = LLVMBuildFMul(builder, src[0], const_255f, "");
 296          b = LLVMBuildFMul(builder, src[1], const_255f, "");
 297          c = LLVMBuildFMul(builder, src[2], const_255f, "");
 298          d = LLVMBuildFMul(builder, src[3], const_255f, "");
 299
 300          /* lp_build_round generates excessively general code without
 301           * sse4, so do rounding manually.
 302           */
 303          if (!util_cpu_caps.has_sse4_1) {
 304             LLVMValueRef const_half = lp_build_const_vec(src_type, 0.5f);
 305
 306             a = LLVMBuildFAdd(builder, a, const_half, "");
 307             b = LLVMBuildFAdd(builder, b, const_half, "");
 308             c = LLVMBuildFAdd(builder, c, const_half, "");
 309             d = LLVMBuildFAdd(builder, d, const_half, "");
 310
 311             src_int0 = LLVMBuildFPToSI(builder, a, int32_vec_type, "");
 312             src_int1 = LLVMBuildFPToSI(builder, b, int32_vec_type, "");
 313             src_int2 = LLVMBuildFPToSI(builder, c, int32_vec_type, "");
 314             src_int3 = LLVMBuildFPToSI(builder, d, int32_vec_type, "");
 315          }
 316          else {
 317             struct lp_build_context bld;
 318
 319             bld.builder = builder;
 320             bld.type = src_type;
 321             bld.vec_type = src_vec_type;
 322             bld.int_elem_type = lp_build_elem_type(int32_type);
 323             bld.int_vec_type = int32_vec_type;
 324             bld.undef = lp_build_undef(src_type);
 325             bld.zero = lp_build_zero(src_type);
 326             bld.one = lp_build_one(src_type);
 327
 328             src_int0 = lp_build_iround(&bld, a);
 329             src_int1 = lp_build_iround(&bld, b);
 330             src_int2 = lp_build_iround(&bld, c);
 331             src_int3 = lp_build_iround(&bld, d);
 332          }
 333
 334          lo = lp_build_pack2(builder, int32_type, int16_type, src_int0, src_int1);
 335          hi = lp_build_pack2(builder, int32_type, int16_type, src_int2, src_int3);
 336          dst[i] = lp_build_pack2(builder, int16_type, dst_type, lo, hi);
 337       }
 338       return;
 339    }
 340
 341    /*
 342     * Clamp if necessary
 343     */
 344
 345    if(memcmp(&src_type, &dst_type, sizeof src_type) != 0) {
 346       struct lp_build_context bld;
 347       double src_min = lp_const_min(src_type);
 348       double dst_min = lp_const_min(dst_type);
 349       double src_max = lp_const_max(src_type);
 350       double dst_max = lp_const_max(dst_type);
 351       LLVMValueRef thres;
 352
 353       lp_build_context_init(&bld, builder, tmp_type);
 354
 355       if(src_min < dst_min) {
 356          if(dst_min == 0.0)
 357             thres = bld.zero;
 358          else
 359             thres = lp_build_const_vec(src_type, dst_min);
 360          for(i = 0; i < num_tmps; ++i)
 361             tmp[i] = lp_build_max(&bld, tmp[i], thres);
 362       }
 363
 364       if(src_max > dst_max) {
 365          if(dst_max == 1.0)
 366             thres = bld.one;
 367          else
 368             thres = lp_build_const_vec(src_type, dst_max);
 369          for(i = 0; i < num_tmps; ++i)
 370             tmp[i] = lp_build_min(&bld, tmp[i], thres);
 371       }
 372    }
 373
 374    /*
 375     * Scale to the narrowest range
 376     */
 377
 378    if(dst_type.floating) {
 379       /* Nothing to do */
 380    }
 381    else if(tmp_type.floating) {
 382       if(!dst_type.fixed && !dst_type.sign && dst_type.norm) {
 383          for(i = 0; i < num_tmps; ++i) {
 384             tmp[i] = lp_build_clamped_float_to_unsigned_norm(builder,
 385                                                              tmp_type,
 386                                                              dst_type.width,
 387                                                              tmp[i]);
 388          }
 389          tmp_type.floating = FALSE;
 390       }
 391       else {
 392          double dst_scale = lp_const_scale(dst_type);
 393          LLVMTypeRef tmp_vec_type;
 394
 395          if (dst_scale != 1.0) {
 396             LLVMValueRef scale = lp_build_const_vec(tmp_type, dst_scale);
 397             for(i = 0; i < num_tmps; ++i)
 398                tmp[i] = LLVMBuildFMul(builder, tmp[i], scale, "");
 399          }
 400
 401          /* Use an equally sized integer for intermediate computations */
 402          tmp_type.floating = FALSE;
 403          tmp_vec_type = lp_build_vec_type(tmp_type);
 404          for(i = 0; i < num_tmps; ++i) {
 405 #if 0
 406             if(dst_type.sign)
 407                tmp[i] = LLVMBuildFPToSI(builder, tmp[i], tmp_vec_type, "");
 408             else
 409                tmp[i] = LLVMBuildFPToUI(builder, tmp[i], tmp_vec_type, "");
 410 #else
 411            /* FIXME: there is no SSE counterpart for LLVMBuildFPToUI */
 412             tmp[i] = LLVMBuildFPToSI(builder, tmp[i], tmp_vec_type, "");
 413 #endif
 414          }
 415       }
 416    }
 417    else {
 418       unsigned src_shift = lp_const_shift(src_type);
 419       unsigned dst_shift = lp_const_shift(dst_type);
 420
 421       /* FIXME: compensate different offsets too */
 422       if(src_shift > dst_shift) {
 423          LLVMValueRef shift = lp_build_const_int_vec(tmp_type, src_shift - dst_shift);
 424          for(i = 0; i < num_tmps; ++i)
 425             if(src_type.sign)
 426                tmp[i] = LLVMBuildAShr(builder, tmp[i], shift, "");
 427             else
 428                tmp[i] = LLVMBuildLShr(builder, tmp[i], shift, "");
 429       }
 430    }
 431
 432    /*
 433     * Truncate or expand bit width
 434     *
 435     * No data conversion should happen here, although the sign bits are
 436     * crucial to avoid bad clamping.
 437     */
 438
 439    {
 440       struct lp_type new_type;
 441
 442       new_type = tmp_type;
 443       new_type.sign   = dst_type.sign;
 444       new_type.width  = dst_type.width;
 445       new_type.length = dst_type.length;
 446
 447       lp_build_resize(builder, tmp_type, new_type, tmp, num_srcs, tmp, num_dsts);
 448
 449       tmp_type = new_type;
 450       num_tmps = num_dsts;
 451    }
 452
 453    /*
 454     * Scale to the widest range
 455     */
 456
 457    if(src_type.floating) {
 458       /* Nothing to do */
 459    }
 460    else if(!src_type.floating && dst_type.floating) {
 461       if(!src_type.fixed && !src_type.sign && src_type.norm) {
 462          for(i = 0; i < num_tmps; ++i) {
 463             tmp[i] = lp_build_unsigned_norm_to_float(builder,
 464                                                      src_type.width,
 465                                                      dst_type,
 466                                                      tmp[i]);
 467          }
 468          tmp_type.floating = TRUE;
 469       }
 470       else {
 471          double src_scale = lp_const_scale(src_type);
 472          LLVMTypeRef tmp_vec_type;
 473
 474          /* Use an equally sized integer for intermediate computations */
 475          tmp_type.floating = TRUE;
 476          tmp_type.sign = TRUE;
 477          tmp_vec_type = lp_build_vec_type(tmp_type);
 478          for(i = 0; i < num_tmps; ++i) {
 479 #if 0
 480             if(dst_type.sign)
 481                tmp[i] = LLVMBuildSIToFP(builder, tmp[i], tmp_vec_type, "");
 482             else
 483                tmp[i] = LLVMBuildUIToFP(builder, tmp[i], tmp_vec_type, "");
 484 #else
 485             /* FIXME: there is no SSE counterpart for LLVMBuildUIToFP */
 486             tmp[i] = LLVMBuildSIToFP(builder, tmp[i], tmp_vec_type, "");
 487 #endif
 488           }
 489
 490           if (src_scale != 1.0) {
 491              LLVMValueRef scale = lp_build_const_vec(tmp_type, 1.0/src_scale);
 492              for(i = 0; i < num_tmps; ++i)
 493                 tmp[i] = LLVMBuildFMul(builder, tmp[i], scale, "");
 494           }
 495       }
 496     }
 497     else {
 498        unsigned src_shift = lp_const_shift(src_type);
 499        unsigned dst_shift = lp_const_shift(dst_type);
 500
 501        /* FIXME: compensate different offsets too */
 502        if(src_shift < dst_shift) {
 503           LLVMValueRef shift = lp_build_const_int_vec(tmp_type, dst_shift - src_shift);
 504           for(i = 0; i < num_tmps; ++i)
 505              tmp[i] = LLVMBuildShl(builder, tmp[i], shift, "");
 506        }
 507     }
 508
 509    for(i = 0; i < num_dsts; ++i) {
 510       dst[i] = tmp[i];
 511       assert(lp_check_value(dst_type, dst[i]));
 512    }
 513 }
 514
 515
 516 /**
 517  * Bit mask conversion.
 518  *
 519  * This will convert the integer masks that match the given types.
 520  *
 521  * The mask values should 0 or -1, i.e., all bits either set to zero or one.
 522  * Any other value will likely cause in unpredictable results.
 523  *
 524  * This is basically a very trimmed down version of lp_build_conv.
 525  */
 526 void
 527 lp_build_conv_mask(LLVMBuilderRef builder,
 528                    struct lp_type src_type,
 529                    struct lp_type dst_type,
 530                    const LLVMValueRef *src, unsigned num_srcs,
 531                    LLVMValueRef *dst, unsigned num_dsts)
 532 {
 533    /* Register width must remain constant */
 534    assert(src_type.width * src_type.length == dst_type.width * dst_type.length);
 535
 536    /* We must not loose or gain channels. Only precision */
 537    assert(src_type.length * num_srcs == dst_type.length * num_dsts);
 538
 539    /*
 540     * Drop
 541     *
 542     * We assume all values are 0 or -1
 543     */
 544
 545    src_type.floating = FALSE;
 546    src_type.fixed = FALSE;
 547    src_type.sign = TRUE;
 548    src_type.norm = FALSE;
 549
 550    dst_type.floating = FALSE;
 551    dst_type.fixed = FALSE;
 552    dst_type.sign = TRUE;
 553    dst_type.norm = FALSE;
 554
 555    /*
 556     * Truncate or expand bit width
 557     */
 558
 559    if(src_type.width > dst_type.width) {
 560       assert(num_dsts == 1);
 561       dst[0] = lp_build_pack(builder, src_type, dst_type, TRUE, src, num_srcs);
 562    }
 563    else if(src_type.width < dst_type.width) {
 564       assert(num_srcs == 1);
 565       lp_build_unpack(builder, src_type, dst_type, src[0], dst, num_dsts);
 566    }
 567    else {
 568       assert(num_srcs == num_dsts);
 569       memcpy(dst, src, num_dsts * sizeof *dst);
 570    }
 571 }