src/gallium/auxiliary/gallivm/lp_bld_arit.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2009-2010 VMware, Inc.
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the
  15  * next paragraph) shall be included in all copies or substantial portions
  16  * of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  21  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
  22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25  *
  26  **************************************************************************/
  27
  28
  29 /**
  30  * @file
  31  * Helper
  32  *
  33  * LLVM IR doesn't support all basic arithmetic operations we care about (most
  34  * notably min/max and saturated operations), and it is often necessary to
  35  * resort machine-specific intrinsics directly. The functions here hide all
  36  * these implementation details from the other modules.
  37  *
  38  * We also do simple expressions simplification here. Reasons are:
  39  * - it is very easy given we have all necessary information readily available
  40  * - LLVM optimization passes fail to simplify several vector expressions
  41  * - We often know value constraints which the optimization passes have no way
  42  *   of knowing, such as when source arguments are known to be in [0, 1] range.
  43  *
  44  * @author Jose Fonseca <jfonseca@vmware.com>
  45  */
  46
  47
  48 #include <float.h>
  49
  50 #include "util/u_memory.h"
  51 #include "util/u_debug.h"
  52 #include "util/u_math.h"
  53 #include "util/u_string.h"
  54 #include "util/u_cpu_detect.h"
  55
  56 #include "lp_bld_type.h"
  57 #include "lp_bld_const.h"
  58 #include "lp_bld_init.h"
  59 #include "lp_bld_intr.h"
  60 #include "lp_bld_logic.h"
  61 #include "lp_bld_pack.h"
  62 #include "lp_bld_debug.h"
  63 #include "lp_bld_bitarit.h"
  64 #include "lp_bld_arit.h"
  65 #include "lp_bld_flow.h"
  66
  67
  68 #define EXP_POLY_DEGREE 5
  69
  70 #define LOG_POLY_DEGREE 4
  71
  72
  73 /**
  74  * Generate min(a, b)
  75  * No checks for special case values of a or b = 1 or 0 are done.
  76  */
  77 static LLVMValueRef
  78 lp_build_min_simple(struct lp_build_context *bld,
  79                     LLVMValueRef a,
  80                     LLVMValueRef b)
  81 {
  82    const struct lp_type type = bld->type;
  83    const char *intrinsic = NULL;
  84    unsigned intr_size = 0;
  85    LLVMValueRef cond;
  86
  87    assert(lp_check_value(type, a));
  88    assert(lp_check_value(type, b));
  89
  90    /* TODO: optimize the constant case */
  91
  92    if (type.floating && util_cpu_caps.has_sse) {
  93       if (type.width == 32) {
  94          if (type.length == 1) {
  95             intrinsic = "llvm.x86.sse.min.ss";
  96             intr_size = 128;
  97          }
  98          else if (type.length <= 4 || !util_cpu_caps.has_avx) {
  99             intrinsic = "llvm.x86.sse.min.ps";
 100             intr_size = 128;
 101          }
 102          else {
 103             intrinsic = "llvm.x86.avx.min.ps.256";
 104             intr_size = 256;
 105          }
 106       }
 107       if (type.width == 64 && util_cpu_caps.has_sse2) {
 108          if (type.length == 1) {
 109             intrinsic = "llvm.x86.sse2.min.sd";
 110             intr_size = 128;
 111          }
 112          else if (type.length == 2 || !util_cpu_caps.has_avx) {
 113             intrinsic = "llvm.x86.sse2.min.pd";
 114             intr_size = 128;
 115          }
 116          else {
 117             intrinsic = "llvm.x86.avx.min.pd.256";
 118             intr_size = 256;
 119          }
 120       }
 121    }
 122    else if (type.floating && util_cpu_caps.has_altivec) {
 123       if (type.width == 32 && type.length == 4) {
 124          intrinsic = "llvm.ppc.altivec.vminfp";
 125          intr_size = 128;
 126       }
 127    } else if (util_cpu_caps.has_sse2 && type.length >= 2) {
 128       intr_size = 128;
 129       if ((type.width == 8 || type.width == 16) &&
 130           (type.width * type.length <= 64) &&
 131           (gallivm_debug & GALLIVM_DEBUG_PERF)) {
 132          debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
 133                       __FUNCTION__);
 134          }
 135       if (type.width == 8 && !type.sign) {
 136          intrinsic = "llvm.x86.sse2.pminu.b";
 137       }
 138       else if (type.width == 16 && type.sign) {
 139          intrinsic = "llvm.x86.sse2.pmins.w";
 140       }
 141       if (util_cpu_caps.has_sse4_1) {
 142          if (type.width == 8 && type.sign) {
 143             intrinsic = "llvm.x86.sse41.pminsb";
 144          }
 145          if (type.width == 16 && !type.sign) {
 146             intrinsic = "llvm.x86.sse41.pminuw";
 147          }
 148          if (type.width == 32 && !type.sign) {
 149             intrinsic = "llvm.x86.sse41.pminud";
 150         }
 151          if (type.width == 32 && type.sign) {
 152             intrinsic = "llvm.x86.sse41.pminsd";
 153          }
 154       }
 155    } else if (util_cpu_caps.has_altivec) {
 156      intr_size = 128;
 157      if (type.width == 8) {
 158        if (!type.sign) {
 159          intrinsic = "llvm.ppc.altivec.vminub";
 160        } else {
 161          intrinsic = "llvm.ppc.altivec.vminsb";
 162        }
 163      } else if (type.width == 16) {
 164        if (!type.sign) {
 165          intrinsic = "llvm.ppc.altivec.vminuh";
 166        } else {
 167          intrinsic = "llvm.ppc.altivec.vminsh";
 168        }
 169      } else if (type.width == 32) {
 170        if (!type.sign) {
 171          intrinsic = "llvm.ppc.altivec.vminuw";
 172        } else {
 173          intrinsic = "llvm.ppc.altivec.vminsw";
 174        }
 175      }
 176    }
 177
 178    if(intrinsic) {
 179       return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
 180                                                  type,
 181                                                  intr_size, a, b);
 182    }
 183
 184    cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 185    return lp_build_select(bld, cond, a, b);
 186 }
 187
 188
 189 /**
 190  * Generate max(a, b)
 191  * No checks for special case values of a or b = 1 or 0 are done.
 192  */
 193 static LLVMValueRef
 194 lp_build_max_simple(struct lp_build_context *bld,
 195                     LLVMValueRef a,
 196                     LLVMValueRef b)
 197 {
 198    const struct lp_type type = bld->type;
 199    const char *intrinsic = NULL;
 200    unsigned intr_size = 0;
 201    LLVMValueRef cond;
 202
 203    assert(lp_check_value(type, a));
 204    assert(lp_check_value(type, b));
 205
 206    /* TODO: optimize the constant case */
 207
 208    if (type.floating && util_cpu_caps.has_sse) {
 209       if (type.width == 32) {
 210          if (type.length == 1) {
 211             intrinsic = "llvm.x86.sse.max.ss";
 212             intr_size = 128;
 213          }
 214          else if (type.length <= 4 || !util_cpu_caps.has_avx) {
 215             intrinsic = "llvm.x86.sse.max.ps";
 216             intr_size = 128;
 217          }
 218          else {
 219             intrinsic = "llvm.x86.avx.max.ps.256";
 220             intr_size = 256;
 221          }
 222       }
 223       if (type.width == 64 && util_cpu_caps.has_sse2) {
 224          if (type.length == 1) {
 225             intrinsic = "llvm.x86.sse2.max.sd";
 226             intr_size = 128;
 227          }
 228          else if (type.length == 2 || !util_cpu_caps.has_avx) {
 229             intrinsic = "llvm.x86.sse2.max.pd";
 230             intr_size = 128;
 231          }
 232          else {
 233             intrinsic = "llvm.x86.avx.max.pd.256";
 234             intr_size = 256;
 235          }
 236       }
 237    }
 238    else if (type.floating && util_cpu_caps.has_altivec) {
 239       if (type.width == 32 || type.length == 4) {
 240          intrinsic = "llvm.ppc.altivec.vmaxfp";
 241          intr_size = 128;
 242       }
 243    } else if (util_cpu_caps.has_sse2 && type.length >= 2) {
 244       intr_size = 128;
 245       if ((type.width == 8 || type.width == 16) &&
 246           (type.width * type.length <= 64) &&
 247           (gallivm_debug & GALLIVM_DEBUG_PERF)) {
 248          debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
 249                       __FUNCTION__);
 250          }
 251       if (type.width == 8 && !type.sign) {
 252          intrinsic = "llvm.x86.sse2.pmaxu.b";
 253          intr_size = 128;
 254       }
 255       else if (type.width == 16 && type.sign) {
 256          intrinsic = "llvm.x86.sse2.pmaxs.w";
 257       }
 258       if (util_cpu_caps.has_sse4_1) {
 259          if (type.width == 8 && type.sign) {
 260             intrinsic = "llvm.x86.sse41.pmaxsb";
 261          }
 262          if (type.width == 16 && !type.sign) {
 263             intrinsic = "llvm.x86.sse41.pmaxuw";
 264          }
 265          if (type.width == 32 && !type.sign) {
 266             intrinsic = "llvm.x86.sse41.pmaxud";
 267         }
 268          if (type.width == 32 && type.sign) {
 269             intrinsic = "llvm.x86.sse41.pmaxsd";
 270          }
 271       }
 272    } else if (util_cpu_caps.has_altivec) {
 273      intr_size = 128;
 274      if (type.width == 8) {
 275        if (!type.sign) {
 276          intrinsic = "llvm.ppc.altivec.vmaxub";
 277        } else {
 278          intrinsic = "llvm.ppc.altivec.vmaxsb";
 279        }
 280      } else if (type.width == 16) {
 281        if (!type.sign) {
 282          intrinsic = "llvm.ppc.altivec.vmaxuh";
 283        } else {
 284          intrinsic = "llvm.ppc.altivec.vmaxsh";
 285        }
 286      } else if (type.width == 32) {
 287        if (!type.sign) {
 288          intrinsic = "llvm.ppc.altivec.vmaxuw";
 289        } else {
 290          intrinsic = "llvm.ppc.altivec.vmaxsw";
 291        }
 292      }
 293    }
 294
 295    if(intrinsic) {
 296       return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
 297                                                  type,
 298                                                  intr_size, a, b);
 299    }
 300
 301    cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 302    return lp_build_select(bld, cond, a, b);
 303 }
 304
 305
 306 /**
 307  * Generate 1 - a, or ~a depending on bld->type.
 308  */
 309 LLVMValueRef
 310 lp_build_comp(struct lp_build_context *bld,
 311               LLVMValueRef a)
 312 {
 313    LLVMBuilderRef builder = bld->gallivm->builder;
 314    const struct lp_type type = bld->type;
 315
 316    assert(lp_check_value(type, a));
 317
 318    if(a == bld->one)
 319       return bld->zero;
 320    if(a == bld->zero)
 321       return bld->one;
 322
 323    if(type.norm && !type.floating && !type.fixed && !type.sign) {
 324       if(LLVMIsConstant(a))
 325          return LLVMConstNot(a);
 326       else
 327          return LLVMBuildNot(builder, a, "");
 328    }
 329
 330    if(LLVMIsConstant(a))
 331       if (type.floating)
 332           return LLVMConstFSub(bld->one, a);
 333       else
 334           return LLVMConstSub(bld->one, a);
 335    else
 336       if (type.floating)
 337          return LLVMBuildFSub(builder, bld->one, a, "");
 338       else
 339          return LLVMBuildSub(builder, bld->one, a, "");
 340 }
 341
 342
 343 /**
 344  * Generate a + b
 345  */
 346 LLVMValueRef
 347 lp_build_add(struct lp_build_context *bld,
 348              LLVMValueRef a,
 349              LLVMValueRef b)
 350 {
 351    LLVMBuilderRef builder = bld->gallivm->builder;
 352    const struct lp_type type = bld->type;
 353    LLVMValueRef res;
 354
 355    assert(lp_check_value(type, a));
 356    assert(lp_check_value(type, b));
 357
 358    if(a == bld->zero)
 359       return b;
 360    if(b == bld->zero)
 361       return a;
 362    if(a == bld->undef || b == bld->undef)
 363       return bld->undef;
 364
 365    if(bld->type.norm) {
 366       const char *intrinsic = NULL;
 367
 368       if(a == bld->one || b == bld->one)
 369         return bld->one;
 370
 371       if (type.width * type.length == 128 &&
 372           !type.floating && !type.fixed) {
 373          if(util_cpu_caps.has_sse2) {
 374            if(type.width == 8)
 375              intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
 376            if(type.width == 16)
 377              intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
 378          } else if (util_cpu_caps.has_altivec) {
 379            if(type.width == 8)
 380               intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs";
 381            if(type.width == 16)
 382               intrinsic = type.sign ? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs";
 383          }
 384       }
 385
 386       if(intrinsic)
 387          return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
 388    }
 389
 390    /* TODO: handle signed case */
 391    if(type.norm && !type.floating && !type.fixed && !type.sign)
 392       a = lp_build_min_simple(bld, a, lp_build_comp(bld, b));
 393
 394    if(LLVMIsConstant(a) && LLVMIsConstant(b))
 395       if (type.floating)
 396          res = LLVMConstFAdd(a, b);
 397       else
 398          res = LLVMConstAdd(a, b);
 399    else
 400       if (type.floating)
 401          res = LLVMBuildFAdd(builder, a, b, "");
 402       else
 403          res = LLVMBuildAdd(builder, a, b, "");
 404
 405    /* clamp to ceiling of 1.0 */
 406    if(bld->type.norm && (bld->type.floating || bld->type.fixed))
 407       res = lp_build_min_simple(bld, res, bld->one);
 408
 409    /* XXX clamp to floor of -1 or 0??? */
 410
 411    return res;
 412 }
 413
 414
 415 /** Return the scalar sum of the elements of a.
 416  * Should avoid this operation whenever possible.
 417  */
 418 LLVMValueRef
 419 lp_build_horizontal_add(struct lp_build_context *bld,
 420                         LLVMValueRef a)
 421 {
 422    LLVMBuilderRef builder = bld->gallivm->builder;
 423    const struct lp_type type = bld->type;
 424    LLVMValueRef index, res;
 425    unsigned i, length;
 426    LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 2];
 427    LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 2];
 428    LLVMValueRef vecres, elem2;
 429
 430    assert(lp_check_value(type, a));
 431
 432    if (type.length == 1) {
 433       return a;
 434    }
 435
 436    assert(!bld->type.norm);
 437
 438    /*
 439     * for byte vectors can do much better with psadbw.
 440     * Using repeated shuffle/adds here. Note with multiple vectors
 441     * this can be done more efficiently as outlined in the intel
 442     * optimization manual.
 443     * Note: could cause data rearrangement if used with smaller element
 444     * sizes.
 445     */
 446
 447    vecres = a;
 448    length = type.length / 2;
 449    while (length > 1) {
 450       LLVMValueRef vec1, vec2;
 451       for (i = 0; i < length; i++) {
 452          shuffles1[i] = lp_build_const_int32(bld->gallivm, i);
 453          shuffles2[i] = lp_build_const_int32(bld->gallivm, i + length);
 454       }
 455       vec1 = LLVMBuildShuffleVector(builder, vecres, vecres,
 456                                     LLVMConstVector(shuffles1, length), "");
 457       vec2 = LLVMBuildShuffleVector(builder, vecres, vecres,
 458                                     LLVMConstVector(shuffles2, length), "");
 459       if (type.floating) {
 460          vecres = LLVMBuildFAdd(builder, vec1, vec2, "");
 461       }
 462       else {
 463          vecres = LLVMBuildAdd(builder, vec1, vec2, "");
 464       }
 465       length = length >> 1;
 466    }
 467
 468    /* always have vector of size 2 here */
 469    assert(length == 1);
 470
 471    index = lp_build_const_int32(bld->gallivm, 0);
 472    res = LLVMBuildExtractElement(builder, vecres, index, "");
 473    index = lp_build_const_int32(bld->gallivm, 1);
 474    elem2 = LLVMBuildExtractElement(builder, vecres, index, "");
 475
 476    if (type.floating)
 477       res = LLVMBuildFAdd(builder, res, elem2, "");
 478     else
 479       res = LLVMBuildAdd(builder, res, elem2, "");
 480
 481    return res;
 482 }
 483
 484 /**
 485  * Return the horizontal sums of 4 float vectors as a float4 vector.
 486  * This uses the technique as outlined in Intel Optimization Manual.
 487  */
 488 static LLVMValueRef
 489 lp_build_horizontal_add4x4f(struct lp_build_context *bld,
 490                             LLVMValueRef src[4])
 491 {
 492    struct gallivm_state *gallivm = bld->gallivm;
 493    LLVMBuilderRef builder = gallivm->builder;
 494    LLVMValueRef shuffles[4];
 495    LLVMValueRef tmp[4];
 496    LLVMValueRef sumtmp[2], shuftmp[2];
 497
 498    /* lower half of regs */
 499    shuffles[0] = lp_build_const_int32(gallivm, 0);
 500    shuffles[1] = lp_build_const_int32(gallivm, 1);
 501    shuffles[2] = lp_build_const_int32(gallivm, 4);
 502    shuffles[3] = lp_build_const_int32(gallivm, 5);
 503    tmp[0] = LLVMBuildShuffleVector(builder, src[0], src[1],
 504                                    LLVMConstVector(shuffles, 4), "");
 505    tmp[2] = LLVMBuildShuffleVector(builder, src[2], src[3],
 506                                    LLVMConstVector(shuffles, 4), "");
 507
 508    /* upper half of regs */
 509    shuffles[0] = lp_build_const_int32(gallivm, 2);
 510    shuffles[1] = lp_build_const_int32(gallivm, 3);
 511    shuffles[2] = lp_build_const_int32(gallivm, 6);
 512    shuffles[3] = lp_build_const_int32(gallivm, 7);
 513    tmp[1] = LLVMBuildShuffleVector(builder, src[0], src[1],
 514                                    LLVMConstVector(shuffles, 4), "");
 515    tmp[3] = LLVMBuildShuffleVector(builder, src[2], src[3],
 516                                    LLVMConstVector(shuffles, 4), "");
 517
 518    sumtmp[0] = LLVMBuildFAdd(builder, tmp[0], tmp[1], "");
 519    sumtmp[1] = LLVMBuildFAdd(builder, tmp[2], tmp[3], "");
 520
 521    shuffles[0] = lp_build_const_int32(gallivm, 0);
 522    shuffles[1] = lp_build_const_int32(gallivm, 2);
 523    shuffles[2] = lp_build_const_int32(gallivm, 4);
 524    shuffles[3] = lp_build_const_int32(gallivm, 6);
 525    shuftmp[0] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
 526                                        LLVMConstVector(shuffles, 4), "");
 527
 528    shuffles[0] = lp_build_const_int32(gallivm, 1);
 529    shuffles[1] = lp_build_const_int32(gallivm, 3);
 530    shuffles[2] = lp_build_const_int32(gallivm, 5);
 531    shuffles[3] = lp_build_const_int32(gallivm, 7);
 532    shuftmp[1] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
 533                                        LLVMConstVector(shuffles, 4), "");
 534
 535    return LLVMBuildFAdd(builder, shuftmp[0], shuftmp[1], "");
 536 }
 537
 538
 539 /*
 540  * partially horizontally add 2-4 float vectors with length nx4,
 541  * i.e. only four adjacent values in each vector will be added,
 542  * assuming values are really grouped in 4 which also determines
 543  * output order.
 544  *
 545  * Return a vector of the same length as the initial vectors,
 546  * with the excess elements (if any) being undefined.
 547  * The element order is independent of number of input vectors.
 548  * For 3 vectors x0x1x2x3x4x5x6x7, y0y1y2y3y4y5y6y7, z0z1z2z3z4z5z6z7
 549  * the output order thus will be
 550  * sumx0-x3,sumy0-y3,sumz0-z3,undef,sumx4-x7,sumy4-y7,sumz4z7,undef
 551  */
 552 LLVMValueRef
 553 lp_build_hadd_partial4(struct lp_build_context *bld,
 554                        LLVMValueRef vectors[],
 555                        unsigned num_vecs)
 556 {
 557    struct gallivm_state *gallivm = bld->gallivm;
 558    LLVMBuilderRef builder = gallivm->builder;
 559    LLVMValueRef ret_vec;
 560    LLVMValueRef tmp[4];
 561    const char *intrinsic = NULL;
 562
 563    assert(num_vecs >= 2 && num_vecs <= 4);
 564    assert(bld->type.floating);
 565
 566    /* only use this with at least 2 vectors, as it is sort of expensive
 567     * (depending on cpu) and we always need two horizontal adds anyway,
 568     * so a shuffle/add approach might be better.
 569     */
 570
 571    tmp[0] = vectors[0];
 572    tmp[1] = vectors[1];
 573
 574    tmp[2] = num_vecs > 2 ? vectors[2] : vectors[0];
 575    tmp[3] = num_vecs > 3 ? vectors[3] : vectors[0];
 576
 577    if (util_cpu_caps.has_sse3 && bld->type.width == 32 &&
 578        bld->type.length == 4) {
 579       intrinsic = "llvm.x86.sse3.hadd.ps";
 580    }
 581    else if (util_cpu_caps.has_avx && bld->type.width == 32 &&
 582             bld->type.length == 8) {
 583       intrinsic = "llvm.x86.avx.hadd.ps.256";
 584    }
 585    if (intrinsic) {
 586       tmp[0] = lp_build_intrinsic_binary(builder, intrinsic,
 587                                        lp_build_vec_type(gallivm, bld->type),
 588                                        tmp[0], tmp[1]);
 589       if (num_vecs > 2) {
 590          tmp[1] = lp_build_intrinsic_binary(builder, intrinsic,
 591                                           lp_build_vec_type(gallivm, bld->type),
 592                                           tmp[2], tmp[3]);
 593       }
 594       else {
 595          tmp[1] = tmp[0];
 596       }
 597       return lp_build_intrinsic_binary(builder, intrinsic,
 598                                        lp_build_vec_type(gallivm, bld->type),
 599                                        tmp[0], tmp[1]);
 600    }
 601
 602    if (bld->type.length == 4) {
 603       ret_vec = lp_build_horizontal_add4x4f(bld, tmp);
 604    }
 605    else {
 606       LLVMValueRef partres[LP_MAX_VECTOR_LENGTH/4];
 607       unsigned j;
 608       unsigned num_iter = bld->type.length / 4;
 609       struct lp_type parttype = bld->type;
 610       parttype.length = 4;
 611       for (j = 0; j < num_iter; j++) {
 612          LLVMValueRef partsrc[4];
 613          unsigned i;
 614          for (i = 0; i < 4; i++) {
 615             partsrc[i] = lp_build_extract_range(gallivm, tmp[i], j*4, 4);
 616          }
 617          partres[j] = lp_build_horizontal_add4x4f(bld, partsrc);
 618       }
 619       ret_vec = lp_build_concat(gallivm, partres, parttype, num_iter);
 620    }
 621    return ret_vec;
 622 }
 623
 624 /**
 625  * Generate a - b
 626  */
 627 LLVMValueRef
 628 lp_build_sub(struct lp_build_context *bld,
 629              LLVMValueRef a,
 630              LLVMValueRef b)
 631 {
 632    LLVMBuilderRef builder = bld->gallivm->builder;
 633    const struct lp_type type = bld->type;
 634    LLVMValueRef res;
 635
 636    assert(lp_check_value(type, a));
 637    assert(lp_check_value(type, b));
 638
 639    if(b == bld->zero)
 640       return a;
 641    if(a == bld->undef || b == bld->undef)
 642       return bld->undef;
 643    if(a == b)
 644       return bld->zero;
 645
 646    if(bld->type.norm) {
 647       const char *intrinsic = NULL;
 648
 649       if(b == bld->one)
 650         return bld->zero;
 651
 652       if (type.width * type.length == 128 &&
 653           !type.floating && !type.fixed) {
 654          if (util_cpu_caps.has_sse2) {
 655            if(type.width == 8)
 656               intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
 657            if(type.width == 16)
 658               intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
 659          } else if (util_cpu_caps.has_altivec) {
 660            if(type.width == 8)
 661               intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs";
 662            if(type.width == 16)
 663               intrinsic = type.sign ? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs";
 664          }
 665       }
 666
 667       if(intrinsic)
 668          return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
 669    }
 670
 671    /* TODO: handle signed case */
 672    if(type.norm && !type.floating && !type.fixed && !type.sign)
 673       a = lp_build_max_simple(bld, a, b);
 674
 675    if(LLVMIsConstant(a) && LLVMIsConstant(b))
 676       if (type.floating)
 677          res = LLVMConstFSub(a, b);
 678       else
 679          res = LLVMConstSub(a, b);
 680    else
 681       if (type.floating)
 682          res = LLVMBuildFSub(builder, a, b, "");
 683       else
 684          res = LLVMBuildSub(builder, a, b, "");
 685
 686    if(bld->type.norm && (bld->type.floating || bld->type.fixed))
 687       res = lp_build_max_simple(bld, res, bld->zero);
 688
 689    return res;
 690 }
 691
 692
 693
 694 /**
 695  * Normalized multiplication.
 696  *
 697  * There are several approaches for (using 8-bit normalized multiplication as
 698  * an example):
 699  *
 700  * - alpha plus one
 701  *
 702  *     makes the following approximation to the division (Sree)
 703  *
 704  *       a*b/255 ~= (a*(b + 1)) >> 256
 705  *
 706  *     which is the fastest method that satisfies the following OpenGL criteria of
 707  *
 708  *       0*0 = 0 and 255*255 = 255
 709  *
 710  * - geometric series
 711  *
 712  *     takes the geometric series approximation to the division
 713  *
 714  *       t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
 715  *
 716  *     in this case just the first two terms to fit in 16bit arithmetic
 717  *
 718  *       t/255 ~= (t + (t >> 8)) >> 8
 719  *
 720  *     note that just by itself it doesn't satisfies the OpenGL criteria, as
 721  *     255*255 = 254, so the special case b = 255 must be accounted or roundoff
 722  *     must be used.
 723  *
 724  * - geometric series plus rounding
 725  *
 726  *     when using a geometric series division instead of truncating the result
 727  *     use roundoff in the approximation (Jim Blinn)
 728  *
 729  *       t/255 ~= (t + (t >> 8) + 0x80) >> 8
 730  *
 731  *     achieving the exact results.
 732  *
 733  *
 734  *
 735  * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
 736  *     ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
 737  * @sa Michael Herf, The "double blend trick", May 2000,
 738  *     http://www.stereopsis.com/doubleblend.html
 739  */
 740 static LLVMValueRef
 741 lp_build_mul_norm(struct gallivm_state *gallivm,
 742                   struct lp_type wide_type,
 743                   LLVMValueRef a, LLVMValueRef b)
 744 {
 745    LLVMBuilderRef builder = gallivm->builder;
 746    struct lp_build_context bld;
 747    unsigned n;
 748    LLVMValueRef half;
 749    LLVMValueRef ab;
 750
 751    assert(!wide_type.floating);
 752    assert(lp_check_value(wide_type, a));
 753    assert(lp_check_value(wide_type, b));
 754
 755    lp_build_context_init(&bld, gallivm, wide_type);
 756
 757    n = wide_type.width / 2;
 758    if (wide_type.sign) {
 759       --n;
 760    }
 761
 762    /*
 763     * TODO: for 16bits normalized SSE2 vectors we could consider using PMULHUW
 764     * http://ssp.impulsetrain.com/2011/07/03/multiplying-normalized-16-bit-numbers-with-sse2/
 765     */
 766
 767    /*
 768     * a*b / (2**n - 1) ~= (a*b + (a*b >> n) + half) >> n
 769     */
 770
 771    ab = LLVMBuildMul(builder, a, b, "");
 772    ab = LLVMBuildAdd(builder, ab, lp_build_shr_imm(&bld, ab, n), "");
 773
 774    /*
 775     * half = sgn(ab) * 0.5 * (2 ** n) = sgn(ab) * (1 << (n - 1))
 776     */
 777
 778    half = lp_build_const_int_vec(gallivm, wide_type, 1 << (n - 1));
 779    if (wide_type.sign) {
 780       LLVMValueRef minus_half = LLVMBuildNeg(builder, half, "");
 781       LLVMValueRef sign = lp_build_shr_imm(&bld, ab, wide_type.width - 1);
 782       half = lp_build_select(&bld, sign, minus_half, half);
 783    }
 784    ab = LLVMBuildAdd(builder, ab, half, "");
 785
 786    /* Final division */
 787    ab = lp_build_shr_imm(&bld, ab, n);
 788
 789    return ab;
 790 }
 791
 792 /**
 793  * Generate a * b
 794  */
 795 LLVMValueRef
 796 lp_build_mul(struct lp_build_context *bld,
 797              LLVMValueRef a,
 798              LLVMValueRef b)
 799 {
 800    LLVMBuilderRef builder = bld->gallivm->builder;
 801    const struct lp_type type = bld->type;
 802    LLVMValueRef shift;
 803    LLVMValueRef res;
 804
 805    assert(lp_check_value(type, a));
 806    assert(lp_check_value(type, b));
 807
 808    if(a == bld->zero)
 809       return bld->zero;
 810    if(a == bld->one)
 811       return b;
 812    if(b == bld->zero)
 813       return bld->zero;
 814    if(b == bld->one)
 815       return a;
 816    if(a == bld->undef || b == bld->undef)
 817       return bld->undef;
 818
 819    if (!type.floating && !type.fixed && type.norm) {
 820       struct lp_type wide_type = lp_wider_type(type);
 821       LLVMValueRef al, ah, bl, bh, abl, abh, ab;
 822
 823       lp_build_unpack2(bld->gallivm, type, wide_type, a, &al, &ah);
 824       lp_build_unpack2(bld->gallivm, type, wide_type, b, &bl, &bh);
 825
 826       /* PMULLW, PSRLW, PADDW */
 827       abl = lp_build_mul_norm(bld->gallivm, wide_type, al, bl);
 828       abh = lp_build_mul_norm(bld->gallivm, wide_type, ah, bh);
 829
 830       ab = lp_build_pack2(bld->gallivm, wide_type, type, abl, abh);
 831
 832       return ab;
 833    }
 834
 835    if(type.fixed)
 836       shift = lp_build_const_int_vec(bld->gallivm, type, type.width/2);
 837    else
 838       shift = NULL;
 839
 840    if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
 841       if (type.floating)
 842          res = LLVMConstFMul(a, b);
 843       else
 844          res = LLVMConstMul(a, b);
 845       if(shift) {
 846          if(type.sign)
 847             res = LLVMConstAShr(res, shift);
 848          else
 849             res = LLVMConstLShr(res, shift);
 850       }
 851    }
 852    else {
 853       if (type.floating)
 854          res = LLVMBuildFMul(builder, a, b, "");
 855       else
 856          res = LLVMBuildMul(builder, a, b, "");
 857       if(shift) {
 858          if(type.sign)
 859             res = LLVMBuildAShr(builder, res, shift, "");
 860          else
 861             res = LLVMBuildLShr(builder, res, shift, "");
 862       }
 863    }
 864
 865    return res;
 866 }
 867
 868
 869 /**
 870  * Small vector x scale multiplication optimization.
 871  */
 872 LLVMValueRef
 873 lp_build_mul_imm(struct lp_build_context *bld,
 874                  LLVMValueRef a,
 875                  int b)
 876 {
 877    LLVMBuilderRef builder = bld->gallivm->builder;
 878    LLVMValueRef factor;
 879
 880    assert(lp_check_value(bld->type, a));
 881
 882    if(b == 0)
 883       return bld->zero;
 884
 885    if(b == 1)
 886       return a;
 887
 888    if(b == -1)
 889       return lp_build_negate(bld, a);
 890
 891    if(b == 2 && bld->type.floating)
 892       return lp_build_add(bld, a, a);
 893
 894    if(util_is_power_of_two(b)) {
 895       unsigned shift = ffs(b) - 1;
 896
 897       if(bld->type.floating) {
 898 #if 0
 899          /*
 900           * Power of two multiplication by directly manipulating the exponent.
 901           *
 902           * XXX: This might not be always faster, it will introduce a small error
 903           * for multiplication by zero, and it will produce wrong results
 904           * for Inf and NaN.
 905           */
 906          unsigned mantissa = lp_mantissa(bld->type);
 907          factor = lp_build_const_int_vec(bld->gallivm, bld->type, (unsigned long long)shift << mantissa);
 908          a = LLVMBuildBitCast(builder, a, lp_build_int_vec_type(bld->type), "");
 909          a = LLVMBuildAdd(builder, a, factor, "");
 910          a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, bld->type), "");
 911          return a;
 912 #endif
 913       }
 914       else {
 915          factor = lp_build_const_vec(bld->gallivm, bld->type, shift);
 916          return LLVMBuildShl(builder, a, factor, "");
 917       }
 918    }
 919
 920    factor = lp_build_const_vec(bld->gallivm, bld->type, (double)b);
 921    return lp_build_mul(bld, a, factor);
 922 }
 923
 924
 925 /**
 926  * Generate a / b
 927  */
 928 LLVMValueRef
 929 lp_build_div(struct lp_build_context *bld,
 930              LLVMValueRef a,
 931              LLVMValueRef b)
 932 {
 933    LLVMBuilderRef builder = bld->gallivm->builder;
 934    const struct lp_type type = bld->type;
 935
 936    assert(lp_check_value(type, a));
 937    assert(lp_check_value(type, b));
 938
 939    if(a == bld->zero)
 940       return bld->zero;
 941    if(a == bld->one)
 942       return lp_build_rcp(bld, b);
 943    if(b == bld->zero)
 944       return bld->undef;
 945    if(b == bld->one)
 946       return a;
 947    if(a == bld->undef || b == bld->undef)
 948       return bld->undef;
 949
 950    if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
 951       if (type.floating)
 952          return LLVMConstFDiv(a, b);
 953       else if (type.sign)
 954          return LLVMConstSDiv(a, b);
 955       else
 956          return LLVMConstUDiv(a, b);
 957    }
 958
 959    if(((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
 960        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) &&
 961       type.floating)
 962       return lp_build_mul(bld, a, lp_build_rcp(bld, b));
 963
 964    if (type.floating)
 965       return LLVMBuildFDiv(builder, a, b, "");
 966    else if (type.sign)
 967       return LLVMBuildSDiv(builder, a, b, "");
 968    else
 969       return LLVMBuildUDiv(builder, a, b, "");
 970 }
 971
 972
 973 /**
 974  * Linear interpolation helper.
 975  *
 976  * @param normalized whether we are interpolating normalized values,
 977  *        encoded in normalized integers, twice as wide.
 978  *
 979  * @sa http://www.stereopsis.com/doubleblend.html
 980  */
 981 static INLINE LLVMValueRef
 982 lp_build_lerp_simple(struct lp_build_context *bld,
 983                      LLVMValueRef x,
 984                      LLVMValueRef v0,
 985                      LLVMValueRef v1,
 986                      unsigned flags)
 987 {
 988    unsigned half_width = bld->type.width/2;
 989    LLVMBuilderRef builder = bld->gallivm->builder;
 990    LLVMValueRef delta;
 991    LLVMValueRef res;
 992
 993    assert(lp_check_value(bld->type, x));
 994    assert(lp_check_value(bld->type, v0));
 995    assert(lp_check_value(bld->type, v1));
 996
 997    delta = lp_build_sub(bld, v1, v0);
 998
 999    if (flags & LP_BLD_LERP_WIDE_NORMALIZED) {
1000       if (!bld->type.sign) {
1001          if (!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS)) {
1002             /*
1003              * Scale x from [0, 2**n - 1] to [0, 2**n] by adding the
1004              * most-significant-bit to the lowest-significant-bit, so that
1005              * later we can just divide by 2**n instead of 2**n - 1.
1006              */
1007
1008             x = lp_build_add(bld, x, lp_build_shr_imm(bld, x, half_width - 1));
1009          }
1010
1011          /* (x * delta) >> n */
1012          res = lp_build_mul(bld, x, delta);
1013          res = lp_build_shr_imm(bld, res, half_width);
1014       } else {
1015          /*
1016           * The rescaling trick above doesn't work for signed numbers, so
1017           * use the 2**n - 1 divison approximation in lp_build_mul_norm
1018           * instead.
1019           */
1020          assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1021          res = lp_build_mul_norm(bld->gallivm, bld->type, x, delta);
1022       }
1023    } else {
1024       assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1025       res = lp_build_mul(bld, x, delta);
1026    }
1027
1028    res = lp_build_add(bld, v0, res);
1029
1030    if (((flags & LP_BLD_LERP_WIDE_NORMALIZED) && !bld->type.sign) ||
1031        bld->type.fixed) {
1032       /* We need to mask out the high order bits when lerping 8bit normalized colors stored on 16bits */
1033       /* XXX: This step is necessary for lerping 8bit colors stored on 16bits,
1034        * but it will be wrong for true fixed point use cases. Basically we need
1035        * a more powerful lp_type, capable of further distinguishing the values
1036        * interpretation from the value storage. */
1037       res = LLVMBuildAnd(builder, res, lp_build_const_int_vec(bld->gallivm, bld->type, (1 << half_width) - 1), "");
1038    }
1039
1040    return res;
1041 }
1042
1043
1044 /**
1045  * Linear interpolation.
1046  */
1047 LLVMValueRef
1048 lp_build_lerp(struct lp_build_context *bld,
1049               LLVMValueRef x,
1050               LLVMValueRef v0,
1051               LLVMValueRef v1,
1052               unsigned flags)
1053 {
1054    const struct lp_type type = bld->type;
1055    LLVMValueRef res;
1056
1057    assert(lp_check_value(type, x));
1058    assert(lp_check_value(type, v0));
1059    assert(lp_check_value(type, v1));
1060
1061    assert(!(flags & LP_BLD_LERP_WIDE_NORMALIZED));
1062
1063    if (type.norm) {
1064       struct lp_type wide_type;
1065       struct lp_build_context wide_bld;
1066       LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh;
1067
1068       assert(type.length >= 2);
1069
1070       /*
1071        * Create a wider integer type, enough to hold the
1072        * intermediate result of the multiplication.
1073        */
1074       memset(&wide_type, 0, sizeof wide_type);
1075       wide_type.sign   = type.sign;
1076       wide_type.width  = type.width*2;
1077       wide_type.length = type.length/2;
1078
1079       lp_build_context_init(&wide_bld, bld->gallivm, wide_type);
1080
1081       lp_build_unpack2(bld->gallivm, type, wide_type, x,  &xl,  &xh);
1082       lp_build_unpack2(bld->gallivm, type, wide_type, v0, &v0l, &v0h);
1083       lp_build_unpack2(bld->gallivm, type, wide_type, v1, &v1l, &v1h);
1084
1085       /*
1086        * Lerp both halves.
1087        */
1088
1089       flags |= LP_BLD_LERP_WIDE_NORMALIZED;
1090
1091       resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l, flags);
1092       resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h, flags);
1093
1094       res = lp_build_pack2(bld->gallivm, wide_type, type, resl, resh);
1095    } else {
1096       res = lp_build_lerp_simple(bld, x, v0, v1, flags);
1097    }
1098
1099    return res;
1100 }
1101
1102
1103 /**
1104  * Bilinear interpolation.
1105  *
1106  * Values indices are in v_{yx}.
1107  */
1108 LLVMValueRef
1109 lp_build_lerp_2d(struct lp_build_context *bld,
1110                  LLVMValueRef x,
1111                  LLVMValueRef y,
1112                  LLVMValueRef v00,
1113                  LLVMValueRef v01,
1114                  LLVMValueRef v10,
1115                  LLVMValueRef v11,
1116                  unsigned flags)
1117 {
1118    LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01, flags);
1119    LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11, flags);
1120    return lp_build_lerp(bld, y, v0, v1, flags);
1121 }
1122
1123
1124 LLVMValueRef
1125 lp_build_lerp_3d(struct lp_build_context *bld,
1126                  LLVMValueRef x,
1127                  LLVMValueRef y,
1128                  LLVMValueRef z,
1129                  LLVMValueRef v000,
1130                  LLVMValueRef v001,
1131                  LLVMValueRef v010,
1132                  LLVMValueRef v011,
1133                  LLVMValueRef v100,
1134                  LLVMValueRef v101,
1135                  LLVMValueRef v110,
1136                  LLVMValueRef v111,
1137                  unsigned flags)
1138 {
1139    LLVMValueRef v0 = lp_build_lerp_2d(bld, x, y, v000, v001, v010, v011, flags);
1140    LLVMValueRef v1 = lp_build_lerp_2d(bld, x, y, v100, v101, v110, v111, flags);
1141    return lp_build_lerp(bld, z, v0, v1, flags);
1142 }
1143
1144
1145 /**
1146  * Generate min(a, b)
1147  * Do checks for special cases.
1148  */
1149 LLVMValueRef
1150 lp_build_min(struct lp_build_context *bld,
1151              LLVMValueRef a,
1152              LLVMValueRef b)
1153 {
1154    assert(lp_check_value(bld->type, a));
1155    assert(lp_check_value(bld->type, b));
1156
1157    if(a == bld->undef || b == bld->undef)
1158       return bld->undef;
1159
1160    if(a == b)
1161       return a;
1162
1163    if (bld->type.norm) {
1164       if (!bld->type.sign) {
1165          if (a == bld->zero || b == bld->zero) {
1166             return bld->zero;
1167          }
1168       }
1169       if(a == bld->one)
1170          return b;
1171       if(b == bld->one)
1172          return a;
1173    }
1174
1175    return lp_build_min_simple(bld, a, b);
1176 }
1177
1178
1179 /**
1180  * Generate max(a, b)
1181  * Do checks for special cases.
1182  */
1183 LLVMValueRef
1184 lp_build_max(struct lp_build_context *bld,
1185              LLVMValueRef a,
1186              LLVMValueRef b)
1187 {
1188    assert(lp_check_value(bld->type, a));
1189    assert(lp_check_value(bld->type, b));
1190
1191    if(a == bld->undef || b == bld->undef)
1192       return bld->undef;
1193
1194    if(a == b)
1195       return a;
1196
1197    if(bld->type.norm) {
1198       if(a == bld->one || b == bld->one)
1199          return bld->one;
1200       if (!bld->type.sign) {
1201          if (a == bld->zero) {
1202             return b;
1203          }
1204          if (b == bld->zero) {
1205             return a;
1206          }
1207       }
1208    }
1209
1210    return lp_build_max_simple(bld, a, b);
1211 }
1212
1213
1214 /**
1215  * Generate clamp(a, min, max)
1216  * Do checks for special cases.
1217  */
1218 LLVMValueRef
1219 lp_build_clamp(struct lp_build_context *bld,
1220                LLVMValueRef a,
1221                LLVMValueRef min,
1222                LLVMValueRef max)
1223 {
1224    assert(lp_check_value(bld->type, a));
1225    assert(lp_check_value(bld->type, min));
1226    assert(lp_check_value(bld->type, max));
1227
1228    a = lp_build_min(bld, a, max);
1229    a = lp_build_max(bld, a, min);
1230    return a;
1231 }
1232
1233
1234 /**
1235  * Generate abs(a)
1236  */
1237 LLVMValueRef
1238 lp_build_abs(struct lp_build_context *bld,
1239              LLVMValueRef a)
1240 {
1241    LLVMBuilderRef builder = bld->gallivm->builder;
1242    const struct lp_type type = bld->type;
1243    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1244
1245    assert(lp_check_value(type, a));
1246
1247    if(!type.sign)
1248       return a;
1249
1250    if(type.floating) {
1251       /* Mask out the sign bit */
1252       LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1253       unsigned long long absMask = ~(1ULL << (type.width - 1));
1254       LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, ((unsigned long long) absMask));
1255       a = LLVMBuildBitCast(builder, a, int_vec_type, "");
1256       a = LLVMBuildAnd(builder, a, mask, "");
1257       a = LLVMBuildBitCast(builder, a, vec_type, "");
1258       return a;
1259    }
1260
1261    if(type.width*type.length == 128 && util_cpu_caps.has_ssse3) {
1262       switch(type.width) {
1263       case 8:
1264          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
1265       case 16:
1266          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
1267       case 32:
1268          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
1269       }
1270    }
1271    else if (type.width*type.length == 256 && util_cpu_caps.has_ssse3 &&
1272             (gallivm_debug & GALLIVM_DEBUG_PERF) &&
1273             (type.width == 8 || type.width == 16 || type.width == 32)) {
1274       debug_printf("%s: inefficient code, should split vectors manually\n",
1275                    __FUNCTION__);
1276    }
1277
1278    return lp_build_max(bld, a, LLVMBuildNeg(builder, a, ""));
1279 }
1280
1281
1282 LLVMValueRef
1283 lp_build_negate(struct lp_build_context *bld,
1284                 LLVMValueRef a)
1285 {
1286    LLVMBuilderRef builder = bld->gallivm->builder;
1287
1288    assert(lp_check_value(bld->type, a));
1289
1290 #if HAVE_LLVM >= 0x0207
1291    if (bld->type.floating)
1292       a = LLVMBuildFNeg(builder, a, "");
1293    else
1294 #endif
1295       a = LLVMBuildNeg(builder, a, "");
1296
1297    return a;
1298 }
1299
1300
1301 /** Return -1, 0 or +1 depending on the sign of a */
1302 LLVMValueRef
1303 lp_build_sgn(struct lp_build_context *bld,
1304              LLVMValueRef a)
1305 {
1306    LLVMBuilderRef builder = bld->gallivm->builder;
1307    const struct lp_type type = bld->type;
1308    LLVMValueRef cond;
1309    LLVMValueRef res;
1310
1311    assert(lp_check_value(type, a));
1312
1313    /* Handle non-zero case */
1314    if(!type.sign) {
1315       /* if not zero then sign must be positive */
1316       res = bld->one;
1317    }
1318    else if(type.floating) {
1319       LLVMTypeRef vec_type;
1320       LLVMTypeRef int_type;
1321       LLVMValueRef mask;
1322       LLVMValueRef sign;
1323       LLVMValueRef one;
1324       unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
1325
1326       int_type = lp_build_int_vec_type(bld->gallivm, type);
1327       vec_type = lp_build_vec_type(bld->gallivm, type);
1328       mask = lp_build_const_int_vec(bld->gallivm, type, maskBit);
1329
1330       /* Take the sign bit and add it to 1 constant */
1331       sign = LLVMBuildBitCast(builder, a, int_type, "");
1332       sign = LLVMBuildAnd(builder, sign, mask, "");
1333       one = LLVMConstBitCast(bld->one, int_type);
1334       res = LLVMBuildOr(builder, sign, one, "");
1335       res = LLVMBuildBitCast(builder, res, vec_type, "");
1336    }
1337    else
1338    {
1339       /* signed int/norm/fixed point */
1340       /* could use psign with sse3 and appropriate vectors here */
1341       LLVMValueRef minus_one = lp_build_const_vec(bld->gallivm, type, -1.0);
1342       cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
1343       res = lp_build_select(bld, cond, bld->one, minus_one);
1344    }
1345
1346    /* Handle zero */
1347    cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
1348    res = lp_build_select(bld, cond, bld->zero, res);
1349
1350    return res;
1351 }
1352
1353
1354 /**
1355  * Set the sign of float vector 'a' according to 'sign'.
1356  * If sign==0, return abs(a).
1357  * If sign==1, return -abs(a);
1358  * Other values for sign produce undefined results.
1359  */
1360 LLVMValueRef
1361 lp_build_set_sign(struct lp_build_context *bld,
1362                   LLVMValueRef a, LLVMValueRef sign)
1363 {
1364    LLVMBuilderRef builder = bld->gallivm->builder;
1365    const struct lp_type type = bld->type;
1366    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1367    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1368    LLVMValueRef shift = lp_build_const_int_vec(bld->gallivm, type, type.width - 1);
1369    LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1370                              ~((unsigned long long) 1 << (type.width - 1)));
1371    LLVMValueRef val, res;
1372
1373    assert(type.floating);
1374    assert(lp_check_value(type, a));
1375
1376    /* val = reinterpret_cast<int>(a) */
1377    val = LLVMBuildBitCast(builder, a, int_vec_type, "");
1378    /* val = val & mask */
1379    val = LLVMBuildAnd(builder, val, mask, "");
1380    /* sign = sign << shift */
1381    sign = LLVMBuildShl(builder, sign, shift, "");
1382    /* res = val | sign */
1383    res = LLVMBuildOr(builder, val, sign, "");
1384    /* res = reinterpret_cast<float>(res) */
1385    res = LLVMBuildBitCast(builder, res, vec_type, "");
1386
1387    return res;
1388 }
1389
1390
1391 /**
1392  * Convert vector of (or scalar) int to vector of (or scalar) float.
1393  */
1394 LLVMValueRef
1395 lp_build_int_to_float(struct lp_build_context *bld,
1396                       LLVMValueRef a)
1397 {
1398    LLVMBuilderRef builder = bld->gallivm->builder;
1399    const struct lp_type type = bld->type;
1400    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1401
1402    assert(type.floating);
1403
1404    return LLVMBuildSIToFP(builder, a, vec_type, "");
1405 }
1406
1407 static boolean
1408 arch_rounding_available(const struct lp_type type)
1409 {
1410    if ((util_cpu_caps.has_sse4_1 &&
1411        (type.length == 1 || type.width*type.length == 128)) ||
1412        (util_cpu_caps.has_avx && type.width*type.length == 256))
1413       return TRUE;
1414    else if ((util_cpu_caps.has_altivec &&
1415             (type.width == 32 && type.length == 4)))
1416       return TRUE;
1417
1418    return FALSE;
1419 }
1420
1421 enum lp_build_round_mode
1422 {
1423    LP_BUILD_ROUND_NEAREST = 0,
1424    LP_BUILD_ROUND_FLOOR = 1,
1425    LP_BUILD_ROUND_CEIL = 2,
1426    LP_BUILD_ROUND_TRUNCATE = 3
1427 };
1428
1429 /**
1430  * Helper for SSE4.1's ROUNDxx instructions.
1431  *
1432  * NOTE: In the SSE4.1's nearest mode, if two values are equally close, the
1433  * result is the even value.  That is, rounding 2.5 will be 2.0, and not 3.0.
1434  */
1435 static INLINE LLVMValueRef
1436 lp_build_round_sse41(struct lp_build_context *bld,
1437                      LLVMValueRef a,
1438                      enum lp_build_round_mode mode)
1439 {
1440    LLVMBuilderRef builder = bld->gallivm->builder;
1441    const struct lp_type type = bld->type;
1442    LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1443    const char *intrinsic;
1444    LLVMValueRef res;
1445
1446    assert(type.floating);
1447
1448    assert(lp_check_value(type, a));
1449    assert(util_cpu_caps.has_sse4_1);
1450
1451    if (type.length == 1) {
1452       LLVMTypeRef vec_type;
1453       LLVMValueRef undef;
1454       LLVMValueRef args[3];
1455       LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
1456
1457       switch(type.width) {
1458       case 32:
1459          intrinsic = "llvm.x86.sse41.round.ss";
1460          break;
1461       case 64:
1462          intrinsic = "llvm.x86.sse41.round.sd";
1463          break;
1464       default:
1465          assert(0);
1466          return bld->undef;
1467       }
1468
1469       vec_type = LLVMVectorType(bld->elem_type, 4);
1470
1471       undef = LLVMGetUndef(vec_type);
1472
1473       args[0] = undef;
1474       args[1] = LLVMBuildInsertElement(builder, undef, a, index0, "");
1475       args[2] = LLVMConstInt(i32t, mode, 0);
1476
1477       res = lp_build_intrinsic(builder, intrinsic,
1478                                vec_type, args, Elements(args));
1479
1480       res = LLVMBuildExtractElement(builder, res, index0, "");
1481    }
1482    else {
1483       if (type.width * type.length == 128) {
1484          switch(type.width) {
1485          case 32:
1486             intrinsic = "llvm.x86.sse41.round.ps";
1487             break;
1488          case 64:
1489             intrinsic = "llvm.x86.sse41.round.pd";
1490             break;
1491          default:
1492             assert(0);
1493             return bld->undef;
1494          }
1495       }
1496       else {
1497          assert(type.width * type.length == 256);
1498          assert(util_cpu_caps.has_avx);
1499
1500          switch(type.width) {
1501          case 32:
1502             intrinsic = "llvm.x86.avx.round.ps.256";
1503             break;
1504          case 64:
1505             intrinsic = "llvm.x86.avx.round.pd.256";
1506             break;
1507          default:
1508             assert(0);
1509             return bld->undef;
1510          }
1511       }
1512
1513       res = lp_build_intrinsic_binary(builder, intrinsic,
1514                                       bld->vec_type, a,
1515                                       LLVMConstInt(i32t, mode, 0));
1516    }
1517
1518    return res;
1519 }
1520
1521
1522 static INLINE LLVMValueRef
1523 lp_build_iround_nearest_sse2(struct lp_build_context *bld,
1524                              LLVMValueRef a)
1525 {
1526    LLVMBuilderRef builder = bld->gallivm->builder;
1527    const struct lp_type type = bld->type;
1528    LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1529    LLVMTypeRef ret_type = lp_build_int_vec_type(bld->gallivm, type);
1530    const char *intrinsic;
1531    LLVMValueRef res;
1532
1533    assert(type.floating);
1534    /* using the double precision conversions is a bit more complicated */
1535    assert(type.width == 32);
1536
1537    assert(lp_check_value(type, a));
1538    assert(util_cpu_caps.has_sse2);
1539
1540    /* This is relying on MXCSR rounding mode, which should always be nearest. */
1541    if (type.length == 1) {
1542       LLVMTypeRef vec_type;
1543       LLVMValueRef undef;
1544       LLVMValueRef arg;
1545       LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
1546
1547       vec_type = LLVMVectorType(bld->elem_type, 4);
1548
1549       intrinsic = "llvm.x86.sse.cvtss2si";
1550
1551       undef = LLVMGetUndef(vec_type);
1552
1553       arg = LLVMBuildInsertElement(builder, undef, a, index0, "");
1554
1555       res = lp_build_intrinsic_unary(builder, intrinsic,
1556                                      ret_type, arg);
1557    }
1558    else {
1559       if (type.width* type.length == 128) {
1560          intrinsic = "llvm.x86.sse2.cvtps2dq";
1561       }
1562       else {
1563          assert(type.width*type.length == 256);
1564          assert(util_cpu_caps.has_avx);
1565
1566          intrinsic = "llvm.x86.avx.cvt.ps2dq.256";
1567       }
1568       res = lp_build_intrinsic_unary(builder, intrinsic,
1569                                      ret_type, a);
1570    }
1571
1572    return res;
1573 }
1574
1575
1576 /*
1577  */
1578 static INLINE LLVMValueRef
1579 lp_build_round_altivec(struct lp_build_context *bld,
1580                        LLVMValueRef a,
1581                        enum lp_build_round_mode mode)
1582 {
1583    LLVMBuilderRef builder = bld->gallivm->builder;
1584    const struct lp_type type = bld->type;
1585    const char *intrinsic = NULL;
1586
1587    assert(type.floating);
1588
1589    assert(lp_check_value(type, a));
1590    assert(util_cpu_caps.has_altivec);
1591
1592    switch (mode) {
1593    case LP_BUILD_ROUND_NEAREST:
1594       intrinsic = "llvm.ppc.altivec.vrfin";
1595       break;
1596    case LP_BUILD_ROUND_FLOOR:
1597       intrinsic = "llvm.ppc.altivec.vrfim";
1598       break;
1599    case LP_BUILD_ROUND_CEIL:
1600       intrinsic = "llvm.ppc.altivec.vrfip";
1601       break;
1602    case LP_BUILD_ROUND_TRUNCATE:
1603       intrinsic = "llvm.ppc.altivec.vrfiz";
1604       break;
1605    }
1606
1607    return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
1608 }
1609
1610 static INLINE LLVMValueRef
1611 lp_build_round_arch(struct lp_build_context *bld,
1612                     LLVMValueRef a,
1613                     enum lp_build_round_mode mode)
1614 {
1615    if (util_cpu_caps.has_sse4_1)
1616      return lp_build_round_sse41(bld, a, mode);
1617    else /* (util_cpu_caps.has_altivec) */
1618      return lp_build_round_altivec(bld, a, mode);
1619 }
1620
1621 /**
1622  * Return the integer part of a float (vector) value (== round toward zero).
1623  * The returned value is a float (vector).
1624  * Ex: trunc(-1.5) = -1.0
1625  */
1626 LLVMValueRef
1627 lp_build_trunc(struct lp_build_context *bld,
1628                LLVMValueRef a)
1629 {
1630    LLVMBuilderRef builder = bld->gallivm->builder;
1631    const struct lp_type type = bld->type;
1632
1633    assert(type.floating);
1634    assert(lp_check_value(type, a));
1635
1636    if (arch_rounding_available(type)) {
1637       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_TRUNCATE);
1638    }
1639    else {
1640       const struct lp_type type = bld->type;
1641       struct lp_type inttype;
1642       struct lp_build_context intbld;
1643       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 2^24);
1644       LLVMValueRef trunc, res, anosign, mask;
1645       LLVMTypeRef int_vec_type = bld->int_vec_type;
1646       LLVMTypeRef vec_type = bld->vec_type;
1647
1648       assert(type.width == 32); /* might want to handle doubles at some point */
1649
1650       inttype = type;
1651       inttype.floating = 0;
1652       lp_build_context_init(&intbld, bld->gallivm, inttype);
1653
1654       /* round by truncation */
1655       trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
1656       res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
1657
1658       /* mask out sign bit */
1659       anosign = lp_build_abs(bld, a);
1660       /*
1661        * mask out all values if anosign > 2^24
1662        * This should work both for large ints (all rounding is no-op for them
1663        * because such floats are always exact) as well as special cases like
1664        * NaNs, Infs (taking advantage of the fact they use max exponent).
1665        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
1666        */
1667       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
1668       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
1669       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
1670       return lp_build_select(bld, mask, a, res);
1671    }
1672 }
1673
1674
1675 /**
1676  * Return float (vector) rounded to nearest integer (vector).  The returned
1677  * value is a float (vector).
1678  * Ex: round(0.9) = 1.0
1679  * Ex: round(-1.5) = -2.0
1680  */
1681 LLVMValueRef
1682 lp_build_round(struct lp_build_context *bld,
1683                LLVMValueRef a)
1684 {
1685    LLVMBuilderRef builder = bld->gallivm->builder;
1686    const struct lp_type type = bld->type;
1687
1688    assert(type.floating);
1689    assert(lp_check_value(type, a));
1690
1691    if (arch_rounding_available(type)) {
1692       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
1693    }
1694    else {
1695       const struct lp_type type = bld->type;
1696       struct lp_type inttype;
1697       struct lp_build_context intbld;
1698       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 2^24);
1699       LLVMValueRef res, anosign, mask;
1700       LLVMTypeRef int_vec_type = bld->int_vec_type;
1701       LLVMTypeRef vec_type = bld->vec_type;
1702
1703       assert(type.width == 32); /* might want to handle doubles at some point */
1704
1705       inttype = type;
1706       inttype.floating = 0;
1707       lp_build_context_init(&intbld, bld->gallivm, inttype);
1708
1709       res = lp_build_iround(bld, a);
1710       res = LLVMBuildSIToFP(builder, res, vec_type, "");
1711
1712       /* mask out sign bit */
1713       anosign = lp_build_abs(bld, a);
1714       /*
1715        * mask out all values if anosign > 2^24
1716        * This should work both for large ints (all rounding is no-op for them
1717        * because such floats are always exact) as well as special cases like
1718        * NaNs, Infs (taking advantage of the fact they use max exponent).
1719        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
1720        */
1721       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
1722       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
1723       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
1724       return lp_build_select(bld, mask, a, res);
1725    }
1726 }
1727
1728
1729 /**
1730  * Return floor of float (vector), result is a float (vector)
1731  * Ex: floor(1.1) = 1.0
1732  * Ex: floor(-1.1) = -2.0
1733  */
1734 LLVMValueRef
1735 lp_build_floor(struct lp_build_context *bld,
1736                LLVMValueRef a)
1737 {
1738    LLVMBuilderRef builder = bld->gallivm->builder;
1739    const struct lp_type type = bld->type;
1740
1741    assert(type.floating);
1742    assert(lp_check_value(type, a));
1743
1744    if (arch_rounding_available(type)) {
1745       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
1746    }
1747    else {
1748       const struct lp_type type = bld->type;
1749       struct lp_type inttype;
1750       struct lp_build_context intbld;
1751       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 2^24);
1752       LLVMValueRef trunc, res, anosign, mask;
1753       LLVMTypeRef int_vec_type = bld->int_vec_type;
1754       LLVMTypeRef vec_type = bld->vec_type;
1755
1756       assert(type.width == 32); /* might want to handle doubles at some point */
1757
1758       inttype = type;
1759       inttype.floating = 0;
1760       lp_build_context_init(&intbld, bld->gallivm, inttype);
1761
1762       /* round by truncation */
1763       trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
1764       res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
1765
1766       if (type.sign) {
1767          LLVMValueRef tmp;
1768
1769          /*
1770           * fix values if rounding is wrong (for non-special cases)
1771           * - this is the case if trunc > a
1772           */
1773          mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, res, a);
1774          /* tmp = trunc > a ? 1.0 : 0.0 */
1775          tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
1776          tmp = lp_build_and(&intbld, mask, tmp);
1777          tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
1778          res = lp_build_sub(bld, res, tmp);
1779       }
1780
1781       /* mask out sign bit */
1782       anosign = lp_build_abs(bld, a);
1783       /*
1784        * mask out all values if anosign > 2^24
1785        * This should work both for large ints (all rounding is no-op for them
1786        * because such floats are always exact) as well as special cases like
1787        * NaNs, Infs (taking advantage of the fact they use max exponent).
1788        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
1789        */
1790       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
1791       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
1792       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
1793       return lp_build_select(bld, mask, a, res);
1794    }
1795 }
1796
1797
1798 /**
1799  * Return ceiling of float (vector), returning float (vector).
1800  * Ex: ceil( 1.1) = 2.0
1801  * Ex: ceil(-1.1) = -1.0
1802  */
1803 LLVMValueRef
1804 lp_build_ceil(struct lp_build_context *bld,
1805               LLVMValueRef a)
1806 {
1807    LLVMBuilderRef builder = bld->gallivm->builder;
1808    const struct lp_type type = bld->type;
1809
1810    assert(type.floating);
1811    assert(lp_check_value(type, a));
1812
1813    if (arch_rounding_available(type)) {
1814       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
1815    }
1816    else {
1817       const struct lp_type type = bld->type;
1818       struct lp_type inttype;
1819       struct lp_build_context intbld;
1820       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 2^24);
1821       LLVMValueRef trunc, res, anosign, mask, tmp;
1822       LLVMTypeRef int_vec_type = bld->int_vec_type;
1823       LLVMTypeRef vec_type = bld->vec_type;
1824
1825       assert(type.width == 32); /* might want to handle doubles at some point */
1826
1827       inttype = type;
1828       inttype.floating = 0;
1829       lp_build_context_init(&intbld, bld->gallivm, inttype);
1830
1831       /* round by truncation */
1832       trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
1833       trunc = LLVMBuildSIToFP(builder, trunc, vec_type, "ceil.trunc");
1834
1835       /*
1836        * fix values if rounding is wrong (for non-special cases)
1837        * - this is the case if trunc < a
1838        */
1839       mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
1840       /* tmp = trunc < a ? 1.0 : 0.0 */
1841       tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
1842       tmp = lp_build_and(&intbld, mask, tmp);
1843       tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
1844       res = lp_build_add(bld, trunc, tmp);
1845
1846       /* mask out sign bit */
1847       anosign = lp_build_abs(bld, a);
1848       /*
1849        * mask out all values if anosign > 2^24
1850        * This should work both for large ints (all rounding is no-op for them
1851        * because such floats are always exact) as well as special cases like
1852        * NaNs, Infs (taking advantage of the fact they use max exponent).
1853        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
1854        */
1855       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
1856       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
1857       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
1858       return lp_build_select(bld, mask, a, res);
1859    }
1860 }
1861
1862
1863 /**
1864  * Return fractional part of 'a' computed as a - floor(a)
1865  * Typically used in texture coord arithmetic.
1866  */
1867 LLVMValueRef
1868 lp_build_fract(struct lp_build_context *bld,
1869                LLVMValueRef a)
1870 {
1871    assert(bld->type.floating);
1872    return lp_build_sub(bld, a, lp_build_floor(bld, a));
1873 }
1874
1875
1876 /**
1877  * Prevent returning a fractional part of 1.0 for very small negative values of
1878  * 'a' by clamping against 0.99999(9).
1879  */
1880 static inline LLVMValueRef
1881 clamp_fract(struct lp_build_context *bld, LLVMValueRef fract)
1882 {
1883    LLVMValueRef max;
1884
1885    /* this is the largest number smaller than 1.0 representable as float */
1886    max = lp_build_const_vec(bld->gallivm, bld->type,
1887                             1.0 - 1.0/(1LL << (lp_mantissa(bld->type) + 1)));
1888    return lp_build_min(bld, fract, max);
1889 }
1890
1891
1892 /**
1893  * Same as lp_build_fract, but guarantees that the result is always smaller
1894  * than one.
1895  */
1896 LLVMValueRef
1897 lp_build_fract_safe(struct lp_build_context *bld,
1898                     LLVMValueRef a)
1899 {
1900    return clamp_fract(bld, lp_build_fract(bld, a));
1901 }
1902
1903
1904 /**
1905  * Return the integer part of a float (vector) value (== round toward zero).
1906  * The returned value is an integer (vector).
1907  * Ex: itrunc(-1.5) = -1
1908  */
1909 LLVMValueRef
1910 lp_build_itrunc(struct lp_build_context *bld,
1911                 LLVMValueRef a)
1912 {
1913    LLVMBuilderRef builder = bld->gallivm->builder;
1914    const struct lp_type type = bld->type;
1915    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1916
1917    assert(type.floating);
1918    assert(lp_check_value(type, a));
1919
1920    return LLVMBuildFPToSI(builder, a, int_vec_type, "");
1921 }
1922
1923
1924 /**
1925  * Return float (vector) rounded to nearest integer (vector).  The returned
1926  * value is an integer (vector).
1927  * Ex: iround(0.9) = 1
1928  * Ex: iround(-1.5) = -2
1929  */
1930 LLVMValueRef
1931 lp_build_iround(struct lp_build_context *bld,
1932                 LLVMValueRef a)
1933 {
1934    LLVMBuilderRef builder = bld->gallivm->builder;
1935    const struct lp_type type = bld->type;
1936    LLVMTypeRef int_vec_type = bld->int_vec_type;
1937    LLVMValueRef res;
1938
1939    assert(type.floating);
1940
1941    assert(lp_check_value(type, a));
1942
1943    if ((util_cpu_caps.has_sse2 &&
1944        ((type.width == 32) && (type.length == 1 || type.length == 4))) ||
1945        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
1946       return lp_build_iround_nearest_sse2(bld, a);
1947    }
1948    if (arch_rounding_available(type)) {
1949       res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
1950    }
1951    else {
1952       LLVMValueRef half;
1953
1954       half = lp_build_const_vec(bld->gallivm, type, 0.5);
1955
1956       if (type.sign) {
1957          LLVMTypeRef vec_type = bld->vec_type;
1958          LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1959                                     (unsigned long long)1 << (type.width - 1));
1960          LLVMValueRef sign;
1961
1962          /* get sign bit */
1963          sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
1964          sign = LLVMBuildAnd(builder, sign, mask, "");
1965
1966          /* sign * 0.5 */
1967          half = LLVMBuildBitCast(builder, half, int_vec_type, "");
1968          half = LLVMBuildOr(builder, sign, half, "");
1969          half = LLVMBuildBitCast(builder, half, vec_type, "");
1970       }
1971
1972       res = LLVMBuildFAdd(builder, a, half, "");
1973    }
1974
1975    res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
1976
1977    return res;
1978 }
1979
1980
1981 /**
1982  * Return floor of float (vector), result is an int (vector)
1983  * Ex: ifloor(1.1) = 1.0
1984  * Ex: ifloor(-1.1) = -2.0
1985  */
1986 LLVMValueRef
1987 lp_build_ifloor(struct lp_build_context *bld,
1988                 LLVMValueRef a)
1989 {
1990    LLVMBuilderRef builder = bld->gallivm->builder;
1991    const struct lp_type type = bld->type;
1992    LLVMTypeRef int_vec_type = bld->int_vec_type;
1993    LLVMValueRef res;
1994
1995    assert(type.floating);
1996    assert(lp_check_value(type, a));
1997
1998    res = a;
1999    if (type.sign) {
2000       if (arch_rounding_available(type)) {
2001          res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
2002       }
2003       else {
2004          struct lp_type inttype;
2005          struct lp_build_context intbld;
2006          LLVMValueRef trunc, itrunc, mask;
2007
2008          assert(type.floating);
2009          assert(lp_check_value(type, a));
2010
2011          inttype = type;
2012          inttype.floating = 0;
2013          lp_build_context_init(&intbld, bld->gallivm, inttype);
2014
2015          /* round by truncation */
2016          itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2017          trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "ifloor.trunc");
2018
2019          /*
2020           * fix values if rounding is wrong (for non-special cases)
2021           * - this is the case if trunc > a
2022           * The results of doing this with NaNs, very large values etc.
2023           * are undefined but this seems to be the case anyway.
2024           */
2025          mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, trunc, a);
2026          /* cheapie minus one with mask since the mask is minus one / zero */
2027          return lp_build_add(&intbld, itrunc, mask);
2028       }
2029    }
2030
2031    /* round to nearest (toward zero) */
2032    res = LLVMBuildFPToSI(builder, res, int_vec_type, "ifloor.res");
2033
2034    return res;
2035 }
2036
2037
2038 /**
2039  * Return ceiling of float (vector), returning int (vector).
2040  * Ex: iceil( 1.1) = 2
2041  * Ex: iceil(-1.1) = -1
2042  */
2043 LLVMValueRef
2044 lp_build_iceil(struct lp_build_context *bld,
2045                LLVMValueRef a)
2046 {
2047    LLVMBuilderRef builder = bld->gallivm->builder;
2048    const struct lp_type type = bld->type;
2049    LLVMTypeRef int_vec_type = bld->int_vec_type;
2050    LLVMValueRef res;
2051
2052    assert(type.floating);
2053    assert(lp_check_value(type, a));
2054
2055    if (arch_rounding_available(type)) {
2056       res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2057    }
2058    else {
2059       struct lp_type inttype;
2060       struct lp_build_context intbld;
2061       LLVMValueRef trunc, itrunc, mask;
2062
2063       assert(type.floating);
2064       assert(lp_check_value(type, a));
2065
2066       inttype = type;
2067       inttype.floating = 0;
2068       lp_build_context_init(&intbld, bld->gallivm, inttype);
2069
2070       /* round by truncation */
2071       itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2072       trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "iceil.trunc");
2073
2074       /*
2075        * fix values if rounding is wrong (for non-special cases)
2076        * - this is the case if trunc < a
2077        * The results of doing this with NaNs, very large values etc.
2078        * are undefined but this seems to be the case anyway.
2079        */
2080       mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2081       /* cheapie plus one with mask since the mask is minus one / zero */
2082       return lp_build_sub(&intbld, itrunc, mask);
2083    }
2084
2085    /* round to nearest (toward zero) */
2086    res = LLVMBuildFPToSI(builder, res, int_vec_type, "iceil.res");
2087
2088    return res;
2089 }
2090
2091
2092 /**
2093  * Combined ifloor() & fract().
2094  *
2095  * Preferred to calling the functions separately, as it will ensure that the
2096  * strategy (floor() vs ifloor()) that results in less redundant work is used.
2097  */
2098 void
2099 lp_build_ifloor_fract(struct lp_build_context *bld,
2100                       LLVMValueRef a,
2101                       LLVMValueRef *out_ipart,
2102                       LLVMValueRef *out_fpart)
2103 {
2104    LLVMBuilderRef builder = bld->gallivm->builder;
2105    const struct lp_type type = bld->type;
2106    LLVMValueRef ipart;
2107
2108    assert(type.floating);
2109    assert(lp_check_value(type, a));
2110
2111    if (arch_rounding_available(type)) {
2112       /*
2113        * floor() is easier.
2114        */
2115
2116       ipart = lp_build_floor(bld, a);
2117       *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2118       *out_ipart = LLVMBuildFPToSI(builder, ipart, bld->int_vec_type, "ipart");
2119    }
2120    else {
2121       /*
2122        * ifloor() is easier.
2123        */
2124
2125       *out_ipart = lp_build_ifloor(bld, a);
2126       ipart = LLVMBuildSIToFP(builder, *out_ipart, bld->vec_type, "ipart");
2127       *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2128    }
2129 }
2130
2131
2132 /**
2133  * Same as lp_build_ifloor_fract, but guarantees that the fractional part is
2134  * always smaller than one.
2135  */
2136 void
2137 lp_build_ifloor_fract_safe(struct lp_build_context *bld,
2138                            LLVMValueRef a,
2139                            LLVMValueRef *out_ipart,
2140                            LLVMValueRef *out_fpart)
2141 {
2142    lp_build_ifloor_fract(bld, a, out_ipart, out_fpart);
2143    *out_fpart = clamp_fract(bld, *out_fpart);
2144 }
2145
2146
2147 LLVMValueRef
2148 lp_build_sqrt(struct lp_build_context *bld,
2149               LLVMValueRef a)
2150 {
2151    LLVMBuilderRef builder = bld->gallivm->builder;
2152    const struct lp_type type = bld->type;
2153    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
2154    char intrinsic[32];
2155
2156    assert(lp_check_value(type, a));
2157
2158    /* TODO: optimize the constant case */
2159
2160    assert(type.floating);
2161    if (type.length == 1) {
2162       util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.f%u", type.width);
2163    }
2164    else {
2165       util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.v%uf%u", type.length, type.width);
2166    }
2167
2168    return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2169 }
2170
2171
2172 /**
2173  * Do one Newton-Raphson step to improve reciprocate precision:
2174  *
2175  *   x_{i+1} = x_i * (2 - a * x_i)
2176  *
2177  * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
2178  * +/-Inf, giving NaN instead.  Certain applications rely on this behavior,
2179  * such as Google Earth, which does RCP(RSQRT(0.0) when drawing the Earth's
2180  * halo. It would be necessary to clamp the argument to prevent this.
2181  *
2182  * See also:
2183  * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
2184  * - http://softwarecommunity.intel.com/articles/eng/1818.htm
2185  */
2186 static INLINE LLVMValueRef
2187 lp_build_rcp_refine(struct lp_build_context *bld,
2188                     LLVMValueRef a,
2189                     LLVMValueRef rcp_a)
2190 {
2191    LLVMBuilderRef builder = bld->gallivm->builder;
2192    LLVMValueRef two = lp_build_const_vec(bld->gallivm, bld->type, 2.0);
2193    LLVMValueRef res;
2194
2195    res = LLVMBuildFMul(builder, a, rcp_a, "");
2196    res = LLVMBuildFSub(builder, two, res, "");
2197    res = LLVMBuildFMul(builder, rcp_a, res, "");
2198
2199    return res;
2200 }
2201
2202
2203 LLVMValueRef
2204 lp_build_rcp(struct lp_build_context *bld,
2205              LLVMValueRef a)
2206 {
2207    LLVMBuilderRef builder = bld->gallivm->builder;
2208    const struct lp_type type = bld->type;
2209
2210    assert(lp_check_value(type, a));
2211
2212    if(a == bld->zero)
2213       return bld->undef;
2214    if(a == bld->one)
2215       return bld->one;
2216    if(a == bld->undef)
2217       return bld->undef;
2218
2219    assert(type.floating);
2220
2221    if(LLVMIsConstant(a))
2222       return LLVMConstFDiv(bld->one, a);
2223
2224    /*
2225     * We don't use RCPPS because:
2226     * - it only has 10bits of precision
2227     * - it doesn't even get the reciprocate of 1.0 exactly
2228     * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
2229     * - for recent processors the benefit over DIVPS is marginal, a case
2230     *   dependent
2231     *
2232     * We could still use it on certain processors if benchmarks show that the
2233     * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
2234     * particular uses that require less workarounds.
2235     */
2236
2237    if (FALSE && ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2238          (util_cpu_caps.has_avx && type.width == 32 && type.length == 8))){
2239       const unsigned num_iterations = 0;
2240       LLVMValueRef res;
2241       unsigned i;
2242       const char *intrinsic = NULL;
2243
2244       if (type.length == 4) {
2245          intrinsic = "llvm.x86.sse.rcp.ps";
2246       }
2247       else {
2248          intrinsic = "llvm.x86.avx.rcp.ps.256";
2249       }
2250
2251       res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2252
2253       for (i = 0; i < num_iterations; ++i) {
2254          res = lp_build_rcp_refine(bld, a, res);
2255       }
2256
2257       return res;
2258    }
2259
2260    return LLVMBuildFDiv(builder, bld->one, a, "");
2261 }
2262
2263
2264 /**
2265  * Do one Newton-Raphson step to improve rsqrt precision:
2266  *
2267  *   x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
2268  *
2269  * See also Intel 64 and IA-32 Architectures Optimization Manual.
2270  */
2271 static INLINE LLVMValueRef
2272 lp_build_rsqrt_refine(struct lp_build_context *bld,
2273                       LLVMValueRef a,
2274                       LLVMValueRef rsqrt_a)
2275 {
2276    LLVMBuilderRef builder = bld->gallivm->builder;
2277    LLVMValueRef half = lp_build_const_vec(bld->gallivm, bld->type, 0.5);
2278    LLVMValueRef three = lp_build_const_vec(bld->gallivm, bld->type, 3.0);
2279    LLVMValueRef res;
2280
2281    res = LLVMBuildFMul(builder, rsqrt_a, rsqrt_a, "");
2282    res = LLVMBuildFMul(builder, a, res, "");
2283    res = LLVMBuildFSub(builder, three, res, "");
2284    res = LLVMBuildFMul(builder, rsqrt_a, res, "");
2285    res = LLVMBuildFMul(builder, half, res, "");
2286
2287    return res;
2288 }
2289
2290
2291 /**
2292  * Generate 1/sqrt(a).
2293  * Result is undefined for values < 0, infinity for +0.
2294  */
2295 LLVMValueRef
2296 lp_build_rsqrt(struct lp_build_context *bld,
2297                LLVMValueRef a)
2298 {
2299    LLVMBuilderRef builder = bld->gallivm->builder;
2300    const struct lp_type type = bld->type;
2301
2302    assert(lp_check_value(type, a));
2303
2304    assert(type.floating);
2305
2306    /*
2307     * This should be faster but all denormals will end up as infinity.
2308     */
2309    if (0 && ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2310         (util_cpu_caps.has_avx && type.width == 32 && type.length == 8))) {
2311       const unsigned num_iterations = 1;
2312       LLVMValueRef res;
2313       unsigned i;
2314       const char *intrinsic = NULL;
2315
2316       if (type.length == 4) {
2317          intrinsic = "llvm.x86.sse.rsqrt.ps";
2318       }
2319       else {
2320          intrinsic = "llvm.x86.avx.rsqrt.ps.256";
2321       }
2322       if (num_iterations) {
2323          /*
2324           * Newton-Raphson will result in NaN instead of infinity for zero,
2325           * and NaN instead of zero for infinity.
2326           * Also, need to ensure rsqrt(1.0) == 1.0.
2327           * All numbers smaller than FLT_MIN will result in +infinity
2328           * (rsqrtps treats all denormals as zero).
2329           */
2330          /*
2331           * Certain non-c99 compilers don't know INFINITY and might not support
2332           * hacks to evaluate it at compile time neither.
2333           */
2334          const unsigned posinf_int = 0x7F800000;
2335          LLVMValueRef cmp;
2336          LLVMValueRef flt_min = lp_build_const_vec(bld->gallivm, type, FLT_MIN);
2337          LLVMValueRef inf = lp_build_const_int_vec(bld->gallivm, type, posinf_int);
2338
2339          inf = LLVMBuildBitCast(builder, inf, lp_build_vec_type(bld->gallivm, type), "");
2340
2341          res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2342
2343          for (i = 0; i < num_iterations; ++i) {
2344             res = lp_build_rsqrt_refine(bld, a, res);
2345          }
2346          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_LESS, a, flt_min);
2347          res = lp_build_select(bld, cmp, inf, res);
2348          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, inf);
2349          res = lp_build_select(bld, cmp, bld->zero, res);
2350          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, bld->one);
2351          res = lp_build_select(bld, cmp, bld->one, res);
2352       }
2353       else {
2354          /* rsqrt(1.0) != 1.0 here */
2355          res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2356
2357       }
2358
2359       return res;
2360    }
2361
2362    return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2363 }
2364
2365
2366 /**
2367  * Generate sin(a) using SSE2
2368  */
2369 LLVMValueRef
2370 lp_build_sin(struct lp_build_context *bld,
2371              LLVMValueRef a)
2372 {
2373    struct gallivm_state *gallivm = bld->gallivm;
2374    LLVMBuilderRef builder = gallivm->builder;
2375    struct lp_type int_type = lp_int_type(bld->type);
2376    LLVMBuilderRef b = builder;
2377
2378    /*
2379     *  take the absolute value,
2380     *  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
2381     */
2382
2383    LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
2384    LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
2385
2386    LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
2387    LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
2388
2389    /*
2390     * extract the sign bit (upper one)
2391     * sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask);
2392     */
2393    LLVMValueRef sig_mask = lp_build_const_int_vec(gallivm, bld->type, 0x80000000);
2394    LLVMValueRef sign_bit_i = LLVMBuildAnd(b, a_v4si, sig_mask, "sign_bit_i");
2395
2396    /*
2397     * scale by 4/Pi
2398     * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
2399     */
2400
2401    LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
2402    LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
2403
2404    /*
2405     * store the integer part of y in mm0
2406     * emm2 = _mm_cvttps_epi32(y);
2407     */
2408
2409    LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
2410
2411    /*
2412     * j=(j+1) & (~1) (see the cephes sources)
2413     * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
2414     */
2415
2416    LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
2417    LLVMValueRef emm2_add =  LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
2418    /*
2419     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
2420     */
2421    LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
2422    LLVMValueRef emm2_and =  LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
2423
2424    /*
2425     * y = _mm_cvtepi32_ps(emm2);
2426     */
2427    LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
2428
2429    /* get the swap sign flag
2430     * emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
2431     */
2432    LLVMValueRef pi32_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
2433    LLVMValueRef emm0_and =  LLVMBuildAnd(b, emm2_add, pi32_4, "emm0_and");
2434
2435    /*
2436     * emm2 = _mm_slli_epi32(emm0, 29);
2437     */
2438    LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
2439    LLVMValueRef swap_sign_bit = LLVMBuildShl(b, emm0_and, const_29, "swap_sign_bit");
2440
2441    /*
2442     * get the polynom selection mask
2443     * there is one polynom for 0 <= x <= Pi/4
2444     * and another one for Pi/4<x<=Pi/2
2445     * Both branches will be computed.
2446     *
2447     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
2448     * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
2449     */
2450
2451    LLVMValueRef pi32_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
2452    LLVMValueRef emm2_3 =  LLVMBuildAnd(b, emm2_and, pi32_2, "emm2_3");
2453    LLVMValueRef poly_mask = lp_build_compare(gallivm,
2454                                              int_type, PIPE_FUNC_EQUAL,
2455                                              emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
2456    /*
2457     *   sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
2458     */
2459    LLVMValueRef sign_bit_1 =  LLVMBuildXor(b, sign_bit_i, swap_sign_bit, "sign_bit");
2460
2461    /*
2462     * _PS_CONST(minus_cephes_DP1, -0.78515625);
2463     * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
2464     * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
2465     */
2466    LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
2467    LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
2468    LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
2469
2470    /*
2471     * The magic pass: "Extended precision modular arithmetic"
2472     * x = ((x - y * DP1) - y * DP2) - y * DP3;
2473     * xmm1 = _mm_mul_ps(y, xmm1);
2474     * xmm2 = _mm_mul_ps(y, xmm2);
2475     * xmm3 = _mm_mul_ps(y, xmm3);
2476     */
2477    LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1");
2478    LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2");
2479    LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3");
2480
2481    /*
2482     * x = _mm_add_ps(x, xmm1);
2483     * x = _mm_add_ps(x, xmm2);
2484     * x = _mm_add_ps(x, xmm3);
2485     */
2486
2487    LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1");
2488    LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2");
2489    LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3");
2490
2491    /*
2492     * Evaluate the first polynom  (0 <= x <= Pi/4)
2493     *
2494     * z = _mm_mul_ps(x,x);
2495     */
2496    LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
2497
2498    /*
2499     * _PS_CONST(coscof_p0,  2.443315711809948E-005);
2500     * _PS_CONST(coscof_p1, -1.388731625493765E-003);
2501     * _PS_CONST(coscof_p2,  4.166664568298827E-002);
2502     */
2503    LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
2504    LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
2505    LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
2506
2507    /*
2508     * y = *(v4sf*)_ps_coscof_p0;
2509     * y = _mm_mul_ps(y, z);
2510     */
2511    LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3");
2512    LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4");
2513    LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5");
2514    LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6");
2515    LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
2516    LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
2517
2518
2519    /*
2520     * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
2521     * y = _mm_sub_ps(y, tmp);
2522     * y = _mm_add_ps(y, *(v4sf*)_ps_1);
2523     */
2524    LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
2525    LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
2526    LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
2527    LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
2528    LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
2529
2530    /*
2531     * _PS_CONST(sincof_p0, -1.9515295891E-4);
2532     * _PS_CONST(sincof_p1,  8.3321608736E-3);
2533     * _PS_CONST(sincof_p2, -1.6666654611E-1);
2534     */
2535    LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
2536    LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
2537    LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
2538
2539    /*
2540     * Evaluate the second polynom  (Pi/4 <= x <= 0)
2541     *
2542     * y2 = *(v4sf*)_ps_sincof_p0;
2543     * y2 = _mm_mul_ps(y2, z);
2544     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
2545     * y2 = _mm_mul_ps(y2, z);
2546     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
2547     * y2 = _mm_mul_ps(y2, z);
2548     * y2 = _mm_mul_ps(y2, x);
2549     * y2 = _mm_add_ps(y2, x);
2550     */
2551
2552    LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3");
2553    LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4");
2554    LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5");
2555    LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6");
2556    LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
2557    LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8");
2558    LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9");
2559
2560    /*
2561     * select the correct result from the two polynoms
2562     * xmm3 = poly_mask;
2563     * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
2564     * y = _mm_andnot_ps(xmm3, y);
2565     * y = _mm_add_ps(y,y2);
2566     */
2567    LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
2568    LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
2569    LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
2570    LLVMValueRef inv = lp_build_const_int_vec(gallivm, bld->type, ~0);
2571    LLVMValueRef poly_mask_inv = LLVMBuildXor(b, poly_mask, inv, "poly_mask_inv");
2572    LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
2573    LLVMValueRef y_combine = LLVMBuildAdd(b, y_and, y2_and, "y_combine");
2574
2575    /*
2576     * update the sign
2577     * y = _mm_xor_ps(y, sign_bit);
2578     */
2579    LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit_1, "y_sin");
2580    LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
2581    return y_result;
2582 }
2583
2584
2585 /**
2586  * Generate cos(a) using SSE2
2587  */
2588 LLVMValueRef
2589 lp_build_cos(struct lp_build_context *bld,
2590              LLVMValueRef a)
2591 {
2592    struct gallivm_state *gallivm = bld->gallivm;
2593    LLVMBuilderRef builder = gallivm->builder;
2594    struct lp_type int_type = lp_int_type(bld->type);
2595    LLVMBuilderRef b = builder;
2596
2597    /*
2598     *  take the absolute value,
2599     *  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
2600     */
2601
2602    LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
2603    LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
2604
2605    LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
2606    LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
2607
2608    /*
2609     * scale by 4/Pi
2610     * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
2611     */
2612
2613    LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
2614    LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
2615
2616    /*
2617     * store the integer part of y in mm0
2618     * emm2 = _mm_cvttps_epi32(y);
2619     */
2620
2621    LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
2622
2623    /*
2624     * j=(j+1) & (~1) (see the cephes sources)
2625     * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
2626     */
2627
2628    LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
2629    LLVMValueRef emm2_add =  LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
2630    /*
2631     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
2632     */
2633    LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
2634    LLVMValueRef emm2_and =  LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
2635
2636    /*
2637     * y = _mm_cvtepi32_ps(emm2);
2638     */
2639    LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
2640
2641
2642    /*
2643     * emm2 = _mm_sub_epi32(emm2, *(v4si*)_pi32_2);
2644     */
2645    LLVMValueRef const_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
2646    LLVMValueRef emm2_2 = LLVMBuildSub(b, emm2_and, const_2, "emm2_2");
2647
2648
2649    /* get the swap sign flag
2650     * emm0 = _mm_andnot_si128(emm2, *(v4si*)_pi32_4);
2651     */
2652    LLVMValueRef inv = lp_build_const_int_vec(gallivm, bld->type, ~0);
2653    LLVMValueRef emm0_not = LLVMBuildXor(b, emm2_2, inv, "emm0_not");
2654    LLVMValueRef pi32_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
2655    LLVMValueRef emm0_and =  LLVMBuildAnd(b, emm0_not, pi32_4, "emm0_and");
2656
2657    /*
2658     * emm2 = _mm_slli_epi32(emm0, 29);
2659     */
2660    LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
2661    LLVMValueRef sign_bit = LLVMBuildShl(b, emm0_and, const_29, "sign_bit");
2662
2663    /*
2664     * get the polynom selection mask
2665     * there is one polynom for 0 <= x <= Pi/4
2666     * and another one for Pi/4<x<=Pi/2
2667     * Both branches will be computed.
2668     *
2669     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
2670     * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
2671     */
2672
2673    LLVMValueRef pi32_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
2674    LLVMValueRef emm2_3 =  LLVMBuildAnd(b, emm2_2, pi32_2, "emm2_3");
2675    LLVMValueRef poly_mask = lp_build_compare(gallivm,
2676                                              int_type, PIPE_FUNC_EQUAL,
2677                                              emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
2678
2679    /*
2680     * _PS_CONST(minus_cephes_DP1, -0.78515625);
2681     * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
2682     * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
2683     */
2684    LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
2685    LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
2686    LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
2687
2688    /*
2689     * The magic pass: "Extended precision modular arithmetic"
2690     * x = ((x - y * DP1) - y * DP2) - y * DP3;
2691     * xmm1 = _mm_mul_ps(y, xmm1);
2692     * xmm2 = _mm_mul_ps(y, xmm2);
2693     * xmm3 = _mm_mul_ps(y, xmm3);
2694     */
2695    LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1");
2696    LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2");
2697    LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3");
2698
2699    /*
2700     * x = _mm_add_ps(x, xmm1);
2701     * x = _mm_add_ps(x, xmm2);
2702     * x = _mm_add_ps(x, xmm3);
2703     */
2704
2705    LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1");
2706    LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2");
2707    LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3");
2708
2709    /*
2710     * Evaluate the first polynom  (0 <= x <= Pi/4)
2711     *
2712     * z = _mm_mul_ps(x,x);
2713     */
2714    LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
2715
2716    /*
2717     * _PS_CONST(coscof_p0,  2.443315711809948E-005);
2718     * _PS_CONST(coscof_p1, -1.388731625493765E-003);
2719     * _PS_CONST(coscof_p2,  4.166664568298827E-002);
2720     */
2721    LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
2722    LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
2723    LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
2724
2725    /*
2726     * y = *(v4sf*)_ps_coscof_p0;
2727     * y = _mm_mul_ps(y, z);
2728     */
2729    LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3");
2730    LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4");
2731    LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5");
2732    LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6");
2733    LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
2734    LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
2735
2736
2737    /*
2738     * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
2739     * y = _mm_sub_ps(y, tmp);
2740     * y = _mm_add_ps(y, *(v4sf*)_ps_1);
2741     */
2742    LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
2743    LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
2744    LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
2745    LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
2746    LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
2747
2748    /*
2749     * _PS_CONST(sincof_p0, -1.9515295891E-4);
2750     * _PS_CONST(sincof_p1,  8.3321608736E-3);
2751     * _PS_CONST(sincof_p2, -1.6666654611E-1);
2752     */
2753    LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
2754    LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
2755    LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
2756
2757    /*
2758     * Evaluate the second polynom  (Pi/4 <= x <= 0)
2759     *
2760     * y2 = *(v4sf*)_ps_sincof_p0;
2761     * y2 = _mm_mul_ps(y2, z);
2762     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
2763     * y2 = _mm_mul_ps(y2, z);
2764     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
2765     * y2 = _mm_mul_ps(y2, z);
2766     * y2 = _mm_mul_ps(y2, x);
2767     * y2 = _mm_add_ps(y2, x);
2768     */
2769
2770    LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3");
2771    LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4");
2772    LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5");
2773    LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6");
2774    LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
2775    LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8");
2776    LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9");
2777
2778    /*
2779     * select the correct result from the two polynoms
2780     * xmm3 = poly_mask;
2781     * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
2782     * y = _mm_andnot_ps(xmm3, y);
2783     * y = _mm_add_ps(y,y2);
2784     */
2785    LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
2786    LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
2787    LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
2788    LLVMValueRef poly_mask_inv = LLVMBuildXor(b, poly_mask, inv, "poly_mask_inv");
2789    LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
2790    LLVMValueRef y_combine = LLVMBuildAdd(b, y_and, y2_and, "y_combine");
2791
2792    /*
2793     * update the sign
2794     * y = _mm_xor_ps(y, sign_bit);
2795     */
2796    LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sin");
2797    LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
2798    return y_result;
2799 }
2800
2801
2802 /**
2803  * Generate pow(x, y)
2804  */
2805 LLVMValueRef
2806 lp_build_pow(struct lp_build_context *bld,
2807              LLVMValueRef x,
2808              LLVMValueRef y)
2809 {
2810    /* TODO: optimize the constant case */
2811    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2812        LLVMIsConstant(x) && LLVMIsConstant(y)) {
2813       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2814                    __FUNCTION__);
2815    }
2816
2817    return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
2818 }
2819
2820
2821 /**
2822  * Generate exp(x)
2823  */
2824 LLVMValueRef
2825 lp_build_exp(struct lp_build_context *bld,
2826              LLVMValueRef x)
2827 {
2828    /* log2(e) = 1/log(2) */
2829    LLVMValueRef log2e = lp_build_const_vec(bld->gallivm, bld->type,
2830                                            1.4426950408889634);
2831
2832    assert(lp_check_value(bld->type, x));
2833
2834    return lp_build_exp2(bld, lp_build_mul(bld, log2e, x));
2835 }
2836
2837
2838 /**
2839  * Generate log(x)
2840  */
2841 LLVMValueRef
2842 lp_build_log(struct lp_build_context *bld,
2843              LLVMValueRef x)
2844 {
2845    /* log(2) */
2846    LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
2847                                           0.69314718055994529);
2848
2849    assert(lp_check_value(bld->type, x));
2850
2851    return lp_build_mul(bld, log2, lp_build_log2(bld, x));
2852 }
2853
2854
2855 /**
2856  * Generate polynomial.
2857  * Ex:  coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
2858  */
2859 static LLVMValueRef
2860 lp_build_polynomial(struct lp_build_context *bld,
2861                     LLVMValueRef x,
2862                     const double *coeffs,
2863                     unsigned num_coeffs)
2864 {
2865    const struct lp_type type = bld->type;
2866    LLVMValueRef even = NULL, odd = NULL;
2867    LLVMValueRef x2;
2868    unsigned i;
2869
2870    assert(lp_check_value(bld->type, x));
2871
2872    /* TODO: optimize the constant case */
2873    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2874        LLVMIsConstant(x)) {
2875       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2876                    __FUNCTION__);
2877    }
2878
2879    /*
2880     * Calculate odd and even terms seperately to decrease data dependency
2881     * Ex:
2882     *     c[0] + x^2 * c[2] + x^4 * c[4] ...
2883     *     + x * (c[1] + x^2 * c[3] + x^4 * c[5]) ...
2884     */
2885    x2 = lp_build_mul(bld, x, x);
2886
2887    for (i = num_coeffs; i--; ) {
2888       LLVMValueRef coeff;
2889
2890       coeff = lp_build_const_vec(bld->gallivm, type, coeffs[i]);
2891
2892       if (i % 2 == 0) {
2893          if (even)
2894             even = lp_build_add(bld, coeff, lp_build_mul(bld, x2, even));
2895          else
2896             even = coeff;
2897       } else {
2898          if (odd)
2899             odd = lp_build_add(bld, coeff, lp_build_mul(bld, x2, odd));
2900          else
2901             odd = coeff;
2902       }
2903    }
2904
2905    if (odd)
2906       return lp_build_add(bld, lp_build_mul(bld, odd, x), even);
2907    else if (even)
2908       return even;
2909    else
2910       return bld->undef;
2911 }
2912
2913
2914 /**
2915  * Minimax polynomial fit of 2**x, in range [0, 1[
2916  */
2917 const double lp_build_exp2_polynomial[] = {
2918 #if EXP_POLY_DEGREE == 5
2919    0.999999925063526176901,
2920    0.693153073200168932794,
2921    0.240153617044375388211,
2922    0.0558263180532956664775,
2923    0.00898934009049466391101,
2924    0.00187757667519147912699
2925 #elif EXP_POLY_DEGREE == 4
2926    1.00000259337069434683,
2927    0.693003834469974940458,
2928    0.24144275689150793076,
2929    0.0520114606103070150235,
2930    0.0135341679161270268764
2931 #elif EXP_POLY_DEGREE == 3
2932    0.999925218562710312959,
2933    0.695833540494823811697,
2934    0.226067155427249155588,
2935    0.0780245226406372992967
2936 #elif EXP_POLY_DEGREE == 2
2937    1.00172476321474503578,
2938    0.657636275736077639316,
2939    0.33718943461968720704
2940 #else
2941 #error
2942 #endif
2943 };
2944
2945
2946 void
2947 lp_build_exp2_approx(struct lp_build_context *bld,
2948                      LLVMValueRef x,
2949                      LLVMValueRef *p_exp2_int_part,
2950                      LLVMValueRef *p_frac_part,
2951                      LLVMValueRef *p_exp2)
2952 {
2953    LLVMBuilderRef builder = bld->gallivm->builder;
2954    const struct lp_type type = bld->type;
2955    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
2956    LLVMValueRef ipart = NULL;
2957    LLVMValueRef fpart = NULL;
2958    LLVMValueRef expipart = NULL;
2959    LLVMValueRef expfpart = NULL;
2960    LLVMValueRef res = NULL;
2961
2962    assert(lp_check_value(bld->type, x));
2963
2964    if(p_exp2_int_part || p_frac_part || p_exp2) {
2965       /* TODO: optimize the constant case */
2966       if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2967           LLVMIsConstant(x)) {
2968          debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2969                       __FUNCTION__);
2970       }
2971
2972       assert(type.floating && type.width == 32);
2973
2974       x = lp_build_min(bld, x, lp_build_const_vec(bld->gallivm, type,  129.0));
2975       x = lp_build_max(bld, x, lp_build_const_vec(bld->gallivm, type, -126.99999));
2976
2977       /* ipart = floor(x) */
2978       /* fpart = x - ipart */
2979       lp_build_ifloor_fract(bld, x, &ipart, &fpart);
2980    }
2981
2982    if(p_exp2_int_part || p_exp2) {
2983       /* expipart = (float) (1 << ipart) */
2984       expipart = LLVMBuildAdd(builder, ipart,
2985                               lp_build_const_int_vec(bld->gallivm, type, 127), "");
2986       expipart = LLVMBuildShl(builder, expipart,
2987                               lp_build_const_int_vec(bld->gallivm, type, 23), "");
2988       expipart = LLVMBuildBitCast(builder, expipart, vec_type, "");
2989    }
2990
2991    if(p_exp2) {
2992       expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
2993                                      Elements(lp_build_exp2_polynomial));
2994
2995       res = LLVMBuildFMul(builder, expipart, expfpart, "");
2996    }
2997
2998    if(p_exp2_int_part)
2999       *p_exp2_int_part = expipart;
3000
3001    if(p_frac_part)
3002       *p_frac_part = fpart;
3003
3004    if(p_exp2)
3005       *p_exp2 = res;
3006 }
3007
3008
3009 LLVMValueRef
3010 lp_build_exp2(struct lp_build_context *bld,
3011               LLVMValueRef x)
3012 {
3013    LLVMValueRef res;
3014    lp_build_exp2_approx(bld, x, NULL, NULL, &res);
3015    return res;
3016 }
3017
3018
3019 /**
3020  * Extract the exponent of a IEEE-754 floating point value.
3021  *
3022  * Optionally apply an integer bias.
3023  *
3024  * Result is an integer value with
3025  *
3026  *   ifloor(log2(x)) + bias
3027  */
3028 LLVMValueRef
3029 lp_build_extract_exponent(struct lp_build_context *bld,
3030                           LLVMValueRef x,
3031                           int bias)
3032 {
3033    LLVMBuilderRef builder = bld->gallivm->builder;
3034    const struct lp_type type = bld->type;
3035    unsigned mantissa = lp_mantissa(type);
3036    LLVMValueRef res;
3037
3038    assert(type.floating);
3039
3040    assert(lp_check_value(bld->type, x));
3041
3042    x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3043
3044    res = LLVMBuildLShr(builder, x,
3045                        lp_build_const_int_vec(bld->gallivm, type, mantissa), "");
3046    res = LLVMBuildAnd(builder, res,
3047                       lp_build_const_int_vec(bld->gallivm, type, 255), "");
3048    res = LLVMBuildSub(builder, res,
3049                       lp_build_const_int_vec(bld->gallivm, type, 127 - bias), "");
3050
3051    return res;
3052 }
3053
3054
3055 /**
3056  * Extract the mantissa of the a floating.
3057  *
3058  * Result is a floating point value with
3059  *
3060  *   x / floor(log2(x))
3061  */
3062 LLVMValueRef
3063 lp_build_extract_mantissa(struct lp_build_context *bld,
3064                           LLVMValueRef x)
3065 {
3066    LLVMBuilderRef builder = bld->gallivm->builder;
3067    const struct lp_type type = bld->type;
3068    unsigned mantissa = lp_mantissa(type);
3069    LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type,
3070                                                   (1ULL << mantissa) - 1);
3071    LLVMValueRef one = LLVMConstBitCast(bld->one, bld->int_vec_type);
3072    LLVMValueRef res;
3073
3074    assert(lp_check_value(bld->type, x));
3075
3076    assert(type.floating);
3077
3078    x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3079
3080    /* res = x / 2**ipart */
3081    res = LLVMBuildAnd(builder, x, mantmask, "");
3082    res = LLVMBuildOr(builder, res, one, "");
3083    res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
3084
3085    return res;
3086 }
3087
3088
3089
3090 /**
3091  * Minimax polynomial fit of log2((1.0 + sqrt(x))/(1.0 - sqrt(x)))/sqrt(x) ,for x in range of [0, 1/9[
3092  * These coefficients can be generate with
3093  * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
3094  */
3095 const double lp_build_log2_polynomial[] = {
3096 #if LOG_POLY_DEGREE == 5
3097    2.88539008148777786488L,
3098    0.961796878841293367824L,
3099    0.577058946784739859012L,
3100    0.412914355135828735411L,
3101    0.308591899232910175289L,
3102    0.352376952300281371868L,
3103 #elif LOG_POLY_DEGREE == 4
3104    2.88539009343309178325L,
3105    0.961791550404184197881L,
3106    0.577440339438736392009L,
3107    0.403343858251329912514L,
3108    0.406718052498846252698L,
3109 #elif LOG_POLY_DEGREE == 3
3110    2.88538959748872753838L,
3111    0.961932915889597772928L,
3112    0.571118517972136195241L,
3113    0.493997535084709500285L,
3114 #else
3115 #error
3116 #endif
3117 };
3118
3119 /**
3120  * See http://www.devmaster.net/forums/showthread.php?p=43580
3121  * http://en.wikipedia.org/wiki/Logarithm#Calculation
3122  * http://www.nezumi.demon.co.uk/consult/logx.htm
3123  */
3124 void
3125 lp_build_log2_approx(struct lp_build_context *bld,
3126                      LLVMValueRef x,
3127                      LLVMValueRef *p_exp,
3128                      LLVMValueRef *p_floor_log2,
3129                      LLVMValueRef *p_log2)
3130 {
3131    LLVMBuilderRef builder = bld->gallivm->builder;
3132    const struct lp_type type = bld->type;
3133    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3134    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
3135
3136    LLVMValueRef expmask = lp_build_const_int_vec(bld->gallivm, type, 0x7f800000);
3137    LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 0x007fffff);
3138    LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
3139
3140    LLVMValueRef i = NULL;
3141    LLVMValueRef y = NULL;
3142    LLVMValueRef z = NULL;
3143    LLVMValueRef exp = NULL;
3144    LLVMValueRef mant = NULL;
3145    LLVMValueRef logexp = NULL;
3146    LLVMValueRef logmant = NULL;
3147    LLVMValueRef res = NULL;
3148
3149    assert(lp_check_value(bld->type, x));
3150
3151    if(p_exp || p_floor_log2 || p_log2) {
3152       /* TODO: optimize the constant case */
3153       if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3154           LLVMIsConstant(x)) {
3155          debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3156                       __FUNCTION__);
3157       }
3158
3159       assert(type.floating && type.width == 32);
3160
3161       /*
3162        * We don't explicitly handle denormalized numbers. They will yield a
3163        * result in the neighbourhood of -127, which appears to be adequate
3164        * enough.
3165        */
3166
3167       i = LLVMBuildBitCast(builder, x, int_vec_type, "");
3168
3169       /* exp = (float) exponent(x) */
3170       exp = LLVMBuildAnd(builder, i, expmask, "");
3171    }
3172
3173    if(p_floor_log2 || p_log2) {
3174       logexp = LLVMBuildLShr(builder, exp, lp_build_const_int_vec(bld->gallivm, type, 23), "");
3175       logexp = LLVMBuildSub(builder, logexp, lp_build_const_int_vec(bld->gallivm, type, 127), "");
3176       logexp = LLVMBuildSIToFP(builder, logexp, vec_type, "");
3177    }
3178
3179    if(p_log2) {
3180       /* mant = 1 + (float) mantissa(x) */
3181       mant = LLVMBuildAnd(builder, i, mantmask, "");
3182       mant = LLVMBuildOr(builder, mant, one, "");
3183       mant = LLVMBuildBitCast(builder, mant, vec_type, "");
3184
3185       /* y = (mant - 1) / (mant + 1) */
3186       y = lp_build_div(bld,
3187          lp_build_sub(bld, mant, bld->one),
3188          lp_build_add(bld, mant, bld->one)
3189       );
3190
3191       /* z = y^2 */
3192       z = lp_build_mul(bld, y, y);
3193
3194       /* compute P(z) */
3195       logmant = lp_build_polynomial(bld, z, lp_build_log2_polynomial,
3196                                     Elements(lp_build_log2_polynomial));
3197
3198       /* logmant = y * P(z) */
3199       logmant = lp_build_mul(bld, y, logmant);
3200
3201       res = lp_build_add(bld, logmant, logexp);
3202    }
3203
3204    if(p_exp) {
3205       exp = LLVMBuildBitCast(builder, exp, vec_type, "");
3206       *p_exp = exp;
3207    }
3208
3209    if(p_floor_log2)
3210       *p_floor_log2 = logexp;
3211
3212    if(p_log2)
3213       *p_log2 = res;
3214 }
3215
3216
3217 LLVMValueRef
3218 lp_build_log2(struct lp_build_context *bld,
3219               LLVMValueRef x)
3220 {
3221    LLVMValueRef res;
3222    lp_build_log2_approx(bld, x, NULL, NULL, &res);
3223    return res;
3224 }
3225
3226
3227 /**
3228  * Faster (and less accurate) log2.
3229  *
3230  *    log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x))
3231  *
3232  * Piece-wise linear approximation, with exact results when x is a
3233  * power of two.
3234  *
3235  * See http://www.flipcode.com/archives/Fast_log_Function.shtml
3236  */
3237 LLVMValueRef
3238 lp_build_fast_log2(struct lp_build_context *bld,
3239                    LLVMValueRef x)
3240 {
3241    LLVMBuilderRef builder = bld->gallivm->builder;
3242    LLVMValueRef ipart;
3243    LLVMValueRef fpart;
3244
3245    assert(lp_check_value(bld->type, x));
3246
3247    assert(bld->type.floating);
3248
3249    /* ipart = floor(log2(x)) - 1 */
3250    ipart = lp_build_extract_exponent(bld, x, -1);
3251    ipart = LLVMBuildSIToFP(builder, ipart, bld->vec_type, "");
3252
3253    /* fpart = x / 2**ipart */
3254    fpart = lp_build_extract_mantissa(bld, x);
3255
3256    /* ipart + fpart */
3257    return LLVMBuildFAdd(builder, ipart, fpart, "");
3258 }
3259
3260
3261 /**
3262  * Fast implementation of iround(log2(x)).
3263  *
3264  * Not an approximation -- it should give accurate results all the time.
3265  */
3266 LLVMValueRef
3267 lp_build_ilog2(struct lp_build_context *bld,
3268                LLVMValueRef x)
3269 {
3270    LLVMBuilderRef builder = bld->gallivm->builder;
3271    LLVMValueRef sqrt2 = lp_build_const_vec(bld->gallivm, bld->type, M_SQRT2);
3272    LLVMValueRef ipart;
3273
3274    assert(bld->type.floating);
3275
3276    assert(lp_check_value(bld->type, x));
3277
3278    /* x * 2^(0.5)   i.e., add 0.5 to the log2(x) */
3279    x = LLVMBuildFMul(builder, x, sqrt2, "");
3280
3281    /* ipart = floor(log2(x) + 0.5)  */
3282    ipart = lp_build_extract_exponent(bld, x, 0);
3283
3284    return ipart;
3285 }
3286
3287 LLVMValueRef
3288 lp_build_mod(struct lp_build_context *bld,
3289              LLVMValueRef x,
3290              LLVMValueRef y)
3291 {
3292    LLVMBuilderRef builder = bld->gallivm->builder;
3293    LLVMValueRef res;
3294    const struct lp_type type = bld->type;
3295
3296    assert(lp_check_value(type, x));
3297    assert(lp_check_value(type, y));
3298
3299    if (type.floating)
3300       res = LLVMBuildFRem(builder, x, y, "");
3301    else if (type.sign)
3302       res = LLVMBuildSRem(builder, x, y, "");
3303    else
3304       res = LLVMBuildURem(builder, x, y, "");
3305    return res;
3306 }