src/gallium/auxiliary/gallivm/lp_bld_arit.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2009-2010 VMware, Inc.
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the
  15  * next paragraph) shall be included in all copies or substantial portions
  16  * of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  21  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
  22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25  *
  26  **************************************************************************/
  27
  28
  29 /**
  30  * @file
  31  * Helper
  32  *
  33  * LLVM IR doesn't support all basic arithmetic operations we care about (most
  34  * notably min/max and saturated operations), and it is often necessary to
  35  * resort machine-specific intrinsics directly. The functions here hide all
  36  * these implementation details from the other modules.
  37  *
  38  * We also do simple expressions simplification here. Reasons are:
  39  * - it is very easy given we have all necessary information readily available
  40  * - LLVM optimization passes fail to simplify several vector expressions
  41  * - We often know value constraints which the optimization passes have no way
  42  *   of knowing, such as when source arguments are known to be in [0, 1] range.
  43  *
  44  * @author Jose Fonseca <jfonseca@vmware.com>
  45  */
  46
  47
  48 #include <float.h>
  49
  50 #include "util/u_memory.h"
  51 #include "util/u_debug.h"
  52 #include "util/u_math.h"
  53 #include "util/u_string.h"
  54 #include "util/u_cpu_detect.h"
  55
  56 #include "lp_bld_type.h"
  57 #include "lp_bld_const.h"
  58 #include "lp_bld_init.h"
  59 #include "lp_bld_intr.h"
  60 #include "lp_bld_logic.h"
  61 #include "lp_bld_pack.h"
  62 #include "lp_bld_debug.h"
  63 #include "lp_bld_bitarit.h"
  64 #include "lp_bld_arit.h"
  65 #include "lp_bld_flow.h"
  66
  67
  68 #define EXP_POLY_DEGREE 5
  69
  70 #define LOG_POLY_DEGREE 4
  71
  72
  73 /**
  74  * Generate min(a, b)
  75  * No checks for special case values of a or b = 1 or 0 are done.
  76  */
  77 static LLVMValueRef
  78 lp_build_min_simple(struct lp_build_context *bld,
  79                     LLVMValueRef a,
  80                     LLVMValueRef b)
  81 {
  82    const struct lp_type type = bld->type;
  83    const char *intrinsic = NULL;
  84    unsigned intr_size = 0;
  85    LLVMValueRef cond;
  86
  87    assert(lp_check_value(type, a));
  88    assert(lp_check_value(type, b));
  89
  90    /* TODO: optimize the constant case */
  91
  92    if (type.floating && util_cpu_caps.has_sse) {
  93       if (type.width == 32) {
  94          if (type.length == 1) {
  95             intrinsic = "llvm.x86.sse.min.ss";
  96             intr_size = 128;
  97          }
  98          else if (type.length <= 4 || !util_cpu_caps.has_avx) {
  99             intrinsic = "llvm.x86.sse.min.ps";
 100             intr_size = 128;
 101          }
 102          else {
 103             intrinsic = "llvm.x86.avx.min.ps.256";
 104             intr_size = 256;
 105          }
 106       }
 107       if (type.width == 64 && util_cpu_caps.has_sse2) {
 108          if (type.length == 1) {
 109             intrinsic = "llvm.x86.sse2.min.sd";
 110             intr_size = 128;
 111          }
 112          else if (type.length == 2 || !util_cpu_caps.has_avx) {
 113             intrinsic = "llvm.x86.sse2.min.pd";
 114             intr_size = 128;
 115          }
 116          else {
 117             intrinsic = "llvm.x86.avx.min.pd.256";
 118             intr_size = 256;
 119          }
 120       }
 121    }
 122    else if (type.floating && util_cpu_caps.has_altivec) {
 123       if (type.width == 32 && type.length == 4) {
 124          intrinsic = "llvm.ppc.altivec.vminfp";
 125          intr_size = 128;
 126       }
 127    } else if (util_cpu_caps.has_sse2 && type.length >= 2) {
 128       intr_size = 128;
 129       if ((type.width == 8 || type.width == 16) &&
 130           (type.width * type.length <= 64) &&
 131           (gallivm_debug & GALLIVM_DEBUG_PERF)) {
 132          debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
 133                       __FUNCTION__);
 134          }
 135       if (type.width == 8 && !type.sign) {
 136          intrinsic = "llvm.x86.sse2.pminu.b";
 137       }
 138       else if (type.width == 16 && type.sign) {
 139          intrinsic = "llvm.x86.sse2.pmins.w";
 140       }
 141       if (util_cpu_caps.has_sse4_1) {
 142          if (type.width == 8 && type.sign) {
 143             intrinsic = "llvm.x86.sse41.pminsb";
 144          }
 145          if (type.width == 16 && !type.sign) {
 146             intrinsic = "llvm.x86.sse41.pminuw";
 147          }
 148          if (type.width == 32 && !type.sign) {
 149             intrinsic = "llvm.x86.sse41.pminud";
 150         }
 151          if (type.width == 32 && type.sign) {
 152             intrinsic = "llvm.x86.sse41.pminsd";
 153          }
 154       }
 155    } else if (util_cpu_caps.has_altivec) {
 156      intr_size = 128;
 157      if (type.width == 8) {
 158        if (!type.sign) {
 159          intrinsic = "llvm.ppc.altivec.vminub";
 160        } else {
 161          intrinsic = "llvm.ppc.altivec.vminsb";
 162        }
 163      } else if (type.width == 16) {
 164        if (!type.sign) {
 165          intrinsic = "llvm.ppc.altivec.vminuh";
 166        } else {
 167          intrinsic = "llvm.ppc.altivec.vminsh";
 168        }
 169      } else if (type.width == 32) {
 170        if (!type.sign) {
 171          intrinsic = "llvm.ppc.altivec.vminuw";
 172        } else {
 173          intrinsic = "llvm.ppc.altivec.vminsw";
 174        }
 175      }
 176    }
 177
 178    if(intrinsic) {
 179       return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
 180                                                  type,
 181                                                  intr_size, a, b);
 182    }
 183
 184    cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 185    return lp_build_select(bld, cond, a, b);
 186 }
 187
 188
 189 /**
 190  * Generate max(a, b)
 191  * No checks for special case values of a or b = 1 or 0 are done.
 192  */
 193 static LLVMValueRef
 194 lp_build_max_simple(struct lp_build_context *bld,
 195                     LLVMValueRef a,
 196                     LLVMValueRef b)
 197 {
 198    const struct lp_type type = bld->type;
 199    const char *intrinsic = NULL;
 200    unsigned intr_size = 0;
 201    LLVMValueRef cond;
 202
 203    assert(lp_check_value(type, a));
 204    assert(lp_check_value(type, b));
 205
 206    /* TODO: optimize the constant case */
 207
 208    if (type.floating && util_cpu_caps.has_sse) {
 209       if (type.width == 32) {
 210          if (type.length == 1) {
 211             intrinsic = "llvm.x86.sse.max.ss";
 212             intr_size = 128;
 213          }
 214          else if (type.length <= 4 || !util_cpu_caps.has_avx) {
 215             intrinsic = "llvm.x86.sse.max.ps";
 216             intr_size = 128;
 217          }
 218          else {
 219             intrinsic = "llvm.x86.avx.max.ps.256";
 220             intr_size = 256;
 221          }
 222       }
 223       if (type.width == 64 && util_cpu_caps.has_sse2) {
 224          if (type.length == 1) {
 225             intrinsic = "llvm.x86.sse2.max.sd";
 226             intr_size = 128;
 227          }
 228          else if (type.length == 2 || !util_cpu_caps.has_avx) {
 229             intrinsic = "llvm.x86.sse2.max.pd";
 230             intr_size = 128;
 231          }
 232          else {
 233             intrinsic = "llvm.x86.avx.max.pd.256";
 234             intr_size = 256;
 235          }
 236       }
 237    }
 238    else if (type.floating && util_cpu_caps.has_altivec) {
 239       if (type.width == 32 || type.length == 4) {
 240          intrinsic = "llvm.ppc.altivec.vmaxfp";
 241          intr_size = 128;
 242       }
 243    } else if (util_cpu_caps.has_sse2 && type.length >= 2) {
 244       intr_size = 128;
 245       if ((type.width == 8 || type.width == 16) &&
 246           (type.width * type.length <= 64) &&
 247           (gallivm_debug & GALLIVM_DEBUG_PERF)) {
 248          debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
 249                       __FUNCTION__);
 250          }
 251       if (type.width == 8 && !type.sign) {
 252          intrinsic = "llvm.x86.sse2.pmaxu.b";
 253          intr_size = 128;
 254       }
 255       else if (type.width == 16 && type.sign) {
 256          intrinsic = "llvm.x86.sse2.pmaxs.w";
 257       }
 258       if (util_cpu_caps.has_sse4_1) {
 259          if (type.width == 8 && type.sign) {
 260             intrinsic = "llvm.x86.sse41.pmaxsb";
 261          }
 262          if (type.width == 16 && !type.sign) {
 263             intrinsic = "llvm.x86.sse41.pmaxuw";
 264          }
 265          if (type.width == 32 && !type.sign) {
 266             intrinsic = "llvm.x86.sse41.pmaxud";
 267         }
 268          if (type.width == 32 && type.sign) {
 269             intrinsic = "llvm.x86.sse41.pmaxsd";
 270          }
 271       }
 272    } else if (util_cpu_caps.has_altivec) {
 273      intr_size = 128;
 274      if (type.width == 8) {
 275        if (!type.sign) {
 276          intrinsic = "llvm.ppc.altivec.vmaxub";
 277        } else {
 278          intrinsic = "llvm.ppc.altivec.vmaxsb";
 279        }
 280      } else if (type.width == 16) {
 281        if (!type.sign) {
 282          intrinsic = "llvm.ppc.altivec.vmaxuh";
 283        } else {
 284          intrinsic = "llvm.ppc.altivec.vmaxsh";
 285        }
 286      } else if (type.width == 32) {
 287        if (!type.sign) {
 288          intrinsic = "llvm.ppc.altivec.vmaxuw";
 289        } else {
 290          intrinsic = "llvm.ppc.altivec.vmaxsw";
 291        }
 292      }
 293    }
 294
 295    if(intrinsic) {
 296       return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
 297                                                  type,
 298                                                  intr_size, a, b);
 299    }
 300
 301    cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 302    return lp_build_select(bld, cond, a, b);
 303 }
 304
 305
 306 /**
 307  * Generate 1 - a, or ~a depending on bld->type.
 308  */
 309 LLVMValueRef
 310 lp_build_comp(struct lp_build_context *bld,
 311               LLVMValueRef a)
 312 {
 313    LLVMBuilderRef builder = bld->gallivm->builder;
 314    const struct lp_type type = bld->type;
 315
 316    assert(lp_check_value(type, a));
 317
 318    if(a == bld->one)
 319       return bld->zero;
 320    if(a == bld->zero)
 321       return bld->one;
 322
 323    if(type.norm && !type.floating && !type.fixed && !type.sign) {
 324       if(LLVMIsConstant(a))
 325          return LLVMConstNot(a);
 326       else
 327          return LLVMBuildNot(builder, a, "");
 328    }
 329
 330    if(LLVMIsConstant(a))
 331       if (type.floating)
 332           return LLVMConstFSub(bld->one, a);
 333       else
 334           return LLVMConstSub(bld->one, a);
 335    else
 336       if (type.floating)
 337          return LLVMBuildFSub(builder, bld->one, a, "");
 338       else
 339          return LLVMBuildSub(builder, bld->one, a, "");
 340 }
 341
 342
 343 /**
 344  * Generate a + b
 345  */
 346 LLVMValueRef
 347 lp_build_add(struct lp_build_context *bld,
 348              LLVMValueRef a,
 349              LLVMValueRef b)
 350 {
 351    LLVMBuilderRef builder = bld->gallivm->builder;
 352    const struct lp_type type = bld->type;
 353    LLVMValueRef res;
 354
 355    assert(lp_check_value(type, a));
 356    assert(lp_check_value(type, b));
 357
 358    if(a == bld->zero)
 359       return b;
 360    if(b == bld->zero)
 361       return a;
 362    if(a == bld->undef || b == bld->undef)
 363       return bld->undef;
 364
 365    if(bld->type.norm) {
 366       const char *intrinsic = NULL;
 367
 368       if(a == bld->one || b == bld->one)
 369         return bld->one;
 370
 371       if (type.width * type.length == 128 &&
 372           !type.floating && !type.fixed) {
 373          if(util_cpu_caps.has_sse2) {
 374            if(type.width == 8)
 375              intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
 376            if(type.width == 16)
 377              intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
 378          } else if (util_cpu_caps.has_altivec) {
 379            if(type.width == 8)
 380               intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs";
 381            if(type.width == 16)
 382               intrinsic = type.sign ? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs";
 383          }
 384       }
 385
 386       if(intrinsic)
 387          return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
 388    }
 389
 390    /* TODO: handle signed case */
 391    if(type.norm && !type.floating && !type.fixed && !type.sign)
 392       a = lp_build_min_simple(bld, a, lp_build_comp(bld, b));
 393
 394    if(LLVMIsConstant(a) && LLVMIsConstant(b))
 395       if (type.floating)
 396          res = LLVMConstFAdd(a, b);
 397       else
 398          res = LLVMConstAdd(a, b);
 399    else
 400       if (type.floating)
 401          res = LLVMBuildFAdd(builder, a, b, "");
 402       else
 403          res = LLVMBuildAdd(builder, a, b, "");
 404
 405    /* clamp to ceiling of 1.0 */
 406    if(bld->type.norm && (bld->type.floating || bld->type.fixed))
 407       res = lp_build_min_simple(bld, res, bld->one);
 408
 409    /* XXX clamp to floor of -1 or 0??? */
 410
 411    return res;
 412 }
 413
 414
 415 /** Return the scalar sum of the elements of a.
 416  * Should avoid this operation whenever possible.
 417  */
 418 LLVMValueRef
 419 lp_build_horizontal_add(struct lp_build_context *bld,
 420                         LLVMValueRef a)
 421 {
 422    LLVMBuilderRef builder = bld->gallivm->builder;
 423    const struct lp_type type = bld->type;
 424    LLVMValueRef index, res;
 425    unsigned i, length;
 426    LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 2];
 427    LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 2];
 428    LLVMValueRef vecres, elem2;
 429
 430    assert(lp_check_value(type, a));
 431
 432    if (type.length == 1) {
 433       return a;
 434    }
 435
 436    assert(!bld->type.norm);
 437
 438    /*
 439     * for byte vectors can do much better with psadbw.
 440     * Using repeated shuffle/adds here. Note with multiple vectors
 441     * this can be done more efficiently as outlined in the intel
 442     * optimization manual.
 443     * Note: could cause data rearrangement if used with smaller element
 444     * sizes.
 445     */
 446
 447    vecres = a;
 448    length = type.length / 2;
 449    while (length > 1) {
 450       LLVMValueRef vec1, vec2;
 451       for (i = 0; i < length; i++) {
 452          shuffles1[i] = lp_build_const_int32(bld->gallivm, i);
 453          shuffles2[i] = lp_build_const_int32(bld->gallivm, i + length);
 454       }
 455       vec1 = LLVMBuildShuffleVector(builder, vecres, vecres,
 456                                     LLVMConstVector(shuffles1, length), "");
 457       vec2 = LLVMBuildShuffleVector(builder, vecres, vecres,
 458                                     LLVMConstVector(shuffles2, length), "");
 459       if (type.floating) {
 460          vecres = LLVMBuildFAdd(builder, vec1, vec2, "");
 461       }
 462       else {
 463          vecres = LLVMBuildAdd(builder, vec1, vec2, "");
 464       }
 465       length = length >> 1;
 466    }
 467
 468    /* always have vector of size 2 here */
 469    assert(length == 1);
 470
 471    index = lp_build_const_int32(bld->gallivm, 0);
 472    res = LLVMBuildExtractElement(builder, vecres, index, "");
 473    index = lp_build_const_int32(bld->gallivm, 1);
 474    elem2 = LLVMBuildExtractElement(builder, vecres, index, "");
 475
 476    if (type.floating)
 477       res = LLVMBuildFAdd(builder, res, elem2, "");
 478     else
 479       res = LLVMBuildAdd(builder, res, elem2, "");
 480
 481    return res;
 482 }
 483
 484 /**
 485  * Return the horizontal sums of 4 float vectors as a float4 vector.
 486  * This uses the technique as outlined in Intel Optimization Manual.
 487  */
 488 static LLVMValueRef
 489 lp_build_horizontal_add4x4f(struct lp_build_context *bld,
 490                             LLVMValueRef src[4])
 491 {
 492    struct gallivm_state *gallivm = bld->gallivm;
 493    LLVMBuilderRef builder = gallivm->builder;
 494    LLVMValueRef shuffles[4];
 495    LLVMValueRef tmp[4];
 496    LLVMValueRef sumtmp[2], shuftmp[2];
 497
 498    /* lower half of regs */
 499    shuffles[0] = lp_build_const_int32(gallivm, 0);
 500    shuffles[1] = lp_build_const_int32(gallivm, 1);
 501    shuffles[2] = lp_build_const_int32(gallivm, 4);
 502    shuffles[3] = lp_build_const_int32(gallivm, 5);
 503    tmp[0] = LLVMBuildShuffleVector(builder, src[0], src[1],
 504                                    LLVMConstVector(shuffles, 4), "");
 505    tmp[2] = LLVMBuildShuffleVector(builder, src[2], src[3],
 506                                    LLVMConstVector(shuffles, 4), "");
 507
 508    /* upper half of regs */
 509    shuffles[0] = lp_build_const_int32(gallivm, 2);
 510    shuffles[1] = lp_build_const_int32(gallivm, 3);
 511    shuffles[2] = lp_build_const_int32(gallivm, 6);
 512    shuffles[3] = lp_build_const_int32(gallivm, 7);
 513    tmp[1] = LLVMBuildShuffleVector(builder, src[0], src[1],
 514                                    LLVMConstVector(shuffles, 4), "");
 515    tmp[3] = LLVMBuildShuffleVector(builder, src[2], src[3],
 516                                    LLVMConstVector(shuffles, 4), "");
 517
 518    sumtmp[0] = LLVMBuildFAdd(builder, tmp[0], tmp[1], "");
 519    sumtmp[1] = LLVMBuildFAdd(builder, tmp[2], tmp[3], "");
 520
 521    shuffles[0] = lp_build_const_int32(gallivm, 0);
 522    shuffles[1] = lp_build_const_int32(gallivm, 2);
 523    shuffles[2] = lp_build_const_int32(gallivm, 4);
 524    shuffles[3] = lp_build_const_int32(gallivm, 6);
 525    shuftmp[0] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
 526                                        LLVMConstVector(shuffles, 4), "");
 527
 528    shuffles[0] = lp_build_const_int32(gallivm, 1);
 529    shuffles[1] = lp_build_const_int32(gallivm, 3);
 530    shuffles[2] = lp_build_const_int32(gallivm, 5);
 531    shuffles[3] = lp_build_const_int32(gallivm, 7);
 532    shuftmp[1] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
 533                                        LLVMConstVector(shuffles, 4), "");
 534
 535    return LLVMBuildFAdd(builder, shuftmp[0], shuftmp[1], "");
 536 }
 537
 538
 539 /*
 540  * partially horizontally add 2-4 float vectors with length nx4,
 541  * i.e. only four adjacent values in each vector will be added,
 542  * assuming values are really grouped in 4 which also determines
 543  * output order.
 544  *
 545  * Return a vector of the same length as the initial vectors,
 546  * with the excess elements (if any) being undefined.
 547  * The element order is independent of number of input vectors.
 548  * For 3 vectors x0x1x2x3x4x5x6x7, y0y1y2y3y4y5y6y7, z0z1z2z3z4z5z6z7
 549  * the output order thus will be
 550  * sumx0-x3,sumy0-y3,sumz0-z3,undef,sumx4-x7,sumy4-y7,sumz4z7,undef
 551  */
 552 LLVMValueRef
 553 lp_build_hadd_partial4(struct lp_build_context *bld,
 554                        LLVMValueRef vectors[],
 555                        unsigned num_vecs)
 556 {
 557    struct gallivm_state *gallivm = bld->gallivm;
 558    LLVMBuilderRef builder = gallivm->builder;
 559    LLVMValueRef ret_vec;
 560    LLVMValueRef tmp[4];
 561    const char *intrinsic = NULL;
 562
 563    assert(num_vecs >= 2 && num_vecs <= 4);
 564    assert(bld->type.floating);
 565
 566    /* only use this with at least 2 vectors, as it is sort of expensive
 567     * (depending on cpu) and we always need two horizontal adds anyway,
 568     * so a shuffle/add approach might be better.
 569     */
 570
 571    tmp[0] = vectors[0];
 572    tmp[1] = vectors[1];
 573
 574    tmp[2] = num_vecs > 2 ? vectors[2] : vectors[0];
 575    tmp[3] = num_vecs > 3 ? vectors[3] : vectors[0];
 576
 577    if (util_cpu_caps.has_sse3 && bld->type.width == 32 &&
 578        bld->type.length == 4) {
 579       intrinsic = "llvm.x86.sse3.hadd.ps";
 580    }
 581    else if (util_cpu_caps.has_avx && bld->type.width == 32 &&
 582             bld->type.length == 8) {
 583       intrinsic = "llvm.x86.avx.hadd.ps.256";
 584    }
 585    if (intrinsic) {
 586       tmp[0] = lp_build_intrinsic_binary(builder, intrinsic,
 587                                        lp_build_vec_type(gallivm, bld->type),
 588                                        tmp[0], tmp[1]);
 589       if (num_vecs > 2) {
 590          tmp[1] = lp_build_intrinsic_binary(builder, intrinsic,
 591                                           lp_build_vec_type(gallivm, bld->type),
 592                                           tmp[2], tmp[3]);
 593       }
 594       else {
 595          tmp[1] = tmp[0];
 596       }
 597       return lp_build_intrinsic_binary(builder, intrinsic,
 598                                        lp_build_vec_type(gallivm, bld->type),
 599                                        tmp[0], tmp[1]);
 600    }
 601
 602    if (bld->type.length == 4) {
 603       ret_vec = lp_build_horizontal_add4x4f(bld, tmp);
 604    }
 605    else {
 606       LLVMValueRef partres[LP_MAX_VECTOR_LENGTH/4];
 607       unsigned j;
 608       unsigned num_iter = bld->type.length / 4;
 609       struct lp_type parttype = bld->type;
 610       parttype.length = 4;
 611       for (j = 0; j < num_iter; j++) {
 612          LLVMValueRef partsrc[4];
 613          unsigned i;
 614          for (i = 0; i < 4; i++) {
 615             partsrc[i] = lp_build_extract_range(gallivm, tmp[i], j*4, 4);
 616          }
 617          partres[j] = lp_build_horizontal_add4x4f(bld, partsrc);
 618       }
 619       ret_vec = lp_build_concat(gallivm, partres, parttype, num_iter);
 620    }
 621    return ret_vec;
 622 }
 623
 624 /**
 625  * Generate a - b
 626  */
 627 LLVMValueRef
 628 lp_build_sub(struct lp_build_context *bld,
 629              LLVMValueRef a,
 630              LLVMValueRef b)
 631 {
 632    LLVMBuilderRef builder = bld->gallivm->builder;
 633    const struct lp_type type = bld->type;
 634    LLVMValueRef res;
 635
 636    assert(lp_check_value(type, a));
 637    assert(lp_check_value(type, b));
 638
 639    if(b == bld->zero)
 640       return a;
 641    if(a == bld->undef || b == bld->undef)
 642       return bld->undef;
 643    if(a == b)
 644       return bld->zero;
 645
 646    if(bld->type.norm) {
 647       const char *intrinsic = NULL;
 648
 649       if(b == bld->one)
 650         return bld->zero;
 651
 652       if (type.width * type.length == 128 &&
 653           !type.floating && !type.fixed) {
 654          if (util_cpu_caps.has_sse2) {
 655            if(type.width == 8)
 656               intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
 657            if(type.width == 16)
 658               intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
 659          } else if (util_cpu_caps.has_altivec) {
 660            if(type.width == 8)
 661               intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs";
 662            if(type.width == 16)
 663               intrinsic = type.sign ? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs";
 664          }
 665       }
 666
 667       if(intrinsic)
 668          return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
 669    }
 670
 671    /* TODO: handle signed case */
 672    if(type.norm && !type.floating && !type.fixed && !type.sign)
 673       a = lp_build_max_simple(bld, a, b);
 674
 675    if(LLVMIsConstant(a) && LLVMIsConstant(b))
 676       if (type.floating)
 677          res = LLVMConstFSub(a, b);
 678       else
 679          res = LLVMConstSub(a, b);
 680    else
 681       if (type.floating)
 682          res = LLVMBuildFSub(builder, a, b, "");
 683       else
 684          res = LLVMBuildSub(builder, a, b, "");
 685
 686    if(bld->type.norm && (bld->type.floating || bld->type.fixed))
 687       res = lp_build_max_simple(bld, res, bld->zero);
 688
 689    return res;
 690 }
 691
 692
 693
 694 /**
 695  * Normalized multiplication.
 696  *
 697  * There are several approaches for (using 8-bit normalized multiplication as
 698  * an example):
 699  *
 700  * - alpha plus one
 701  *
 702  *     makes the following approximation to the division (Sree)
 703  *
 704  *       a*b/255 ~= (a*(b + 1)) >> 256
 705  *
 706  *     which is the fastest method that satisfies the following OpenGL criteria of
 707  *
 708  *       0*0 = 0 and 255*255 = 255
 709  *
 710  * - geometric series
 711  *
 712  *     takes the geometric series approximation to the division
 713  *
 714  *       t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
 715  *
 716  *     in this case just the first two terms to fit in 16bit arithmetic
 717  *
 718  *       t/255 ~= (t + (t >> 8)) >> 8
 719  *
 720  *     note that just by itself it doesn't satisfies the OpenGL criteria, as
 721  *     255*255 = 254, so the special case b = 255 must be accounted or roundoff
 722  *     must be used.
 723  *
 724  * - geometric series plus rounding
 725  *
 726  *     when using a geometric series division instead of truncating the result
 727  *     use roundoff in the approximation (Jim Blinn)
 728  *
 729  *       t/255 ~= (t + (t >> 8) + 0x80) >> 8
 730  *
 731  *     achieving the exact results.
 732  *
 733  *
 734  *
 735  * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
 736  *     ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
 737  * @sa Michael Herf, The "double blend trick", May 2000,
 738  *     http://www.stereopsis.com/doubleblend.html
 739  */
 740 static LLVMValueRef
 741 lp_build_mul_norm(struct gallivm_state *gallivm,
 742                   struct lp_type wide_type,
 743                   LLVMValueRef a, LLVMValueRef b)
 744 {
 745    LLVMBuilderRef builder = gallivm->builder;
 746    struct lp_build_context bld;
 747    unsigned n;
 748    LLVMValueRef half;
 749    LLVMValueRef ab;
 750
 751    assert(!wide_type.floating);
 752    assert(lp_check_value(wide_type, a));
 753    assert(lp_check_value(wide_type, b));
 754
 755    lp_build_context_init(&bld, gallivm, wide_type);
 756
 757    n = wide_type.width / 2;
 758    if (wide_type.sign) {
 759       --n;
 760    }
 761
 762    /*
 763     * TODO: for 16bits normalized SSE2 vectors we could consider using PMULHUW
 764     * http://ssp.impulsetrain.com/2011/07/03/multiplying-normalized-16-bit-numbers-with-sse2/
 765     */
 766
 767    /*
 768     * a*b / (2**n - 1) ~= (a*b + (a*b >> n) + half) >> n
 769     */
 770
 771    ab = LLVMBuildMul(builder, a, b, "");
 772    ab = LLVMBuildAdd(builder, ab, lp_build_shr_imm(&bld, ab, n), "");
 773
 774    /*
 775     * half = sgn(ab) * 0.5 * (2 ** n) = sgn(ab) * (1 << (n - 1))
 776     */
 777
 778    half = lp_build_const_int_vec(gallivm, wide_type, 1 << (n - 1));
 779    if (wide_type.sign) {
 780       LLVMValueRef minus_half = LLVMBuildNeg(builder, half, "");
 781       LLVMValueRef sign = lp_build_shr_imm(&bld, ab, wide_type.width - 1);
 782       half = lp_build_select(&bld, sign, minus_half, half);
 783    }
 784    ab = LLVMBuildAdd(builder, ab, half, "");
 785
 786    /* Final division */
 787    ab = lp_build_shr_imm(&bld, ab, n);
 788
 789    return ab;
 790 }
 791
 792 /**
 793  * Generate a * b
 794  */
 795 LLVMValueRef
 796 lp_build_mul(struct lp_build_context *bld,
 797              LLVMValueRef a,
 798              LLVMValueRef b)
 799 {
 800    LLVMBuilderRef builder = bld->gallivm->builder;
 801    const struct lp_type type = bld->type;
 802    LLVMValueRef shift;
 803    LLVMValueRef res;
 804
 805    assert(lp_check_value(type, a));
 806    assert(lp_check_value(type, b));
 807
 808    if(a == bld->zero)
 809       return bld->zero;
 810    if(a == bld->one)
 811       return b;
 812    if(b == bld->zero)
 813       return bld->zero;
 814    if(b == bld->one)
 815       return a;
 816    if(a == bld->undef || b == bld->undef)
 817       return bld->undef;
 818
 819    if (!type.floating && !type.fixed && type.norm) {
 820       struct lp_type wide_type = lp_wider_type(type);
 821       LLVMValueRef al, ah, bl, bh, abl, abh, ab;
 822
 823       lp_build_unpack2(bld->gallivm, type, wide_type, a, &al, &ah);
 824       lp_build_unpack2(bld->gallivm, type, wide_type, b, &bl, &bh);
 825
 826       /* PMULLW, PSRLW, PADDW */
 827       abl = lp_build_mul_norm(bld->gallivm, wide_type, al, bl);
 828       abh = lp_build_mul_norm(bld->gallivm, wide_type, ah, bh);
 829
 830       ab = lp_build_pack2(bld->gallivm, wide_type, type, abl, abh);
 831
 832       return ab;
 833    }
 834
 835    if(type.fixed)
 836       shift = lp_build_const_int_vec(bld->gallivm, type, type.width/2);
 837    else
 838       shift = NULL;
 839
 840    if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
 841       if (type.floating)
 842          res = LLVMConstFMul(a, b);
 843       else
 844          res = LLVMConstMul(a, b);
 845       if(shift) {
 846          if(type.sign)
 847             res = LLVMConstAShr(res, shift);
 848          else
 849             res = LLVMConstLShr(res, shift);
 850       }
 851    }
 852    else {
 853       if (type.floating)
 854          res = LLVMBuildFMul(builder, a, b, "");
 855       else
 856          res = LLVMBuildMul(builder, a, b, "");
 857       if(shift) {
 858          if(type.sign)
 859             res = LLVMBuildAShr(builder, res, shift, "");
 860          else
 861             res = LLVMBuildLShr(builder, res, shift, "");
 862       }
 863    }
 864
 865    return res;
 866 }
 867
 868
 869 /**
 870  * Small vector x scale multiplication optimization.
 871  */
 872 LLVMValueRef
 873 lp_build_mul_imm(struct lp_build_context *bld,
 874                  LLVMValueRef a,
 875                  int b)
 876 {
 877    LLVMBuilderRef builder = bld->gallivm->builder;
 878    LLVMValueRef factor;
 879
 880    assert(lp_check_value(bld->type, a));
 881
 882    if(b == 0)
 883       return bld->zero;
 884
 885    if(b == 1)
 886       return a;
 887
 888    if(b == -1)
 889       return lp_build_negate(bld, a);
 890
 891    if(b == 2 && bld->type.floating)
 892       return lp_build_add(bld, a, a);
 893
 894    if(util_is_power_of_two(b)) {
 895       unsigned shift = ffs(b) - 1;
 896
 897       if(bld->type.floating) {
 898 #if 0
 899          /*
 900           * Power of two multiplication by directly manipulating the exponent.
 901           *
 902           * XXX: This might not be always faster, it will introduce a small error
 903           * for multiplication by zero, and it will produce wrong results
 904           * for Inf and NaN.
 905           */
 906          unsigned mantissa = lp_mantissa(bld->type);
 907          factor = lp_build_const_int_vec(bld->gallivm, bld->type, (unsigned long long)shift << mantissa);
 908          a = LLVMBuildBitCast(builder, a, lp_build_int_vec_type(bld->type), "");
 909          a = LLVMBuildAdd(builder, a, factor, "");
 910          a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, bld->type), "");
 911          return a;
 912 #endif
 913       }
 914       else {
 915          factor = lp_build_const_vec(bld->gallivm, bld->type, shift);
 916          return LLVMBuildShl(builder, a, factor, "");
 917       }
 918    }
 919
 920    factor = lp_build_const_vec(bld->gallivm, bld->type, (double)b);
 921    return lp_build_mul(bld, a, factor);
 922 }
 923
 924
 925 /**
 926  * Generate a / b
 927  */
 928 LLVMValueRef
 929 lp_build_div(struct lp_build_context *bld,
 930              LLVMValueRef a,
 931              LLVMValueRef b)
 932 {
 933    LLVMBuilderRef builder = bld->gallivm->builder;
 934    const struct lp_type type = bld->type;
 935
 936    assert(lp_check_value(type, a));
 937    assert(lp_check_value(type, b));
 938
 939    if(a == bld->zero)
 940       return bld->zero;
 941    if(a == bld->one)
 942       return lp_build_rcp(bld, b);
 943    if(b == bld->zero)
 944       return bld->undef;
 945    if(b == bld->one)
 946       return a;
 947    if(a == bld->undef || b == bld->undef)
 948       return bld->undef;
 949
 950    if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
 951       if (type.floating)
 952          return LLVMConstFDiv(a, b);
 953       else if (type.sign)
 954          return LLVMConstSDiv(a, b);
 955       else
 956          return LLVMConstUDiv(a, b);
 957    }
 958
 959    if(((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
 960        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) &&
 961       type.floating)
 962       return lp_build_mul(bld, a, lp_build_rcp(bld, b));
 963
 964    if (type.floating)
 965       return LLVMBuildFDiv(builder, a, b, "");
 966    else if (type.sign)
 967       return LLVMBuildSDiv(builder, a, b, "");
 968    else
 969       return LLVMBuildUDiv(builder, a, b, "");
 970 }
 971
 972
 973 /**
 974  * Linear interpolation helper.
 975  *
 976  * @param normalized whether we are interpolating normalized values,
 977  *        encoded in normalized integers, twice as wide.
 978  *
 979  * @sa http://www.stereopsis.com/doubleblend.html
 980  */
 981 static INLINE LLVMValueRef
 982 lp_build_lerp_simple(struct lp_build_context *bld,
 983                      LLVMValueRef x,
 984                      LLVMValueRef v0,
 985                      LLVMValueRef v1,
 986                      unsigned flags)
 987 {
 988    unsigned half_width = bld->type.width/2;
 989    LLVMBuilderRef builder = bld->gallivm->builder;
 990    LLVMValueRef delta;
 991    LLVMValueRef res;
 992
 993    assert(lp_check_value(bld->type, x));
 994    assert(lp_check_value(bld->type, v0));
 995    assert(lp_check_value(bld->type, v1));
 996
 997    delta = lp_build_sub(bld, v1, v0);
 998
 999    if (flags & LP_BLD_LERP_WIDE_NORMALIZED) {
1000       if (!bld->type.sign) {
1001          if (!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS)) {
1002             /*
1003              * Scale x from [0, 2**n - 1] to [0, 2**n] by adding the
1004              * most-significant-bit to the lowest-significant-bit, so that
1005              * later we can just divide by 2**n instead of 2**n - 1.
1006              */
1007
1008             x = lp_build_add(bld, x, lp_build_shr_imm(bld, x, half_width - 1));
1009          }
1010
1011          /* (x * delta) >> n */
1012          res = lp_build_mul(bld, x, delta);
1013          res = lp_build_shr_imm(bld, res, half_width);
1014       } else {
1015          /*
1016           * The rescaling trick above doesn't work for signed numbers, so
1017           * use the 2**n - 1 divison approximation in lp_build_mul_norm
1018           * instead.
1019           */
1020          assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1021          res = lp_build_mul_norm(bld->gallivm, bld->type, x, delta);
1022       }
1023    } else {
1024       assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1025       res = lp_build_mul(bld, x, delta);
1026    }
1027
1028    res = lp_build_add(bld, v0, res);
1029
1030    if (((flags & LP_BLD_LERP_WIDE_NORMALIZED) && !bld->type.sign) ||
1031        bld->type.fixed) {
1032       /* We need to mask out the high order bits when lerping 8bit normalized colors stored on 16bits */
1033       /* XXX: This step is necessary for lerping 8bit colors stored on 16bits,
1034        * but it will be wrong for true fixed point use cases. Basically we need
1035        * a more powerful lp_type, capable of further distinguishing the values
1036        * interpretation from the value storage. */
1037       res = LLVMBuildAnd(builder, res, lp_build_const_int_vec(bld->gallivm, bld->type, (1 << half_width) - 1), "");
1038    }
1039
1040    return res;
1041 }
1042
1043
1044 /**
1045  * Linear interpolation.
1046  */
1047 LLVMValueRef
1048 lp_build_lerp(struct lp_build_context *bld,
1049               LLVMValueRef x,
1050               LLVMValueRef v0,
1051               LLVMValueRef v1,
1052               unsigned flags)
1053 {
1054    const struct lp_type type = bld->type;
1055    LLVMValueRef res;
1056
1057    assert(lp_check_value(type, x));
1058    assert(lp_check_value(type, v0));
1059    assert(lp_check_value(type, v1));
1060
1061    assert(!(flags & LP_BLD_LERP_WIDE_NORMALIZED));
1062
1063    if (type.norm) {
1064       struct lp_type wide_type;
1065       struct lp_build_context wide_bld;
1066       LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh;
1067
1068       assert(type.length >= 2);
1069
1070       /*
1071        * Create a wider integer type, enough to hold the
1072        * intermediate result of the multiplication.
1073        */
1074       memset(&wide_type, 0, sizeof wide_type);
1075       wide_type.sign   = type.sign;
1076       wide_type.width  = type.width*2;
1077       wide_type.length = type.length/2;
1078
1079       lp_build_context_init(&wide_bld, bld->gallivm, wide_type);
1080
1081       lp_build_unpack2(bld->gallivm, type, wide_type, x,  &xl,  &xh);
1082       lp_build_unpack2(bld->gallivm, type, wide_type, v0, &v0l, &v0h);
1083       lp_build_unpack2(bld->gallivm, type, wide_type, v1, &v1l, &v1h);
1084
1085       /*
1086        * Lerp both halves.
1087        */
1088
1089       flags |= LP_BLD_LERP_WIDE_NORMALIZED;
1090
1091       resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l, flags);
1092       resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h, flags);
1093
1094       res = lp_build_pack2(bld->gallivm, wide_type, type, resl, resh);
1095    } else {
1096       res = lp_build_lerp_simple(bld, x, v0, v1, flags);
1097    }
1098
1099    return res;
1100 }
1101
1102
1103 /**
1104  * Bilinear interpolation.
1105  *
1106  * Values indices are in v_{yx}.
1107  */
1108 LLVMValueRef
1109 lp_build_lerp_2d(struct lp_build_context *bld,
1110                  LLVMValueRef x,
1111                  LLVMValueRef y,
1112                  LLVMValueRef v00,
1113                  LLVMValueRef v01,
1114                  LLVMValueRef v10,
1115                  LLVMValueRef v11,
1116                  unsigned flags)
1117 {
1118    LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01, flags);
1119    LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11, flags);
1120    return lp_build_lerp(bld, y, v0, v1, flags);
1121 }
1122
1123
1124 LLVMValueRef
1125 lp_build_lerp_3d(struct lp_build_context *bld,
1126                  LLVMValueRef x,
1127                  LLVMValueRef y,
1128                  LLVMValueRef z,
1129                  LLVMValueRef v000,
1130                  LLVMValueRef v001,
1131                  LLVMValueRef v010,
1132                  LLVMValueRef v011,
1133                  LLVMValueRef v100,
1134                  LLVMValueRef v101,
1135                  LLVMValueRef v110,
1136                  LLVMValueRef v111,
1137                  unsigned flags)
1138 {
1139    LLVMValueRef v0 = lp_build_lerp_2d(bld, x, y, v000, v001, v010, v011, flags);
1140    LLVMValueRef v1 = lp_build_lerp_2d(bld, x, y, v100, v101, v110, v111, flags);
1141    return lp_build_lerp(bld, z, v0, v1, flags);
1142 }
1143
1144
1145 /**
1146  * Generate min(a, b)
1147  * Do checks for special cases.
1148  */
1149 LLVMValueRef
1150 lp_build_min(struct lp_build_context *bld,
1151              LLVMValueRef a,
1152              LLVMValueRef b)
1153 {
1154    assert(lp_check_value(bld->type, a));
1155    assert(lp_check_value(bld->type, b));
1156
1157    if(a == bld->undef || b == bld->undef)
1158       return bld->undef;
1159
1160    if(a == b)
1161       return a;
1162
1163    if (bld->type.norm) {
1164       if (!bld->type.sign) {
1165          if (a == bld->zero || b == bld->zero) {
1166             return bld->zero;
1167          }
1168       }
1169       if(a == bld->one)
1170          return b;
1171       if(b == bld->one)
1172          return a;
1173    }
1174
1175    return lp_build_min_simple(bld, a, b);
1176 }
1177
1178
1179 /**
1180  * Generate max(a, b)
1181  * Do checks for special cases.
1182  */
1183 LLVMValueRef
1184 lp_build_max(struct lp_build_context *bld,
1185              LLVMValueRef a,
1186              LLVMValueRef b)
1187 {
1188    assert(lp_check_value(bld->type, a));
1189    assert(lp_check_value(bld->type, b));
1190
1191    if(a == bld->undef || b == bld->undef)
1192       return bld->undef;
1193
1194    if(a == b)
1195       return a;
1196
1197    if(bld->type.norm) {
1198       if(a == bld->one || b == bld->one)
1199          return bld->one;
1200       if (!bld->type.sign) {
1201          if (a == bld->zero) {
1202             return b;
1203          }
1204          if (b == bld->zero) {
1205             return a;
1206          }
1207       }
1208    }
1209
1210    return lp_build_max_simple(bld, a, b);
1211 }
1212
1213
1214 /**
1215  * Generate clamp(a, min, max)
1216  * Do checks for special cases.
1217  */
1218 LLVMValueRef
1219 lp_build_clamp(struct lp_build_context *bld,
1220                LLVMValueRef a,
1221                LLVMValueRef min,
1222                LLVMValueRef max)
1223 {
1224    assert(lp_check_value(bld->type, a));
1225    assert(lp_check_value(bld->type, min));
1226    assert(lp_check_value(bld->type, max));
1227
1228    a = lp_build_min(bld, a, max);
1229    a = lp_build_max(bld, a, min);
1230    return a;
1231 }
1232
1233
1234 /**
1235  * Generate abs(a)
1236  */
1237 LLVMValueRef
1238 lp_build_abs(struct lp_build_context *bld,
1239              LLVMValueRef a)
1240 {
1241    LLVMBuilderRef builder = bld->gallivm->builder;
1242    const struct lp_type type = bld->type;
1243    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1244
1245    assert(lp_check_value(type, a));
1246
1247    if(!type.sign)
1248       return a;
1249
1250    if(type.floating) {
1251       /* Mask out the sign bit */
1252       LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1253       unsigned long long absMask = ~(1ULL << (type.width - 1));
1254       LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, ((unsigned long long) absMask));
1255       a = LLVMBuildBitCast(builder, a, int_vec_type, "");
1256       a = LLVMBuildAnd(builder, a, mask, "");
1257       a = LLVMBuildBitCast(builder, a, vec_type, "");
1258       return a;
1259    }
1260
1261    if(type.width*type.length == 128 && util_cpu_caps.has_ssse3) {
1262       switch(type.width) {
1263       case 8:
1264          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
1265       case 16:
1266          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
1267       case 32:
1268          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
1269       }
1270    }
1271    else if (type.width*type.length == 256 && util_cpu_caps.has_ssse3 &&
1272             (gallivm_debug & GALLIVM_DEBUG_PERF) &&
1273             (type.width == 8 || type.width == 16 || type.width == 32)) {
1274       debug_printf("%s: inefficient code, should split vectors manually\n",
1275                    __FUNCTION__);
1276    }
1277
1278    return lp_build_max(bld, a, LLVMBuildNeg(builder, a, ""));
1279 }
1280
1281
1282 LLVMValueRef
1283 lp_build_negate(struct lp_build_context *bld,
1284                 LLVMValueRef a)
1285 {
1286    LLVMBuilderRef builder = bld->gallivm->builder;
1287
1288    assert(lp_check_value(bld->type, a));
1289
1290 #if HAVE_LLVM >= 0x0207
1291    if (bld->type.floating)
1292       a = LLVMBuildFNeg(builder, a, "");
1293    else
1294 #endif
1295       a = LLVMBuildNeg(builder, a, "");
1296
1297    return a;
1298 }
1299
1300
1301 /** Return -1, 0 or +1 depending on the sign of a */
1302 LLVMValueRef
1303 lp_build_sgn(struct lp_build_context *bld,
1304              LLVMValueRef a)
1305 {
1306    LLVMBuilderRef builder = bld->gallivm->builder;
1307    const struct lp_type type = bld->type;
1308    LLVMValueRef cond;
1309    LLVMValueRef res;
1310
1311    assert(lp_check_value(type, a));
1312
1313    /* Handle non-zero case */
1314    if(!type.sign) {
1315       /* if not zero then sign must be positive */
1316       res = bld->one;
1317    }
1318    else if(type.floating) {
1319       LLVMTypeRef vec_type;
1320       LLVMTypeRef int_type;
1321       LLVMValueRef mask;
1322       LLVMValueRef sign;
1323       LLVMValueRef one;
1324       unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
1325
1326       int_type = lp_build_int_vec_type(bld->gallivm, type);
1327       vec_type = lp_build_vec_type(bld->gallivm, type);
1328       mask = lp_build_const_int_vec(bld->gallivm, type, maskBit);
1329
1330       /* Take the sign bit and add it to 1 constant */
1331       sign = LLVMBuildBitCast(builder, a, int_type, "");
1332       sign = LLVMBuildAnd(builder, sign, mask, "");
1333       one = LLVMConstBitCast(bld->one, int_type);
1334       res = LLVMBuildOr(builder, sign, one, "");
1335       res = LLVMBuildBitCast(builder, res, vec_type, "");
1336    }
1337    else
1338    {
1339       /* signed int/norm/fixed point */
1340       /* could use psign with sse3 and appropriate vectors here */
1341       LLVMValueRef minus_one = lp_build_const_vec(bld->gallivm, type, -1.0);
1342       cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
1343       res = lp_build_select(bld, cond, bld->one, minus_one);
1344    }
1345
1346    /* Handle zero */
1347    cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
1348    res = lp_build_select(bld, cond, bld->zero, res);
1349
1350    return res;
1351 }
1352
1353
1354 /**
1355  * Set the sign of float vector 'a' according to 'sign'.
1356  * If sign==0, return abs(a).
1357  * If sign==1, return -abs(a);
1358  * Other values for sign produce undefined results.
1359  */
1360 LLVMValueRef
1361 lp_build_set_sign(struct lp_build_context *bld,
1362                   LLVMValueRef a, LLVMValueRef sign)
1363 {
1364    LLVMBuilderRef builder = bld->gallivm->builder;
1365    const struct lp_type type = bld->type;
1366    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1367    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1368    LLVMValueRef shift = lp_build_const_int_vec(bld->gallivm, type, type.width - 1);
1369    LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1370                              ~((unsigned long long) 1 << (type.width - 1)));
1371    LLVMValueRef val, res;
1372
1373    assert(type.floating);
1374    assert(lp_check_value(type, a));
1375
1376    /* val = reinterpret_cast<int>(a) */
1377    val = LLVMBuildBitCast(builder, a, int_vec_type, "");
1378    /* val = val & mask */
1379    val = LLVMBuildAnd(builder, val, mask, "");
1380    /* sign = sign << shift */
1381    sign = LLVMBuildShl(builder, sign, shift, "");
1382    /* res = val | sign */
1383    res = LLVMBuildOr(builder, val, sign, "");
1384    /* res = reinterpret_cast<float>(res) */
1385    res = LLVMBuildBitCast(builder, res, vec_type, "");
1386
1387    return res;
1388 }
1389
1390
1391 /**
1392  * Convert vector of (or scalar) int to vector of (or scalar) float.
1393  */
1394 LLVMValueRef
1395 lp_build_int_to_float(struct lp_build_context *bld,
1396                       LLVMValueRef a)
1397 {
1398    LLVMBuilderRef builder = bld->gallivm->builder;
1399    const struct lp_type type = bld->type;
1400    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1401
1402    assert(type.floating);
1403
1404    return LLVMBuildSIToFP(builder, a, vec_type, "");
1405 }
1406
1407 static boolean
1408 arch_rounding_available(const struct lp_type type)
1409 {
1410    if ((util_cpu_caps.has_sse4_1 &&
1411        (type.length == 1 || type.width*type.length == 128)) ||
1412        (util_cpu_caps.has_avx && type.width*type.length == 256))
1413       return TRUE;
1414    else if ((util_cpu_caps.has_altivec &&
1415             (type.width == 32 && type.length == 4)))
1416       return TRUE;
1417
1418    return FALSE;
1419 }
1420
1421 enum lp_build_round_mode
1422 {
1423    LP_BUILD_ROUND_NEAREST = 0,
1424    LP_BUILD_ROUND_FLOOR = 1,
1425    LP_BUILD_ROUND_CEIL = 2,
1426    LP_BUILD_ROUND_TRUNCATE = 3
1427 };
1428
1429 /**
1430  * Helper for SSE4.1's ROUNDxx instructions.
1431  *
1432  * NOTE: In the SSE4.1's nearest mode, if two values are equally close, the
1433  * result is the even value.  That is, rounding 2.5 will be 2.0, and not 3.0.
1434  */
1435 static INLINE LLVMValueRef
1436 lp_build_round_sse41(struct lp_build_context *bld,
1437                      LLVMValueRef a,
1438                      enum lp_build_round_mode mode)
1439 {
1440    LLVMBuilderRef builder = bld->gallivm->builder;
1441    const struct lp_type type = bld->type;
1442    LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1443    const char *intrinsic;
1444    LLVMValueRef res;
1445
1446    assert(type.floating);
1447
1448    assert(lp_check_value(type, a));
1449    assert(util_cpu_caps.has_sse4_1);
1450
1451    if (type.length == 1) {
1452       LLVMTypeRef vec_type;
1453       LLVMValueRef undef;
1454       LLVMValueRef args[3];
1455       LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
1456
1457       switch(type.width) {
1458       case 32:
1459          intrinsic = "llvm.x86.sse41.round.ss";
1460          break;
1461       case 64:
1462          intrinsic = "llvm.x86.sse41.round.sd";
1463          break;
1464       default:
1465          assert(0);
1466          return bld->undef;
1467       }
1468
1469       vec_type = LLVMVectorType(bld->elem_type, 4);
1470
1471       undef = LLVMGetUndef(vec_type);
1472
1473       args[0] = undef;
1474       args[1] = LLVMBuildInsertElement(builder, undef, a, index0, "");
1475       args[2] = LLVMConstInt(i32t, mode, 0);
1476
1477       res = lp_build_intrinsic(builder, intrinsic,
1478                                vec_type, args, Elements(args));
1479
1480       res = LLVMBuildExtractElement(builder, res, index0, "");
1481    }
1482    else {
1483       if (type.width * type.length == 128) {
1484          switch(type.width) {
1485          case 32:
1486             intrinsic = "llvm.x86.sse41.round.ps";
1487             break;
1488          case 64:
1489             intrinsic = "llvm.x86.sse41.round.pd";
1490             break;
1491          default:
1492             assert(0);
1493             return bld->undef;
1494          }
1495       }
1496       else {
1497          assert(type.width * type.length == 256);
1498          assert(util_cpu_caps.has_avx);
1499
1500          switch(type.width) {
1501          case 32:
1502             intrinsic = "llvm.x86.avx.round.ps.256";
1503             break;
1504          case 64:
1505             intrinsic = "llvm.x86.avx.round.pd.256";
1506             break;
1507          default:
1508             assert(0);
1509             return bld->undef;
1510          }
1511       }
1512
1513       res = lp_build_intrinsic_binary(builder, intrinsic,
1514                                       bld->vec_type, a,
1515                                       LLVMConstInt(i32t, mode, 0));
1516    }
1517
1518    return res;
1519 }
1520
1521
1522 static INLINE LLVMValueRef
1523 lp_build_iround_nearest_sse2(struct lp_build_context *bld,
1524                              LLVMValueRef a)
1525 {
1526    LLVMBuilderRef builder = bld->gallivm->builder;
1527    const struct lp_type type = bld->type;
1528    LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1529    LLVMTypeRef ret_type = lp_build_int_vec_type(bld->gallivm, type);
1530    const char *intrinsic;
1531    LLVMValueRef res;
1532
1533    assert(type.floating);
1534    /* using the double precision conversions is a bit more complicated */
1535    assert(type.width == 32);
1536
1537    assert(lp_check_value(type, a));
1538    assert(util_cpu_caps.has_sse2);
1539
1540    /* This is relying on MXCSR rounding mode, which should always be nearest. */
1541    if (type.length == 1) {
1542       LLVMTypeRef vec_type;
1543       LLVMValueRef undef;
1544       LLVMValueRef arg;
1545       LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
1546
1547       vec_type = LLVMVectorType(bld->elem_type, 4);
1548
1549       intrinsic = "llvm.x86.sse.cvtss2si";
1550
1551       undef = LLVMGetUndef(vec_type);
1552
1553       arg = LLVMBuildInsertElement(builder, undef, a, index0, "");
1554
1555       res = lp_build_intrinsic_unary(builder, intrinsic,
1556                                      ret_type, arg);
1557    }
1558    else {
1559       if (type.width* type.length == 128) {
1560          intrinsic = "llvm.x86.sse2.cvtps2dq";
1561       }
1562       else {
1563          assert(type.width*type.length == 256);
1564          assert(util_cpu_caps.has_avx);
1565
1566          intrinsic = "llvm.x86.avx.cvt.ps2dq.256";
1567       }
1568       res = lp_build_intrinsic_unary(builder, intrinsic,
1569                                      ret_type, a);
1570    }
1571
1572    return res;
1573 }
1574
1575
1576 /*
1577  */
1578 static INLINE LLVMValueRef
1579 lp_build_round_altivec(struct lp_build_context *bld,
1580                        LLVMValueRef a,
1581                        enum lp_build_round_mode mode)
1582 {
1583    LLVMBuilderRef builder = bld->gallivm->builder;
1584    const struct lp_type type = bld->type;
1585    const char *intrinsic = NULL;
1586
1587    assert(type.floating);
1588
1589    assert(lp_check_value(type, a));
1590    assert(util_cpu_caps.has_altivec);
1591
1592    switch (mode) {
1593    case LP_BUILD_ROUND_NEAREST:
1594       intrinsic = "llvm.ppc.altivec.vrfin";
1595       break;
1596    case LP_BUILD_ROUND_FLOOR:
1597       intrinsic = "llvm.ppc.altivec.vrfim";
1598       break;
1599    case LP_BUILD_ROUND_CEIL:
1600       intrinsic = "llvm.ppc.altivec.vrfip";
1601       break;
1602    case LP_BUILD_ROUND_TRUNCATE:
1603       intrinsic = "llvm.ppc.altivec.vrfiz";
1604       break;
1605    }
1606
1607    return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
1608 }
1609
1610 static INLINE LLVMValueRef
1611 lp_build_round_arch(struct lp_build_context *bld,
1612                     LLVMValueRef a,
1613                     enum lp_build_round_mode mode)
1614 {
1615    if (util_cpu_caps.has_sse4_1)
1616      return lp_build_round_sse41(bld, a, mode);
1617    else /* (util_cpu_caps.has_altivec) */
1618      return lp_build_round_altivec(bld, a, mode);
1619 }
1620
1621 /**
1622  * Return the integer part of a float (vector) value (== round toward zero).
1623  * The returned value is a float (vector).
1624  * Ex: trunc(-1.5) = -1.0
1625  */
1626 LLVMValueRef
1627 lp_build_trunc(struct lp_build_context *bld,
1628                LLVMValueRef a)
1629 {
1630    LLVMBuilderRef builder = bld->gallivm->builder;
1631    const struct lp_type type = bld->type;
1632
1633    assert(type.floating);
1634    assert(lp_check_value(type, a));
1635
1636    if (arch_rounding_available(type)) {
1637       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_TRUNCATE);
1638    }
1639    else {
1640       const struct lp_type type = bld->type;
1641       struct lp_type inttype;
1642       struct lp_build_context intbld;
1643       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 2^24);
1644       LLVMValueRef trunc, res, anosign, mask;
1645       LLVMTypeRef int_vec_type = bld->int_vec_type;
1646       LLVMTypeRef vec_type = bld->vec_type;
1647
1648       assert(type.width == 32); /* might want to handle doubles at some point */
1649
1650       inttype = type;
1651       inttype.floating = 0;
1652       lp_build_context_init(&intbld, bld->gallivm, inttype);
1653
1654       /* round by truncation */
1655       trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
1656       res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
1657
1658       /* mask out sign bit */
1659       anosign = lp_build_abs(bld, a);
1660       /*
1661        * mask out all values if anosign > 2^24
1662        * This should work both for large ints (all rounding is no-op for them
1663        * because such floats are always exact) as well as special cases like
1664        * NaNs, Infs (taking advantage of the fact they use max exponent).
1665        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
1666        */
1667       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
1668       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
1669       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
1670       return lp_build_select(bld, mask, a, res);
1671    }
1672 }
1673
1674
1675 /**
1676  * Return float (vector) rounded to nearest integer (vector).  The returned
1677  * value is a float (vector).
1678  * Ex: round(0.9) = 1.0
1679  * Ex: round(-1.5) = -2.0
1680  */
1681 LLVMValueRef
1682 lp_build_round(struct lp_build_context *bld,
1683                LLVMValueRef a)
1684 {
1685    LLVMBuilderRef builder = bld->gallivm->builder;
1686    const struct lp_type type = bld->type;
1687
1688    assert(type.floating);
1689    assert(lp_check_value(type, a));
1690
1691    if (arch_rounding_available(type)) {
1692       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
1693    }
1694    else {
1695       const struct lp_type type = bld->type;
1696       struct lp_type inttype;
1697       struct lp_build_context intbld;
1698       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 2^24);
1699       LLVMValueRef res, anosign, mask;
1700       LLVMTypeRef int_vec_type = bld->int_vec_type;
1701       LLVMTypeRef vec_type = bld->vec_type;
1702
1703       assert(type.width == 32); /* might want to handle doubles at some point */
1704
1705       inttype = type;
1706       inttype.floating = 0;
1707       lp_build_context_init(&intbld, bld->gallivm, inttype);
1708
1709       res = lp_build_iround(bld, a);
1710       res = LLVMBuildSIToFP(builder, res, vec_type, "");
1711
1712       /* mask out sign bit */
1713       anosign = lp_build_abs(bld, a);
1714       /*
1715        * mask out all values if anosign > 2^24
1716        * This should work both for large ints (all rounding is no-op for them
1717        * because such floats are always exact) as well as special cases like
1718        * NaNs, Infs (taking advantage of the fact they use max exponent).
1719        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
1720        */
1721       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
1722       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
1723       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
1724       return lp_build_select(bld, mask, a, res);
1725    }
1726 }
1727
1728
1729 /**
1730  * Return floor of float (vector), result is a float (vector)
1731  * Ex: floor(1.1) = 1.0
1732  * Ex: floor(-1.1) = -2.0
1733  */
1734 LLVMValueRef
1735 lp_build_floor(struct lp_build_context *bld,
1736                LLVMValueRef a)
1737 {
1738    LLVMBuilderRef builder = bld->gallivm->builder;
1739    const struct lp_type type = bld->type;
1740
1741    assert(type.floating);
1742    assert(lp_check_value(type, a));
1743
1744    if (arch_rounding_available(type)) {
1745       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
1746    }
1747    else {
1748       const struct lp_type type = bld->type;
1749       struct lp_type inttype;
1750       struct lp_build_context intbld;
1751       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 2^24);
1752       LLVMValueRef trunc, res, anosign, mask;
1753       LLVMTypeRef int_vec_type = bld->int_vec_type;
1754       LLVMTypeRef vec_type = bld->vec_type;
1755
1756       assert(type.width == 32); /* might want to handle doubles at some point */
1757
1758       inttype = type;
1759       inttype.floating = 0;
1760       lp_build_context_init(&intbld, bld->gallivm, inttype);
1761
1762       /* round by truncation */
1763       trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
1764       res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
1765
1766       if (type.sign) {
1767          LLVMValueRef tmp;
1768
1769          /*
1770           * fix values if rounding is wrong (for non-special cases)
1771           * - this is the case if trunc > a
1772           */
1773          mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, res, a);
1774          /* tmp = trunc > a ? 1.0 : 0.0 */
1775          tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
1776          tmp = lp_build_and(&intbld, mask, tmp);
1777          tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
1778          res = lp_build_sub(bld, res, tmp);
1779       }
1780
1781       /* mask out sign bit */
1782       anosign = lp_build_abs(bld, a);
1783       /*
1784        * mask out all values if anosign > 2^24
1785        * This should work both for large ints (all rounding is no-op for them
1786        * because such floats are always exact) as well as special cases like
1787        * NaNs, Infs (taking advantage of the fact they use max exponent).
1788        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
1789        */
1790       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
1791       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
1792       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
1793       return lp_build_select(bld, mask, a, res);
1794    }
1795 }
1796
1797
1798 /**
1799  * Return ceiling of float (vector), returning float (vector).
1800  * Ex: ceil( 1.1) = 2.0
1801  * Ex: ceil(-1.1) = -1.0
1802  */
1803 LLVMValueRef
1804 lp_build_ceil(struct lp_build_context *bld,
1805               LLVMValueRef a)
1806 {
1807    LLVMBuilderRef builder = bld->gallivm->builder;
1808    const struct lp_type type = bld->type;
1809
1810    assert(type.floating);
1811    assert(lp_check_value(type, a));
1812
1813    if (arch_rounding_available(type)) {
1814       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
1815    }
1816    else {
1817       const struct lp_type type = bld->type;
1818       struct lp_type inttype;
1819       struct lp_build_context intbld;
1820       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 2^24);
1821       LLVMValueRef trunc, res, anosign, mask, tmp;
1822       LLVMTypeRef int_vec_type = bld->int_vec_type;
1823       LLVMTypeRef vec_type = bld->vec_type;
1824
1825       assert(type.width == 32); /* might want to handle doubles at some point */
1826
1827       inttype = type;
1828       inttype.floating = 0;
1829       lp_build_context_init(&intbld, bld->gallivm, inttype);
1830
1831       /* round by truncation */
1832       trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
1833       trunc = LLVMBuildSIToFP(builder, trunc, vec_type, "ceil.trunc");
1834
1835       /*
1836        * fix values if rounding is wrong (for non-special cases)
1837        * - this is the case if trunc < a
1838        */
1839       mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
1840       /* tmp = trunc < a ? 1.0 : 0.0 */
1841       tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
1842       tmp = lp_build_and(&intbld, mask, tmp);
1843       tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
1844       res = lp_build_add(bld, trunc, tmp);
1845
1846       /* mask out sign bit */
1847       anosign = lp_build_abs(bld, a);
1848       /*
1849        * mask out all values if anosign > 2^24
1850        * This should work both for large ints (all rounding is no-op for them
1851        * because such floats are always exact) as well as special cases like
1852        * NaNs, Infs (taking advantage of the fact they use max exponent).
1853        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
1854        */
1855       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
1856       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
1857       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
1858       return lp_build_select(bld, mask, a, res);
1859    }
1860 }
1861
1862
1863 /**
1864  * Return fractional part of 'a' computed as a - floor(a)
1865  * Typically used in texture coord arithmetic.
1866  */
1867 LLVMValueRef
1868 lp_build_fract(struct lp_build_context *bld,
1869                LLVMValueRef a)
1870 {
1871    assert(bld->type.floating);
1872    return lp_build_sub(bld, a, lp_build_floor(bld, a));
1873 }
1874
1875
1876 /**
1877  * Prevent returning a fractional part of 1.0 for very small negative values of
1878  * 'a' by clamping against 0.99999(9).
1879  */
1880 static inline LLVMValueRef
1881 clamp_fract(struct lp_build_context *bld, LLVMValueRef fract)
1882 {
1883    LLVMValueRef max;
1884
1885    /* this is the largest number smaller than 1.0 representable as float */
1886    max = lp_build_const_vec(bld->gallivm, bld->type,
1887                             1.0 - 1.0/(1LL << (lp_mantissa(bld->type) + 1)));
1888    return lp_build_min(bld, fract, max);
1889 }
1890
1891
1892 /**
1893  * Same as lp_build_fract, but guarantees that the result is always smaller
1894  * than one.
1895  */
1896 LLVMValueRef
1897 lp_build_fract_safe(struct lp_build_context *bld,
1898                     LLVMValueRef a)
1899 {
1900    return clamp_fract(bld, lp_build_fract(bld, a));
1901 }
1902
1903
1904 /**
1905  * Return the integer part of a float (vector) value (== round toward zero).
1906  * The returned value is an integer (vector).
1907  * Ex: itrunc(-1.5) = -1
1908  */
1909 LLVMValueRef
1910 lp_build_itrunc(struct lp_build_context *bld,
1911                 LLVMValueRef a)
1912 {
1913    LLVMBuilderRef builder = bld->gallivm->builder;
1914    const struct lp_type type = bld->type;
1915    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1916
1917    assert(type.floating);
1918    assert(lp_check_value(type, a));
1919
1920    return LLVMBuildFPToSI(builder, a, int_vec_type, "");
1921 }
1922
1923
1924 /**
1925  * Return float (vector) rounded to nearest integer (vector).  The returned
1926  * value is an integer (vector).
1927  * Ex: iround(0.9) = 1
1928  * Ex: iround(-1.5) = -2
1929  */
1930 LLVMValueRef
1931 lp_build_iround(struct lp_build_context *bld,
1932                 LLVMValueRef a)
1933 {
1934    LLVMBuilderRef builder = bld->gallivm->builder;
1935    const struct lp_type type = bld->type;
1936    LLVMTypeRef int_vec_type = bld->int_vec_type;
1937    LLVMValueRef res;
1938
1939    assert(type.floating);
1940
1941    assert(lp_check_value(type, a));
1942
1943    if ((util_cpu_caps.has_sse2 &&
1944        ((type.width == 32) && (type.length == 1 || type.length == 4))) ||
1945        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
1946       return lp_build_iround_nearest_sse2(bld, a);
1947    }
1948    if (arch_rounding_available(type)) {
1949       res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
1950    }
1951    else {
1952       LLVMValueRef half;
1953
1954       half = lp_build_const_vec(bld->gallivm, type, 0.5);
1955
1956       if (type.sign) {
1957          LLVMTypeRef vec_type = bld->vec_type;
1958          LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1959                                     (unsigned long long)1 << (type.width - 1));
1960          LLVMValueRef sign;
1961
1962          /* get sign bit */
1963          sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
1964          sign = LLVMBuildAnd(builder, sign, mask, "");
1965
1966          /* sign * 0.5 */
1967          half = LLVMBuildBitCast(builder, half, int_vec_type, "");
1968          half = LLVMBuildOr(builder, sign, half, "");
1969          half = LLVMBuildBitCast(builder, half, vec_type, "");
1970       }
1971
1972       res = LLVMBuildFAdd(builder, a, half, "");
1973    }
1974
1975    res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
1976
1977    return res;
1978 }
1979
1980
1981 /**
1982  * Return floor of float (vector), result is an int (vector)
1983  * Ex: ifloor(1.1) = 1.0
1984  * Ex: ifloor(-1.1) = -2.0
1985  */
1986 LLVMValueRef
1987 lp_build_ifloor(struct lp_build_context *bld,
1988                 LLVMValueRef a)
1989 {
1990    LLVMBuilderRef builder = bld->gallivm->builder;
1991    const struct lp_type type = bld->type;
1992    LLVMTypeRef int_vec_type = bld->int_vec_type;
1993    LLVMValueRef res;
1994
1995    assert(type.floating);
1996    assert(lp_check_value(type, a));
1997
1998    res = a;
1999    if (type.sign) {
2000       if (arch_rounding_available(type)) {
2001          res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
2002       }
2003       else {
2004          struct lp_type inttype;
2005          struct lp_build_context intbld;
2006          LLVMValueRef trunc, itrunc, mask;
2007
2008          assert(type.floating);
2009          assert(lp_check_value(type, a));
2010
2011          inttype = type;
2012          inttype.floating = 0;
2013          lp_build_context_init(&intbld, bld->gallivm, inttype);
2014
2015          /* round by truncation */
2016          itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2017          trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "ifloor.trunc");
2018
2019          /*
2020           * fix values if rounding is wrong (for non-special cases)
2021           * - this is the case if trunc > a
2022           * The results of doing this with NaNs, very large values etc.
2023           * are undefined but this seems to be the case anyway.
2024           */
2025          mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, trunc, a);
2026          /* cheapie minus one with mask since the mask is minus one / zero */
2027          return lp_build_add(&intbld, itrunc, mask);
2028       }
2029    }
2030
2031    /* round to nearest (toward zero) */
2032    res = LLVMBuildFPToSI(builder, res, int_vec_type, "ifloor.res");
2033
2034    return res;
2035 }
2036
2037
2038 /**
2039  * Return ceiling of float (vector), returning int (vector).
2040  * Ex: iceil( 1.1) = 2
2041  * Ex: iceil(-1.1) = -1
2042  */
2043 LLVMValueRef
2044 lp_build_iceil(struct lp_build_context *bld,
2045                LLVMValueRef a)
2046 {
2047    LLVMBuilderRef builder = bld->gallivm->builder;
2048    const struct lp_type type = bld->type;
2049    LLVMTypeRef int_vec_type = bld->int_vec_type;
2050    LLVMValueRef res;
2051
2052    assert(type.floating);
2053    assert(lp_check_value(type, a));
2054
2055    if (arch_rounding_available(type)) {
2056       res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2057    }
2058    else {
2059       struct lp_type inttype;
2060       struct lp_build_context intbld;
2061       LLVMValueRef trunc, itrunc, mask;
2062
2063       assert(type.floating);
2064       assert(lp_check_value(type, a));
2065
2066       inttype = type;
2067       inttype.floating = 0;
2068       lp_build_context_init(&intbld, bld->gallivm, inttype);
2069
2070       /* round by truncation */
2071       itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2072       trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "iceil.trunc");
2073
2074       /*
2075        * fix values if rounding is wrong (for non-special cases)
2076        * - this is the case if trunc < a
2077        * The results of doing this with NaNs, very large values etc.
2078        * are undefined but this seems to be the case anyway.
2079        */
2080       mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2081       /* cheapie plus one with mask since the mask is minus one / zero */
2082       return lp_build_sub(&intbld, itrunc, mask);
2083    }
2084
2085    /* round to nearest (toward zero) */
2086    res = LLVMBuildFPToSI(builder, res, int_vec_type, "iceil.res");
2087
2088    return res;
2089 }
2090
2091
2092 /**
2093  * Combined ifloor() & fract().
2094  *
2095  * Preferred to calling the functions separately, as it will ensure that the
2096  * strategy (floor() vs ifloor()) that results in less redundant work is used.
2097  */
2098 void
2099 lp_build_ifloor_fract(struct lp_build_context *bld,
2100                       LLVMValueRef a,
2101                       LLVMValueRef *out_ipart,
2102                       LLVMValueRef *out_fpart)
2103 {
2104    LLVMBuilderRef builder = bld->gallivm->builder;
2105    const struct lp_type type = bld->type;
2106    LLVMValueRef ipart;
2107
2108    assert(type.floating);
2109    assert(lp_check_value(type, a));
2110
2111    if (arch_rounding_available(type)) {
2112       /*
2113        * floor() is easier.
2114        */
2115
2116       ipart = lp_build_floor(bld, a);
2117       *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2118       *out_ipart = LLVMBuildFPToSI(builder, ipart, bld->int_vec_type, "ipart");
2119    }
2120    else {
2121       /*
2122        * ifloor() is easier.
2123        */
2124
2125       *out_ipart = lp_build_ifloor(bld, a);
2126       ipart = LLVMBuildSIToFP(builder, *out_ipart, bld->vec_type, "ipart");
2127       *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2128    }
2129 }
2130
2131
2132 /**
2133  * Same as lp_build_ifloor_fract, but guarantees that the fractional part is
2134  * always smaller than one.
2135  */
2136 void
2137 lp_build_ifloor_fract_safe(struct lp_build_context *bld,
2138                            LLVMValueRef a,
2139                            LLVMValueRef *out_ipart,
2140                            LLVMValueRef *out_fpart)
2141 {
2142    lp_build_ifloor_fract(bld, a, out_ipart, out_fpart);
2143    *out_fpart = clamp_fract(bld, *out_fpart);
2144 }
2145
2146
2147 LLVMValueRef
2148 lp_build_sqrt(struct lp_build_context *bld,
2149               LLVMValueRef a)
2150 {
2151    LLVMBuilderRef builder = bld->gallivm->builder;
2152    const struct lp_type type = bld->type;
2153    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
2154    char intrinsic[32];
2155
2156    assert(lp_check_value(type, a));
2157
2158    /* TODO: optimize the constant case */
2159
2160    assert(type.floating);
2161    if (type.length == 1) {
2162       util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.f%u", type.width);
2163    }
2164    else {
2165       util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.v%uf%u", type.length, type.width);
2166    }
2167
2168    return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2169 }
2170
2171
2172 /**
2173  * Do one Newton-Raphson step to improve reciprocate precision:
2174  *
2175  *   x_{i+1} = x_i * (2 - a * x_i)
2176  *
2177  * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
2178  * +/-Inf, giving NaN instead.  Certain applications rely on this behavior,
2179  * such as Google Earth, which does RCP(RSQRT(0.0) when drawing the Earth's
2180  * halo. It would be necessary to clamp the argument to prevent this.
2181  *
2182  * See also:
2183  * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
2184  * - http://softwarecommunity.intel.com/articles/eng/1818.htm
2185  */
2186 static INLINE LLVMValueRef
2187 lp_build_rcp_refine(struct lp_build_context *bld,
2188                     LLVMValueRef a,
2189                     LLVMValueRef rcp_a)
2190 {
2191    LLVMBuilderRef builder = bld->gallivm->builder;
2192    LLVMValueRef two = lp_build_const_vec(bld->gallivm, bld->type, 2.0);
2193    LLVMValueRef res;
2194
2195    res = LLVMBuildFMul(builder, a, rcp_a, "");
2196    res = LLVMBuildFSub(builder, two, res, "");
2197    res = LLVMBuildFMul(builder, rcp_a, res, "");
2198
2199    return res;
2200 }
2201
2202
2203 LLVMValueRef
2204 lp_build_rcp(struct lp_build_context *bld,
2205              LLVMValueRef a)
2206 {
2207    LLVMBuilderRef builder = bld->gallivm->builder;
2208    const struct lp_type type = bld->type;
2209
2210    assert(lp_check_value(type, a));
2211
2212    if(a == bld->zero)
2213       return bld->undef;
2214    if(a == bld->one)
2215       return bld->one;
2216    if(a == bld->undef)
2217       return bld->undef;
2218
2219    assert(type.floating);
2220
2221    if(LLVMIsConstant(a))
2222       return LLVMConstFDiv(bld->one, a);
2223
2224    /*
2225     * We don't use RCPPS because:
2226     * - it only has 10bits of precision
2227     * - it doesn't even get the reciprocate of 1.0 exactly
2228     * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
2229     * - for recent processors the benefit over DIVPS is marginal, a case
2230     *   dependent
2231     *
2232     * We could still use it on certain processors if benchmarks show that the
2233     * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
2234     * particular uses that require less workarounds.
2235     */
2236
2237    if (FALSE && ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2238          (util_cpu_caps.has_avx && type.width == 32 && type.length == 8))){
2239       const unsigned num_iterations = 0;
2240       LLVMValueRef res;
2241       unsigned i;
2242       const char *intrinsic = NULL;
2243
2244       if (type.length == 4) {
2245          intrinsic = "llvm.x86.sse.rcp.ps";
2246       }
2247       else {
2248          intrinsic = "llvm.x86.avx.rcp.ps.256";
2249       }
2250
2251       res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2252
2253       for (i = 0; i < num_iterations; ++i) {
2254          res = lp_build_rcp_refine(bld, a, res);
2255       }
2256
2257       return res;
2258    }
2259
2260    return LLVMBuildFDiv(builder, bld->one, a, "");
2261 }
2262
2263
2264 /**
2265  * Do one Newton-Raphson step to improve rsqrt precision:
2266  *
2267  *   x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
2268  *
2269  * See also Intel 64 and IA-32 Architectures Optimization Manual.
2270  */
2271 static INLINE LLVMValueRef
2272 lp_build_rsqrt_refine(struct lp_build_context *bld,
2273                       LLVMValueRef a,
2274                       LLVMValueRef rsqrt_a)
2275 {
2276    LLVMBuilderRef builder = bld->gallivm->builder;
2277    LLVMValueRef half = lp_build_const_vec(bld->gallivm, bld->type, 0.5);
2278    LLVMValueRef three = lp_build_const_vec(bld->gallivm, bld->type, 3.0);
2279    LLVMValueRef res;
2280
2281    res = LLVMBuildFMul(builder, rsqrt_a, rsqrt_a, "");
2282    res = LLVMBuildFMul(builder, a, res, "");
2283    res = LLVMBuildFSub(builder, three, res, "");
2284    res = LLVMBuildFMul(builder, rsqrt_a, res, "");
2285    res = LLVMBuildFMul(builder, half, res, "");
2286
2287    return res;
2288 }
2289
2290
2291 /**
2292  * Generate 1/sqrt(a).
2293  * Result is undefined for values < 0, infinity for +0.
2294  */
2295 LLVMValueRef
2296 lp_build_rsqrt(struct lp_build_context *bld,
2297                LLVMValueRef a)
2298 {
2299    LLVMBuilderRef builder = bld->gallivm->builder;
2300    const struct lp_type type = bld->type;
2301
2302    assert(lp_check_value(type, a));
2303
2304    assert(type.floating);
2305
2306    /*
2307     * This should be faster but all denormals will end up as infinity.
2308     */
2309    if (0 && lp_build_fast_rsqrt_available(type)) {
2310       const unsigned num_iterations = 1;
2311       LLVMValueRef res;
2312       unsigned i;
2313
2314       /* rsqrt(1.0) != 1.0 here */
2315       res = lp_build_fast_rsqrt(bld, a);
2316
2317       if (num_iterations) {
2318          /*
2319           * Newton-Raphson will result in NaN instead of infinity for zero,
2320           * and NaN instead of zero for infinity.
2321           * Also, need to ensure rsqrt(1.0) == 1.0.
2322           * All numbers smaller than FLT_MIN will result in +infinity
2323           * (rsqrtps treats all denormals as zero).
2324           */
2325          /*
2326           * Certain non-c99 compilers don't know INFINITY and might not support
2327           * hacks to evaluate it at compile time neither.
2328           */
2329          const unsigned posinf_int = 0x7F800000;
2330          LLVMValueRef cmp;
2331          LLVMValueRef flt_min = lp_build_const_vec(bld->gallivm, type, FLT_MIN);
2332          LLVMValueRef inf = lp_build_const_int_vec(bld->gallivm, type, posinf_int);
2333
2334          inf = LLVMBuildBitCast(builder, inf, lp_build_vec_type(bld->gallivm, type), "");
2335
2336          for (i = 0; i < num_iterations; ++i) {
2337             res = lp_build_rsqrt_refine(bld, a, res);
2338          }
2339          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_LESS, a, flt_min);
2340          res = lp_build_select(bld, cmp, inf, res);
2341          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, inf);
2342          res = lp_build_select(bld, cmp, bld->zero, res);
2343          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, bld->one);
2344          res = lp_build_select(bld, cmp, bld->one, res);
2345       }
2346
2347       return res;
2348    }
2349
2350    return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2351 }
2352
2353 /**
2354  * If there's a fast (inaccurate) rsqrt instruction available
2355  * (caller may want to avoid to call rsqrt_fast if it's not available,
2356  * i.e. for calculating x^0.5 it may do rsqrt_fast(x) * x but if
2357  * unavailable it would result in sqrt/div/mul so obviously
2358  * much better to just call sqrt, skipping both div and mul).
2359  */
2360 boolean
2361 lp_build_fast_rsqrt_available(struct lp_type type)
2362 {
2363    assert(type.floating);
2364
2365    if ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2366        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
2367       return true;
2368    }
2369    return false;
2370 }
2371
2372
2373 /**
2374  * Generate 1/sqrt(a).
2375  * Result is undefined for values < 0, infinity for +0.
2376  * Precision is limited, only ~10 bits guaranteed
2377  * (rsqrt 1.0 may not be 1.0, denorms may be flushed to 0).
2378  */
2379 LLVMValueRef
2380 lp_build_fast_rsqrt(struct lp_build_context *bld,
2381                     LLVMValueRef a)
2382 {
2383    LLVMBuilderRef builder = bld->gallivm->builder;
2384    const struct lp_type type = bld->type;
2385
2386    assert(lp_check_value(type, a));
2387
2388    if (lp_build_fast_rsqrt_available(type)) {
2389       const char *intrinsic = NULL;
2390
2391       if (type.length == 4) {
2392          intrinsic = "llvm.x86.sse.rsqrt.ps";
2393       }
2394       else {
2395          intrinsic = "llvm.x86.avx.rsqrt.ps.256";
2396       }
2397       return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2398    }
2399    else {
2400       debug_printf("%s: emulating fast rsqrt with rcp/sqrt\n", __FUNCTION__);
2401    }
2402    return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2403 }
2404
2405
2406 /**
2407  * Generate sin(a) using SSE2
2408  */
2409 LLVMValueRef
2410 lp_build_sin(struct lp_build_context *bld,
2411              LLVMValueRef a)
2412 {
2413    struct gallivm_state *gallivm = bld->gallivm;
2414    LLVMBuilderRef builder = gallivm->builder;
2415    struct lp_type int_type = lp_int_type(bld->type);
2416    LLVMBuilderRef b = builder;
2417
2418    /*
2419     *  take the absolute value,
2420     *  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
2421     */
2422
2423    LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
2424    LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
2425
2426    LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
2427    LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
2428
2429    /*
2430     * extract the sign bit (upper one)
2431     * sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask);
2432     */
2433    LLVMValueRef sig_mask = lp_build_const_int_vec(gallivm, bld->type, 0x80000000);
2434    LLVMValueRef sign_bit_i = LLVMBuildAnd(b, a_v4si, sig_mask, "sign_bit_i");
2435
2436    /*
2437     * scale by 4/Pi
2438     * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
2439     */
2440
2441    LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
2442    LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
2443
2444    /*
2445     * store the integer part of y in mm0
2446     * emm2 = _mm_cvttps_epi32(y);
2447     */
2448
2449    LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
2450
2451    /*
2452     * j=(j+1) & (~1) (see the cephes sources)
2453     * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
2454     */
2455
2456    LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
2457    LLVMValueRef emm2_add =  LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
2458    /*
2459     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
2460     */
2461    LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
2462    LLVMValueRef emm2_and =  LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
2463
2464    /*
2465     * y = _mm_cvtepi32_ps(emm2);
2466     */
2467    LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
2468
2469    /* get the swap sign flag
2470     * emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
2471     */
2472    LLVMValueRef pi32_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
2473    LLVMValueRef emm0_and =  LLVMBuildAnd(b, emm2_add, pi32_4, "emm0_and");
2474
2475    /*
2476     * emm2 = _mm_slli_epi32(emm0, 29);
2477     */
2478    LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
2479    LLVMValueRef swap_sign_bit = LLVMBuildShl(b, emm0_and, const_29, "swap_sign_bit");
2480
2481    /*
2482     * get the polynom selection mask
2483     * there is one polynom for 0 <= x <= Pi/4
2484     * and another one for Pi/4<x<=Pi/2
2485     * Both branches will be computed.
2486     *
2487     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
2488     * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
2489     */
2490
2491    LLVMValueRef pi32_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
2492    LLVMValueRef emm2_3 =  LLVMBuildAnd(b, emm2_and, pi32_2, "emm2_3");
2493    LLVMValueRef poly_mask = lp_build_compare(gallivm,
2494                                              int_type, PIPE_FUNC_EQUAL,
2495                                              emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
2496    /*
2497     *   sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
2498     */
2499    LLVMValueRef sign_bit_1 =  LLVMBuildXor(b, sign_bit_i, swap_sign_bit, "sign_bit");
2500
2501    /*
2502     * _PS_CONST(minus_cephes_DP1, -0.78515625);
2503     * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
2504     * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
2505     */
2506    LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
2507    LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
2508    LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
2509
2510    /*
2511     * The magic pass: "Extended precision modular arithmetic"
2512     * x = ((x - y * DP1) - y * DP2) - y * DP3;
2513     * xmm1 = _mm_mul_ps(y, xmm1);
2514     * xmm2 = _mm_mul_ps(y, xmm2);
2515     * xmm3 = _mm_mul_ps(y, xmm3);
2516     */
2517    LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1");
2518    LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2");
2519    LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3");
2520
2521    /*
2522     * x = _mm_add_ps(x, xmm1);
2523     * x = _mm_add_ps(x, xmm2);
2524     * x = _mm_add_ps(x, xmm3);
2525     */
2526
2527    LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1");
2528    LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2");
2529    LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3");
2530
2531    /*
2532     * Evaluate the first polynom  (0 <= x <= Pi/4)
2533     *
2534     * z = _mm_mul_ps(x,x);
2535     */
2536    LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
2537
2538    /*
2539     * _PS_CONST(coscof_p0,  2.443315711809948E-005);
2540     * _PS_CONST(coscof_p1, -1.388731625493765E-003);
2541     * _PS_CONST(coscof_p2,  4.166664568298827E-002);
2542     */
2543    LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
2544    LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
2545    LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
2546
2547    /*
2548     * y = *(v4sf*)_ps_coscof_p0;
2549     * y = _mm_mul_ps(y, z);
2550     */
2551    LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3");
2552    LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4");
2553    LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5");
2554    LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6");
2555    LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
2556    LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
2557
2558
2559    /*
2560     * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
2561     * y = _mm_sub_ps(y, tmp);
2562     * y = _mm_add_ps(y, *(v4sf*)_ps_1);
2563     */
2564    LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
2565    LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
2566    LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
2567    LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
2568    LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
2569
2570    /*
2571     * _PS_CONST(sincof_p0, -1.9515295891E-4);
2572     * _PS_CONST(sincof_p1,  8.3321608736E-3);
2573     * _PS_CONST(sincof_p2, -1.6666654611E-1);
2574     */
2575    LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
2576    LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
2577    LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
2578
2579    /*
2580     * Evaluate the second polynom  (Pi/4 <= x <= 0)
2581     *
2582     * y2 = *(v4sf*)_ps_sincof_p0;
2583     * y2 = _mm_mul_ps(y2, z);
2584     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
2585     * y2 = _mm_mul_ps(y2, z);
2586     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
2587     * y2 = _mm_mul_ps(y2, z);
2588     * y2 = _mm_mul_ps(y2, x);
2589     * y2 = _mm_add_ps(y2, x);
2590     */
2591
2592    LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3");
2593    LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4");
2594    LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5");
2595    LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6");
2596    LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
2597    LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8");
2598    LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9");
2599
2600    /*
2601     * select the correct result from the two polynoms
2602     * xmm3 = poly_mask;
2603     * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
2604     * y = _mm_andnot_ps(xmm3, y);
2605     * y = _mm_add_ps(y,y2);
2606     */
2607    LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
2608    LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
2609    LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
2610    LLVMValueRef inv = lp_build_const_int_vec(gallivm, bld->type, ~0);
2611    LLVMValueRef poly_mask_inv = LLVMBuildXor(b, poly_mask, inv, "poly_mask_inv");
2612    LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
2613    LLVMValueRef y_combine = LLVMBuildAdd(b, y_and, y2_and, "y_combine");
2614
2615    /*
2616     * update the sign
2617     * y = _mm_xor_ps(y, sign_bit);
2618     */
2619    LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit_1, "y_sin");
2620    LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
2621    return y_result;
2622 }
2623
2624
2625 /**
2626  * Generate cos(a) using SSE2
2627  */
2628 LLVMValueRef
2629 lp_build_cos(struct lp_build_context *bld,
2630              LLVMValueRef a)
2631 {
2632    struct gallivm_state *gallivm = bld->gallivm;
2633    LLVMBuilderRef builder = gallivm->builder;
2634    struct lp_type int_type = lp_int_type(bld->type);
2635    LLVMBuilderRef b = builder;
2636
2637    /*
2638     *  take the absolute value,
2639     *  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
2640     */
2641
2642    LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
2643    LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
2644
2645    LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
2646    LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
2647
2648    /*
2649     * scale by 4/Pi
2650     * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
2651     */
2652
2653    LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
2654    LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
2655
2656    /*
2657     * store the integer part of y in mm0
2658     * emm2 = _mm_cvttps_epi32(y);
2659     */
2660
2661    LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
2662
2663    /*
2664     * j=(j+1) & (~1) (see the cephes sources)
2665     * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
2666     */
2667
2668    LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
2669    LLVMValueRef emm2_add =  LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
2670    /*
2671     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
2672     */
2673    LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
2674    LLVMValueRef emm2_and =  LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
2675
2676    /*
2677     * y = _mm_cvtepi32_ps(emm2);
2678     */
2679    LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
2680
2681
2682    /*
2683     * emm2 = _mm_sub_epi32(emm2, *(v4si*)_pi32_2);
2684     */
2685    LLVMValueRef const_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
2686    LLVMValueRef emm2_2 = LLVMBuildSub(b, emm2_and, const_2, "emm2_2");
2687
2688
2689    /* get the swap sign flag
2690     * emm0 = _mm_andnot_si128(emm2, *(v4si*)_pi32_4);
2691     */
2692    LLVMValueRef inv = lp_build_const_int_vec(gallivm, bld->type, ~0);
2693    LLVMValueRef emm0_not = LLVMBuildXor(b, emm2_2, inv, "emm0_not");
2694    LLVMValueRef pi32_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
2695    LLVMValueRef emm0_and =  LLVMBuildAnd(b, emm0_not, pi32_4, "emm0_and");
2696
2697    /*
2698     * emm2 = _mm_slli_epi32(emm0, 29);
2699     */
2700    LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
2701    LLVMValueRef sign_bit = LLVMBuildShl(b, emm0_and, const_29, "sign_bit");
2702
2703    /*
2704     * get the polynom selection mask
2705     * there is one polynom for 0 <= x <= Pi/4
2706     * and another one for Pi/4<x<=Pi/2
2707     * Both branches will be computed.
2708     *
2709     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
2710     * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
2711     */
2712
2713    LLVMValueRef pi32_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
2714    LLVMValueRef emm2_3 =  LLVMBuildAnd(b, emm2_2, pi32_2, "emm2_3");
2715    LLVMValueRef poly_mask = lp_build_compare(gallivm,
2716                                              int_type, PIPE_FUNC_EQUAL,
2717                                              emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
2718
2719    /*
2720     * _PS_CONST(minus_cephes_DP1, -0.78515625);
2721     * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
2722     * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
2723     */
2724    LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
2725    LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
2726    LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
2727
2728    /*
2729     * The magic pass: "Extended precision modular arithmetic"
2730     * x = ((x - y * DP1) - y * DP2) - y * DP3;
2731     * xmm1 = _mm_mul_ps(y, xmm1);
2732     * xmm2 = _mm_mul_ps(y, xmm2);
2733     * xmm3 = _mm_mul_ps(y, xmm3);
2734     */
2735    LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1");
2736    LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2");
2737    LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3");
2738
2739    /*
2740     * x = _mm_add_ps(x, xmm1);
2741     * x = _mm_add_ps(x, xmm2);
2742     * x = _mm_add_ps(x, xmm3);
2743     */
2744
2745    LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1");
2746    LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2");
2747    LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3");
2748
2749    /*
2750     * Evaluate the first polynom  (0 <= x <= Pi/4)
2751     *
2752     * z = _mm_mul_ps(x,x);
2753     */
2754    LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
2755
2756    /*
2757     * _PS_CONST(coscof_p0,  2.443315711809948E-005);
2758     * _PS_CONST(coscof_p1, -1.388731625493765E-003);
2759     * _PS_CONST(coscof_p2,  4.166664568298827E-002);
2760     */
2761    LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
2762    LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
2763    LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
2764
2765    /*
2766     * y = *(v4sf*)_ps_coscof_p0;
2767     * y = _mm_mul_ps(y, z);
2768     */
2769    LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3");
2770    LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4");
2771    LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5");
2772    LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6");
2773    LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
2774    LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
2775
2776
2777    /*
2778     * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
2779     * y = _mm_sub_ps(y, tmp);
2780     * y = _mm_add_ps(y, *(v4sf*)_ps_1);
2781     */
2782    LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
2783    LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
2784    LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
2785    LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
2786    LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
2787
2788    /*
2789     * _PS_CONST(sincof_p0, -1.9515295891E-4);
2790     * _PS_CONST(sincof_p1,  8.3321608736E-3);
2791     * _PS_CONST(sincof_p2, -1.6666654611E-1);
2792     */
2793    LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
2794    LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
2795    LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
2796
2797    /*
2798     * Evaluate the second polynom  (Pi/4 <= x <= 0)
2799     *
2800     * y2 = *(v4sf*)_ps_sincof_p0;
2801     * y2 = _mm_mul_ps(y2, z);
2802     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
2803     * y2 = _mm_mul_ps(y2, z);
2804     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
2805     * y2 = _mm_mul_ps(y2, z);
2806     * y2 = _mm_mul_ps(y2, x);
2807     * y2 = _mm_add_ps(y2, x);
2808     */
2809
2810    LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3");
2811    LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4");
2812    LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5");
2813    LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6");
2814    LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
2815    LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8");
2816    LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9");
2817
2818    /*
2819     * select the correct result from the two polynoms
2820     * xmm3 = poly_mask;
2821     * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
2822     * y = _mm_andnot_ps(xmm3, y);
2823     * y = _mm_add_ps(y,y2);
2824     */
2825    LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
2826    LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
2827    LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
2828    LLVMValueRef poly_mask_inv = LLVMBuildXor(b, poly_mask, inv, "poly_mask_inv");
2829    LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
2830    LLVMValueRef y_combine = LLVMBuildAdd(b, y_and, y2_and, "y_combine");
2831
2832    /*
2833     * update the sign
2834     * y = _mm_xor_ps(y, sign_bit);
2835     */
2836    LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sin");
2837    LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
2838    return y_result;
2839 }
2840
2841
2842 /**
2843  * Generate pow(x, y)
2844  */
2845 LLVMValueRef
2846 lp_build_pow(struct lp_build_context *bld,
2847              LLVMValueRef x,
2848              LLVMValueRef y)
2849 {
2850    /* TODO: optimize the constant case */
2851    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2852        LLVMIsConstant(x) && LLVMIsConstant(y)) {
2853       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2854                    __FUNCTION__);
2855    }
2856
2857    return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
2858 }
2859
2860
2861 /**
2862  * Generate exp(x)
2863  */
2864 LLVMValueRef
2865 lp_build_exp(struct lp_build_context *bld,
2866              LLVMValueRef x)
2867 {
2868    /* log2(e) = 1/log(2) */
2869    LLVMValueRef log2e = lp_build_const_vec(bld->gallivm, bld->type,
2870                                            1.4426950408889634);
2871
2872    assert(lp_check_value(bld->type, x));
2873
2874    return lp_build_exp2(bld, lp_build_mul(bld, log2e, x));
2875 }
2876
2877
2878 /**
2879  * Generate log(x)
2880  */
2881 LLVMValueRef
2882 lp_build_log(struct lp_build_context *bld,
2883              LLVMValueRef x)
2884 {
2885    /* log(2) */
2886    LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
2887                                           0.69314718055994529);
2888
2889    assert(lp_check_value(bld->type, x));
2890
2891    return lp_build_mul(bld, log2, lp_build_log2(bld, x));
2892 }
2893
2894
2895 /**
2896  * Generate polynomial.
2897  * Ex:  coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
2898  */
2899 LLVMValueRef
2900 lp_build_polynomial(struct lp_build_context *bld,
2901                     LLVMValueRef x,
2902                     const double *coeffs,
2903                     unsigned num_coeffs)
2904 {
2905    const struct lp_type type = bld->type;
2906    LLVMValueRef even = NULL, odd = NULL;
2907    LLVMValueRef x2;
2908    unsigned i;
2909
2910    assert(lp_check_value(bld->type, x));
2911
2912    /* TODO: optimize the constant case */
2913    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2914        LLVMIsConstant(x)) {
2915       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2916                    __FUNCTION__);
2917    }
2918
2919    /*
2920     * Calculate odd and even terms seperately to decrease data dependency
2921     * Ex:
2922     *     c[0] + x^2 * c[2] + x^4 * c[4] ...
2923     *     + x * (c[1] + x^2 * c[3] + x^4 * c[5]) ...
2924     */
2925    x2 = lp_build_mul(bld, x, x);
2926
2927    for (i = num_coeffs; i--; ) {
2928       LLVMValueRef coeff;
2929
2930       coeff = lp_build_const_vec(bld->gallivm, type, coeffs[i]);
2931
2932       if (i % 2 == 0) {
2933          if (even)
2934             even = lp_build_add(bld, coeff, lp_build_mul(bld, x2, even));
2935          else
2936             even = coeff;
2937       } else {
2938          if (odd)
2939             odd = lp_build_add(bld, coeff, lp_build_mul(bld, x2, odd));
2940          else
2941             odd = coeff;
2942       }
2943    }
2944
2945    if (odd)
2946       return lp_build_add(bld, lp_build_mul(bld, odd, x), even);
2947    else if (even)
2948       return even;
2949    else
2950       return bld->undef;
2951 }
2952
2953
2954 /**
2955  * Minimax polynomial fit of 2**x, in range [0, 1[
2956  */
2957 const double lp_build_exp2_polynomial[] = {
2958 #if EXP_POLY_DEGREE == 5
2959    0.999999925063526176901,
2960    0.693153073200168932794,
2961    0.240153617044375388211,
2962    0.0558263180532956664775,
2963    0.00898934009049466391101,
2964    0.00187757667519147912699
2965 #elif EXP_POLY_DEGREE == 4
2966    1.00000259337069434683,
2967    0.693003834469974940458,
2968    0.24144275689150793076,
2969    0.0520114606103070150235,
2970    0.0135341679161270268764
2971 #elif EXP_POLY_DEGREE == 3
2972    0.999925218562710312959,
2973    0.695833540494823811697,
2974    0.226067155427249155588,
2975    0.0780245226406372992967
2976 #elif EXP_POLY_DEGREE == 2
2977    1.00172476321474503578,
2978    0.657636275736077639316,
2979    0.33718943461968720704
2980 #else
2981 #error
2982 #endif
2983 };
2984
2985
2986 void
2987 lp_build_exp2_approx(struct lp_build_context *bld,
2988                      LLVMValueRef x,
2989                      LLVMValueRef *p_exp2_int_part,
2990                      LLVMValueRef *p_frac_part,
2991                      LLVMValueRef *p_exp2)
2992 {
2993    LLVMBuilderRef builder = bld->gallivm->builder;
2994    const struct lp_type type = bld->type;
2995    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
2996    LLVMValueRef ipart = NULL;
2997    LLVMValueRef fpart = NULL;
2998    LLVMValueRef expipart = NULL;
2999    LLVMValueRef expfpart = NULL;
3000    LLVMValueRef res = NULL;
3001
3002    assert(lp_check_value(bld->type, x));
3003
3004    if(p_exp2_int_part || p_frac_part || p_exp2) {
3005       /* TODO: optimize the constant case */
3006       if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3007           LLVMIsConstant(x)) {
3008          debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3009                       __FUNCTION__);
3010       }
3011
3012       assert(type.floating && type.width == 32);
3013
3014       x = lp_build_min(bld, x, lp_build_const_vec(bld->gallivm, type,  129.0));
3015       x = lp_build_max(bld, x, lp_build_const_vec(bld->gallivm, type, -126.99999));
3016
3017       /* ipart = floor(x) */
3018       /* fpart = x - ipart */
3019       lp_build_ifloor_fract(bld, x, &ipart, &fpart);
3020    }
3021
3022    if(p_exp2_int_part || p_exp2) {
3023       /* expipart = (float) (1 << ipart) */
3024       expipart = LLVMBuildAdd(builder, ipart,
3025                               lp_build_const_int_vec(bld->gallivm, type, 127), "");
3026       expipart = LLVMBuildShl(builder, expipart,
3027                               lp_build_const_int_vec(bld->gallivm, type, 23), "");
3028       expipart = LLVMBuildBitCast(builder, expipart, vec_type, "");
3029    }
3030
3031    if(p_exp2) {
3032       expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
3033                                      Elements(lp_build_exp2_polynomial));
3034
3035       res = LLVMBuildFMul(builder, expipart, expfpart, "");
3036    }
3037
3038    if(p_exp2_int_part)
3039       *p_exp2_int_part = expipart;
3040
3041    if(p_frac_part)
3042       *p_frac_part = fpart;
3043
3044    if(p_exp2)
3045       *p_exp2 = res;
3046 }
3047
3048
3049 LLVMValueRef
3050 lp_build_exp2(struct lp_build_context *bld,
3051               LLVMValueRef x)
3052 {
3053    LLVMValueRef res;
3054    lp_build_exp2_approx(bld, x, NULL, NULL, &res);
3055    return res;
3056 }
3057
3058
3059 /**
3060  * Extract the exponent of a IEEE-754 floating point value.
3061  *
3062  * Optionally apply an integer bias.
3063  *
3064  * Result is an integer value with
3065  *
3066  *   ifloor(log2(x)) + bias
3067  */
3068 LLVMValueRef
3069 lp_build_extract_exponent(struct lp_build_context *bld,
3070                           LLVMValueRef x,
3071                           int bias)
3072 {
3073    LLVMBuilderRef builder = bld->gallivm->builder;
3074    const struct lp_type type = bld->type;
3075    unsigned mantissa = lp_mantissa(type);
3076    LLVMValueRef res;
3077
3078    assert(type.floating);
3079
3080    assert(lp_check_value(bld->type, x));
3081
3082    x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3083
3084    res = LLVMBuildLShr(builder, x,
3085                        lp_build_const_int_vec(bld->gallivm, type, mantissa), "");
3086    res = LLVMBuildAnd(builder, res,
3087                       lp_build_const_int_vec(bld->gallivm, type, 255), "");
3088    res = LLVMBuildSub(builder, res,
3089                       lp_build_const_int_vec(bld->gallivm, type, 127 - bias), "");
3090
3091    return res;
3092 }
3093
3094
3095 /**
3096  * Extract the mantissa of the a floating.
3097  *
3098  * Result is a floating point value with
3099  *
3100  *   x / floor(log2(x))
3101  */
3102 LLVMValueRef
3103 lp_build_extract_mantissa(struct lp_build_context *bld,
3104                           LLVMValueRef x)
3105 {
3106    LLVMBuilderRef builder = bld->gallivm->builder;
3107    const struct lp_type type = bld->type;
3108    unsigned mantissa = lp_mantissa(type);
3109    LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type,
3110                                                   (1ULL << mantissa) - 1);
3111    LLVMValueRef one = LLVMConstBitCast(bld->one, bld->int_vec_type);
3112    LLVMValueRef res;
3113
3114    assert(lp_check_value(bld->type, x));
3115
3116    assert(type.floating);
3117
3118    x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3119
3120    /* res = x / 2**ipart */
3121    res = LLVMBuildAnd(builder, x, mantmask, "");
3122    res = LLVMBuildOr(builder, res, one, "");
3123    res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
3124
3125    return res;
3126 }
3127
3128
3129
3130 /**
3131  * Minimax polynomial fit of log2((1.0 + sqrt(x))/(1.0 - sqrt(x)))/sqrt(x) ,for x in range of [0, 1/9[
3132  * These coefficients can be generate with
3133  * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
3134  */
3135 const double lp_build_log2_polynomial[] = {
3136 #if LOG_POLY_DEGREE == 5
3137    2.88539008148777786488L,
3138    0.961796878841293367824L,
3139    0.577058946784739859012L,
3140    0.412914355135828735411L,
3141    0.308591899232910175289L,
3142    0.352376952300281371868L,
3143 #elif LOG_POLY_DEGREE == 4
3144    2.88539009343309178325L,
3145    0.961791550404184197881L,
3146    0.577440339438736392009L,
3147    0.403343858251329912514L,
3148    0.406718052498846252698L,
3149 #elif LOG_POLY_DEGREE == 3
3150    2.88538959748872753838L,
3151    0.961932915889597772928L,
3152    0.571118517972136195241L,
3153    0.493997535084709500285L,
3154 #else
3155 #error
3156 #endif
3157 };
3158
3159 /**
3160  * See http://www.devmaster.net/forums/showthread.php?p=43580
3161  * http://en.wikipedia.org/wiki/Logarithm#Calculation
3162  * http://www.nezumi.demon.co.uk/consult/logx.htm
3163  */
3164 void
3165 lp_build_log2_approx(struct lp_build_context *bld,
3166                      LLVMValueRef x,
3167                      LLVMValueRef *p_exp,
3168                      LLVMValueRef *p_floor_log2,
3169                      LLVMValueRef *p_log2)
3170 {
3171    LLVMBuilderRef builder = bld->gallivm->builder;
3172    const struct lp_type type = bld->type;
3173    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3174    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
3175
3176    LLVMValueRef expmask = lp_build_const_int_vec(bld->gallivm, type, 0x7f800000);
3177    LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 0x007fffff);
3178    LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
3179
3180    LLVMValueRef i = NULL;
3181    LLVMValueRef y = NULL;
3182    LLVMValueRef z = NULL;
3183    LLVMValueRef exp = NULL;
3184    LLVMValueRef mant = NULL;
3185    LLVMValueRef logexp = NULL;
3186    LLVMValueRef logmant = NULL;
3187    LLVMValueRef res = NULL;
3188
3189    assert(lp_check_value(bld->type, x));
3190
3191    if(p_exp || p_floor_log2 || p_log2) {
3192       /* TODO: optimize the constant case */
3193       if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3194           LLVMIsConstant(x)) {
3195          debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3196                       __FUNCTION__);
3197       }
3198
3199       assert(type.floating && type.width == 32);
3200
3201       /*
3202        * We don't explicitly handle denormalized numbers. They will yield a
3203        * result in the neighbourhood of -127, which appears to be adequate
3204        * enough.
3205        */
3206
3207       i = LLVMBuildBitCast(builder, x, int_vec_type, "");
3208
3209       /* exp = (float) exponent(x) */
3210       exp = LLVMBuildAnd(builder, i, expmask, "");
3211    }
3212
3213    if(p_floor_log2 || p_log2) {
3214       logexp = LLVMBuildLShr(builder, exp, lp_build_const_int_vec(bld->gallivm, type, 23), "");
3215       logexp = LLVMBuildSub(builder, logexp, lp_build_const_int_vec(bld->gallivm, type, 127), "");
3216       logexp = LLVMBuildSIToFP(builder, logexp, vec_type, "");
3217    }
3218
3219    if(p_log2) {
3220       /* mant = 1 + (float) mantissa(x) */
3221       mant = LLVMBuildAnd(builder, i, mantmask, "");
3222       mant = LLVMBuildOr(builder, mant, one, "");
3223       mant = LLVMBuildBitCast(builder, mant, vec_type, "");
3224
3225       /* y = (mant - 1) / (mant + 1) */
3226       y = lp_build_div(bld,
3227          lp_build_sub(bld, mant, bld->one),
3228          lp_build_add(bld, mant, bld->one)
3229       );
3230
3231       /* z = y^2 */
3232       z = lp_build_mul(bld, y, y);
3233
3234       /* compute P(z) */
3235       logmant = lp_build_polynomial(bld, z, lp_build_log2_polynomial,
3236                                     Elements(lp_build_log2_polynomial));
3237
3238       /* logmant = y * P(z) */
3239       logmant = lp_build_mul(bld, y, logmant);
3240
3241       res = lp_build_add(bld, logmant, logexp);
3242    }
3243
3244    if(p_exp) {
3245       exp = LLVMBuildBitCast(builder, exp, vec_type, "");
3246       *p_exp = exp;
3247    }
3248
3249    if(p_floor_log2)
3250       *p_floor_log2 = logexp;
3251
3252    if(p_log2)
3253       *p_log2 = res;
3254 }
3255
3256
3257 LLVMValueRef
3258 lp_build_log2(struct lp_build_context *bld,
3259               LLVMValueRef x)
3260 {
3261    LLVMValueRef res;
3262    lp_build_log2_approx(bld, x, NULL, NULL, &res);
3263    return res;
3264 }
3265
3266
3267 /**
3268  * Faster (and less accurate) log2.
3269  *
3270  *    log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x))
3271  *
3272  * Piece-wise linear approximation, with exact results when x is a
3273  * power of two.
3274  *
3275  * See http://www.flipcode.com/archives/Fast_log_Function.shtml
3276  */
3277 LLVMValueRef
3278 lp_build_fast_log2(struct lp_build_context *bld,
3279                    LLVMValueRef x)
3280 {
3281    LLVMBuilderRef builder = bld->gallivm->builder;
3282    LLVMValueRef ipart;
3283    LLVMValueRef fpart;
3284
3285    assert(lp_check_value(bld->type, x));
3286
3287    assert(bld->type.floating);
3288
3289    /* ipart = floor(log2(x)) - 1 */
3290    ipart = lp_build_extract_exponent(bld, x, -1);
3291    ipart = LLVMBuildSIToFP(builder, ipart, bld->vec_type, "");
3292
3293    /* fpart = x / 2**ipart */
3294    fpart = lp_build_extract_mantissa(bld, x);
3295
3296    /* ipart + fpart */
3297    return LLVMBuildFAdd(builder, ipart, fpart, "");
3298 }
3299
3300
3301 /**
3302  * Fast implementation of iround(log2(x)).
3303  *
3304  * Not an approximation -- it should give accurate results all the time.
3305  */
3306 LLVMValueRef
3307 lp_build_ilog2(struct lp_build_context *bld,
3308                LLVMValueRef x)
3309 {
3310    LLVMBuilderRef builder = bld->gallivm->builder;
3311    LLVMValueRef sqrt2 = lp_build_const_vec(bld->gallivm, bld->type, M_SQRT2);
3312    LLVMValueRef ipart;
3313
3314    assert(bld->type.floating);
3315
3316    assert(lp_check_value(bld->type, x));
3317
3318    /* x * 2^(0.5)   i.e., add 0.5 to the log2(x) */
3319    x = LLVMBuildFMul(builder, x, sqrt2, "");
3320
3321    /* ipart = floor(log2(x) + 0.5)  */
3322    ipart = lp_build_extract_exponent(bld, x, 0);
3323
3324    return ipart;
3325 }
3326
3327 LLVMValueRef
3328 lp_build_mod(struct lp_build_context *bld,
3329              LLVMValueRef x,
3330              LLVMValueRef y)
3331 {
3332    LLVMBuilderRef builder = bld->gallivm->builder;
3333    LLVMValueRef res;
3334    const struct lp_type type = bld->type;
3335
3336    assert(lp_check_value(type, x));
3337    assert(lp_check_value(type, y));
3338
3339    if (type.floating)
3340       res = LLVMBuildFRem(builder, x, y, "");
3341    else if (type.sign)
3342       res = LLVMBuildSRem(builder, x, y, "");
3343    else
3344       res = LLVMBuildURem(builder, x, y, "");
3345    return res;
3346 }