src/gallium/auxiliary/gallivm/lp_bld_arit.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2009-2010 VMware, Inc.
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the
  15  * next paragraph) shall be included in all copies or substantial portions
  16  * of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  21  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
  22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25  *
  26  **************************************************************************/
  27
  28
  29 /**
  30  * @file
  31  * Helper
  32  *
  33  * LLVM IR doesn't support all basic arithmetic operations we care about (most
  34  * notably min/max and saturated operations), and it is often necessary to
  35  * resort machine-specific intrinsics directly. The functions here hide all
  36  * these implementation details from the other modules.
  37  *
  38  * We also do simple expressions simplification here. Reasons are:
  39  * - it is very easy given we have all necessary information readily available
  40  * - LLVM optimization passes fail to simplify several vector expressions
  41  * - We often know value constraints which the optimization passes have no way
  42  *   of knowing, such as when source arguments are known to be in [0, 1] range.
  43  *
  44  * @author Jose Fonseca <jfonseca@vmware.com>
  45  */
  46
  47
  48 #include <float.h>
  49
  50 #include "util/u_memory.h"
  51 #include "util/u_debug.h"
  52 #include "util/u_math.h"
  53 #include "util/u_string.h"
  54 #include "util/u_cpu_detect.h"
  55
  56 #include "lp_bld_type.h"
  57 #include "lp_bld_const.h"
  58 #include "lp_bld_init.h"
  59 #include "lp_bld_intr.h"
  60 #include "lp_bld_logic.h"
  61 #include "lp_bld_pack.h"
  62 #include "lp_bld_debug.h"
  63 #include "lp_bld_bitarit.h"
  64 #include "lp_bld_arit.h"
  65
  66
  67 #define EXP_POLY_DEGREE 5
  68
  69 #define LOG_POLY_DEGREE 4
  70
  71
  72 /**
  73  * Generate min(a, b)
  74  * No checks for special case values of a or b = 1 or 0 are done.
  75  */
  76 static LLVMValueRef
  77 lp_build_min_simple(struct lp_build_context *bld,
  78                     LLVMValueRef a,
  79                     LLVMValueRef b)
  80 {
  81    const struct lp_type type = bld->type;
  82    const char *intrinsic = NULL;
  83    unsigned intr_size = 0;
  84    LLVMValueRef cond;
  85
  86    assert(lp_check_value(type, a));
  87    assert(lp_check_value(type, b));
  88
  89    /* TODO: optimize the constant case */
  90
  91    if (type.floating && util_cpu_caps.has_sse) {
  92       if (type.width == 32) {
  93          if (type.length == 1) {
  94             intrinsic = "llvm.x86.sse.min.ss";
  95             intr_size = 128;
  96          }
  97          else if (type.length <= 4 || !util_cpu_caps.has_avx) {
  98             intrinsic = "llvm.x86.sse.min.ps";
  99             intr_size = 128;
 100          }
 101          else {
 102             intrinsic = "llvm.x86.avx.min.ps.256";
 103             intr_size = 256;
 104          }
 105       }
 106       if (type.width == 64 && util_cpu_caps.has_sse2) {
 107          if (type.length == 1) {
 108             intrinsic = "llvm.x86.sse2.min.sd";
 109             intr_size = 128;
 110          }
 111          else if (type.length == 2 || !util_cpu_caps.has_avx) {
 112             intrinsic = "llvm.x86.sse2.min.pd";
 113             intr_size = 128;
 114          }
 115          else {
 116             intrinsic = "llvm.x86.avx.min.pd.256";
 117             intr_size = 256;
 118          }
 119       }
 120    }
 121    else if (type.floating && util_cpu_caps.has_altivec) {
 122       if (type.width == 32 && type.length == 4) {
 123          intrinsic = "llvm.ppc.altivec.vminfp";
 124          intr_size = 128;
 125       }
 126    } else if (util_cpu_caps.has_sse2 && type.length >= 2) {
 127       intr_size = 128;
 128       if ((type.width == 8 || type.width == 16) &&
 129           (type.width * type.length <= 64) &&
 130           (gallivm_debug & GALLIVM_DEBUG_PERF)) {
 131          debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
 132                       __FUNCTION__);
 133          }
 134       if (type.width == 8 && !type.sign) {
 135          intrinsic = "llvm.x86.sse2.pminu.b";
 136       }
 137       else if (type.width == 16 && type.sign) {
 138          intrinsic = "llvm.x86.sse2.pmins.w";
 139       }
 140       if (util_cpu_caps.has_sse4_1) {
 141          if (type.width == 8 && type.sign) {
 142             intrinsic = "llvm.x86.sse41.pminsb";
 143          }
 144          if (type.width == 16 && !type.sign) {
 145             intrinsic = "llvm.x86.sse41.pminuw";
 146          }
 147          if (type.width == 32 && !type.sign) {
 148             intrinsic = "llvm.x86.sse41.pminud";
 149         }
 150          if (type.width == 32 && type.sign) {
 151             intrinsic = "llvm.x86.sse41.pminsd";
 152          }
 153       }
 154    } else if (util_cpu_caps.has_altivec) {
 155      intr_size = 128;
 156      if (type.width == 8) {
 157        if (!type.sign) {
 158          intrinsic = "llvm.ppc.altivec.vminub";
 159        } else {
 160          intrinsic = "llvm.ppc.altivec.vminsb";
 161        }
 162      } else if (type.width == 16) {
 163        if (!type.sign) {
 164          intrinsic = "llvm.ppc.altivec.vminuh";
 165        } else {
 166          intrinsic = "llvm.ppc.altivec.vminsh";
 167        }
 168      } else if (type.width == 32) {
 169        if (!type.sign) {
 170          intrinsic = "llvm.ppc.altivec.vminuw";
 171        } else {
 172          intrinsic = "llvm.ppc.altivec.vminsw";
 173        }
 174      }
 175    }
 176
 177    if(intrinsic) {
 178       return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
 179                                                  type,
 180                                                  intr_size, a, b);
 181    }
 182
 183    cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 184    return lp_build_select(bld, cond, a, b);
 185 }
 186
 187
 188 /**
 189  * Generate max(a, b)
 190  * No checks for special case values of a or b = 1 or 0 are done.
 191  */
 192 static LLVMValueRef
 193 lp_build_max_simple(struct lp_build_context *bld,
 194                     LLVMValueRef a,
 195                     LLVMValueRef b)
 196 {
 197    const struct lp_type type = bld->type;
 198    const char *intrinsic = NULL;
 199    unsigned intr_size = 0;
 200    LLVMValueRef cond;
 201
 202    assert(lp_check_value(type, a));
 203    assert(lp_check_value(type, b));
 204
 205    /* TODO: optimize the constant case */
 206
 207    if (type.floating && util_cpu_caps.has_sse) {
 208       if (type.width == 32) {
 209          if (type.length == 1) {
 210             intrinsic = "llvm.x86.sse.max.ss";
 211             intr_size = 128;
 212          }
 213          else if (type.length <= 4 || !util_cpu_caps.has_avx) {
 214             intrinsic = "llvm.x86.sse.max.ps";
 215             intr_size = 128;
 216          }
 217          else {
 218             intrinsic = "llvm.x86.avx.max.ps.256";
 219             intr_size = 256;
 220          }
 221       }
 222       if (type.width == 64 && util_cpu_caps.has_sse2) {
 223          if (type.length == 1) {
 224             intrinsic = "llvm.x86.sse2.max.sd";
 225             intr_size = 128;
 226          }
 227          else if (type.length == 2 || !util_cpu_caps.has_avx) {
 228             intrinsic = "llvm.x86.sse2.max.pd";
 229             intr_size = 128;
 230          }
 231          else {
 232             intrinsic = "llvm.x86.avx.max.pd.256";
 233             intr_size = 256;
 234          }
 235       }
 236    }
 237    else if (type.floating && util_cpu_caps.has_altivec) {
 238       if (type.width == 32 || type.length == 4) {
 239          intrinsic = "llvm.ppc.altivec.vmaxfp";
 240          intr_size = 128;
 241       }
 242    } else if (util_cpu_caps.has_sse2 && type.length >= 2) {
 243       intr_size = 128;
 244       if ((type.width == 8 || type.width == 16) &&
 245           (type.width * type.length <= 64) &&
 246           (gallivm_debug & GALLIVM_DEBUG_PERF)) {
 247          debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
 248                       __FUNCTION__);
 249          }
 250       if (type.width == 8 && !type.sign) {
 251          intrinsic = "llvm.x86.sse2.pmaxu.b";
 252          intr_size = 128;
 253       }
 254       else if (type.width == 16 && type.sign) {
 255          intrinsic = "llvm.x86.sse2.pmaxs.w";
 256       }
 257       if (util_cpu_caps.has_sse4_1) {
 258          if (type.width == 8 && type.sign) {
 259             intrinsic = "llvm.x86.sse41.pmaxsb";
 260          }
 261          if (type.width == 16 && !type.sign) {
 262             intrinsic = "llvm.x86.sse41.pmaxuw";
 263          }
 264          if (type.width == 32 && !type.sign) {
 265             intrinsic = "llvm.x86.sse41.pmaxud";
 266         }
 267          if (type.width == 32 && type.sign) {
 268             intrinsic = "llvm.x86.sse41.pmaxsd";
 269          }
 270       }
 271    } else if (util_cpu_caps.has_altivec) {
 272      intr_size = 128;
 273      if (type.width == 8) {
 274        if (!type.sign) {
 275          intrinsic = "llvm.ppc.altivec.vmaxub";
 276        } else {
 277          intrinsic = "llvm.ppc.altivec.vmaxsb";
 278        }
 279      } else if (type.width == 16) {
 280        if (!type.sign) {
 281          intrinsic = "llvm.ppc.altivec.vmaxuh";
 282        } else {
 283          intrinsic = "llvm.ppc.altivec.vmaxsh";
 284        }
 285      } else if (type.width == 32) {
 286        if (!type.sign) {
 287          intrinsic = "llvm.ppc.altivec.vmaxuw";
 288        } else {
 289          intrinsic = "llvm.ppc.altivec.vmaxsw";
 290        }
 291      }
 292    }
 293
 294    if(intrinsic) {
 295       return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
 296                                                  type,
 297                                                  intr_size, a, b);
 298    }
 299
 300    cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 301    return lp_build_select(bld, cond, a, b);
 302 }
 303
 304
 305 /**
 306  * Generate 1 - a, or ~a depending on bld->type.
 307  */
 308 LLVMValueRef
 309 lp_build_comp(struct lp_build_context *bld,
 310               LLVMValueRef a)
 311 {
 312    LLVMBuilderRef builder = bld->gallivm->builder;
 313    const struct lp_type type = bld->type;
 314
 315    assert(lp_check_value(type, a));
 316
 317    if(a == bld->one)
 318       return bld->zero;
 319    if(a == bld->zero)
 320       return bld->one;
 321
 322    if(type.norm && !type.floating && !type.fixed && !type.sign) {
 323       if(LLVMIsConstant(a))
 324          return LLVMConstNot(a);
 325       else
 326          return LLVMBuildNot(builder, a, "");
 327    }
 328
 329    if(LLVMIsConstant(a))
 330       if (type.floating)
 331           return LLVMConstFSub(bld->one, a);
 332       else
 333           return LLVMConstSub(bld->one, a);
 334    else
 335       if (type.floating)
 336          return LLVMBuildFSub(builder, bld->one, a, "");
 337       else
 338          return LLVMBuildSub(builder, bld->one, a, "");
 339 }
 340
 341
 342 /**
 343  * Generate a + b
 344  */
 345 LLVMValueRef
 346 lp_build_add(struct lp_build_context *bld,
 347              LLVMValueRef a,
 348              LLVMValueRef b)
 349 {
 350    LLVMBuilderRef builder = bld->gallivm->builder;
 351    const struct lp_type type = bld->type;
 352    LLVMValueRef res;
 353
 354    assert(lp_check_value(type, a));
 355    assert(lp_check_value(type, b));
 356
 357    if(a == bld->zero)
 358       return b;
 359    if(b == bld->zero)
 360       return a;
 361    if(a == bld->undef || b == bld->undef)
 362       return bld->undef;
 363
 364    if(bld->type.norm) {
 365       const char *intrinsic = NULL;
 366
 367       if(a == bld->one || b == bld->one)
 368         return bld->one;
 369
 370       if (type.width * type.length == 128 &&
 371           !type.floating && !type.fixed) {
 372          if(util_cpu_caps.has_sse2) {
 373            if(type.width == 8)
 374              intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
 375            if(type.width == 16)
 376              intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
 377          } else if (util_cpu_caps.has_altivec) {
 378            if(type.width == 8)
 379               intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs";
 380            if(type.width == 16)
 381               intrinsic = type.sign ? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs";
 382          }
 383       }
 384
 385       if(intrinsic)
 386          return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
 387    }
 388
 389    if(LLVMIsConstant(a) && LLVMIsConstant(b))
 390       if (type.floating)
 391          res = LLVMConstFAdd(a, b);
 392       else
 393          res = LLVMConstAdd(a, b);
 394    else
 395       if (type.floating)
 396          res = LLVMBuildFAdd(builder, a, b, "");
 397       else
 398          res = LLVMBuildAdd(builder, a, b, "");
 399
 400    /* clamp to ceiling of 1.0 */
 401    if(bld->type.norm && (bld->type.floating || bld->type.fixed))
 402       res = lp_build_min_simple(bld, res, bld->one);
 403
 404    /* XXX clamp to floor of -1 or 0??? */
 405
 406    return res;
 407 }
 408
 409
 410 /** Return the scalar sum of the elements of a.
 411  * Should avoid this operation whenever possible.
 412  */
 413 LLVMValueRef
 414 lp_build_horizontal_add(struct lp_build_context *bld,
 415                         LLVMValueRef a)
 416 {
 417    LLVMBuilderRef builder = bld->gallivm->builder;
 418    const struct lp_type type = bld->type;
 419    LLVMValueRef index, res;
 420    unsigned i, length;
 421    LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 2];
 422    LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 2];
 423    LLVMValueRef vecres, elem2;
 424
 425    assert(lp_check_value(type, a));
 426
 427    if (type.length == 1) {
 428       return a;
 429    }
 430
 431    assert(!bld->type.norm);
 432
 433    /*
 434     * for byte vectors can do much better with psadbw.
 435     * Using repeated shuffle/adds here. Note with multiple vectors
 436     * this can be done more efficiently as outlined in the intel
 437     * optimization manual.
 438     * Note: could cause data rearrangement if used with smaller element
 439     * sizes.
 440     */
 441
 442    vecres = a;
 443    length = type.length / 2;
 444    while (length > 1) {
 445       LLVMValueRef vec1, vec2;
 446       for (i = 0; i < length; i++) {
 447          shuffles1[i] = lp_build_const_int32(bld->gallivm, i);
 448          shuffles2[i] = lp_build_const_int32(bld->gallivm, i + length);
 449       }
 450       vec1 = LLVMBuildShuffleVector(builder, vecres, vecres,
 451                                     LLVMConstVector(shuffles1, length), "");
 452       vec2 = LLVMBuildShuffleVector(builder, vecres, vecres,
 453                                     LLVMConstVector(shuffles2, length), "");
 454       if (type.floating) {
 455          vecres = LLVMBuildFAdd(builder, vec1, vec2, "");
 456       }
 457       else {
 458          vecres = LLVMBuildAdd(builder, vec1, vec2, "");
 459       }
 460       length = length >> 1;
 461    }
 462
 463    /* always have vector of size 2 here */
 464    assert(length == 1);
 465
 466    index = lp_build_const_int32(bld->gallivm, 0);
 467    res = LLVMBuildExtractElement(builder, vecres, index, "");
 468    index = lp_build_const_int32(bld->gallivm, 1);
 469    elem2 = LLVMBuildExtractElement(builder, vecres, index, "");
 470
 471    if (type.floating)
 472       res = LLVMBuildFAdd(builder, res, elem2, "");
 473     else
 474       res = LLVMBuildAdd(builder, res, elem2, "");
 475
 476    return res;
 477 }
 478
 479 /**
 480  * Return the horizontal sums of 4 float vectors as a float4 vector.
 481  * This uses the technique as outlined in Intel Optimization Manual.
 482  */
 483 static LLVMValueRef
 484 lp_build_horizontal_add4x4f(struct lp_build_context *bld,
 485                             LLVMValueRef src[4])
 486 {
 487    struct gallivm_state *gallivm = bld->gallivm;
 488    LLVMBuilderRef builder = gallivm->builder;
 489    LLVMValueRef shuffles[4];
 490    LLVMValueRef tmp[4];
 491    LLVMValueRef sumtmp[2], shuftmp[2];
 492
 493    /* lower half of regs */
 494    shuffles[0] = lp_build_const_int32(gallivm, 0);
 495    shuffles[1] = lp_build_const_int32(gallivm, 1);
 496    shuffles[2] = lp_build_const_int32(gallivm, 4);
 497    shuffles[3] = lp_build_const_int32(gallivm, 5);
 498    tmp[0] = LLVMBuildShuffleVector(builder, src[0], src[1],
 499                                    LLVMConstVector(shuffles, 4), "");
 500    tmp[2] = LLVMBuildShuffleVector(builder, src[2], src[3],
 501                                    LLVMConstVector(shuffles, 4), "");
 502
 503    /* upper half of regs */
 504    shuffles[0] = lp_build_const_int32(gallivm, 2);
 505    shuffles[1] = lp_build_const_int32(gallivm, 3);
 506    shuffles[2] = lp_build_const_int32(gallivm, 6);
 507    shuffles[3] = lp_build_const_int32(gallivm, 7);
 508    tmp[1] = LLVMBuildShuffleVector(builder, src[0], src[1],
 509                                    LLVMConstVector(shuffles, 4), "");
 510    tmp[3] = LLVMBuildShuffleVector(builder, src[2], src[3],
 511                                    LLVMConstVector(shuffles, 4), "");
 512
 513    sumtmp[0] = LLVMBuildFAdd(builder, tmp[0], tmp[1], "");
 514    sumtmp[1] = LLVMBuildFAdd(builder, tmp[2], tmp[3], "");
 515
 516    shuffles[0] = lp_build_const_int32(gallivm, 0);
 517    shuffles[1] = lp_build_const_int32(gallivm, 2);
 518    shuffles[2] = lp_build_const_int32(gallivm, 4);
 519    shuffles[3] = lp_build_const_int32(gallivm, 6);
 520    shuftmp[0] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
 521                                        LLVMConstVector(shuffles, 4), "");
 522
 523    shuffles[0] = lp_build_const_int32(gallivm, 1);
 524    shuffles[1] = lp_build_const_int32(gallivm, 3);
 525    shuffles[2] = lp_build_const_int32(gallivm, 5);
 526    shuffles[3] = lp_build_const_int32(gallivm, 7);
 527    shuftmp[1] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
 528                                        LLVMConstVector(shuffles, 4), "");
 529
 530    return LLVMBuildFAdd(builder, shuftmp[0], shuftmp[1], "");
 531 }
 532
 533
 534 /*
 535  * partially horizontally add 2-4 float vectors with length nx4,
 536  * i.e. only four adjacent values in each vector will be added,
 537  * assuming values are really grouped in 4 which also determines
 538  * output order.
 539  *
 540  * Return a vector of the same length as the initial vectors,
 541  * with the excess elements (if any) being undefined.
 542  * The element order is independent of number of input vectors.
 543  * For 3 vectors x0x1x2x3x4x5x6x7, y0y1y2y3y4y5y6y7, z0z1z2z3z4z5z6z7
 544  * the output order thus will be
 545  * sumx0-x3,sumy0-y3,sumz0-z3,undef,sumx4-x7,sumy4-y7,sumz4z7,undef
 546  */
 547 LLVMValueRef
 548 lp_build_hadd_partial4(struct lp_build_context *bld,
 549                        LLVMValueRef vectors[],
 550                        unsigned num_vecs)
 551 {
 552    struct gallivm_state *gallivm = bld->gallivm;
 553    LLVMBuilderRef builder = gallivm->builder;
 554    LLVMValueRef ret_vec;
 555    LLVMValueRef tmp[4];
 556    const char *intrinsic = NULL;
 557
 558    assert(num_vecs >= 2 && num_vecs <= 4);
 559    assert(bld->type.floating);
 560
 561    /* only use this with at least 2 vectors, as it is sort of expensive
 562     * (depending on cpu) and we always need two horizontal adds anyway,
 563     * so a shuffle/add approach might be better.
 564     */
 565
 566    tmp[0] = vectors[0];
 567    tmp[1] = vectors[1];
 568
 569    tmp[2] = num_vecs > 2 ? vectors[2] : vectors[0];
 570    tmp[3] = num_vecs > 3 ? vectors[3] : vectors[0];
 571
 572    if (util_cpu_caps.has_sse3 && bld->type.width == 32 &&
 573        bld->type.length == 4) {
 574       intrinsic = "llvm.x86.sse3.hadd.ps";
 575    }
 576    else if (util_cpu_caps.has_avx && bld->type.width == 32 &&
 577             bld->type.length == 8) {
 578       intrinsic = "llvm.x86.avx.hadd.ps.256";
 579    }
 580    if (intrinsic) {
 581       tmp[0] = lp_build_intrinsic_binary(builder, intrinsic,
 582                                        lp_build_vec_type(gallivm, bld->type),
 583                                        tmp[0], tmp[1]);
 584       if (num_vecs > 2) {
 585          tmp[1] = lp_build_intrinsic_binary(builder, intrinsic,
 586                                           lp_build_vec_type(gallivm, bld->type),
 587                                           tmp[2], tmp[3]);
 588       }
 589       else {
 590          tmp[1] = tmp[0];
 591       }
 592       return lp_build_intrinsic_binary(builder, intrinsic,
 593                                        lp_build_vec_type(gallivm, bld->type),
 594                                        tmp[0], tmp[1]);
 595    }
 596
 597    if (bld->type.length == 4) {
 598       ret_vec = lp_build_horizontal_add4x4f(bld, tmp);
 599    }
 600    else {
 601       LLVMValueRef partres[LP_MAX_VECTOR_LENGTH/4];
 602       unsigned j;
 603       unsigned num_iter = bld->type.length / 4;
 604       struct lp_type parttype = bld->type;
 605       parttype.length = 4;
 606       for (j = 0; j < num_iter; j++) {
 607          LLVMValueRef partsrc[4];
 608          unsigned i;
 609          for (i = 0; i < 4; i++) {
 610             partsrc[i] = lp_build_extract_range(gallivm, tmp[i], j*4, 4);
 611          }
 612          partres[j] = lp_build_horizontal_add4x4f(bld, partsrc);
 613       }
 614       ret_vec = lp_build_concat(gallivm, partres, parttype, num_iter);
 615    }
 616    return ret_vec;
 617 }
 618
 619 /**
 620  * Generate a - b
 621  */
 622 LLVMValueRef
 623 lp_build_sub(struct lp_build_context *bld,
 624              LLVMValueRef a,
 625              LLVMValueRef b)
 626 {
 627    LLVMBuilderRef builder = bld->gallivm->builder;
 628    const struct lp_type type = bld->type;
 629    LLVMValueRef res;
 630
 631    assert(lp_check_value(type, a));
 632    assert(lp_check_value(type, b));
 633
 634    if(b == bld->zero)
 635       return a;
 636    if(a == bld->undef || b == bld->undef)
 637       return bld->undef;
 638    if(a == b)
 639       return bld->zero;
 640
 641    if(bld->type.norm) {
 642       const char *intrinsic = NULL;
 643
 644       if(b == bld->one)
 645         return bld->zero;
 646
 647       if (type.width * type.length == 128 &&
 648           !type.floating && !type.fixed) {
 649          if (util_cpu_caps.has_sse2) {
 650            if(type.width == 8)
 651               intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
 652            if(type.width == 16)
 653               intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
 654          } else if (util_cpu_caps.has_altivec) {
 655            if(type.width == 8)
 656               intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs";
 657            if(type.width == 16)
 658               intrinsic = type.sign ? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs";
 659          }
 660       }
 661
 662       if(intrinsic)
 663          return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
 664    }
 665
 666    if(LLVMIsConstant(a) && LLVMIsConstant(b))
 667       if (type.floating)
 668          res = LLVMConstFSub(a, b);
 669       else
 670          res = LLVMConstSub(a, b);
 671    else
 672       if (type.floating)
 673          res = LLVMBuildFSub(builder, a, b, "");
 674       else
 675          res = LLVMBuildSub(builder, a, b, "");
 676
 677    if(bld->type.norm && (bld->type.floating || bld->type.fixed))
 678       res = lp_build_max_simple(bld, res, bld->zero);
 679
 680    return res;
 681 }
 682
 683
 684
 685 /**
 686  * Normalized multiplication.
 687  *
 688  * There are several approaches for (using 8-bit normalized multiplication as
 689  * an example):
 690  *
 691  * - alpha plus one
 692  *
 693  *     makes the following approximation to the division (Sree)
 694  *
 695  *       a*b/255 ~= (a*(b + 1)) >> 256
 696  *
 697  *     which is the fastest method that satisfies the following OpenGL criteria of
 698  *
 699  *       0*0 = 0 and 255*255 = 255
 700  *
 701  * - geometric series
 702  *
 703  *     takes the geometric series approximation to the division
 704  *
 705  *       t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
 706  *
 707  *     in this case just the first two terms to fit in 16bit arithmetic
 708  *
 709  *       t/255 ~= (t + (t >> 8)) >> 8
 710  *
 711  *     note that just by itself it doesn't satisfies the OpenGL criteria, as
 712  *     255*255 = 254, so the special case b = 255 must be accounted or roundoff
 713  *     must be used.
 714  *
 715  * - geometric series plus rounding
 716  *
 717  *     when using a geometric series division instead of truncating the result
 718  *     use roundoff in the approximation (Jim Blinn)
 719  *
 720  *       t/255 ~= (t + (t >> 8) + 0x80) >> 8
 721  *
 722  *     achieving the exact results.
 723  *
 724  *
 725  *
 726  * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
 727  *     ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
 728  * @sa Michael Herf, The "double blend trick", May 2000,
 729  *     http://www.stereopsis.com/doubleblend.html
 730  */
 731 static LLVMValueRef
 732 lp_build_mul_norm(struct gallivm_state *gallivm,
 733                   struct lp_type wide_type,
 734                   LLVMValueRef a, LLVMValueRef b)
 735 {
 736    LLVMBuilderRef builder = gallivm->builder;
 737    struct lp_build_context bld;
 738    unsigned n;
 739    LLVMValueRef half;
 740    LLVMValueRef ab;
 741
 742    assert(!wide_type.floating);
 743    assert(lp_check_value(wide_type, a));
 744    assert(lp_check_value(wide_type, b));
 745
 746    lp_build_context_init(&bld, gallivm, wide_type);
 747
 748    n = wide_type.width / 2;
 749    if (wide_type.sign) {
 750       --n;
 751    }
 752
 753    /*
 754     * TODO: for 16bits normalized SSE2 vectors we could consider using PMULHUW
 755     * http://ssp.impulsetrain.com/2011/07/03/multiplying-normalized-16-bit-numbers-with-sse2/
 756     */
 757
 758    /*
 759     * a*b / (2**n - 1) ~= (a*b + (a*b >> n) + half) >> n
 760     */
 761
 762    ab = LLVMBuildMul(builder, a, b, "");
 763    ab = LLVMBuildAdd(builder, ab, lp_build_shr_imm(&bld, ab, n), "");
 764
 765    /*
 766     * half = sgn(ab) * 0.5 * (2 ** n) = sgn(ab) * (1 << (n - 1))
 767     */
 768
 769    half = lp_build_const_int_vec(gallivm, wide_type, 1 << (n - 1));
 770    if (wide_type.sign) {
 771       LLVMValueRef minus_half = LLVMBuildNeg(builder, half, "");
 772       LLVMValueRef sign = lp_build_shr_imm(&bld, ab, wide_type.width - 1);
 773       half = lp_build_select(&bld, sign, minus_half, half);
 774    }
 775    ab = LLVMBuildAdd(builder, ab, half, "");
 776
 777    /* Final division */
 778    ab = lp_build_shr_imm(&bld, ab, n);
 779
 780    return ab;
 781 }
 782
 783 /**
 784  * Generate a * b
 785  */
 786 LLVMValueRef
 787 lp_build_mul(struct lp_build_context *bld,
 788              LLVMValueRef a,
 789              LLVMValueRef b)
 790 {
 791    LLVMBuilderRef builder = bld->gallivm->builder;
 792    const struct lp_type type = bld->type;
 793    LLVMValueRef shift;
 794    LLVMValueRef res;
 795
 796    assert(lp_check_value(type, a));
 797    assert(lp_check_value(type, b));
 798
 799    if(a == bld->zero)
 800       return bld->zero;
 801    if(a == bld->one)
 802       return b;
 803    if(b == bld->zero)
 804       return bld->zero;
 805    if(b == bld->one)
 806       return a;
 807    if(a == bld->undef || b == bld->undef)
 808       return bld->undef;
 809
 810    if (!type.floating && !type.fixed && type.norm) {
 811       struct lp_type wide_type = lp_wider_type(type);
 812       LLVMValueRef al, ah, bl, bh, abl, abh, ab;
 813
 814       lp_build_unpack2(bld->gallivm, type, wide_type, a, &al, &ah);
 815       lp_build_unpack2(bld->gallivm, type, wide_type, b, &bl, &bh);
 816
 817       /* PMULLW, PSRLW, PADDW */
 818       abl = lp_build_mul_norm(bld->gallivm, wide_type, al, bl);
 819       abh = lp_build_mul_norm(bld->gallivm, wide_type, ah, bh);
 820
 821       ab = lp_build_pack2(bld->gallivm, wide_type, type, abl, abh);
 822
 823       return ab;
 824    }
 825
 826    if(type.fixed)
 827       shift = lp_build_const_int_vec(bld->gallivm, type, type.width/2);
 828    else
 829       shift = NULL;
 830
 831    if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
 832       if (type.floating)
 833          res = LLVMConstFMul(a, b);
 834       else
 835          res = LLVMConstMul(a, b);
 836       if(shift) {
 837          if(type.sign)
 838             res = LLVMConstAShr(res, shift);
 839          else
 840             res = LLVMConstLShr(res, shift);
 841       }
 842    }
 843    else {
 844       if (type.floating)
 845          res = LLVMBuildFMul(builder, a, b, "");
 846       else
 847          res = LLVMBuildMul(builder, a, b, "");
 848       if(shift) {
 849          if(type.sign)
 850             res = LLVMBuildAShr(builder, res, shift, "");
 851          else
 852             res = LLVMBuildLShr(builder, res, shift, "");
 853       }
 854    }
 855
 856    return res;
 857 }
 858
 859
 860 /**
 861  * Small vector x scale multiplication optimization.
 862  */
 863 LLVMValueRef
 864 lp_build_mul_imm(struct lp_build_context *bld,
 865                  LLVMValueRef a,
 866                  int b)
 867 {
 868    LLVMBuilderRef builder = bld->gallivm->builder;
 869    LLVMValueRef factor;
 870
 871    assert(lp_check_value(bld->type, a));
 872
 873    if(b == 0)
 874       return bld->zero;
 875
 876    if(b == 1)
 877       return a;
 878
 879    if(b == -1)
 880       return lp_build_negate(bld, a);
 881
 882    if(b == 2 && bld->type.floating)
 883       return lp_build_add(bld, a, a);
 884
 885    if(util_is_power_of_two(b)) {
 886       unsigned shift = ffs(b) - 1;
 887
 888       if(bld->type.floating) {
 889 #if 0
 890          /*
 891           * Power of two multiplication by directly manipulating the exponent.
 892           *
 893           * XXX: This might not be always faster, it will introduce a small error
 894           * for multiplication by zero, and it will produce wrong results
 895           * for Inf and NaN.
 896           */
 897          unsigned mantissa = lp_mantissa(bld->type);
 898          factor = lp_build_const_int_vec(bld->gallivm, bld->type, (unsigned long long)shift << mantissa);
 899          a = LLVMBuildBitCast(builder, a, lp_build_int_vec_type(bld->type), "");
 900          a = LLVMBuildAdd(builder, a, factor, "");
 901          a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, bld->type), "");
 902          return a;
 903 #endif
 904       }
 905       else {
 906          factor = lp_build_const_vec(bld->gallivm, bld->type, shift);
 907          return LLVMBuildShl(builder, a, factor, "");
 908       }
 909    }
 910
 911    factor = lp_build_const_vec(bld->gallivm, bld->type, (double)b);
 912    return lp_build_mul(bld, a, factor);
 913 }
 914
 915
 916 /**
 917  * Generate a / b
 918  */
 919 LLVMValueRef
 920 lp_build_div(struct lp_build_context *bld,
 921              LLVMValueRef a,
 922              LLVMValueRef b)
 923 {
 924    LLVMBuilderRef builder = bld->gallivm->builder;
 925    const struct lp_type type = bld->type;
 926
 927    assert(lp_check_value(type, a));
 928    assert(lp_check_value(type, b));
 929
 930    if(a == bld->zero)
 931       return bld->zero;
 932    if(a == bld->one)
 933       return lp_build_rcp(bld, b);
 934    if(b == bld->zero)
 935       return bld->undef;
 936    if(b == bld->one)
 937       return a;
 938    if(a == bld->undef || b == bld->undef)
 939       return bld->undef;
 940
 941    if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
 942       if (type.floating)
 943          return LLVMConstFDiv(a, b);
 944       else if (type.sign)
 945          return LLVMConstSDiv(a, b);
 946       else
 947          return LLVMConstUDiv(a, b);
 948    }
 949
 950    if(((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
 951        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) &&
 952       type.floating)
 953       return lp_build_mul(bld, a, lp_build_rcp(bld, b));
 954
 955    if (type.floating)
 956       return LLVMBuildFDiv(builder, a, b, "");
 957    else if (type.sign)
 958       return LLVMBuildSDiv(builder, a, b, "");
 959    else
 960       return LLVMBuildUDiv(builder, a, b, "");
 961 }
 962
 963
 964 /**
 965  * Linear interpolation helper.
 966  *
 967  * @param normalized whether we are interpolating normalized values,
 968  *        encoded in normalized integers, twice as wide.
 969  *
 970  * @sa http://www.stereopsis.com/doubleblend.html
 971  */
 972 static INLINE LLVMValueRef
 973 lp_build_lerp_simple(struct lp_build_context *bld,
 974                      LLVMValueRef x,
 975                      LLVMValueRef v0,
 976                      LLVMValueRef v1,
 977                      unsigned flags)
 978 {
 979    unsigned half_width = bld->type.width/2;
 980    LLVMBuilderRef builder = bld->gallivm->builder;
 981    LLVMValueRef delta;
 982    LLVMValueRef res;
 983
 984    assert(lp_check_value(bld->type, x));
 985    assert(lp_check_value(bld->type, v0));
 986    assert(lp_check_value(bld->type, v1));
 987
 988    delta = lp_build_sub(bld, v1, v0);
 989
 990    if (flags & LP_BLD_LERP_WIDE_NORMALIZED) {
 991       if (!bld->type.sign) {
 992          if (!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS)) {
 993             /*
 994              * Scale x from [0, 2**n - 1] to [0, 2**n] by adding the
 995              * most-significant-bit to the lowest-significant-bit, so that
 996              * later we can just divide by 2**n instead of 2**n - 1.
 997              */
 998
 999             x = lp_build_add(bld, x, lp_build_shr_imm(bld, x, half_width - 1));
1000          }
1001
1002          /* (x * delta) >> n */
1003          res = lp_build_mul(bld, x, delta);
1004          res = lp_build_shr_imm(bld, res, half_width);
1005       } else {
1006          /*
1007           * The rescaling trick above doesn't work for signed numbers, so
1008           * use the 2**n - 1 divison approximation in lp_build_mul_norm
1009           * instead.
1010           */
1011          assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1012          res = lp_build_mul_norm(bld->gallivm, bld->type, x, delta);
1013       }
1014    } else {
1015       assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1016       res = lp_build_mul(bld, x, delta);
1017    }
1018
1019    res = lp_build_add(bld, v0, res);
1020
1021    if (((flags & LP_BLD_LERP_WIDE_NORMALIZED) && !bld->type.sign) ||
1022        bld->type.fixed) {
1023       /* We need to mask out the high order bits when lerping 8bit normalized colors stored on 16bits */
1024       /* XXX: This step is necessary for lerping 8bit colors stored on 16bits,
1025        * but it will be wrong for true fixed point use cases. Basically we need
1026        * a more powerful lp_type, capable of further distinguishing the values
1027        * interpretation from the value storage. */
1028       res = LLVMBuildAnd(builder, res, lp_build_const_int_vec(bld->gallivm, bld->type, (1 << half_width) - 1), "");
1029    }
1030
1031    return res;
1032 }
1033
1034
1035 /**
1036  * Linear interpolation.
1037  */
1038 LLVMValueRef
1039 lp_build_lerp(struct lp_build_context *bld,
1040               LLVMValueRef x,
1041               LLVMValueRef v0,
1042               LLVMValueRef v1,
1043               unsigned flags)
1044 {
1045    const struct lp_type type = bld->type;
1046    LLVMValueRef res;
1047
1048    assert(lp_check_value(type, x));
1049    assert(lp_check_value(type, v0));
1050    assert(lp_check_value(type, v1));
1051
1052    assert(!(flags & LP_BLD_LERP_WIDE_NORMALIZED));
1053
1054    if (type.norm) {
1055       struct lp_type wide_type;
1056       struct lp_build_context wide_bld;
1057       LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh;
1058
1059       assert(type.length >= 2);
1060
1061       /*
1062        * Create a wider integer type, enough to hold the
1063        * intermediate result of the multiplication.
1064        */
1065       memset(&wide_type, 0, sizeof wide_type);
1066       wide_type.sign   = type.sign;
1067       wide_type.width  = type.width*2;
1068       wide_type.length = type.length/2;
1069
1070       lp_build_context_init(&wide_bld, bld->gallivm, wide_type);
1071
1072       lp_build_unpack2(bld->gallivm, type, wide_type, x,  &xl,  &xh);
1073       lp_build_unpack2(bld->gallivm, type, wide_type, v0, &v0l, &v0h);
1074       lp_build_unpack2(bld->gallivm, type, wide_type, v1, &v1l, &v1h);
1075
1076       /*
1077        * Lerp both halves.
1078        */
1079
1080       flags |= LP_BLD_LERP_WIDE_NORMALIZED;
1081
1082       resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l, flags);
1083       resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h, flags);
1084
1085       res = lp_build_pack2(bld->gallivm, wide_type, type, resl, resh);
1086    } else {
1087       res = lp_build_lerp_simple(bld, x, v0, v1, flags);
1088    }
1089
1090    return res;
1091 }
1092
1093
1094 /**
1095  * Bilinear interpolation.
1096  *
1097  * Values indices are in v_{yx}.
1098  */
1099 LLVMValueRef
1100 lp_build_lerp_2d(struct lp_build_context *bld,
1101                  LLVMValueRef x,
1102                  LLVMValueRef y,
1103                  LLVMValueRef v00,
1104                  LLVMValueRef v01,
1105                  LLVMValueRef v10,
1106                  LLVMValueRef v11,
1107                  unsigned flags)
1108 {
1109    LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01, flags);
1110    LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11, flags);
1111    return lp_build_lerp(bld, y, v0, v1, flags);
1112 }
1113
1114
1115 LLVMValueRef
1116 lp_build_lerp_3d(struct lp_build_context *bld,
1117                  LLVMValueRef x,
1118                  LLVMValueRef y,
1119                  LLVMValueRef z,
1120                  LLVMValueRef v000,
1121                  LLVMValueRef v001,
1122                  LLVMValueRef v010,
1123                  LLVMValueRef v011,
1124                  LLVMValueRef v100,
1125                  LLVMValueRef v101,
1126                  LLVMValueRef v110,
1127                  LLVMValueRef v111,
1128                  unsigned flags)
1129 {
1130    LLVMValueRef v0 = lp_build_lerp_2d(bld, x, y, v000, v001, v010, v011, flags);
1131    LLVMValueRef v1 = lp_build_lerp_2d(bld, x, y, v100, v101, v110, v111, flags);
1132    return lp_build_lerp(bld, z, v0, v1, flags);
1133 }
1134
1135
1136 /**
1137  * Generate min(a, b)
1138  * Do checks for special cases.
1139  */
1140 LLVMValueRef
1141 lp_build_min(struct lp_build_context *bld,
1142              LLVMValueRef a,
1143              LLVMValueRef b)
1144 {
1145    assert(lp_check_value(bld->type, a));
1146    assert(lp_check_value(bld->type, b));
1147
1148    if(a == bld->undef || b == bld->undef)
1149       return bld->undef;
1150
1151    if(a == b)
1152       return a;
1153
1154    if (bld->type.norm) {
1155       if (!bld->type.sign) {
1156          if (a == bld->zero || b == bld->zero) {
1157             return bld->zero;
1158          }
1159       }
1160       if(a == bld->one)
1161          return b;
1162       if(b == bld->one)
1163          return a;
1164    }
1165
1166    return lp_build_min_simple(bld, a, b);
1167 }
1168
1169
1170 /**
1171  * Generate max(a, b)
1172  * Do checks for special cases.
1173  */
1174 LLVMValueRef
1175 lp_build_max(struct lp_build_context *bld,
1176              LLVMValueRef a,
1177              LLVMValueRef b)
1178 {
1179    assert(lp_check_value(bld->type, a));
1180    assert(lp_check_value(bld->type, b));
1181
1182    if(a == bld->undef || b == bld->undef)
1183       return bld->undef;
1184
1185    if(a == b)
1186       return a;
1187
1188    if(bld->type.norm) {
1189       if(a == bld->one || b == bld->one)
1190          return bld->one;
1191       if (!bld->type.sign) {
1192          if (a == bld->zero) {
1193             return b;
1194          }
1195          if (b == bld->zero) {
1196             return a;
1197          }
1198       }
1199    }
1200
1201    return lp_build_max_simple(bld, a, b);
1202 }
1203
1204
1205 /**
1206  * Generate clamp(a, min, max)
1207  * Do checks for special cases.
1208  */
1209 LLVMValueRef
1210 lp_build_clamp(struct lp_build_context *bld,
1211                LLVMValueRef a,
1212                LLVMValueRef min,
1213                LLVMValueRef max)
1214 {
1215    assert(lp_check_value(bld->type, a));
1216    assert(lp_check_value(bld->type, min));
1217    assert(lp_check_value(bld->type, max));
1218
1219    a = lp_build_min(bld, a, max);
1220    a = lp_build_max(bld, a, min);
1221    return a;
1222 }
1223
1224
1225 /**
1226  * Generate abs(a)
1227  */
1228 LLVMValueRef
1229 lp_build_abs(struct lp_build_context *bld,
1230              LLVMValueRef a)
1231 {
1232    LLVMBuilderRef builder = bld->gallivm->builder;
1233    const struct lp_type type = bld->type;
1234    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1235
1236    assert(lp_check_value(type, a));
1237
1238    if(!type.sign)
1239       return a;
1240
1241    if(type.floating) {
1242       /* Mask out the sign bit */
1243       LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1244       unsigned long long absMask = ~(1ULL << (type.width - 1));
1245       LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, ((unsigned long long) absMask));
1246       a = LLVMBuildBitCast(builder, a, int_vec_type, "");
1247       a = LLVMBuildAnd(builder, a, mask, "");
1248       a = LLVMBuildBitCast(builder, a, vec_type, "");
1249       return a;
1250    }
1251
1252    if(type.width*type.length == 128 && util_cpu_caps.has_ssse3) {
1253       switch(type.width) {
1254       case 8:
1255          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
1256       case 16:
1257          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
1258       case 32:
1259          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
1260       }
1261    }
1262    else if (type.width*type.length == 256 && util_cpu_caps.has_ssse3 &&
1263             (gallivm_debug & GALLIVM_DEBUG_PERF) &&
1264             (type.width == 8 || type.width == 16 || type.width == 32)) {
1265       debug_printf("%s: inefficient code, should split vectors manually\n",
1266                    __FUNCTION__);
1267    }
1268
1269    return lp_build_max(bld, a, LLVMBuildNeg(builder, a, ""));
1270 }
1271
1272
1273 LLVMValueRef
1274 lp_build_negate(struct lp_build_context *bld,
1275                 LLVMValueRef a)
1276 {
1277    LLVMBuilderRef builder = bld->gallivm->builder;
1278
1279    assert(lp_check_value(bld->type, a));
1280
1281 #if HAVE_LLVM >= 0x0207
1282    if (bld->type.floating)
1283       a = LLVMBuildFNeg(builder, a, "");
1284    else
1285 #endif
1286       a = LLVMBuildNeg(builder, a, "");
1287
1288    return a;
1289 }
1290
1291
1292 /** Return -1, 0 or +1 depending on the sign of a */
1293 LLVMValueRef
1294 lp_build_sgn(struct lp_build_context *bld,
1295              LLVMValueRef a)
1296 {
1297    LLVMBuilderRef builder = bld->gallivm->builder;
1298    const struct lp_type type = bld->type;
1299    LLVMValueRef cond;
1300    LLVMValueRef res;
1301
1302    assert(lp_check_value(type, a));
1303
1304    /* Handle non-zero case */
1305    if(!type.sign) {
1306       /* if not zero then sign must be positive */
1307       res = bld->one;
1308    }
1309    else if(type.floating) {
1310       LLVMTypeRef vec_type;
1311       LLVMTypeRef int_type;
1312       LLVMValueRef mask;
1313       LLVMValueRef sign;
1314       LLVMValueRef one;
1315       unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
1316
1317       int_type = lp_build_int_vec_type(bld->gallivm, type);
1318       vec_type = lp_build_vec_type(bld->gallivm, type);
1319       mask = lp_build_const_int_vec(bld->gallivm, type, maskBit);
1320
1321       /* Take the sign bit and add it to 1 constant */
1322       sign = LLVMBuildBitCast(builder, a, int_type, "");
1323       sign = LLVMBuildAnd(builder, sign, mask, "");
1324       one = LLVMConstBitCast(bld->one, int_type);
1325       res = LLVMBuildOr(builder, sign, one, "");
1326       res = LLVMBuildBitCast(builder, res, vec_type, "");
1327    }
1328    else
1329    {
1330       /* signed int/norm/fixed point */
1331       /* could use psign with sse3 and appropriate vectors here */
1332       LLVMValueRef minus_one = lp_build_const_vec(bld->gallivm, type, -1.0);
1333       cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
1334       res = lp_build_select(bld, cond, bld->one, minus_one);
1335    }
1336
1337    /* Handle zero */
1338    cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
1339    res = lp_build_select(bld, cond, bld->zero, res);
1340
1341    return res;
1342 }
1343
1344
1345 /**
1346  * Set the sign of float vector 'a' according to 'sign'.
1347  * If sign==0, return abs(a).
1348  * If sign==1, return -abs(a);
1349  * Other values for sign produce undefined results.
1350  */
1351 LLVMValueRef
1352 lp_build_set_sign(struct lp_build_context *bld,
1353                   LLVMValueRef a, LLVMValueRef sign)
1354 {
1355    LLVMBuilderRef builder = bld->gallivm->builder;
1356    const struct lp_type type = bld->type;
1357    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1358    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1359    LLVMValueRef shift = lp_build_const_int_vec(bld->gallivm, type, type.width - 1);
1360    LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1361                              ~((unsigned long long) 1 << (type.width - 1)));
1362    LLVMValueRef val, res;
1363
1364    assert(type.floating);
1365    assert(lp_check_value(type, a));
1366
1367    /* val = reinterpret_cast<int>(a) */
1368    val = LLVMBuildBitCast(builder, a, int_vec_type, "");
1369    /* val = val & mask */
1370    val = LLVMBuildAnd(builder, val, mask, "");
1371    /* sign = sign << shift */
1372    sign = LLVMBuildShl(builder, sign, shift, "");
1373    /* res = val | sign */
1374    res = LLVMBuildOr(builder, val, sign, "");
1375    /* res = reinterpret_cast<float>(res) */
1376    res = LLVMBuildBitCast(builder, res, vec_type, "");
1377
1378    return res;
1379 }
1380
1381
1382 /**
1383  * Convert vector of (or scalar) int to vector of (or scalar) float.
1384  */
1385 LLVMValueRef
1386 lp_build_int_to_float(struct lp_build_context *bld,
1387                       LLVMValueRef a)
1388 {
1389    LLVMBuilderRef builder = bld->gallivm->builder;
1390    const struct lp_type type = bld->type;
1391    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1392
1393    assert(type.floating);
1394
1395    return LLVMBuildSIToFP(builder, a, vec_type, "");
1396 }
1397
1398 static boolean
1399 arch_rounding_available(const struct lp_type type)
1400 {
1401    if ((util_cpu_caps.has_sse4_1 &&
1402        (type.length == 1 || type.width*type.length == 128)) ||
1403        (util_cpu_caps.has_avx && type.width*type.length == 256))
1404       return TRUE;
1405    else if ((util_cpu_caps.has_altivec &&
1406             (type.width == 32 && type.length == 4)))
1407       return TRUE;
1408
1409    return FALSE;
1410 }
1411
1412 enum lp_build_round_mode
1413 {
1414    LP_BUILD_ROUND_NEAREST = 0,
1415    LP_BUILD_ROUND_FLOOR = 1,
1416    LP_BUILD_ROUND_CEIL = 2,
1417    LP_BUILD_ROUND_TRUNCATE = 3
1418 };
1419
1420 /**
1421  * Helper for SSE4.1's ROUNDxx instructions.
1422  *
1423  * NOTE: In the SSE4.1's nearest mode, if two values are equally close, the
1424  * result is the even value.  That is, rounding 2.5 will be 2.0, and not 3.0.
1425  */
1426 static INLINE LLVMValueRef
1427 lp_build_round_sse41(struct lp_build_context *bld,
1428                      LLVMValueRef a,
1429                      enum lp_build_round_mode mode)
1430 {
1431    LLVMBuilderRef builder = bld->gallivm->builder;
1432    const struct lp_type type = bld->type;
1433    LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1434    const char *intrinsic;
1435    LLVMValueRef res;
1436
1437    assert(type.floating);
1438
1439    assert(lp_check_value(type, a));
1440    assert(util_cpu_caps.has_sse4_1);
1441
1442    if (type.length == 1) {
1443       LLVMTypeRef vec_type;
1444       LLVMValueRef undef;
1445       LLVMValueRef args[3];
1446       LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
1447
1448       switch(type.width) {
1449       case 32:
1450          intrinsic = "llvm.x86.sse41.round.ss";
1451          break;
1452       case 64:
1453          intrinsic = "llvm.x86.sse41.round.sd";
1454          break;
1455       default:
1456          assert(0);
1457          return bld->undef;
1458       }
1459
1460       vec_type = LLVMVectorType(bld->elem_type, 4);
1461
1462       undef = LLVMGetUndef(vec_type);
1463
1464       args[0] = undef;
1465       args[1] = LLVMBuildInsertElement(builder, undef, a, index0, "");
1466       args[2] = LLVMConstInt(i32t, mode, 0);
1467
1468       res = lp_build_intrinsic(builder, intrinsic,
1469                                vec_type, args, Elements(args));
1470
1471       res = LLVMBuildExtractElement(builder, res, index0, "");
1472    }
1473    else {
1474       if (type.width * type.length == 128) {
1475          switch(type.width) {
1476          case 32:
1477             intrinsic = "llvm.x86.sse41.round.ps";
1478             break;
1479          case 64:
1480             intrinsic = "llvm.x86.sse41.round.pd";
1481             break;
1482          default:
1483             assert(0);
1484             return bld->undef;
1485          }
1486       }
1487       else {
1488          assert(type.width * type.length == 256);
1489          assert(util_cpu_caps.has_avx);
1490
1491          switch(type.width) {
1492          case 32:
1493             intrinsic = "llvm.x86.avx.round.ps.256";
1494             break;
1495          case 64:
1496             intrinsic = "llvm.x86.avx.round.pd.256";
1497             break;
1498          default:
1499             assert(0);
1500             return bld->undef;
1501          }
1502       }
1503
1504       res = lp_build_intrinsic_binary(builder, intrinsic,
1505                                       bld->vec_type, a,
1506                                       LLVMConstInt(i32t, mode, 0));
1507    }
1508
1509    return res;
1510 }
1511
1512
1513 static INLINE LLVMValueRef
1514 lp_build_iround_nearest_sse2(struct lp_build_context *bld,
1515                              LLVMValueRef a)
1516 {
1517    LLVMBuilderRef builder = bld->gallivm->builder;
1518    const struct lp_type type = bld->type;
1519    LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1520    LLVMTypeRef ret_type = lp_build_int_vec_type(bld->gallivm, type);
1521    const char *intrinsic;
1522    LLVMValueRef res;
1523
1524    assert(type.floating);
1525    /* using the double precision conversions is a bit more complicated */
1526    assert(type.width == 32);
1527
1528    assert(lp_check_value(type, a));
1529    assert(util_cpu_caps.has_sse2);
1530
1531    /* This is relying on MXCSR rounding mode, which should always be nearest. */
1532    if (type.length == 1) {
1533       LLVMTypeRef vec_type;
1534       LLVMValueRef undef;
1535       LLVMValueRef arg;
1536       LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
1537
1538       vec_type = LLVMVectorType(bld->elem_type, 4);
1539
1540       intrinsic = "llvm.x86.sse.cvtss2si";
1541
1542       undef = LLVMGetUndef(vec_type);
1543
1544       arg = LLVMBuildInsertElement(builder, undef, a, index0, "");
1545
1546       res = lp_build_intrinsic_unary(builder, intrinsic,
1547                                      ret_type, arg);
1548    }
1549    else {
1550       if (type.width* type.length == 128) {
1551          intrinsic = "llvm.x86.sse2.cvtps2dq";
1552       }
1553       else {
1554          assert(type.width*type.length == 256);
1555          assert(util_cpu_caps.has_avx);
1556
1557          intrinsic = "llvm.x86.avx.cvt.ps2dq.256";
1558       }
1559       res = lp_build_intrinsic_unary(builder, intrinsic,
1560                                      ret_type, a);
1561    }
1562
1563    return res;
1564 }
1565
1566
1567 /*
1568  */
1569 static INLINE LLVMValueRef
1570 lp_build_round_altivec(struct lp_build_context *bld,
1571                        LLVMValueRef a,
1572                        enum lp_build_round_mode mode)
1573 {
1574    LLVMBuilderRef builder = bld->gallivm->builder;
1575    const struct lp_type type = bld->type;
1576    const char *intrinsic = NULL;
1577
1578    assert(type.floating);
1579
1580    assert(lp_check_value(type, a));
1581    assert(util_cpu_caps.has_altivec);
1582
1583    switch (mode) {
1584    case LP_BUILD_ROUND_NEAREST:
1585       intrinsic = "llvm.ppc.altivec.vrfin";
1586       break;
1587    case LP_BUILD_ROUND_FLOOR:
1588       intrinsic = "llvm.ppc.altivec.vrfim";
1589       break;
1590    case LP_BUILD_ROUND_CEIL:
1591       intrinsic = "llvm.ppc.altivec.vrfip";
1592       break;
1593    case LP_BUILD_ROUND_TRUNCATE:
1594       intrinsic = "llvm.ppc.altivec.vrfiz";
1595       break;
1596    }
1597
1598    return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
1599 }
1600
1601 static INLINE LLVMValueRef
1602 lp_build_round_arch(struct lp_build_context *bld,
1603                     LLVMValueRef a,
1604                     enum lp_build_round_mode mode)
1605 {
1606    if (util_cpu_caps.has_sse4_1)
1607      return lp_build_round_sse41(bld, a, mode);
1608    else /* (util_cpu_caps.has_altivec) */
1609      return lp_build_round_altivec(bld, a, mode);
1610 }
1611
1612 /**
1613  * Return the integer part of a float (vector) value (== round toward zero).
1614  * The returned value is a float (vector).
1615  * Ex: trunc(-1.5) = -1.0
1616  */
1617 LLVMValueRef
1618 lp_build_trunc(struct lp_build_context *bld,
1619                LLVMValueRef a)
1620 {
1621    LLVMBuilderRef builder = bld->gallivm->builder;
1622    const struct lp_type type = bld->type;
1623
1624    assert(type.floating);
1625    assert(lp_check_value(type, a));
1626
1627    if (arch_rounding_available(type)) {
1628       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_TRUNCATE);
1629    }
1630    else {
1631       const struct lp_type type = bld->type;
1632       struct lp_type inttype;
1633       struct lp_build_context intbld;
1634       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 2^24);
1635       LLVMValueRef trunc, res, anosign, mask;
1636       LLVMTypeRef int_vec_type = bld->int_vec_type;
1637       LLVMTypeRef vec_type = bld->vec_type;
1638
1639       assert(type.width == 32); /* might want to handle doubles at some point */
1640
1641       inttype = type;
1642       inttype.floating = 0;
1643       lp_build_context_init(&intbld, bld->gallivm, inttype);
1644
1645       /* round by truncation */
1646       trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
1647       res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
1648
1649       /* mask out sign bit */
1650       anosign = lp_build_abs(bld, a);
1651       /*
1652        * mask out all values if anosign > 2^24
1653        * This should work both for large ints (all rounding is no-op for them
1654        * because such floats are always exact) as well as special cases like
1655        * NaNs, Infs (taking advantage of the fact they use max exponent).
1656        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
1657        */
1658       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
1659       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
1660       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
1661       return lp_build_select(bld, mask, a, res);
1662    }
1663 }
1664
1665
1666 /**
1667  * Return float (vector) rounded to nearest integer (vector).  The returned
1668  * value is a float (vector).
1669  * Ex: round(0.9) = 1.0
1670  * Ex: round(-1.5) = -2.0
1671  */
1672 LLVMValueRef
1673 lp_build_round(struct lp_build_context *bld,
1674                LLVMValueRef a)
1675 {
1676    LLVMBuilderRef builder = bld->gallivm->builder;
1677    const struct lp_type type = bld->type;
1678
1679    assert(type.floating);
1680    assert(lp_check_value(type, a));
1681
1682    if (arch_rounding_available(type)) {
1683       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
1684    }
1685    else {
1686       const struct lp_type type = bld->type;
1687       struct lp_type inttype;
1688       struct lp_build_context intbld;
1689       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 2^24);
1690       LLVMValueRef res, anosign, mask;
1691       LLVMTypeRef int_vec_type = bld->int_vec_type;
1692       LLVMTypeRef vec_type = bld->vec_type;
1693
1694       assert(type.width == 32); /* might want to handle doubles at some point */
1695
1696       inttype = type;
1697       inttype.floating = 0;
1698       lp_build_context_init(&intbld, bld->gallivm, inttype);
1699
1700       res = lp_build_iround(bld, a);
1701       res = LLVMBuildSIToFP(builder, res, vec_type, "");
1702
1703       /* mask out sign bit */
1704       anosign = lp_build_abs(bld, a);
1705       /*
1706        * mask out all values if anosign > 2^24
1707        * This should work both for large ints (all rounding is no-op for them
1708        * because such floats are always exact) as well as special cases like
1709        * NaNs, Infs (taking advantage of the fact they use max exponent).
1710        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
1711        */
1712       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
1713       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
1714       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
1715       return lp_build_select(bld, mask, a, res);
1716    }
1717 }
1718
1719
1720 /**
1721  * Return floor of float (vector), result is a float (vector)
1722  * Ex: floor(1.1) = 1.0
1723  * Ex: floor(-1.1) = -2.0
1724  */
1725 LLVMValueRef
1726 lp_build_floor(struct lp_build_context *bld,
1727                LLVMValueRef a)
1728 {
1729    LLVMBuilderRef builder = bld->gallivm->builder;
1730    const struct lp_type type = bld->type;
1731
1732    assert(type.floating);
1733    assert(lp_check_value(type, a));
1734
1735    if (arch_rounding_available(type)) {
1736       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
1737    }
1738    else {
1739       const struct lp_type type = bld->type;
1740       struct lp_type inttype;
1741       struct lp_build_context intbld;
1742       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 2^24);
1743       LLVMValueRef trunc, res, anosign, mask;
1744       LLVMTypeRef int_vec_type = bld->int_vec_type;
1745       LLVMTypeRef vec_type = bld->vec_type;
1746
1747       assert(type.width == 32); /* might want to handle doubles at some point */
1748
1749       inttype = type;
1750       inttype.floating = 0;
1751       lp_build_context_init(&intbld, bld->gallivm, inttype);
1752
1753       /* round by truncation */
1754       trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
1755       res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
1756
1757       if (type.sign) {
1758          LLVMValueRef tmp;
1759
1760          /*
1761           * fix values if rounding is wrong (for non-special cases)
1762           * - this is the case if trunc > a
1763           */
1764          mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, res, a);
1765          /* tmp = trunc > a ? 1.0 : 0.0 */
1766          tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
1767          tmp = lp_build_and(&intbld, mask, tmp);
1768          tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
1769          res = lp_build_sub(bld, res, tmp);
1770       }
1771
1772       /* mask out sign bit */
1773       anosign = lp_build_abs(bld, a);
1774       /*
1775        * mask out all values if anosign > 2^24
1776        * This should work both for large ints (all rounding is no-op for them
1777        * because such floats are always exact) as well as special cases like
1778        * NaNs, Infs (taking advantage of the fact they use max exponent).
1779        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
1780        */
1781       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
1782       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
1783       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
1784       return lp_build_select(bld, mask, a, res);
1785    }
1786 }
1787
1788
1789 /**
1790  * Return ceiling of float (vector), returning float (vector).
1791  * Ex: ceil( 1.1) = 2.0
1792  * Ex: ceil(-1.1) = -1.0
1793  */
1794 LLVMValueRef
1795 lp_build_ceil(struct lp_build_context *bld,
1796               LLVMValueRef a)
1797 {
1798    LLVMBuilderRef builder = bld->gallivm->builder;
1799    const struct lp_type type = bld->type;
1800
1801    assert(type.floating);
1802    assert(lp_check_value(type, a));
1803
1804    if (arch_rounding_available(type)) {
1805       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
1806    }
1807    else {
1808       const struct lp_type type = bld->type;
1809       struct lp_type inttype;
1810       struct lp_build_context intbld;
1811       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 2^24);
1812       LLVMValueRef trunc, res, anosign, mask, tmp;
1813       LLVMTypeRef int_vec_type = bld->int_vec_type;
1814       LLVMTypeRef vec_type = bld->vec_type;
1815
1816       assert(type.width == 32); /* might want to handle doubles at some point */
1817
1818       inttype = type;
1819       inttype.floating = 0;
1820       lp_build_context_init(&intbld, bld->gallivm, inttype);
1821
1822       /* round by truncation */
1823       trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
1824       trunc = LLVMBuildSIToFP(builder, trunc, vec_type, "ceil.trunc");
1825
1826       /*
1827        * fix values if rounding is wrong (for non-special cases)
1828        * - this is the case if trunc < a
1829        */
1830       mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
1831       /* tmp = trunc < a ? 1.0 : 0.0 */
1832       tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
1833       tmp = lp_build_and(&intbld, mask, tmp);
1834       tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
1835       res = lp_build_add(bld, trunc, tmp);
1836
1837       /* mask out sign bit */
1838       anosign = lp_build_abs(bld, a);
1839       /*
1840        * mask out all values if anosign > 2^24
1841        * This should work both for large ints (all rounding is no-op for them
1842        * because such floats are always exact) as well as special cases like
1843        * NaNs, Infs (taking advantage of the fact they use max exponent).
1844        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
1845        */
1846       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
1847       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
1848       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
1849       return lp_build_select(bld, mask, a, res);
1850    }
1851 }
1852
1853
1854 /**
1855  * Return fractional part of 'a' computed as a - floor(a)
1856  * Typically used in texture coord arithmetic.
1857  */
1858 LLVMValueRef
1859 lp_build_fract(struct lp_build_context *bld,
1860                LLVMValueRef a)
1861 {
1862    assert(bld->type.floating);
1863    return lp_build_sub(bld, a, lp_build_floor(bld, a));
1864 }
1865
1866
1867 /**
1868  * Prevent returning a fractional part of 1.0 for very small negative values of
1869  * 'a' by clamping against 0.99999(9).
1870  */
1871 static inline LLVMValueRef
1872 clamp_fract(struct lp_build_context *bld, LLVMValueRef fract)
1873 {
1874    LLVMValueRef max;
1875
1876    /* this is the largest number smaller than 1.0 representable as float */
1877    max = lp_build_const_vec(bld->gallivm, bld->type,
1878                             1.0 - 1.0/(1LL << (lp_mantissa(bld->type) + 1)));
1879    return lp_build_min(bld, fract, max);
1880 }
1881
1882
1883 /**
1884  * Same as lp_build_fract, but guarantees that the result is always smaller
1885  * than one.
1886  */
1887 LLVMValueRef
1888 lp_build_fract_safe(struct lp_build_context *bld,
1889                     LLVMValueRef a)
1890 {
1891    return clamp_fract(bld, lp_build_fract(bld, a));
1892 }
1893
1894
1895 /**
1896  * Return the integer part of a float (vector) value (== round toward zero).
1897  * The returned value is an integer (vector).
1898  * Ex: itrunc(-1.5) = -1
1899  */
1900 LLVMValueRef
1901 lp_build_itrunc(struct lp_build_context *bld,
1902                 LLVMValueRef a)
1903 {
1904    LLVMBuilderRef builder = bld->gallivm->builder;
1905    const struct lp_type type = bld->type;
1906    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1907
1908    assert(type.floating);
1909    assert(lp_check_value(type, a));
1910
1911    return LLVMBuildFPToSI(builder, a, int_vec_type, "");
1912 }
1913
1914
1915 /**
1916  * Return float (vector) rounded to nearest integer (vector).  The returned
1917  * value is an integer (vector).
1918  * Ex: iround(0.9) = 1
1919  * Ex: iround(-1.5) = -2
1920  */
1921 LLVMValueRef
1922 lp_build_iround(struct lp_build_context *bld,
1923                 LLVMValueRef a)
1924 {
1925    LLVMBuilderRef builder = bld->gallivm->builder;
1926    const struct lp_type type = bld->type;
1927    LLVMTypeRef int_vec_type = bld->int_vec_type;
1928    LLVMValueRef res;
1929
1930    assert(type.floating);
1931
1932    assert(lp_check_value(type, a));
1933
1934    if ((util_cpu_caps.has_sse2 &&
1935        ((type.width == 32) && (type.length == 1 || type.length == 4))) ||
1936        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
1937       return lp_build_iround_nearest_sse2(bld, a);
1938    }
1939    if (arch_rounding_available(type)) {
1940       res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
1941    }
1942    else {
1943       LLVMValueRef half;
1944
1945       half = lp_build_const_vec(bld->gallivm, type, 0.5);
1946
1947       if (type.sign) {
1948          LLVMTypeRef vec_type = bld->vec_type;
1949          LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1950                                     (unsigned long long)1 << (type.width - 1));
1951          LLVMValueRef sign;
1952
1953          /* get sign bit */
1954          sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
1955          sign = LLVMBuildAnd(builder, sign, mask, "");
1956
1957          /* sign * 0.5 */
1958          half = LLVMBuildBitCast(builder, half, int_vec_type, "");
1959          half = LLVMBuildOr(builder, sign, half, "");
1960          half = LLVMBuildBitCast(builder, half, vec_type, "");
1961       }
1962
1963       res = LLVMBuildFAdd(builder, a, half, "");
1964    }
1965
1966    res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
1967
1968    return res;
1969 }
1970
1971
1972 /**
1973  * Return floor of float (vector), result is an int (vector)
1974  * Ex: ifloor(1.1) = 1.0
1975  * Ex: ifloor(-1.1) = -2.0
1976  */
1977 LLVMValueRef
1978 lp_build_ifloor(struct lp_build_context *bld,
1979                 LLVMValueRef a)
1980 {
1981    LLVMBuilderRef builder = bld->gallivm->builder;
1982    const struct lp_type type = bld->type;
1983    LLVMTypeRef int_vec_type = bld->int_vec_type;
1984    LLVMValueRef res;
1985
1986    assert(type.floating);
1987    assert(lp_check_value(type, a));
1988
1989    res = a;
1990    if (type.sign) {
1991       if (arch_rounding_available(type)) {
1992          res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
1993       }
1994       else {
1995          struct lp_type inttype;
1996          struct lp_build_context intbld;
1997          LLVMValueRef trunc, itrunc, mask;
1998
1999          assert(type.floating);
2000          assert(lp_check_value(type, a));
2001
2002          inttype = type;
2003          inttype.floating = 0;
2004          lp_build_context_init(&intbld, bld->gallivm, inttype);
2005
2006          /* round by truncation */
2007          itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2008          trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "ifloor.trunc");
2009
2010          /*
2011           * fix values if rounding is wrong (for non-special cases)
2012           * - this is the case if trunc > a
2013           * The results of doing this with NaNs, very large values etc.
2014           * are undefined but this seems to be the case anyway.
2015           */
2016          mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, trunc, a);
2017          /* cheapie minus one with mask since the mask is minus one / zero */
2018          return lp_build_add(&intbld, itrunc, mask);
2019       }
2020    }
2021
2022    /* round to nearest (toward zero) */
2023    res = LLVMBuildFPToSI(builder, res, int_vec_type, "ifloor.res");
2024
2025    return res;
2026 }
2027
2028
2029 /**
2030  * Return ceiling of float (vector), returning int (vector).
2031  * Ex: iceil( 1.1) = 2
2032  * Ex: iceil(-1.1) = -1
2033  */
2034 LLVMValueRef
2035 lp_build_iceil(struct lp_build_context *bld,
2036                LLVMValueRef a)
2037 {
2038    LLVMBuilderRef builder = bld->gallivm->builder;
2039    const struct lp_type type = bld->type;
2040    LLVMTypeRef int_vec_type = bld->int_vec_type;
2041    LLVMValueRef res;
2042
2043    assert(type.floating);
2044    assert(lp_check_value(type, a));
2045
2046    if (arch_rounding_available(type)) {
2047       res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2048    }
2049    else {
2050       struct lp_type inttype;
2051       struct lp_build_context intbld;
2052       LLVMValueRef trunc, itrunc, mask;
2053
2054       assert(type.floating);
2055       assert(lp_check_value(type, a));
2056
2057       inttype = type;
2058       inttype.floating = 0;
2059       lp_build_context_init(&intbld, bld->gallivm, inttype);
2060
2061       /* round by truncation */
2062       itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2063       trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "iceil.trunc");
2064
2065       /*
2066        * fix values if rounding is wrong (for non-special cases)
2067        * - this is the case if trunc < a
2068        * The results of doing this with NaNs, very large values etc.
2069        * are undefined but this seems to be the case anyway.
2070        */
2071       mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2072       /* cheapie plus one with mask since the mask is minus one / zero */
2073       return lp_build_sub(&intbld, itrunc, mask);
2074    }
2075
2076    /* round to nearest (toward zero) */
2077    res = LLVMBuildFPToSI(builder, res, int_vec_type, "iceil.res");
2078
2079    return res;
2080 }
2081
2082
2083 /**
2084  * Combined ifloor() & fract().
2085  *
2086  * Preferred to calling the functions separately, as it will ensure that the
2087  * strategy (floor() vs ifloor()) that results in less redundant work is used.
2088  */
2089 void
2090 lp_build_ifloor_fract(struct lp_build_context *bld,
2091                       LLVMValueRef a,
2092                       LLVMValueRef *out_ipart,
2093                       LLVMValueRef *out_fpart)
2094 {
2095    LLVMBuilderRef builder = bld->gallivm->builder;
2096    const struct lp_type type = bld->type;
2097    LLVMValueRef ipart;
2098
2099    assert(type.floating);
2100    assert(lp_check_value(type, a));
2101
2102    if (arch_rounding_available(type)) {
2103       /*
2104        * floor() is easier.
2105        */
2106
2107       ipart = lp_build_floor(bld, a);
2108       *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2109       *out_ipart = LLVMBuildFPToSI(builder, ipart, bld->int_vec_type, "ipart");
2110    }
2111    else {
2112       /*
2113        * ifloor() is easier.
2114        */
2115
2116       *out_ipart = lp_build_ifloor(bld, a);
2117       ipart = LLVMBuildSIToFP(builder, *out_ipart, bld->vec_type, "ipart");
2118       *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2119    }
2120 }
2121
2122
2123 /**
2124  * Same as lp_build_ifloor_fract, but guarantees that the fractional part is
2125  * always smaller than one.
2126  */
2127 void
2128 lp_build_ifloor_fract_safe(struct lp_build_context *bld,
2129                            LLVMValueRef a,
2130                            LLVMValueRef *out_ipart,
2131                            LLVMValueRef *out_fpart)
2132 {
2133    lp_build_ifloor_fract(bld, a, out_ipart, out_fpart);
2134    *out_fpart = clamp_fract(bld, *out_fpart);
2135 }
2136
2137
2138 LLVMValueRef
2139 lp_build_sqrt(struct lp_build_context *bld,
2140               LLVMValueRef a)
2141 {
2142    LLVMBuilderRef builder = bld->gallivm->builder;
2143    const struct lp_type type = bld->type;
2144    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
2145    char intrinsic[32];
2146
2147    assert(lp_check_value(type, a));
2148
2149    /* TODO: optimize the constant case */
2150
2151    assert(type.floating);
2152    if (type.length == 1) {
2153       util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.f%u", type.width);
2154    }
2155    else {
2156       util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.v%uf%u", type.length, type.width);
2157    }
2158
2159    return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2160 }
2161
2162
2163 /**
2164  * Do one Newton-Raphson step to improve reciprocate precision:
2165  *
2166  *   x_{i+1} = x_i * (2 - a * x_i)
2167  *
2168  * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
2169  * +/-Inf, giving NaN instead.  Certain applications rely on this behavior,
2170  * such as Google Earth, which does RCP(RSQRT(0.0) when drawing the Earth's
2171  * halo. It would be necessary to clamp the argument to prevent this.
2172  *
2173  * See also:
2174  * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
2175  * - http://softwarecommunity.intel.com/articles/eng/1818.htm
2176  */
2177 static INLINE LLVMValueRef
2178 lp_build_rcp_refine(struct lp_build_context *bld,
2179                     LLVMValueRef a,
2180                     LLVMValueRef rcp_a)
2181 {
2182    LLVMBuilderRef builder = bld->gallivm->builder;
2183    LLVMValueRef two = lp_build_const_vec(bld->gallivm, bld->type, 2.0);
2184    LLVMValueRef res;
2185
2186    res = LLVMBuildFMul(builder, a, rcp_a, "");
2187    res = LLVMBuildFSub(builder, two, res, "");
2188    res = LLVMBuildFMul(builder, rcp_a, res, "");
2189
2190    return res;
2191 }
2192
2193
2194 LLVMValueRef
2195 lp_build_rcp(struct lp_build_context *bld,
2196              LLVMValueRef a)
2197 {
2198    LLVMBuilderRef builder = bld->gallivm->builder;
2199    const struct lp_type type = bld->type;
2200
2201    assert(lp_check_value(type, a));
2202
2203    if(a == bld->zero)
2204       return bld->undef;
2205    if(a == bld->one)
2206       return bld->one;
2207    if(a == bld->undef)
2208       return bld->undef;
2209
2210    assert(type.floating);
2211
2212    if(LLVMIsConstant(a))
2213       return LLVMConstFDiv(bld->one, a);
2214
2215    /*
2216     * We don't use RCPPS because:
2217     * - it only has 10bits of precision
2218     * - it doesn't even get the reciprocate of 1.0 exactly
2219     * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
2220     * - for recent processors the benefit over DIVPS is marginal, a case
2221     *   dependent
2222     *
2223     * We could still use it on certain processors if benchmarks show that the
2224     * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
2225     * particular uses that require less workarounds.
2226     */
2227
2228    if (FALSE && ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2229          (util_cpu_caps.has_avx && type.width == 32 && type.length == 8))){
2230       const unsigned num_iterations = 0;
2231       LLVMValueRef res;
2232       unsigned i;
2233       const char *intrinsic = NULL;
2234
2235       if (type.length == 4) {
2236          intrinsic = "llvm.x86.sse.rcp.ps";
2237       }
2238       else {
2239          intrinsic = "llvm.x86.avx.rcp.ps.256";
2240       }
2241
2242       res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2243
2244       for (i = 0; i < num_iterations; ++i) {
2245          res = lp_build_rcp_refine(bld, a, res);
2246       }
2247
2248       return res;
2249    }
2250
2251    return LLVMBuildFDiv(builder, bld->one, a, "");
2252 }
2253
2254
2255 /**
2256  * Do one Newton-Raphson step to improve rsqrt precision:
2257  *
2258  *   x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
2259  *
2260  * See also Intel 64 and IA-32 Architectures Optimization Manual.
2261  */
2262 static INLINE LLVMValueRef
2263 lp_build_rsqrt_refine(struct lp_build_context *bld,
2264                       LLVMValueRef a,
2265                       LLVMValueRef rsqrt_a)
2266 {
2267    LLVMBuilderRef builder = bld->gallivm->builder;
2268    LLVMValueRef half = lp_build_const_vec(bld->gallivm, bld->type, 0.5);
2269    LLVMValueRef three = lp_build_const_vec(bld->gallivm, bld->type, 3.0);
2270    LLVMValueRef res;
2271
2272    res = LLVMBuildFMul(builder, rsqrt_a, rsqrt_a, "");
2273    res = LLVMBuildFMul(builder, a, res, "");
2274    res = LLVMBuildFSub(builder, three, res, "");
2275    res = LLVMBuildFMul(builder, rsqrt_a, res, "");
2276    res = LLVMBuildFMul(builder, half, res, "");
2277
2278    return res;
2279 }
2280
2281
2282 /**
2283  * Generate 1/sqrt(a).
2284  * Result is undefined for values < 0, infinity for +0.
2285  */
2286 LLVMValueRef
2287 lp_build_rsqrt(struct lp_build_context *bld,
2288                LLVMValueRef a)
2289 {
2290    LLVMBuilderRef builder = bld->gallivm->builder;
2291    const struct lp_type type = bld->type;
2292
2293    assert(lp_check_value(type, a));
2294
2295    assert(type.floating);
2296
2297    /*
2298     * This should be faster but all denormals will end up as infinity.
2299     */
2300    if (0 && ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2301         (util_cpu_caps.has_avx && type.width == 32 && type.length == 8))) {
2302       const unsigned num_iterations = 1;
2303       LLVMValueRef res;
2304       unsigned i;
2305       const char *intrinsic = NULL;
2306
2307       if (type.length == 4) {
2308          intrinsic = "llvm.x86.sse.rsqrt.ps";
2309       }
2310       else {
2311          intrinsic = "llvm.x86.avx.rsqrt.ps.256";
2312       }
2313       if (num_iterations) {
2314          /*
2315           * Newton-Raphson will result in NaN instead of infinity for zero,
2316           * and NaN instead of zero for infinity.
2317           * Also, need to ensure rsqrt(1.0) == 1.0.
2318           * All numbers smaller than FLT_MIN will result in +infinity
2319           * (rsqrtps treats all denormals as zero).
2320           */
2321          /*
2322           * Certain non-c99 compilers don't know INFINITY and might not support
2323           * hacks to evaluate it at compile time neither.
2324           */
2325          const unsigned posinf_int = 0x7F800000;
2326          LLVMValueRef cmp;
2327          LLVMValueRef flt_min = lp_build_const_vec(bld->gallivm, type, FLT_MIN);
2328          LLVMValueRef inf = lp_build_const_int_vec(bld->gallivm, type, posinf_int);
2329
2330          inf = LLVMBuildBitCast(builder, inf, lp_build_vec_type(bld->gallivm, type), "");
2331
2332          res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2333
2334          for (i = 0; i < num_iterations; ++i) {
2335             res = lp_build_rsqrt_refine(bld, a, res);
2336          }
2337          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_LESS, a, flt_min);
2338          res = lp_build_select(bld, cmp, inf, res);
2339          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, inf);
2340          res = lp_build_select(bld, cmp, bld->zero, res);
2341          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, bld->one);
2342          res = lp_build_select(bld, cmp, bld->one, res);
2343       }
2344       else {
2345          /* rsqrt(1.0) != 1.0 here */
2346          res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2347
2348       }
2349
2350       return res;
2351    }
2352
2353    return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2354 }
2355
2356
2357 /**
2358  * Generate sin(a) using SSE2
2359  */
2360 LLVMValueRef
2361 lp_build_sin(struct lp_build_context *bld,
2362              LLVMValueRef a)
2363 {
2364    struct gallivm_state *gallivm = bld->gallivm;
2365    LLVMBuilderRef builder = gallivm->builder;
2366    struct lp_type int_type = lp_int_type(bld->type);
2367    LLVMBuilderRef b = builder;
2368
2369    /*
2370     *  take the absolute value,
2371     *  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
2372     */
2373
2374    LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
2375    LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
2376
2377    LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
2378    LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
2379
2380    /*
2381     * extract the sign bit (upper one)
2382     * sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask);
2383     */
2384    LLVMValueRef sig_mask = lp_build_const_int_vec(gallivm, bld->type, 0x80000000);
2385    LLVMValueRef sign_bit_i = LLVMBuildAnd(b, a_v4si, sig_mask, "sign_bit_i");
2386
2387    /*
2388     * scale by 4/Pi
2389     * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
2390     */
2391
2392    LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
2393    LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
2394
2395    /*
2396     * store the integer part of y in mm0
2397     * emm2 = _mm_cvttps_epi32(y);
2398     */
2399
2400    LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
2401
2402    /*
2403     * j=(j+1) & (~1) (see the cephes sources)
2404     * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
2405     */
2406
2407    LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
2408    LLVMValueRef emm2_add =  LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
2409    /*
2410     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
2411     */
2412    LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
2413    LLVMValueRef emm2_and =  LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
2414
2415    /*
2416     * y = _mm_cvtepi32_ps(emm2);
2417     */
2418    LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
2419
2420    /* get the swap sign flag
2421     * emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
2422     */
2423    LLVMValueRef pi32_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
2424    LLVMValueRef emm0_and =  LLVMBuildAnd(b, emm2_add, pi32_4, "emm0_and");
2425
2426    /*
2427     * emm2 = _mm_slli_epi32(emm0, 29);
2428     */
2429    LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
2430    LLVMValueRef swap_sign_bit = LLVMBuildShl(b, emm0_and, const_29, "swap_sign_bit");
2431
2432    /*
2433     * get the polynom selection mask
2434     * there is one polynom for 0 <= x <= Pi/4
2435     * and another one for Pi/4<x<=Pi/2
2436     * Both branches will be computed.
2437     *
2438     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
2439     * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
2440     */
2441
2442    LLVMValueRef pi32_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
2443    LLVMValueRef emm2_3 =  LLVMBuildAnd(b, emm2_and, pi32_2, "emm2_3");
2444    LLVMValueRef poly_mask = lp_build_compare(gallivm,
2445                                              int_type, PIPE_FUNC_EQUAL,
2446                                              emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
2447    /*
2448     *   sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
2449     */
2450    LLVMValueRef sign_bit_1 =  LLVMBuildXor(b, sign_bit_i, swap_sign_bit, "sign_bit");
2451
2452    /*
2453     * _PS_CONST(minus_cephes_DP1, -0.78515625);
2454     * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
2455     * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
2456     */
2457    LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
2458    LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
2459    LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
2460
2461    /*
2462     * The magic pass: "Extended precision modular arithmetic"
2463     * x = ((x - y * DP1) - y * DP2) - y * DP3;
2464     * xmm1 = _mm_mul_ps(y, xmm1);
2465     * xmm2 = _mm_mul_ps(y, xmm2);
2466     * xmm3 = _mm_mul_ps(y, xmm3);
2467     */
2468    LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1");
2469    LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2");
2470    LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3");
2471
2472    /*
2473     * x = _mm_add_ps(x, xmm1);
2474     * x = _mm_add_ps(x, xmm2);
2475     * x = _mm_add_ps(x, xmm3);
2476     */
2477
2478    LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1");
2479    LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2");
2480    LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3");
2481
2482    /*
2483     * Evaluate the first polynom  (0 <= x <= Pi/4)
2484     *
2485     * z = _mm_mul_ps(x,x);
2486     */
2487    LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
2488
2489    /*
2490     * _PS_CONST(coscof_p0,  2.443315711809948E-005);
2491     * _PS_CONST(coscof_p1, -1.388731625493765E-003);
2492     * _PS_CONST(coscof_p2,  4.166664568298827E-002);
2493     */
2494    LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
2495    LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
2496    LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
2497
2498    /*
2499     * y = *(v4sf*)_ps_coscof_p0;
2500     * y = _mm_mul_ps(y, z);
2501     */
2502    LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3");
2503    LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4");
2504    LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5");
2505    LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6");
2506    LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
2507    LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
2508
2509
2510    /*
2511     * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
2512     * y = _mm_sub_ps(y, tmp);
2513     * y = _mm_add_ps(y, *(v4sf*)_ps_1);
2514     */
2515    LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
2516    LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
2517    LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
2518    LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
2519    LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
2520
2521    /*
2522     * _PS_CONST(sincof_p0, -1.9515295891E-4);
2523     * _PS_CONST(sincof_p1,  8.3321608736E-3);
2524     * _PS_CONST(sincof_p2, -1.6666654611E-1);
2525     */
2526    LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
2527    LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
2528    LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
2529
2530    /*
2531     * Evaluate the second polynom  (Pi/4 <= x <= 0)
2532     *
2533     * y2 = *(v4sf*)_ps_sincof_p0;
2534     * y2 = _mm_mul_ps(y2, z);
2535     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
2536     * y2 = _mm_mul_ps(y2, z);
2537     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
2538     * y2 = _mm_mul_ps(y2, z);
2539     * y2 = _mm_mul_ps(y2, x);
2540     * y2 = _mm_add_ps(y2, x);
2541     */
2542
2543    LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3");
2544    LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4");
2545    LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5");
2546    LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6");
2547    LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
2548    LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8");
2549    LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9");
2550
2551    /*
2552     * select the correct result from the two polynoms
2553     * xmm3 = poly_mask;
2554     * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
2555     * y = _mm_andnot_ps(xmm3, y);
2556     * y = _mm_add_ps(y,y2);
2557     */
2558    LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
2559    LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
2560    LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
2561    LLVMValueRef inv = lp_build_const_int_vec(gallivm, bld->type, ~0);
2562    LLVMValueRef poly_mask_inv = LLVMBuildXor(b, poly_mask, inv, "poly_mask_inv");
2563    LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
2564    LLVMValueRef y_combine = LLVMBuildAdd(b, y_and, y2_and, "y_combine");
2565
2566    /*
2567     * update the sign
2568     * y = _mm_xor_ps(y, sign_bit);
2569     */
2570    LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit_1, "y_sin");
2571    LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
2572    return y_result;
2573 }
2574
2575
2576 /**
2577  * Generate cos(a) using SSE2
2578  */
2579 LLVMValueRef
2580 lp_build_cos(struct lp_build_context *bld,
2581              LLVMValueRef a)
2582 {
2583    struct gallivm_state *gallivm = bld->gallivm;
2584    LLVMBuilderRef builder = gallivm->builder;
2585    struct lp_type int_type = lp_int_type(bld->type);
2586    LLVMBuilderRef b = builder;
2587
2588    /*
2589     *  take the absolute value,
2590     *  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
2591     */
2592
2593    LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
2594    LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
2595
2596    LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
2597    LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
2598
2599    /*
2600     * scale by 4/Pi
2601     * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
2602     */
2603
2604    LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
2605    LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
2606
2607    /*
2608     * store the integer part of y in mm0
2609     * emm2 = _mm_cvttps_epi32(y);
2610     */
2611
2612    LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
2613
2614    /*
2615     * j=(j+1) & (~1) (see the cephes sources)
2616     * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
2617     */
2618
2619    LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
2620    LLVMValueRef emm2_add =  LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
2621    /*
2622     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
2623     */
2624    LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
2625    LLVMValueRef emm2_and =  LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
2626
2627    /*
2628     * y = _mm_cvtepi32_ps(emm2);
2629     */
2630    LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
2631
2632
2633    /*
2634     * emm2 = _mm_sub_epi32(emm2, *(v4si*)_pi32_2);
2635     */
2636    LLVMValueRef const_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
2637    LLVMValueRef emm2_2 = LLVMBuildSub(b, emm2_and, const_2, "emm2_2");
2638
2639
2640    /* get the swap sign flag
2641     * emm0 = _mm_andnot_si128(emm2, *(v4si*)_pi32_4);
2642     */
2643    LLVMValueRef inv = lp_build_const_int_vec(gallivm, bld->type, ~0);
2644    LLVMValueRef emm0_not = LLVMBuildXor(b, emm2_2, inv, "emm0_not");
2645    LLVMValueRef pi32_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
2646    LLVMValueRef emm0_and =  LLVMBuildAnd(b, emm0_not, pi32_4, "emm0_and");
2647
2648    /*
2649     * emm2 = _mm_slli_epi32(emm0, 29);
2650     */
2651    LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
2652    LLVMValueRef sign_bit = LLVMBuildShl(b, emm0_and, const_29, "sign_bit");
2653
2654    /*
2655     * get the polynom selection mask
2656     * there is one polynom for 0 <= x <= Pi/4
2657     * and another one for Pi/4<x<=Pi/2
2658     * Both branches will be computed.
2659     *
2660     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
2661     * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
2662     */
2663
2664    LLVMValueRef pi32_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
2665    LLVMValueRef emm2_3 =  LLVMBuildAnd(b, emm2_2, pi32_2, "emm2_3");
2666    LLVMValueRef poly_mask = lp_build_compare(gallivm,
2667                                              int_type, PIPE_FUNC_EQUAL,
2668                                              emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
2669
2670    /*
2671     * _PS_CONST(minus_cephes_DP1, -0.78515625);
2672     * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
2673     * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
2674     */
2675    LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
2676    LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
2677    LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
2678
2679    /*
2680     * The magic pass: "Extended precision modular arithmetic"
2681     * x = ((x - y * DP1) - y * DP2) - y * DP3;
2682     * xmm1 = _mm_mul_ps(y, xmm1);
2683     * xmm2 = _mm_mul_ps(y, xmm2);
2684     * xmm3 = _mm_mul_ps(y, xmm3);
2685     */
2686    LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1");
2687    LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2");
2688    LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3");
2689
2690    /*
2691     * x = _mm_add_ps(x, xmm1);
2692     * x = _mm_add_ps(x, xmm2);
2693     * x = _mm_add_ps(x, xmm3);
2694     */
2695
2696    LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1");
2697    LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2");
2698    LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3");
2699
2700    /*
2701     * Evaluate the first polynom  (0 <= x <= Pi/4)
2702     *
2703     * z = _mm_mul_ps(x,x);
2704     */
2705    LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
2706
2707    /*
2708     * _PS_CONST(coscof_p0,  2.443315711809948E-005);
2709     * _PS_CONST(coscof_p1, -1.388731625493765E-003);
2710     * _PS_CONST(coscof_p2,  4.166664568298827E-002);
2711     */
2712    LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
2713    LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
2714    LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
2715
2716    /*
2717     * y = *(v4sf*)_ps_coscof_p0;
2718     * y = _mm_mul_ps(y, z);
2719     */
2720    LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3");
2721    LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4");
2722    LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5");
2723    LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6");
2724    LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
2725    LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
2726
2727
2728    /*
2729     * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
2730     * y = _mm_sub_ps(y, tmp);
2731     * y = _mm_add_ps(y, *(v4sf*)_ps_1);
2732     */
2733    LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
2734    LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
2735    LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
2736    LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
2737    LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
2738
2739    /*
2740     * _PS_CONST(sincof_p0, -1.9515295891E-4);
2741     * _PS_CONST(sincof_p1,  8.3321608736E-3);
2742     * _PS_CONST(sincof_p2, -1.6666654611E-1);
2743     */
2744    LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
2745    LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
2746    LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
2747
2748    /*
2749     * Evaluate the second polynom  (Pi/4 <= x <= 0)
2750     *
2751     * y2 = *(v4sf*)_ps_sincof_p0;
2752     * y2 = _mm_mul_ps(y2, z);
2753     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
2754     * y2 = _mm_mul_ps(y2, z);
2755     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
2756     * y2 = _mm_mul_ps(y2, z);
2757     * y2 = _mm_mul_ps(y2, x);
2758     * y2 = _mm_add_ps(y2, x);
2759     */
2760
2761    LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3");
2762    LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4");
2763    LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5");
2764    LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6");
2765    LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
2766    LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8");
2767    LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9");
2768
2769    /*
2770     * select the correct result from the two polynoms
2771     * xmm3 = poly_mask;
2772     * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
2773     * y = _mm_andnot_ps(xmm3, y);
2774     * y = _mm_add_ps(y,y2);
2775     */
2776    LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
2777    LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
2778    LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
2779    LLVMValueRef poly_mask_inv = LLVMBuildXor(b, poly_mask, inv, "poly_mask_inv");
2780    LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
2781    LLVMValueRef y_combine = LLVMBuildAdd(b, y_and, y2_and, "y_combine");
2782
2783    /*
2784     * update the sign
2785     * y = _mm_xor_ps(y, sign_bit);
2786     */
2787    LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sin");
2788    LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
2789    return y_result;
2790 }
2791
2792
2793 /**
2794  * Generate pow(x, y)
2795  */
2796 LLVMValueRef
2797 lp_build_pow(struct lp_build_context *bld,
2798              LLVMValueRef x,
2799              LLVMValueRef y)
2800 {
2801    /* TODO: optimize the constant case */
2802    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2803        LLVMIsConstant(x) && LLVMIsConstant(y)) {
2804       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2805                    __FUNCTION__);
2806    }
2807
2808    return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
2809 }
2810
2811
2812 /**
2813  * Generate exp(x)
2814  */
2815 LLVMValueRef
2816 lp_build_exp(struct lp_build_context *bld,
2817              LLVMValueRef x)
2818 {
2819    /* log2(e) = 1/log(2) */
2820    LLVMValueRef log2e = lp_build_const_vec(bld->gallivm, bld->type,
2821                                            1.4426950408889634);
2822
2823    assert(lp_check_value(bld->type, x));
2824
2825    return lp_build_exp2(bld, lp_build_mul(bld, log2e, x));
2826 }
2827
2828
2829 /**
2830  * Generate log(x)
2831  */
2832 LLVMValueRef
2833 lp_build_log(struct lp_build_context *bld,
2834              LLVMValueRef x)
2835 {
2836    /* log(2) */
2837    LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
2838                                           0.69314718055994529);
2839
2840    assert(lp_check_value(bld->type, x));
2841
2842    return lp_build_mul(bld, log2, lp_build_log2(bld, x));
2843 }
2844
2845
2846 /**
2847  * Generate polynomial.
2848  * Ex:  coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
2849  */
2850 static LLVMValueRef
2851 lp_build_polynomial(struct lp_build_context *bld,
2852                     LLVMValueRef x,
2853                     const double *coeffs,
2854                     unsigned num_coeffs)
2855 {
2856    const struct lp_type type = bld->type;
2857    LLVMValueRef even = NULL, odd = NULL;
2858    LLVMValueRef x2;
2859    unsigned i;
2860
2861    assert(lp_check_value(bld->type, x));
2862
2863    /* TODO: optimize the constant case */
2864    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2865        LLVMIsConstant(x)) {
2866       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2867                    __FUNCTION__);
2868    }
2869
2870    /*
2871     * Calculate odd and even terms seperately to decrease data dependency
2872     * Ex:
2873     *     c[0] + x^2 * c[2] + x^4 * c[4] ...
2874     *     + x * (c[1] + x^2 * c[3] + x^4 * c[5]) ...
2875     */
2876    x2 = lp_build_mul(bld, x, x);
2877
2878    for (i = num_coeffs; i--; ) {
2879       LLVMValueRef coeff;
2880
2881       coeff = lp_build_const_vec(bld->gallivm, type, coeffs[i]);
2882
2883       if (i % 2 == 0) {
2884          if (even)
2885             even = lp_build_add(bld, coeff, lp_build_mul(bld, x2, even));
2886          else
2887             even = coeff;
2888       } else {
2889          if (odd)
2890             odd = lp_build_add(bld, coeff, lp_build_mul(bld, x2, odd));
2891          else
2892             odd = coeff;
2893       }
2894    }
2895
2896    if (odd)
2897       return lp_build_add(bld, lp_build_mul(bld, odd, x), even);
2898    else if (even)
2899       return even;
2900    else
2901       return bld->undef;
2902 }
2903
2904
2905 /**
2906  * Minimax polynomial fit of 2**x, in range [0, 1[
2907  */
2908 const double lp_build_exp2_polynomial[] = {
2909 #if EXP_POLY_DEGREE == 5
2910    0.999999925063526176901,
2911    0.693153073200168932794,
2912    0.240153617044375388211,
2913    0.0558263180532956664775,
2914    0.00898934009049466391101,
2915    0.00187757667519147912699
2916 #elif EXP_POLY_DEGREE == 4
2917    1.00000259337069434683,
2918    0.693003834469974940458,
2919    0.24144275689150793076,
2920    0.0520114606103070150235,
2921    0.0135341679161270268764
2922 #elif EXP_POLY_DEGREE == 3
2923    0.999925218562710312959,
2924    0.695833540494823811697,
2925    0.226067155427249155588,
2926    0.0780245226406372992967
2927 #elif EXP_POLY_DEGREE == 2
2928    1.00172476321474503578,
2929    0.657636275736077639316,
2930    0.33718943461968720704
2931 #else
2932 #error
2933 #endif
2934 };
2935
2936
2937 void
2938 lp_build_exp2_approx(struct lp_build_context *bld,
2939                      LLVMValueRef x,
2940                      LLVMValueRef *p_exp2_int_part,
2941                      LLVMValueRef *p_frac_part,
2942                      LLVMValueRef *p_exp2)
2943 {
2944    LLVMBuilderRef builder = bld->gallivm->builder;
2945    const struct lp_type type = bld->type;
2946    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
2947    LLVMValueRef ipart = NULL;
2948    LLVMValueRef fpart = NULL;
2949    LLVMValueRef expipart = NULL;
2950    LLVMValueRef expfpart = NULL;
2951    LLVMValueRef res = NULL;
2952
2953    assert(lp_check_value(bld->type, x));
2954
2955    if(p_exp2_int_part || p_frac_part || p_exp2) {
2956       /* TODO: optimize the constant case */
2957       if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2958           LLVMIsConstant(x)) {
2959          debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2960                       __FUNCTION__);
2961       }
2962
2963       assert(type.floating && type.width == 32);
2964
2965       x = lp_build_min(bld, x, lp_build_const_vec(bld->gallivm, type,  129.0));
2966       x = lp_build_max(bld, x, lp_build_const_vec(bld->gallivm, type, -126.99999));
2967
2968       /* ipart = floor(x) */
2969       /* fpart = x - ipart */
2970       lp_build_ifloor_fract(bld, x, &ipart, &fpart);
2971    }
2972
2973    if(p_exp2_int_part || p_exp2) {
2974       /* expipart = (float) (1 << ipart) */
2975       expipart = LLVMBuildAdd(builder, ipart,
2976                               lp_build_const_int_vec(bld->gallivm, type, 127), "");
2977       expipart = LLVMBuildShl(builder, expipart,
2978                               lp_build_const_int_vec(bld->gallivm, type, 23), "");
2979       expipart = LLVMBuildBitCast(builder, expipart, vec_type, "");
2980    }
2981
2982    if(p_exp2) {
2983       expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
2984                                      Elements(lp_build_exp2_polynomial));
2985
2986       res = LLVMBuildFMul(builder, expipart, expfpart, "");
2987    }
2988
2989    if(p_exp2_int_part)
2990       *p_exp2_int_part = expipart;
2991
2992    if(p_frac_part)
2993       *p_frac_part = fpart;
2994
2995    if(p_exp2)
2996       *p_exp2 = res;
2997 }
2998
2999
3000 LLVMValueRef
3001 lp_build_exp2(struct lp_build_context *bld,
3002               LLVMValueRef x)
3003 {
3004    LLVMValueRef res;
3005    lp_build_exp2_approx(bld, x, NULL, NULL, &res);
3006    return res;
3007 }
3008
3009
3010 /**
3011  * Extract the exponent of a IEEE-754 floating point value.
3012  *
3013  * Optionally apply an integer bias.
3014  *
3015  * Result is an integer value with
3016  *
3017  *   ifloor(log2(x)) + bias
3018  */
3019 LLVMValueRef
3020 lp_build_extract_exponent(struct lp_build_context *bld,
3021                           LLVMValueRef x,
3022                           int bias)
3023 {
3024    LLVMBuilderRef builder = bld->gallivm->builder;
3025    const struct lp_type type = bld->type;
3026    unsigned mantissa = lp_mantissa(type);
3027    LLVMValueRef res;
3028
3029    assert(type.floating);
3030
3031    assert(lp_check_value(bld->type, x));
3032
3033    x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3034
3035    res = LLVMBuildLShr(builder, x,
3036                        lp_build_const_int_vec(bld->gallivm, type, mantissa), "");
3037    res = LLVMBuildAnd(builder, res,
3038                       lp_build_const_int_vec(bld->gallivm, type, 255), "");
3039    res = LLVMBuildSub(builder, res,
3040                       lp_build_const_int_vec(bld->gallivm, type, 127 - bias), "");
3041
3042    return res;
3043 }
3044
3045
3046 /**
3047  * Extract the mantissa of the a floating.
3048  *
3049  * Result is a floating point value with
3050  *
3051  *   x / floor(log2(x))
3052  */
3053 LLVMValueRef
3054 lp_build_extract_mantissa(struct lp_build_context *bld,
3055                           LLVMValueRef x)
3056 {
3057    LLVMBuilderRef builder = bld->gallivm->builder;
3058    const struct lp_type type = bld->type;
3059    unsigned mantissa = lp_mantissa(type);
3060    LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type,
3061                                                   (1ULL << mantissa) - 1);
3062    LLVMValueRef one = LLVMConstBitCast(bld->one, bld->int_vec_type);
3063    LLVMValueRef res;
3064
3065    assert(lp_check_value(bld->type, x));
3066
3067    assert(type.floating);
3068
3069    x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3070
3071    /* res = x / 2**ipart */
3072    res = LLVMBuildAnd(builder, x, mantmask, "");
3073    res = LLVMBuildOr(builder, res, one, "");
3074    res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
3075
3076    return res;
3077 }
3078
3079
3080
3081 /**
3082  * Minimax polynomial fit of log2((1.0 + sqrt(x))/(1.0 - sqrt(x)))/sqrt(x) ,for x in range of [0, 1/9[
3083  * These coefficients can be generate with
3084  * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
3085  */
3086 const double lp_build_log2_polynomial[] = {
3087 #if LOG_POLY_DEGREE == 5
3088    2.88539008148777786488L,
3089    0.961796878841293367824L,
3090    0.577058946784739859012L,
3091    0.412914355135828735411L,
3092    0.308591899232910175289L,
3093    0.352376952300281371868L,
3094 #elif LOG_POLY_DEGREE == 4
3095    2.88539009343309178325L,
3096    0.961791550404184197881L,
3097    0.577440339438736392009L,
3098    0.403343858251329912514L,
3099    0.406718052498846252698L,
3100 #elif LOG_POLY_DEGREE == 3
3101    2.88538959748872753838L,
3102    0.961932915889597772928L,
3103    0.571118517972136195241L,
3104    0.493997535084709500285L,
3105 #else
3106 #error
3107 #endif
3108 };
3109
3110 /**
3111  * See http://www.devmaster.net/forums/showthread.php?p=43580
3112  * http://en.wikipedia.org/wiki/Logarithm#Calculation
3113  * http://www.nezumi.demon.co.uk/consult/logx.htm
3114  */
3115 void
3116 lp_build_log2_approx(struct lp_build_context *bld,
3117                      LLVMValueRef x,
3118                      LLVMValueRef *p_exp,
3119                      LLVMValueRef *p_floor_log2,
3120                      LLVMValueRef *p_log2)
3121 {
3122    LLVMBuilderRef builder = bld->gallivm->builder;
3123    const struct lp_type type = bld->type;
3124    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3125    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
3126
3127    LLVMValueRef expmask = lp_build_const_int_vec(bld->gallivm, type, 0x7f800000);
3128    LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 0x007fffff);
3129    LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
3130
3131    LLVMValueRef i = NULL;
3132    LLVMValueRef y = NULL;
3133    LLVMValueRef z = NULL;
3134    LLVMValueRef exp = NULL;
3135    LLVMValueRef mant = NULL;
3136    LLVMValueRef logexp = NULL;
3137    LLVMValueRef logmant = NULL;
3138    LLVMValueRef res = NULL;
3139
3140    assert(lp_check_value(bld->type, x));
3141
3142    if(p_exp || p_floor_log2 || p_log2) {
3143       /* TODO: optimize the constant case */
3144       if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3145           LLVMIsConstant(x)) {
3146          debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3147                       __FUNCTION__);
3148       }
3149
3150       assert(type.floating && type.width == 32);
3151
3152       /*
3153        * We don't explicitly handle denormalized numbers. They will yield a
3154        * result in the neighbourhood of -127, which appears to be adequate
3155        * enough.
3156        */
3157
3158       i = LLVMBuildBitCast(builder, x, int_vec_type, "");
3159
3160       /* exp = (float) exponent(x) */
3161       exp = LLVMBuildAnd(builder, i, expmask, "");
3162    }
3163
3164    if(p_floor_log2 || p_log2) {
3165       logexp = LLVMBuildLShr(builder, exp, lp_build_const_int_vec(bld->gallivm, type, 23), "");
3166       logexp = LLVMBuildSub(builder, logexp, lp_build_const_int_vec(bld->gallivm, type, 127), "");
3167       logexp = LLVMBuildSIToFP(builder, logexp, vec_type, "");
3168    }
3169
3170    if(p_log2) {
3171       /* mant = 1 + (float) mantissa(x) */
3172       mant = LLVMBuildAnd(builder, i, mantmask, "");
3173       mant = LLVMBuildOr(builder, mant, one, "");
3174       mant = LLVMBuildBitCast(builder, mant, vec_type, "");
3175
3176       /* y = (mant - 1) / (mant + 1) */
3177       y = lp_build_div(bld,
3178          lp_build_sub(bld, mant, bld->one),
3179          lp_build_add(bld, mant, bld->one)
3180       );
3181
3182       /* z = y^2 */
3183       z = lp_build_mul(bld, y, y);
3184
3185       /* compute P(z) */
3186       logmant = lp_build_polynomial(bld, z, lp_build_log2_polynomial,
3187                                     Elements(lp_build_log2_polynomial));
3188
3189       /* logmant = y * P(z) */
3190       logmant = lp_build_mul(bld, y, logmant);
3191
3192       res = lp_build_add(bld, logmant, logexp);
3193    }
3194
3195    if(p_exp) {
3196       exp = LLVMBuildBitCast(builder, exp, vec_type, "");
3197       *p_exp = exp;
3198    }
3199
3200    if(p_floor_log2)
3201       *p_floor_log2 = logexp;
3202
3203    if(p_log2)
3204       *p_log2 = res;
3205 }
3206
3207
3208 LLVMValueRef
3209 lp_build_log2(struct lp_build_context *bld,
3210               LLVMValueRef x)
3211 {
3212    LLVMValueRef res;
3213    lp_build_log2_approx(bld, x, NULL, NULL, &res);
3214    return res;
3215 }
3216
3217
3218 /**
3219  * Faster (and less accurate) log2.
3220  *
3221  *    log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x))
3222  *
3223  * Piece-wise linear approximation, with exact results when x is a
3224  * power of two.
3225  *
3226  * See http://www.flipcode.com/archives/Fast_log_Function.shtml
3227  */
3228 LLVMValueRef
3229 lp_build_fast_log2(struct lp_build_context *bld,
3230                    LLVMValueRef x)
3231 {
3232    LLVMBuilderRef builder = bld->gallivm->builder;
3233    LLVMValueRef ipart;
3234    LLVMValueRef fpart;
3235
3236    assert(lp_check_value(bld->type, x));
3237
3238    assert(bld->type.floating);
3239
3240    /* ipart = floor(log2(x)) - 1 */
3241    ipart = lp_build_extract_exponent(bld, x, -1);
3242    ipart = LLVMBuildSIToFP(builder, ipart, bld->vec_type, "");
3243
3244    /* fpart = x / 2**ipart */
3245    fpart = lp_build_extract_mantissa(bld, x);
3246
3247    /* ipart + fpart */
3248    return LLVMBuildFAdd(builder, ipart, fpart, "");
3249 }
3250
3251
3252 /**
3253  * Fast implementation of iround(log2(x)).
3254  *
3255  * Not an approximation -- it should give accurate results all the time.
3256  */
3257 LLVMValueRef
3258 lp_build_ilog2(struct lp_build_context *bld,
3259                LLVMValueRef x)
3260 {
3261    LLVMBuilderRef builder = bld->gallivm->builder;
3262    LLVMValueRef sqrt2 = lp_build_const_vec(bld->gallivm, bld->type, M_SQRT2);
3263    LLVMValueRef ipart;
3264
3265    assert(bld->type.floating);
3266
3267    assert(lp_check_value(bld->type, x));
3268
3269    /* x * 2^(0.5)   i.e., add 0.5 to the log2(x) */
3270    x = LLVMBuildFMul(builder, x, sqrt2, "");
3271
3272    /* ipart = floor(log2(x) + 0.5)  */
3273    ipart = lp_build_extract_exponent(bld, x, 0);
3274
3275    return ipart;
3276 }
3277
3278 LLVMValueRef
3279 lp_build_mod(struct lp_build_context *bld,
3280              LLVMValueRef x,
3281              LLVMValueRef y)
3282 {
3283    LLVMBuilderRef builder = bld->gallivm->builder;
3284    LLVMValueRef res;
3285    const struct lp_type type = bld->type;
3286
3287    assert(lp_check_value(type, x));
3288    assert(lp_check_value(type, y));
3289
3290    if (type.floating)
3291       res = LLVMBuildFRem(builder, x, y, "");
3292    else if (type.sign)
3293       res = LLVMBuildSRem(builder, x, y, "");
3294    else
3295       res = LLVMBuildURem(builder, x, y, "");
3296    return res;
3297 }