src/gallium/auxiliary/gallivm/lp_bld_arit.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2009-2010 VMware, Inc.
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the
  15  * next paragraph) shall be included in all copies or substantial portions
  16  * of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  21  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
  22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25  *
  26  **************************************************************************/
  27
  28
  29 /**
  30  * @file
  31  * Helper
  32  *
  33  * LLVM IR doesn't support all basic arithmetic operations we care about (most
  34  * notably min/max and saturated operations), and it is often necessary to
  35  * resort machine-specific intrinsics directly. The functions here hide all
  36  * these implementation details from the other modules.
  37  *
  38  * We also do simple expressions simplification here. Reasons are:
  39  * - it is very easy given we have all necessary information readily available
  40  * - LLVM optimization passes fail to simplify several vector expressions
  41  * - We often know value constraints which the optimization passes have no way
  42  *   of knowing, such as when source arguments are known to be in [0, 1] range.
  43  *
  44  * @author Jose Fonseca <jfonseca@vmware.com>
  45  */
  46
  47
  48 #include "util/u_memory.h"
  49 #include "util/u_debug.h"
  50 #include "util/u_math.h"
  51 #include "util/u_string.h"
  52 #include "util/u_cpu_detect.h"
  53
  54 #include "lp_bld_type.h"
  55 #include "lp_bld_const.h"
  56 #include "lp_bld_init.h"
  57 #include "lp_bld_intr.h"
  58 #include "lp_bld_logic.h"
  59 #include "lp_bld_pack.h"
  60 #include "lp_bld_debug.h"
  61 #include "lp_bld_arit.h"
  62
  63
  64 #define EXP_POLY_DEGREE 5
  65
  66 #define LOG_POLY_DEGREE 5
  67
  68
  69 /**
  70  * Generate min(a, b)
  71  * No checks for special case values of a or b = 1 or 0 are done.
  72  */
  73 static LLVMValueRef
  74 lp_build_min_simple(struct lp_build_context *bld,
  75                     LLVMValueRef a,
  76                     LLVMValueRef b)
  77 {
  78    LLVMBuilderRef builder = bld->gallivm->builder;
  79    const struct lp_type type = bld->type;
  80    const char *intrinsic = NULL;
  81    LLVMValueRef cond;
  82
  83    assert(lp_check_value(type, a));
  84    assert(lp_check_value(type, b));
  85
  86    /* TODO: optimize the constant case */
  87
  88    if(type.width * type.length == 128) {
  89       if(type.floating) {
  90          if(type.width == 32 && util_cpu_caps.has_sse)
  91             intrinsic = "llvm.x86.sse.min.ps";
  92          if(type.width == 64 && util_cpu_caps.has_sse2)
  93             intrinsic = "llvm.x86.sse2.min.pd";
  94       }
  95       else {
  96          if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2)
  97             intrinsic = "llvm.x86.sse2.pminu.b";
  98          if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1)
  99             intrinsic = "llvm.x86.sse41.pminsb";
 100          if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1)
 101             intrinsic = "llvm.x86.sse41.pminuw";
 102          if(type.width == 16 && type.sign && util_cpu_caps.has_sse2)
 103             intrinsic = "llvm.x86.sse2.pmins.w";
 104          if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1)
 105             intrinsic = "llvm.x86.sse41.pminud";
 106          if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1)
 107             intrinsic = "llvm.x86.sse41.pminsd";
 108       }
 109    }
 110
 111    if(intrinsic)
 112       return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
 113
 114    cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 115    return lp_build_select(bld, cond, a, b);
 116 }
 117
 118
 119 /**
 120  * Generate max(a, b)
 121  * No checks for special case values of a or b = 1 or 0 are done.
 122  */
 123 static LLVMValueRef
 124 lp_build_max_simple(struct lp_build_context *bld,
 125                     LLVMValueRef a,
 126                     LLVMValueRef b)
 127 {
 128    LLVMBuilderRef builder = bld->gallivm->builder;
 129    const struct lp_type type = bld->type;
 130    const char *intrinsic = NULL;
 131    LLVMValueRef cond;
 132
 133    assert(lp_check_value(type, a));
 134    assert(lp_check_value(type, b));
 135
 136    /* TODO: optimize the constant case */
 137
 138    if(type.width * type.length == 128) {
 139       if(type.floating) {
 140          if(type.width == 32 && util_cpu_caps.has_sse)
 141             intrinsic = "llvm.x86.sse.max.ps";
 142          if(type.width == 64 && util_cpu_caps.has_sse2)
 143             intrinsic = "llvm.x86.sse2.max.pd";
 144       }
 145       else {
 146          if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2)
 147             intrinsic = "llvm.x86.sse2.pmaxu.b";
 148          if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1)
 149             intrinsic = "llvm.x86.sse41.pmaxsb";
 150          if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1)
 151             intrinsic = "llvm.x86.sse41.pmaxuw";
 152          if(type.width == 16 && type.sign && util_cpu_caps.has_sse2)
 153             intrinsic = "llvm.x86.sse2.pmaxs.w";
 154          if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1)
 155             intrinsic = "llvm.x86.sse41.pmaxud";
 156          if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1)
 157             intrinsic = "llvm.x86.sse41.pmaxsd";
 158       }
 159    }
 160
 161    if(intrinsic)
 162       return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
 163
 164    cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 165    return lp_build_select(bld, cond, a, b);
 166 }
 167
 168
 169 /**
 170  * Generate 1 - a, or ~a depending on bld->type.
 171  */
 172 LLVMValueRef
 173 lp_build_comp(struct lp_build_context *bld,
 174               LLVMValueRef a)
 175 {
 176    LLVMBuilderRef builder = bld->gallivm->builder;
 177    const struct lp_type type = bld->type;
 178
 179    assert(lp_check_value(type, a));
 180
 181    if(a == bld->one)
 182       return bld->zero;
 183    if(a == bld->zero)
 184       return bld->one;
 185
 186    if(type.norm && !type.floating && !type.fixed && !type.sign) {
 187       if(LLVMIsConstant(a))
 188          return LLVMConstNot(a);
 189       else
 190          return LLVMBuildNot(builder, a, "");
 191    }
 192
 193    if(LLVMIsConstant(a))
 194       if (type.floating)
 195           return LLVMConstFSub(bld->one, a);
 196       else
 197           return LLVMConstSub(bld->one, a);
 198    else
 199       if (type.floating)
 200          return LLVMBuildFSub(builder, bld->one, a, "");
 201       else
 202          return LLVMBuildSub(builder, bld->one, a, "");
 203 }
 204
 205
 206 /**
 207  * Generate a + b
 208  */
 209 LLVMValueRef
 210 lp_build_add(struct lp_build_context *bld,
 211              LLVMValueRef a,
 212              LLVMValueRef b)
 213 {
 214    LLVMBuilderRef builder = bld->gallivm->builder;
 215    const struct lp_type type = bld->type;
 216    LLVMValueRef res;
 217
 218    assert(lp_check_value(type, a));
 219    assert(lp_check_value(type, b));
 220
 221    if(a == bld->zero)
 222       return b;
 223    if(b == bld->zero)
 224       return a;
 225    if(a == bld->undef || b == bld->undef)
 226       return bld->undef;
 227
 228    if(bld->type.norm) {
 229       const char *intrinsic = NULL;
 230
 231       if(a == bld->one || b == bld->one)
 232         return bld->one;
 233
 234       if(util_cpu_caps.has_sse2 &&
 235          type.width * type.length == 128 &&
 236          !type.floating && !type.fixed) {
 237          if(type.width == 8)
 238             intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
 239          if(type.width == 16)
 240             intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
 241       }
 242
 243       if(intrinsic)
 244          return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
 245    }
 246
 247    if(LLVMIsConstant(a) && LLVMIsConstant(b))
 248       if (type.floating)
 249          res = LLVMConstFAdd(a, b);
 250       else
 251          res = LLVMConstAdd(a, b);
 252    else
 253       if (type.floating)
 254          res = LLVMBuildFAdd(builder, a, b, "");
 255       else
 256          res = LLVMBuildAdd(builder, a, b, "");
 257
 258    /* clamp to ceiling of 1.0 */
 259    if(bld->type.norm && (bld->type.floating || bld->type.fixed))
 260       res = lp_build_min_simple(bld, res, bld->one);
 261
 262    /* XXX clamp to floor of -1 or 0??? */
 263
 264    return res;
 265 }
 266
 267
 268 /** Return the scalar sum of the elements of a */
 269 LLVMValueRef
 270 lp_build_sum_vector(struct lp_build_context *bld,
 271                     LLVMValueRef a)
 272 {
 273    LLVMBuilderRef builder = bld->gallivm->builder;
 274    const struct lp_type type = bld->type;
 275    LLVMValueRef index, res;
 276    unsigned i;
 277
 278    assert(lp_check_value(type, a));
 279
 280    if (type.length == 1) {
 281       return a;
 282    }
 283
 284    assert(!bld->type.norm);
 285
 286    index = lp_build_const_int32(bld->gallivm, 0);
 287    res = LLVMBuildExtractElement(builder, a, index, "");
 288
 289    for (i = 1; i < type.length; i++) {
 290       index = lp_build_const_int32(bld->gallivm, i);
 291       if (type.floating)
 292          res = LLVMBuildFAdd(builder, res,
 293                             LLVMBuildExtractElement(builder,
 294                                                     a, index, ""),
 295                             "");
 296       else
 297          res = LLVMBuildAdd(builder, res,
 298                             LLVMBuildExtractElement(builder,
 299                                                     a, index, ""),
 300                             "");
 301    }
 302
 303    return res;
 304 }
 305
 306
 307 /**
 308  * Generate a - b
 309  */
 310 LLVMValueRef
 311 lp_build_sub(struct lp_build_context *bld,
 312              LLVMValueRef a,
 313              LLVMValueRef b)
 314 {
 315    LLVMBuilderRef builder = bld->gallivm->builder;
 316    const struct lp_type type = bld->type;
 317    LLVMValueRef res;
 318
 319    assert(lp_check_value(type, a));
 320    assert(lp_check_value(type, b));
 321
 322    if(b == bld->zero)
 323       return a;
 324    if(a == bld->undef || b == bld->undef)
 325       return bld->undef;
 326    if(a == b)
 327       return bld->zero;
 328
 329    if(bld->type.norm) {
 330       const char *intrinsic = NULL;
 331
 332       if(b == bld->one)
 333         return bld->zero;
 334
 335       if(util_cpu_caps.has_sse2 &&
 336          type.width * type.length == 128 &&
 337          !type.floating && !type.fixed) {
 338          if(type.width == 8)
 339             intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
 340          if(type.width == 16)
 341             intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
 342       }
 343
 344       if(intrinsic)
 345          return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
 346    }
 347
 348    if(LLVMIsConstant(a) && LLVMIsConstant(b))
 349       if (type.floating)
 350          res = LLVMConstFSub(a, b);
 351       else
 352          res = LLVMConstSub(a, b);
 353    else
 354       if (type.floating)
 355          res = LLVMBuildFSub(builder, a, b, "");
 356       else
 357          res = LLVMBuildSub(builder, a, b, "");
 358
 359    if(bld->type.norm && (bld->type.floating || bld->type.fixed))
 360       res = lp_build_max_simple(bld, res, bld->zero);
 361
 362    return res;
 363 }
 364
 365
 366 /**
 367  * Normalized 8bit multiplication.
 368  *
 369  * - alpha plus one
 370  *
 371  *     makes the following approximation to the division (Sree)
 372  *
 373  *       a*b/255 ~= (a*(b + 1)) >> 256
 374  *
 375  *     which is the fastest method that satisfies the following OpenGL criteria
 376  *
 377  *       0*0 = 0 and 255*255 = 255
 378  *
 379  * - geometric series
 380  *
 381  *     takes the geometric series approximation to the division
 382  *
 383  *       t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
 384  *
 385  *     in this case just the first two terms to fit in 16bit arithmetic
 386  *
 387  *       t/255 ~= (t + (t >> 8)) >> 8
 388  *
 389  *     note that just by itself it doesn't satisfies the OpenGL criteria, as
 390  *     255*255 = 254, so the special case b = 255 must be accounted or roundoff
 391  *     must be used
 392  *
 393  * - geometric series plus rounding
 394  *
 395  *     when using a geometric series division instead of truncating the result
 396  *     use roundoff in the approximation (Jim Blinn)
 397  *
 398  *       t/255 ~= (t + (t >> 8) + 0x80) >> 8
 399  *
 400  *     achieving the exact results
 401  *
 402  * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
 403  *     ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
 404  * @sa Michael Herf, The "double blend trick", May 2000,
 405  *     http://www.stereopsis.com/doubleblend.html
 406  */
 407 static LLVMValueRef
 408 lp_build_mul_u8n(struct gallivm_state *gallivm,
 409                  struct lp_type i16_type,
 410                  LLVMValueRef a, LLVMValueRef b)
 411 {
 412    LLVMBuilderRef builder = gallivm->builder;
 413    LLVMValueRef c8;
 414    LLVMValueRef ab;
 415
 416    assert(!i16_type.floating);
 417    assert(lp_check_value(i16_type, a));
 418    assert(lp_check_value(i16_type, b));
 419
 420    c8 = lp_build_const_int_vec(gallivm, i16_type, 8);
 421
 422 #if 0
 423
 424    /* a*b/255 ~= (a*(b + 1)) >> 256 */
 425    b = LLVMBuildAdd(builder, b, lp_build_const_int_vec(gallium, i16_type, 1), "");
 426    ab = LLVMBuildMul(builder, a, b, "");
 427
 428 #else
 429
 430    /* ab/255 ~= (ab + (ab >> 8) + 0x80) >> 8 */
 431    ab = LLVMBuildMul(builder, a, b, "");
 432    ab = LLVMBuildAdd(builder, ab, LLVMBuildLShr(builder, ab, c8, ""), "");
 433    ab = LLVMBuildAdd(builder, ab, lp_build_const_int_vec(gallivm, i16_type, 0x80), "");
 434
 435 #endif
 436
 437    ab = LLVMBuildLShr(builder, ab, c8, "");
 438
 439    return ab;
 440 }
 441
 442
 443 /**
 444  * Generate a * b
 445  */
 446 LLVMValueRef
 447 lp_build_mul(struct lp_build_context *bld,
 448              LLVMValueRef a,
 449              LLVMValueRef b)
 450 {
 451    LLVMBuilderRef builder = bld->gallivm->builder;
 452    const struct lp_type type = bld->type;
 453    LLVMValueRef shift;
 454    LLVMValueRef res;
 455
 456    assert(lp_check_value(type, a));
 457    assert(lp_check_value(type, b));
 458
 459    if(a == bld->zero)
 460       return bld->zero;
 461    if(a == bld->one)
 462       return b;
 463    if(b == bld->zero)
 464       return bld->zero;
 465    if(b == bld->one)
 466       return a;
 467    if(a == bld->undef || b == bld->undef)
 468       return bld->undef;
 469
 470    if(!type.floating && !type.fixed && type.norm) {
 471       if(type.width == 8) {
 472          struct lp_type i16_type = lp_wider_type(type);
 473          LLVMValueRef al, ah, bl, bh, abl, abh, ab;
 474
 475          lp_build_unpack2(bld->gallivm, type, i16_type, a, &al, &ah);
 476          lp_build_unpack2(bld->gallivm, type, i16_type, b, &bl, &bh);
 477
 478          /* PMULLW, PSRLW, PADDW */
 479          abl = lp_build_mul_u8n(bld->gallivm, i16_type, al, bl);
 480          abh = lp_build_mul_u8n(bld->gallivm, i16_type, ah, bh);
 481
 482          ab = lp_build_pack2(bld->gallivm, i16_type, type, abl, abh);
 483
 484          return ab;
 485       }
 486
 487       /* FIXME */
 488       assert(0);
 489    }
 490
 491    if(type.fixed)
 492       shift = lp_build_const_int_vec(bld->gallivm, type, type.width/2);
 493    else
 494       shift = NULL;
 495
 496    if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
 497       if (type.floating)
 498          res = LLVMConstFMul(a, b);
 499       else
 500          res = LLVMConstMul(a, b);
 501       if(shift) {
 502          if(type.sign)
 503             res = LLVMConstAShr(res, shift);
 504          else
 505             res = LLVMConstLShr(res, shift);
 506       }
 507    }
 508    else {
 509       if (type.floating)
 510          res = LLVMBuildFMul(builder, a, b, "");
 511       else
 512          res = LLVMBuildMul(builder, a, b, "");
 513       if(shift) {
 514          if(type.sign)
 515             res = LLVMBuildAShr(builder, res, shift, "");
 516          else
 517             res = LLVMBuildLShr(builder, res, shift, "");
 518       }
 519    }
 520
 521    return res;
 522 }
 523
 524
 525 /**
 526  * Small vector x scale multiplication optimization.
 527  */
 528 LLVMValueRef
 529 lp_build_mul_imm(struct lp_build_context *bld,
 530                  LLVMValueRef a,
 531                  int b)
 532 {
 533    LLVMBuilderRef builder = bld->gallivm->builder;
 534    LLVMValueRef factor;
 535
 536    assert(lp_check_value(bld->type, a));
 537
 538    if(b == 0)
 539       return bld->zero;
 540
 541    if(b == 1)
 542       return a;
 543
 544    if(b == -1)
 545       return lp_build_negate(bld, a);
 546
 547    if(b == 2 && bld->type.floating)
 548       return lp_build_add(bld, a, a);
 549
 550    if(util_is_power_of_two(b)) {
 551       unsigned shift = ffs(b) - 1;
 552
 553       if(bld->type.floating) {
 554 #if 0
 555          /*
 556           * Power of two multiplication by directly manipulating the mantissa.
 557           *
 558           * XXX: This might not be always faster, it will introduce a small error
 559           * for multiplication by zero, and it will produce wrong results
 560           * for Inf and NaN.
 561           */
 562          unsigned mantissa = lp_mantissa(bld->type);
 563          factor = lp_build_const_int_vec(bld->gallivm, bld->type, (unsigned long long)shift << mantissa);
 564          a = LLVMBuildBitCast(builder, a, lp_build_int_vec_type(bld->type), "");
 565          a = LLVMBuildAdd(builder, a, factor, "");
 566          a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, bld->type), "");
 567          return a;
 568 #endif
 569       }
 570       else {
 571          factor = lp_build_const_vec(bld->gallivm, bld->type, shift);
 572          return LLVMBuildShl(builder, a, factor, "");
 573       }
 574    }
 575
 576    factor = lp_build_const_vec(bld->gallivm, bld->type, (double)b);
 577    return lp_build_mul(bld, a, factor);
 578 }
 579
 580
 581 /**
 582  * Generate a / b
 583  */
 584 LLVMValueRef
 585 lp_build_div(struct lp_build_context *bld,
 586              LLVMValueRef a,
 587              LLVMValueRef b)
 588 {
 589    LLVMBuilderRef builder = bld->gallivm->builder;
 590    const struct lp_type type = bld->type;
 591
 592    assert(lp_check_value(type, a));
 593    assert(lp_check_value(type, b));
 594
 595    if(a == bld->zero)
 596       return bld->zero;
 597    if(a == bld->one)
 598       return lp_build_rcp(bld, b);
 599    if(b == bld->zero)
 600       return bld->undef;
 601    if(b == bld->one)
 602       return a;
 603    if(a == bld->undef || b == bld->undef)
 604       return bld->undef;
 605
 606    if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
 607       if (type.floating)
 608          return LLVMConstFDiv(a, b);
 609       else if (type.sign)
 610          return LLVMConstSDiv(a, b);
 611       else
 612          return LLVMConstUDiv(a, b);
 613    }
 614
 615    if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4 &&
 616       type.floating)
 617       return lp_build_mul(bld, a, lp_build_rcp(bld, b));
 618
 619    if (type.floating)
 620       return LLVMBuildFDiv(builder, a, b, "");
 621    else if (type.sign)
 622       return LLVMBuildSDiv(builder, a, b, "");
 623    else
 624       return LLVMBuildUDiv(builder, a, b, "");
 625 }
 626
 627
 628 /**
 629  * Linear interpolation -- without any checks.
 630  *
 631  * @sa http://www.stereopsis.com/doubleblend.html
 632  */
 633 static INLINE LLVMValueRef
 634 lp_build_lerp_simple(struct lp_build_context *bld,
 635                      LLVMValueRef x,
 636                      LLVMValueRef v0,
 637                      LLVMValueRef v1)
 638 {
 639    LLVMBuilderRef builder = bld->gallivm->builder;
 640    LLVMValueRef delta;
 641    LLVMValueRef res;
 642
 643    assert(lp_check_value(bld->type, x));
 644    assert(lp_check_value(bld->type, v0));
 645    assert(lp_check_value(bld->type, v1));
 646
 647    delta = lp_build_sub(bld, v1, v0);
 648
 649    res = lp_build_mul(bld, x, delta);
 650
 651    res = lp_build_add(bld, v0, res);
 652
 653    if (bld->type.fixed) {
 654       /* XXX: This step is necessary for lerping 8bit colors stored on 16bits,
 655        * but it will be wrong for other uses. Basically we need a more
 656        * powerful lp_type, capable of further distinguishing the values
 657        * interpretation from the value storage. */
 658       res = LLVMBuildAnd(builder, res, lp_build_const_int_vec(bld->gallivm, bld->type, (1 << bld->type.width/2) - 1), "");
 659    }
 660
 661    return res;
 662 }
 663
 664
 665 /**
 666  * Linear interpolation.
 667  */
 668 LLVMValueRef
 669 lp_build_lerp(struct lp_build_context *bld,
 670               LLVMValueRef x,
 671               LLVMValueRef v0,
 672               LLVMValueRef v1)
 673 {
 674    LLVMBuilderRef builder = bld->gallivm->builder;
 675    const struct lp_type type = bld->type;
 676    LLVMValueRef res;
 677
 678    assert(lp_check_value(type, x));
 679    assert(lp_check_value(type, v0));
 680    assert(lp_check_value(type, v1));
 681
 682    if (type.norm) {
 683       struct lp_type wide_type;
 684       struct lp_build_context wide_bld;
 685       LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh;
 686       LLVMValueRef shift;
 687
 688       assert(type.length >= 2);
 689       assert(!type.sign);
 690
 691       /*
 692        * Create a wider type, enough to hold the intermediate result of the
 693        * multiplication.
 694        */
 695       memset(&wide_type, 0, sizeof wide_type);
 696       wide_type.fixed  = TRUE;
 697       wide_type.width  = type.width*2;
 698       wide_type.length = type.length/2;
 699
 700       lp_build_context_init(&wide_bld, bld->gallivm, wide_type);
 701
 702       lp_build_unpack2(bld->gallivm, type, wide_type, x,  &xl,  &xh);
 703       lp_build_unpack2(bld->gallivm, type, wide_type, v0, &v0l, &v0h);
 704       lp_build_unpack2(bld->gallivm, type, wide_type, v1, &v1l, &v1h);
 705
 706       /*
 707        * Scale x from [0, 255] to [0, 256]
 708        */
 709
 710       shift = lp_build_const_int_vec(bld->gallivm, wide_type, type.width - 1);
 711
 712       xl = lp_build_add(&wide_bld, xl,
 713                         LLVMBuildAShr(builder, xl, shift, ""));
 714       xh = lp_build_add(&wide_bld, xh,
 715                         LLVMBuildAShr(builder, xh, shift, ""));
 716
 717       /*
 718        * Lerp both halves.
 719        */
 720
 721       resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l);
 722       resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h);
 723
 724       res = lp_build_pack2(bld->gallivm, wide_type, type, resl, resh);
 725    } else {
 726       res = lp_build_lerp_simple(bld, x, v0, v1);
 727    }
 728
 729    return res;
 730 }
 731
 732
 733 LLVMValueRef
 734 lp_build_lerp_2d(struct lp_build_context *bld,
 735                  LLVMValueRef x,
 736                  LLVMValueRef y,
 737                  LLVMValueRef v00,
 738                  LLVMValueRef v01,
 739                  LLVMValueRef v10,
 740                  LLVMValueRef v11)
 741 {
 742    LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01);
 743    LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11);
 744    return lp_build_lerp(bld, y, v0, v1);
 745 }
 746
 747
 748 /**
 749  * Generate min(a, b)
 750  * Do checks for special cases.
 751  */
 752 LLVMValueRef
 753 lp_build_min(struct lp_build_context *bld,
 754              LLVMValueRef a,
 755              LLVMValueRef b)
 756 {
 757    assert(lp_check_value(bld->type, a));
 758    assert(lp_check_value(bld->type, b));
 759
 760    if(a == bld->undef || b == bld->undef)
 761       return bld->undef;
 762
 763    if(a == b)
 764       return a;
 765
 766    if(bld->type.norm) {
 767       if(a == bld->zero || b == bld->zero)
 768          return bld->zero;
 769       if(a == bld->one)
 770          return b;
 771       if(b == bld->one)
 772          return a;
 773    }
 774
 775    return lp_build_min_simple(bld, a, b);
 776 }
 777
 778
 779 /**
 780  * Generate max(a, b)
 781  * Do checks for special cases.
 782  */
 783 LLVMValueRef
 784 lp_build_max(struct lp_build_context *bld,
 785              LLVMValueRef a,
 786              LLVMValueRef b)
 787 {
 788    assert(lp_check_value(bld->type, a));
 789    assert(lp_check_value(bld->type, b));
 790
 791    if(a == bld->undef || b == bld->undef)
 792       return bld->undef;
 793
 794    if(a == b)
 795       return a;
 796
 797    if(bld->type.norm) {
 798       if(a == bld->one || b == bld->one)
 799          return bld->one;
 800       if(a == bld->zero)
 801          return b;
 802       if(b == bld->zero)
 803          return a;
 804    }
 805
 806    return lp_build_max_simple(bld, a, b);
 807 }
 808
 809
 810 /**
 811  * Generate clamp(a, min, max)
 812  * Do checks for special cases.
 813  */
 814 LLVMValueRef
 815 lp_build_clamp(struct lp_build_context *bld,
 816                LLVMValueRef a,
 817                LLVMValueRef min,
 818                LLVMValueRef max)
 819 {
 820    assert(lp_check_value(bld->type, a));
 821    assert(lp_check_value(bld->type, min));
 822    assert(lp_check_value(bld->type, max));
 823
 824    a = lp_build_min(bld, a, max);
 825    a = lp_build_max(bld, a, min);
 826    return a;
 827 }
 828
 829
 830 /**
 831  * Generate abs(a)
 832  */
 833 LLVMValueRef
 834 lp_build_abs(struct lp_build_context *bld,
 835              LLVMValueRef a)
 836 {
 837    LLVMBuilderRef builder = bld->gallivm->builder;
 838    const struct lp_type type = bld->type;
 839    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
 840
 841    assert(lp_check_value(type, a));
 842
 843    if(!type.sign)
 844       return a;
 845
 846    if(type.floating) {
 847       /* Mask out the sign bit */
 848       LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
 849       unsigned long long absMask = ~(1ULL << (type.width - 1));
 850       LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, ((unsigned long long) absMask));
 851       a = LLVMBuildBitCast(builder, a, int_vec_type, "");
 852       a = LLVMBuildAnd(builder, a, mask, "");
 853       a = LLVMBuildBitCast(builder, a, vec_type, "");
 854       return a;
 855    }
 856
 857    if(type.width*type.length == 128 && util_cpu_caps.has_ssse3) {
 858       switch(type.width) {
 859       case 8:
 860          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
 861       case 16:
 862          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
 863       case 32:
 864          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
 865       }
 866    }
 867
 868    return lp_build_max(bld, a, LLVMBuildNeg(builder, a, ""));
 869 }
 870
 871
 872 LLVMValueRef
 873 lp_build_negate(struct lp_build_context *bld,
 874                 LLVMValueRef a)
 875 {
 876    LLVMBuilderRef builder = bld->gallivm->builder;
 877
 878    assert(lp_check_value(bld->type, a));
 879
 880 #if HAVE_LLVM >= 0x0207
 881    if (bld->type.floating)
 882       a = LLVMBuildFNeg(builder, a, "");
 883    else
 884 #endif
 885       a = LLVMBuildNeg(builder, a, "");
 886
 887    return a;
 888 }
 889
 890
 891 /** Return -1, 0 or +1 depending on the sign of a */
 892 LLVMValueRef
 893 lp_build_sgn(struct lp_build_context *bld,
 894              LLVMValueRef a)
 895 {
 896    LLVMBuilderRef builder = bld->gallivm->builder;
 897    const struct lp_type type = bld->type;
 898    LLVMValueRef cond;
 899    LLVMValueRef res;
 900
 901    assert(lp_check_value(type, a));
 902
 903    /* Handle non-zero case */
 904    if(!type.sign) {
 905       /* if not zero then sign must be positive */
 906       res = bld->one;
 907    }
 908    else if(type.floating) {
 909       LLVMTypeRef vec_type;
 910       LLVMTypeRef int_type;
 911       LLVMValueRef mask;
 912       LLVMValueRef sign;
 913       LLVMValueRef one;
 914       unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
 915
 916       int_type = lp_build_int_vec_type(bld->gallivm, type);
 917       vec_type = lp_build_vec_type(bld->gallivm, type);
 918       mask = lp_build_const_int_vec(bld->gallivm, type, maskBit);
 919
 920       /* Take the sign bit and add it to 1 constant */
 921       sign = LLVMBuildBitCast(builder, a, int_type, "");
 922       sign = LLVMBuildAnd(builder, sign, mask, "");
 923       one = LLVMConstBitCast(bld->one, int_type);
 924       res = LLVMBuildOr(builder, sign, one, "");
 925       res = LLVMBuildBitCast(builder, res, vec_type, "");
 926    }
 927    else
 928    {
 929       LLVMValueRef minus_one = lp_build_const_vec(bld->gallivm, type, -1.0);
 930       cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
 931       res = lp_build_select(bld, cond, bld->one, minus_one);
 932    }
 933
 934    /* Handle zero */
 935    cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
 936    res = lp_build_select(bld, cond, bld->zero, res);
 937
 938    return res;
 939 }
 940
 941
 942 /**
 943  * Set the sign of float vector 'a' according to 'sign'.
 944  * If sign==0, return abs(a).
 945  * If sign==1, return -abs(a);
 946  * Other values for sign produce undefined results.
 947  */
 948 LLVMValueRef
 949 lp_build_set_sign(struct lp_build_context *bld,
 950                   LLVMValueRef a, LLVMValueRef sign)
 951 {
 952    LLVMBuilderRef builder = bld->gallivm->builder;
 953    const struct lp_type type = bld->type;
 954    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
 955    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
 956    LLVMValueRef shift = lp_build_const_int_vec(bld->gallivm, type, type.width - 1);
 957    LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
 958                              ~((unsigned long long) 1 << (type.width - 1)));
 959    LLVMValueRef val, res;
 960
 961    assert(type.floating);
 962    assert(lp_check_value(type, a));
 963
 964    /* val = reinterpret_cast<int>(a) */
 965    val = LLVMBuildBitCast(builder, a, int_vec_type, "");
 966    /* val = val & mask */
 967    val = LLVMBuildAnd(builder, val, mask, "");
 968    /* sign = sign << shift */
 969    sign = LLVMBuildShl(builder, sign, shift, "");
 970    /* res = val | sign */
 971    res = LLVMBuildOr(builder, val, sign, "");
 972    /* res = reinterpret_cast<float>(res) */
 973    res = LLVMBuildBitCast(builder, res, vec_type, "");
 974
 975    return res;
 976 }
 977
 978
 979 /**
 980  * Convert vector of (or scalar) int to vector of (or scalar) float.
 981  */
 982 LLVMValueRef
 983 lp_build_int_to_float(struct lp_build_context *bld,
 984                       LLVMValueRef a)
 985 {
 986    LLVMBuilderRef builder = bld->gallivm->builder;
 987    const struct lp_type type = bld->type;
 988    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
 989
 990    assert(type.floating);
 991
 992    return LLVMBuildSIToFP(builder, a, vec_type, "");
 993 }
 994
 995
 996
 997 enum lp_build_round_sse41_mode
 998 {
 999    LP_BUILD_ROUND_SSE41_NEAREST = 0,
1000    LP_BUILD_ROUND_SSE41_FLOOR = 1,
1001    LP_BUILD_ROUND_SSE41_CEIL = 2,
1002    LP_BUILD_ROUND_SSE41_TRUNCATE = 3
1003 };
1004
1005
1006 /**
1007  * Helper for SSE4.1's ROUNDxx instructions.
1008  *
1009  * NOTE: In the SSE4.1's nearest mode, if two values are equally close, the
1010  * result is the even value.  That is, rounding 2.5 will be 2.0, and not 3.0.
1011  */
1012 static INLINE LLVMValueRef
1013 lp_build_round_sse41(struct lp_build_context *bld,
1014                      LLVMValueRef a,
1015                      enum lp_build_round_sse41_mode mode)
1016 {
1017    LLVMBuilderRef builder = bld->gallivm->builder;
1018    const struct lp_type type = bld->type;
1019    LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1020    const char *intrinsic;
1021    LLVMValueRef res;
1022
1023    assert(type.floating);
1024
1025    assert(lp_check_value(type, a));
1026    assert(util_cpu_caps.has_sse4_1);
1027
1028    if (type.length == 1) {
1029       LLVMTypeRef vec_type;
1030       LLVMValueRef undef;
1031       LLVMValueRef args[3];
1032       LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
1033
1034       switch(type.width) {
1035       case 32:
1036          intrinsic = "llvm.x86.sse41.round.ss";
1037          break;
1038       case 64:
1039          intrinsic = "llvm.x86.sse41.round.sd";
1040          break;
1041       default:
1042          assert(0);
1043          return bld->undef;
1044       }
1045
1046       vec_type = LLVMVectorType(bld->elem_type, 4);
1047
1048       undef = LLVMGetUndef(vec_type);
1049
1050       args[0] = undef;
1051       args[1] = LLVMBuildInsertElement(builder, undef, a, index0, "");
1052       args[2] = LLVMConstInt(i32t, mode, 0);
1053
1054       res = lp_build_intrinsic(builder, intrinsic,
1055                                vec_type, args, Elements(args));
1056
1057       res = LLVMBuildExtractElement(builder, res, index0, "");
1058    }
1059    else {
1060       assert(type.width*type.length == 128);
1061
1062       switch(type.width) {
1063       case 32:
1064          intrinsic = "llvm.x86.sse41.round.ps";
1065          break;
1066       case 64:
1067          intrinsic = "llvm.x86.sse41.round.pd";
1068          break;
1069       default:
1070          assert(0);
1071          return bld->undef;
1072       }
1073
1074       res = lp_build_intrinsic_binary(builder, intrinsic,
1075                                       bld->vec_type, a,
1076                                       LLVMConstInt(i32t, mode, 0));
1077    }
1078
1079    return res;
1080 }
1081
1082
1083 static INLINE LLVMValueRef
1084 lp_build_iround_nearest_sse2(struct lp_build_context *bld,
1085                              LLVMValueRef a)
1086 {
1087    LLVMBuilderRef builder = bld->gallivm->builder;
1088    const struct lp_type type = bld->type;
1089    LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1090    LLVMTypeRef ret_type = lp_build_int_vec_type(bld->gallivm, type);
1091    const char *intrinsic;
1092    LLVMValueRef res;
1093
1094    assert(type.floating);
1095    /* using the double precision conversions is a bit more complicated */
1096    assert(type.width == 32);
1097
1098    assert(lp_check_value(type, a));
1099    assert(util_cpu_caps.has_sse2);
1100
1101    /* This is relying on MXCSR rounding mode, which should always be nearest. */
1102    if (type.length == 1) {
1103       LLVMTypeRef vec_type;
1104       LLVMValueRef undef;
1105       LLVMValueRef arg;
1106       LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
1107
1108       vec_type = LLVMVectorType(bld->elem_type, 4);
1109
1110       intrinsic = "llvm.x86.sse.cvtss2si";
1111
1112       undef = LLVMGetUndef(vec_type);
1113
1114       arg = LLVMBuildInsertElement(builder, undef, a, index0, "");
1115
1116       res = lp_build_intrinsic_unary(builder, intrinsic,
1117                                      ret_type, arg);
1118    }
1119    else {
1120       assert(type.width*type.length == 128);
1121
1122       intrinsic = "llvm.x86.sse2.cvtps2dq";
1123
1124       res = lp_build_intrinsic_unary(builder, intrinsic,
1125                                      ret_type, a);
1126    }
1127
1128    return res;
1129 }
1130
1131
1132 /**
1133  * Return the integer part of a float (vector) value (== round toward zero).
1134  * The returned value is a float (vector).
1135  * Ex: trunc(-1.5) = -1.0
1136  */
1137 LLVMValueRef
1138 lp_build_trunc(struct lp_build_context *bld,
1139                LLVMValueRef a)
1140 {
1141    LLVMBuilderRef builder = bld->gallivm->builder;
1142    const struct lp_type type = bld->type;
1143
1144    assert(type.floating);
1145    assert(lp_check_value(type, a));
1146
1147    if (util_cpu_caps.has_sse4_1 &&
1148        (type.length == 1 || type.width*type.length == 128)) {
1149       return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_TRUNCATE);
1150    }
1151    else {
1152       LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1153       LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1154       LLVMValueRef res;
1155       res = LLVMBuildFPToSI(builder, a, int_vec_type, "");
1156       res = LLVMBuildSIToFP(builder, res, vec_type, "");
1157       return res;
1158    }
1159 }
1160
1161
1162 /**
1163  * Return float (vector) rounded to nearest integer (vector).  The returned
1164  * value is a float (vector).
1165  * Ex: round(0.9) = 1.0
1166  * Ex: round(-1.5) = -2.0
1167  */
1168 LLVMValueRef
1169 lp_build_round(struct lp_build_context *bld,
1170                LLVMValueRef a)
1171 {
1172    LLVMBuilderRef builder = bld->gallivm->builder;
1173    const struct lp_type type = bld->type;
1174
1175    assert(type.floating);
1176    assert(lp_check_value(type, a));
1177
1178    if (util_cpu_caps.has_sse4_1 &&
1179        (type.length == 1 || type.width*type.length == 128)) {
1180       return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
1181    }
1182    else {
1183       LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1184       LLVMValueRef res;
1185       res = lp_build_iround(bld, a);
1186       res = LLVMBuildSIToFP(builder, res, vec_type, "");
1187       return res;
1188    }
1189 }
1190
1191
1192 /**
1193  * Return floor of float (vector), result is a float (vector)
1194  * Ex: floor(1.1) = 1.0
1195  * Ex: floor(-1.1) = -2.0
1196  */
1197 LLVMValueRef
1198 lp_build_floor(struct lp_build_context *bld,
1199                LLVMValueRef a)
1200 {
1201    LLVMBuilderRef builder = bld->gallivm->builder;
1202    const struct lp_type type = bld->type;
1203
1204    assert(type.floating);
1205    assert(lp_check_value(type, a));
1206
1207    if (util_cpu_caps.has_sse4_1 &&
1208        (type.length == 1 || type.width*type.length == 128)) {
1209       return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
1210    }
1211    else {
1212       LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1213       LLVMValueRef res;
1214       res = lp_build_ifloor(bld, a);
1215       res = LLVMBuildSIToFP(builder, res, vec_type, "");
1216       return res;
1217    }
1218 }
1219
1220
1221 /**
1222  * Return ceiling of float (vector), returning float (vector).
1223  * Ex: ceil( 1.1) = 2.0
1224  * Ex: ceil(-1.1) = -1.0
1225  */
1226 LLVMValueRef
1227 lp_build_ceil(struct lp_build_context *bld,
1228               LLVMValueRef a)
1229 {
1230    LLVMBuilderRef builder = bld->gallivm->builder;
1231    const struct lp_type type = bld->type;
1232
1233    assert(type.floating);
1234    assert(lp_check_value(type, a));
1235
1236    if (util_cpu_caps.has_sse4_1 &&
1237        (type.length == 1 || type.width*type.length == 128)) {
1238       return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
1239    }
1240    else {
1241       LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1242       LLVMValueRef res;
1243       res = lp_build_iceil(bld, a);
1244       res = LLVMBuildSIToFP(builder, res, vec_type, "");
1245       return res;
1246    }
1247 }
1248
1249
1250 /**
1251  * Return fractional part of 'a' computed as a - floor(a)
1252  * Typically used in texture coord arithmetic.
1253  */
1254 LLVMValueRef
1255 lp_build_fract(struct lp_build_context *bld,
1256                LLVMValueRef a)
1257 {
1258    assert(bld->type.floating);
1259    return lp_build_sub(bld, a, lp_build_floor(bld, a));
1260 }
1261
1262
1263 /**
1264  * Return the integer part of a float (vector) value (== round toward zero).
1265  * The returned value is an integer (vector).
1266  * Ex: itrunc(-1.5) = -1
1267  */
1268 LLVMValueRef
1269 lp_build_itrunc(struct lp_build_context *bld,
1270                 LLVMValueRef a)
1271 {
1272    LLVMBuilderRef builder = bld->gallivm->builder;
1273    const struct lp_type type = bld->type;
1274    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1275
1276    assert(type.floating);
1277    assert(lp_check_value(type, a));
1278
1279    return LLVMBuildFPToSI(builder, a, int_vec_type, "");
1280 }
1281
1282
1283 /**
1284  * Return float (vector) rounded to nearest integer (vector).  The returned
1285  * value is an integer (vector).
1286  * Ex: iround(0.9) = 1
1287  * Ex: iround(-1.5) = -2
1288  */
1289 LLVMValueRef
1290 lp_build_iround(struct lp_build_context *bld,
1291                 LLVMValueRef a)
1292 {
1293    LLVMBuilderRef builder = bld->gallivm->builder;
1294    const struct lp_type type = bld->type;
1295    LLVMTypeRef int_vec_type = bld->int_vec_type;
1296    LLVMValueRef res;
1297
1298    assert(type.floating);
1299
1300    assert(lp_check_value(type, a));
1301
1302    if (util_cpu_caps.has_sse2 &&
1303        ((type.width == 32) && (type.length == 1 || type.length == 4))) {
1304       return lp_build_iround_nearest_sse2(bld, a);
1305    }
1306    else if (util_cpu_caps.has_sse4_1 &&
1307        (type.length == 1 || type.width*type.length == 128)) {
1308       res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
1309    }
1310    else {
1311       LLVMValueRef half;
1312
1313       half = lp_build_const_vec(bld->gallivm, type, 0.5);
1314
1315       if (type.sign) {
1316          LLVMTypeRef vec_type = bld->vec_type;
1317          LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1318                                     (unsigned long long)1 << (type.width - 1));
1319          LLVMValueRef sign;
1320
1321          /* get sign bit */
1322          sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
1323          sign = LLVMBuildAnd(builder, sign, mask, "");
1324
1325          /* sign * 0.5 */
1326          half = LLVMBuildBitCast(builder, half, int_vec_type, "");
1327          half = LLVMBuildOr(builder, sign, half, "");
1328          half = LLVMBuildBitCast(builder, half, vec_type, "");
1329       }
1330
1331       res = LLVMBuildFAdd(builder, a, half, "");
1332    }
1333
1334    res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
1335
1336    return res;
1337 }
1338
1339
1340 /**
1341  * Return floor of float (vector), result is an int (vector)
1342  * Ex: ifloor(1.1) = 1.0
1343  * Ex: ifloor(-1.1) = -2.0
1344  */
1345 LLVMValueRef
1346 lp_build_ifloor(struct lp_build_context *bld,
1347                 LLVMValueRef a)
1348 {
1349    LLVMBuilderRef builder = bld->gallivm->builder;
1350    const struct lp_type type = bld->type;
1351    LLVMTypeRef int_vec_type = bld->int_vec_type;
1352    LLVMValueRef res;
1353
1354    assert(type.floating);
1355    assert(lp_check_value(type, a));
1356
1357    if (util_cpu_caps.has_sse4_1 &&
1358        (type.length == 1 || type.width*type.length == 128)) {
1359       res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
1360    }
1361    else {
1362       res = a;
1363
1364       if (type.sign) {
1365          /* Take the sign bit and add it to 1 constant */
1366          LLVMTypeRef vec_type = bld->vec_type;
1367          unsigned mantissa = lp_mantissa(type);
1368          LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1369                                   (unsigned long long)1 << (type.width - 1));
1370          LLVMValueRef sign;
1371          LLVMValueRef offset;
1372
1373          /* sign = a < 0 ? ~0 : 0 */
1374          sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
1375          sign = LLVMBuildAnd(builder, sign, mask, "");
1376          sign = LLVMBuildAShr(builder, sign,
1377                               lp_build_const_int_vec(bld->gallivm, type,
1378                                                      type.width - 1),
1379                               "ifloor.sign");
1380
1381          /* offset = -0.99999(9)f */
1382          offset = lp_build_const_vec(bld->gallivm, type,
1383                                      -(double)(((unsigned long long)1 << mantissa) - 10)/((unsigned long long)1 << mantissa));
1384          offset = LLVMConstBitCast(offset, int_vec_type);
1385
1386          /* offset = a < 0 ? offset : 0.0f */
1387          offset = LLVMBuildAnd(builder, offset, sign, "");
1388          offset = LLVMBuildBitCast(builder, offset, vec_type, "ifloor.offset");
1389
1390          res = LLVMBuildFAdd(builder, res, offset, "ifloor.res");
1391       }
1392    }
1393
1394    /* round to nearest (toward zero) */
1395    res = LLVMBuildFPToSI(builder, res, int_vec_type, "ifloor.res");
1396
1397    return res;
1398 }
1399
1400
1401 /**
1402  * Return ceiling of float (vector), returning int (vector).
1403  * Ex: iceil( 1.1) = 2
1404  * Ex: iceil(-1.1) = -1
1405  */
1406 LLVMValueRef
1407 lp_build_iceil(struct lp_build_context *bld,
1408                LLVMValueRef a)
1409 {
1410    LLVMBuilderRef builder = bld->gallivm->builder;
1411    const struct lp_type type = bld->type;
1412    LLVMTypeRef int_vec_type = bld->int_vec_type;
1413    LLVMValueRef res;
1414
1415    assert(type.floating);
1416    assert(lp_check_value(type, a));
1417
1418    if (util_cpu_caps.has_sse4_1 &&
1419        (type.length == 1 || type.width*type.length == 128)) {
1420       res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
1421    }
1422    else {
1423       LLVMTypeRef vec_type = bld->vec_type;
1424       unsigned mantissa = lp_mantissa(type);
1425       LLVMValueRef offset;
1426
1427       /* offset = 0.99999(9)f */
1428       offset = lp_build_const_vec(bld->gallivm, type,
1429                                   (double)(((unsigned long long)1 << mantissa) - 10)/((unsigned long long)1 << mantissa));
1430
1431       if (type.sign) {
1432          LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1433                                 (unsigned long long)1 << (type.width - 1));
1434          LLVMValueRef sign;
1435
1436          /* sign = a < 0 ? 0 : ~0 */
1437          sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
1438          sign = LLVMBuildAnd(builder, sign, mask, "");
1439          sign = LLVMBuildAShr(builder, sign,
1440                               lp_build_const_int_vec(bld->gallivm, type,
1441                                                      type.width - 1),
1442                               "iceil.sign");
1443          sign = LLVMBuildNot(builder, sign, "iceil.not");
1444
1445          /* offset = a < 0 ? 0.0 : offset */
1446          offset = LLVMConstBitCast(offset, int_vec_type);
1447          offset = LLVMBuildAnd(builder, offset, sign, "");
1448          offset = LLVMBuildBitCast(builder, offset, vec_type, "iceil.offset");
1449       }
1450
1451       res = LLVMBuildFAdd(builder, a, offset, "iceil.res");
1452    }
1453
1454    /* round to nearest (toward zero) */
1455    res = LLVMBuildFPToSI(builder, res, int_vec_type, "iceil.res");
1456
1457    return res;
1458 }
1459
1460
1461 /**
1462  * Combined ifloor() & fract().
1463  *
1464  * Preferred to calling the functions separately, as it will ensure that the
1465  * stratergy (floor() vs ifloor()) that results in less redundant work is used.
1466  */
1467 void
1468 lp_build_ifloor_fract(struct lp_build_context *bld,
1469                       LLVMValueRef a,
1470                       LLVMValueRef *out_ipart,
1471                       LLVMValueRef *out_fpart)
1472 {
1473    LLVMBuilderRef builder = bld->gallivm->builder;
1474    const struct lp_type type = bld->type;
1475    LLVMValueRef ipart;
1476
1477    assert(type.floating);
1478    assert(lp_check_value(type, a));
1479
1480    if (util_cpu_caps.has_sse4_1 &&
1481        (type.length == 1 || type.width*type.length == 128)) {
1482       /*
1483        * floor() is easier.
1484        */
1485
1486       ipart = lp_build_floor(bld, a);
1487       *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
1488       *out_ipart = LLVMBuildFPToSI(builder, ipart, bld->int_vec_type, "ipart");
1489    }
1490    else {
1491       /*
1492        * ifloor() is easier.
1493        */
1494
1495       *out_ipart = lp_build_ifloor(bld, a);
1496       ipart = LLVMBuildSIToFP(builder, *out_ipart, bld->vec_type, "ipart");
1497       *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
1498    }
1499 }
1500
1501
1502 LLVMValueRef
1503 lp_build_sqrt(struct lp_build_context *bld,
1504               LLVMValueRef a)
1505 {
1506    LLVMBuilderRef builder = bld->gallivm->builder;
1507    const struct lp_type type = bld->type;
1508    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1509    char intrinsic[32];
1510
1511    assert(lp_check_value(type, a));
1512
1513    /* TODO: optimize the constant case */
1514    /* TODO: optimize the constant case */
1515
1516    assert(type.floating);
1517    util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.v%uf%u", type.length, type.width);
1518
1519    return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
1520 }
1521
1522
1523 /**
1524  * Do one Newton-Raphson step to improve reciprocate precision:
1525  *
1526  *   x_{i+1} = x_i * (2 - a * x_i)
1527  *
1528  * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
1529  * +/-Inf, giving NaN instead.  Certain applications rely on this behavior,
1530  * such as Google Earth, which does RCP(RSQRT(0.0) when drawing the Earth's
1531  * halo. It would be necessary to clamp the argument to prevent this.
1532  *
1533  * See also:
1534  * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
1535  * - http://softwarecommunity.intel.com/articles/eng/1818.htm
1536  */
1537 static INLINE LLVMValueRef
1538 lp_build_rcp_refine(struct lp_build_context *bld,
1539                     LLVMValueRef a,
1540                     LLVMValueRef rcp_a)
1541 {
1542    LLVMBuilderRef builder = bld->gallivm->builder;
1543    LLVMValueRef two = lp_build_const_vec(bld->gallivm, bld->type, 2.0);
1544    LLVMValueRef res;
1545
1546    res = LLVMBuildFMul(builder, a, rcp_a, "");
1547    res = LLVMBuildFSub(builder, two, res, "");
1548    res = LLVMBuildFMul(builder, rcp_a, res, "");
1549
1550    return res;
1551 }
1552
1553
1554 LLVMValueRef
1555 lp_build_rcp(struct lp_build_context *bld,
1556              LLVMValueRef a)
1557 {
1558    LLVMBuilderRef builder = bld->gallivm->builder;
1559    const struct lp_type type = bld->type;
1560
1561    assert(lp_check_value(type, a));
1562
1563    if(a == bld->zero)
1564       return bld->undef;
1565    if(a == bld->one)
1566       return bld->one;
1567    if(a == bld->undef)
1568       return bld->undef;
1569
1570    assert(type.floating);
1571
1572    if(LLVMIsConstant(a))
1573       return LLVMConstFDiv(bld->one, a);
1574
1575    /*
1576     * We don't use RCPPS because:
1577     * - it only has 10bits of precision
1578     * - it doesn't even get the reciprocate of 1.0 exactly
1579     * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
1580     * - for recent processors the benefit over DIVPS is marginal, a case
1581     *   depedent
1582     *
1583     * We could still use it on certain processors if benchmarks show that the
1584     * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
1585     * particular uses that require less workarounds.
1586     */
1587
1588    if (FALSE && util_cpu_caps.has_sse && type.width == 32 && type.length == 4) {
1589       const unsigned num_iterations = 0;
1590       LLVMValueRef res;
1591       unsigned i;
1592
1593       res = lp_build_intrinsic_unary(builder, "llvm.x86.sse.rcp.ps", bld->vec_type, a);
1594
1595       for (i = 0; i < num_iterations; ++i) {
1596          res = lp_build_rcp_refine(bld, a, res);
1597       }
1598
1599       return res;
1600    }
1601
1602    return LLVMBuildFDiv(builder, bld->one, a, "");
1603 }
1604
1605
1606 /**
1607  * Do one Newton-Raphson step to improve rsqrt precision:
1608  *
1609  *   x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
1610  *
1611  * See also:
1612  * - http://softwarecommunity.intel.com/articles/eng/1818.htm
1613  */
1614 static INLINE LLVMValueRef
1615 lp_build_rsqrt_refine(struct lp_build_context *bld,
1616                       LLVMValueRef a,
1617                       LLVMValueRef rsqrt_a)
1618 {
1619    LLVMBuilderRef builder = bld->gallivm->builder;
1620    LLVMValueRef half = lp_build_const_vec(bld->gallivm, bld->type, 0.5);
1621    LLVMValueRef three = lp_build_const_vec(bld->gallivm, bld->type, 3.0);
1622    LLVMValueRef res;
1623
1624    res = LLVMBuildFMul(builder, rsqrt_a, rsqrt_a, "");
1625    res = LLVMBuildFMul(builder, a, res, "");
1626    res = LLVMBuildFSub(builder, three, res, "");
1627    res = LLVMBuildFMul(builder, rsqrt_a, res, "");
1628    res = LLVMBuildFMul(builder, half, res, "");
1629
1630    return res;
1631 }
1632
1633
1634 /**
1635  * Generate 1/sqrt(a)
1636  */
1637 LLVMValueRef
1638 lp_build_rsqrt(struct lp_build_context *bld,
1639                LLVMValueRef a)
1640 {
1641    LLVMBuilderRef builder = bld->gallivm->builder;
1642    const struct lp_type type = bld->type;
1643
1644    assert(lp_check_value(type, a));
1645
1646    assert(type.floating);
1647
1648    if (util_cpu_caps.has_sse && type.width == 32 && type.length == 4) {
1649       const unsigned num_iterations = 1;
1650       LLVMValueRef res;
1651       unsigned i;
1652
1653       res = lp_build_intrinsic_unary(builder, "llvm.x86.sse.rsqrt.ps", bld->vec_type, a);
1654
1655       for (i = 0; i < num_iterations; ++i) {
1656          res = lp_build_rsqrt_refine(bld, a, res);
1657       }
1658
1659       return res;
1660    }
1661
1662    return lp_build_rcp(bld, lp_build_sqrt(bld, a));
1663 }
1664
1665
1666 /**
1667  * Generate sin(a) using SSE2
1668  */
1669 LLVMValueRef
1670 lp_build_sin(struct lp_build_context *bld,
1671              LLVMValueRef a)
1672 {
1673    struct gallivm_state *gallivm = bld->gallivm;
1674    LLVMBuilderRef builder = gallivm->builder;
1675    struct lp_type int_type = lp_int_type(bld->type);
1676    LLVMBuilderRef b = builder;
1677
1678    /*
1679     *  take the absolute value,
1680     *  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
1681     */
1682
1683    LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
1684    LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
1685
1686    LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
1687    LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
1688
1689    /*
1690     * extract the sign bit (upper one)
1691     * sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask);
1692     */
1693    LLVMValueRef sig_mask = lp_build_const_int_vec(gallivm, bld->type, 0x80000000);
1694    LLVMValueRef sign_bit_i = LLVMBuildAnd(b, a_v4si, sig_mask, "sign_bit_i");
1695
1696    /*
1697     * scale by 4/Pi
1698     * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
1699     */
1700
1701    LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
1702    LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
1703
1704    /*
1705     * store the integer part of y in mm0
1706     * emm2 = _mm_cvttps_epi32(y);
1707     */
1708
1709    LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
1710
1711    /*
1712     * j=(j+1) & (~1) (see the cephes sources)
1713     * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
1714     */
1715
1716    LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
1717    LLVMValueRef emm2_add =  LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
1718    /*
1719     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
1720     */
1721    LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
1722    LLVMValueRef emm2_and =  LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
1723
1724    /*
1725     * y = _mm_cvtepi32_ps(emm2);
1726     */
1727    LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
1728
1729    /* get the swap sign flag
1730     * emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
1731     */
1732    LLVMValueRef pi32_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
1733    LLVMValueRef emm0_and =  LLVMBuildAnd(b, emm2_add, pi32_4, "emm0_and");
1734
1735    /*
1736     * emm2 = _mm_slli_epi32(emm0, 29);
1737     */
1738    LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
1739    LLVMValueRef swap_sign_bit = LLVMBuildShl(b, emm0_and, const_29, "swap_sign_bit");
1740
1741    /*
1742     * get the polynom selection mask
1743     * there is one polynom for 0 <= x <= Pi/4
1744     * and another one for Pi/4<x<=Pi/2
1745     * Both branches will be computed.
1746     *
1747     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
1748     * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
1749     */
1750
1751    LLVMValueRef pi32_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
1752    LLVMValueRef emm2_3 =  LLVMBuildAnd(b, emm2_and, pi32_2, "emm2_3");
1753    LLVMValueRef poly_mask = lp_build_compare(gallivm,
1754                                              int_type, PIPE_FUNC_EQUAL,
1755                                              emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
1756    /*
1757     *   sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
1758     */
1759    LLVMValueRef sign_bit_1 =  LLVMBuildXor(b, sign_bit_i, swap_sign_bit, "sign_bit");
1760
1761    /*
1762     * _PS_CONST(minus_cephes_DP1, -0.78515625);
1763     * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
1764     * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
1765     */
1766    LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
1767    LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
1768    LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
1769
1770    /*
1771     * The magic pass: "Extended precision modular arithmetic"
1772     * x = ((x - y * DP1) - y * DP2) - y * DP3;
1773     * xmm1 = _mm_mul_ps(y, xmm1);
1774     * xmm2 = _mm_mul_ps(y, xmm2);
1775     * xmm3 = _mm_mul_ps(y, xmm3);
1776     */
1777    LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1");
1778    LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2");
1779    LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3");
1780
1781    /*
1782     * x = _mm_add_ps(x, xmm1);
1783     * x = _mm_add_ps(x, xmm2);
1784     * x = _mm_add_ps(x, xmm3);
1785     */
1786
1787    LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1");
1788    LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2");
1789    LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3");
1790
1791    /*
1792     * Evaluate the first polynom  (0 <= x <= Pi/4)
1793     *
1794     * z = _mm_mul_ps(x,x);
1795     */
1796    LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
1797
1798    /*
1799     * _PS_CONST(coscof_p0,  2.443315711809948E-005);
1800     * _PS_CONST(coscof_p1, -1.388731625493765E-003);
1801     * _PS_CONST(coscof_p2,  4.166664568298827E-002);
1802     */
1803    LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
1804    LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
1805    LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
1806
1807    /*
1808     * y = *(v4sf*)_ps_coscof_p0;
1809     * y = _mm_mul_ps(y, z);
1810     */
1811    LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3");
1812    LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4");
1813    LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5");
1814    LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6");
1815    LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
1816    LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
1817
1818
1819    /*
1820     * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
1821     * y = _mm_sub_ps(y, tmp);
1822     * y = _mm_add_ps(y, *(v4sf*)_ps_1);
1823     */
1824    LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
1825    LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
1826    LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
1827    LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
1828    LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
1829
1830    /*
1831     * _PS_CONST(sincof_p0, -1.9515295891E-4);
1832     * _PS_CONST(sincof_p1,  8.3321608736E-3);
1833     * _PS_CONST(sincof_p2, -1.6666654611E-1);
1834     */
1835    LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
1836    LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
1837    LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
1838
1839    /*
1840     * Evaluate the second polynom  (Pi/4 <= x <= 0)
1841     *
1842     * y2 = *(v4sf*)_ps_sincof_p0;
1843     * y2 = _mm_mul_ps(y2, z);
1844     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
1845     * y2 = _mm_mul_ps(y2, z);
1846     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
1847     * y2 = _mm_mul_ps(y2, z);
1848     * y2 = _mm_mul_ps(y2, x);
1849     * y2 = _mm_add_ps(y2, x);
1850     */
1851
1852    LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3");
1853    LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4");
1854    LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5");
1855    LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6");
1856    LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
1857    LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8");
1858    LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9");
1859
1860    /*
1861     * select the correct result from the two polynoms
1862     * xmm3 = poly_mask;
1863     * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
1864     * y = _mm_andnot_ps(xmm3, y);
1865     * y = _mm_add_ps(y,y2);
1866     */
1867    LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
1868    LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
1869    LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
1870    LLVMValueRef inv = lp_build_const_int_vec(gallivm, bld->type, ~0);
1871    LLVMValueRef poly_mask_inv = LLVMBuildXor(b, poly_mask, inv, "poly_mask_inv");
1872    LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
1873    LLVMValueRef y_combine = LLVMBuildAdd(b, y_and, y2_and, "y_combine");
1874
1875    /*
1876     * update the sign
1877     * y = _mm_xor_ps(y, sign_bit);
1878     */
1879    LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit_1, "y_sin");
1880    LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
1881    return y_result;
1882 }
1883
1884
1885 /**
1886  * Generate cos(a) using SSE2
1887  */
1888 LLVMValueRef
1889 lp_build_cos(struct lp_build_context *bld,
1890              LLVMValueRef a)
1891 {
1892    struct gallivm_state *gallivm = bld->gallivm;
1893    LLVMBuilderRef builder = gallivm->builder;
1894    struct lp_type int_type = lp_int_type(bld->type);
1895    LLVMBuilderRef b = builder;
1896
1897    /*
1898     *  take the absolute value,
1899     *  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
1900     */
1901
1902    LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
1903    LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
1904
1905    LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
1906    LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
1907
1908    /*
1909     * scale by 4/Pi
1910     * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
1911     */
1912
1913    LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
1914    LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
1915
1916    /*
1917     * store the integer part of y in mm0
1918     * emm2 = _mm_cvttps_epi32(y);
1919     */
1920
1921    LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
1922
1923    /*
1924     * j=(j+1) & (~1) (see the cephes sources)
1925     * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
1926     */
1927
1928    LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
1929    LLVMValueRef emm2_add =  LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
1930    /*
1931     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
1932     */
1933    LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
1934    LLVMValueRef emm2_and =  LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
1935
1936    /*
1937     * y = _mm_cvtepi32_ps(emm2);
1938     */
1939    LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
1940
1941
1942    /*
1943     * emm2 = _mm_sub_epi32(emm2, *(v4si*)_pi32_2);
1944     */
1945    LLVMValueRef const_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
1946    LLVMValueRef emm2_2 = LLVMBuildSub(b, emm2_and, const_2, "emm2_2");
1947
1948
1949    /* get the swap sign flag
1950     * emm0 = _mm_andnot_si128(emm2, *(v4si*)_pi32_4);
1951     */
1952    LLVMValueRef inv = lp_build_const_int_vec(gallivm, bld->type, ~0);
1953    LLVMValueRef emm0_not = LLVMBuildXor(b, emm2_2, inv, "emm0_not");
1954    LLVMValueRef pi32_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
1955    LLVMValueRef emm0_and =  LLVMBuildAnd(b, emm0_not, pi32_4, "emm0_and");
1956
1957    /*
1958     * emm2 = _mm_slli_epi32(emm0, 29);
1959     */
1960    LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
1961    LLVMValueRef sign_bit = LLVMBuildShl(b, emm0_and, const_29, "sign_bit");
1962
1963    /*
1964     * get the polynom selection mask
1965     * there is one polynom for 0 <= x <= Pi/4
1966     * and another one for Pi/4<x<=Pi/2
1967     * Both branches will be computed.
1968     *
1969     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
1970     * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
1971     */
1972
1973    LLVMValueRef pi32_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
1974    LLVMValueRef emm2_3 =  LLVMBuildAnd(b, emm2_2, pi32_2, "emm2_3");
1975    LLVMValueRef poly_mask = lp_build_compare(gallivm,
1976                                              int_type, PIPE_FUNC_EQUAL,
1977                                              emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
1978
1979    /*
1980     * _PS_CONST(minus_cephes_DP1, -0.78515625);
1981     * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
1982     * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
1983     */
1984    LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
1985    LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
1986    LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
1987
1988    /*
1989     * The magic pass: "Extended precision modular arithmetic"
1990     * x = ((x - y * DP1) - y * DP2) - y * DP3;
1991     * xmm1 = _mm_mul_ps(y, xmm1);
1992     * xmm2 = _mm_mul_ps(y, xmm2);
1993     * xmm3 = _mm_mul_ps(y, xmm3);
1994     */
1995    LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1");
1996    LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2");
1997    LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3");
1998
1999    /*
2000     * x = _mm_add_ps(x, xmm1);
2001     * x = _mm_add_ps(x, xmm2);
2002     * x = _mm_add_ps(x, xmm3);
2003     */
2004
2005    LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1");
2006    LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2");
2007    LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3");
2008
2009    /*
2010     * Evaluate the first polynom  (0 <= x <= Pi/4)
2011     *
2012     * z = _mm_mul_ps(x,x);
2013     */
2014    LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
2015
2016    /*
2017     * _PS_CONST(coscof_p0,  2.443315711809948E-005);
2018     * _PS_CONST(coscof_p1, -1.388731625493765E-003);
2019     * _PS_CONST(coscof_p2,  4.166664568298827E-002);
2020     */
2021    LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
2022    LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
2023    LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
2024
2025    /*
2026     * y = *(v4sf*)_ps_coscof_p0;
2027     * y = _mm_mul_ps(y, z);
2028     */
2029    LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3");
2030    LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4");
2031    LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5");
2032    LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6");
2033    LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
2034    LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
2035
2036
2037    /*
2038     * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
2039     * y = _mm_sub_ps(y, tmp);
2040     * y = _mm_add_ps(y, *(v4sf*)_ps_1);
2041     */
2042    LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
2043    LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
2044    LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
2045    LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
2046    LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
2047
2048    /*
2049     * _PS_CONST(sincof_p0, -1.9515295891E-4);
2050     * _PS_CONST(sincof_p1,  8.3321608736E-3);
2051     * _PS_CONST(sincof_p2, -1.6666654611E-1);
2052     */
2053    LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
2054    LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
2055    LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
2056
2057    /*
2058     * Evaluate the second polynom  (Pi/4 <= x <= 0)
2059     *
2060     * y2 = *(v4sf*)_ps_sincof_p0;
2061     * y2 = _mm_mul_ps(y2, z);
2062     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
2063     * y2 = _mm_mul_ps(y2, z);
2064     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
2065     * y2 = _mm_mul_ps(y2, z);
2066     * y2 = _mm_mul_ps(y2, x);
2067     * y2 = _mm_add_ps(y2, x);
2068     */
2069
2070    LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3");
2071    LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4");
2072    LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5");
2073    LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6");
2074    LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
2075    LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8");
2076    LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9");
2077
2078    /*
2079     * select the correct result from the two polynoms
2080     * xmm3 = poly_mask;
2081     * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
2082     * y = _mm_andnot_ps(xmm3, y);
2083     * y = _mm_add_ps(y,y2);
2084     */
2085    LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
2086    LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
2087    LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
2088    LLVMValueRef poly_mask_inv = LLVMBuildXor(b, poly_mask, inv, "poly_mask_inv");
2089    LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
2090    LLVMValueRef y_combine = LLVMBuildAdd(b, y_and, y2_and, "y_combine");
2091
2092    /*
2093     * update the sign
2094     * y = _mm_xor_ps(y, sign_bit);
2095     */
2096    LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sin");
2097    LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
2098    return y_result;
2099 }
2100
2101
2102 /**
2103  * Generate pow(x, y)
2104  */
2105 LLVMValueRef
2106 lp_build_pow(struct lp_build_context *bld,
2107              LLVMValueRef x,
2108              LLVMValueRef y)
2109 {
2110    /* TODO: optimize the constant case */
2111    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2112        LLVMIsConstant(x) && LLVMIsConstant(y)) {
2113       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2114                    __FUNCTION__);
2115    }
2116
2117    return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
2118 }
2119
2120
2121 /**
2122  * Generate exp(x)
2123  */
2124 LLVMValueRef
2125 lp_build_exp(struct lp_build_context *bld,
2126              LLVMValueRef x)
2127 {
2128    /* log2(e) = 1/log(2) */
2129    LLVMValueRef log2e = lp_build_const_vec(bld->gallivm, bld->type,
2130                                            1.4426950408889634);
2131
2132    assert(lp_check_value(bld->type, x));
2133
2134    return lp_build_exp2(bld, lp_build_mul(bld, log2e, x));
2135 }
2136
2137
2138 /**
2139  * Generate log(x)
2140  */
2141 LLVMValueRef
2142 lp_build_log(struct lp_build_context *bld,
2143              LLVMValueRef x)
2144 {
2145    /* log(2) */
2146    LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
2147                                           0.69314718055994529);
2148
2149    assert(lp_check_value(bld->type, x));
2150
2151    return lp_build_mul(bld, log2, lp_build_log2(bld, x));
2152 }
2153
2154
2155 /**
2156  * Generate polynomial.
2157  * Ex:  coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
2158  */
2159 static LLVMValueRef
2160 lp_build_polynomial(struct lp_build_context *bld,
2161                     LLVMValueRef x,
2162                     const double *coeffs,
2163                     unsigned num_coeffs)
2164 {
2165    const struct lp_type type = bld->type;
2166    LLVMValueRef even = NULL, odd = NULL;
2167    LLVMValueRef x2;
2168    unsigned i;
2169
2170    assert(lp_check_value(bld->type, x));
2171
2172    /* TODO: optimize the constant case */
2173    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2174        LLVMIsConstant(x)) {
2175       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2176                    __FUNCTION__);
2177    }
2178
2179    /*
2180     * Calculate odd and even terms seperately to decrease data dependency
2181     * Ex:
2182     *     c[0] + x^2 * c[2] + x^4 * c[4] ...
2183     *     + x * (c[1] + x^2 * c[3] + x^4 * c[5]) ...
2184     */
2185    x2 = lp_build_mul(bld, x, x);
2186
2187    for (i = num_coeffs; i--; ) {
2188       LLVMValueRef coeff;
2189
2190       coeff = lp_build_const_vec(bld->gallivm, type, coeffs[i]);
2191
2192       if (i % 2 == 0) {
2193          if (even)
2194             even = lp_build_add(bld, coeff, lp_build_mul(bld, x2, even));
2195          else
2196             even = coeff;
2197       } else {
2198          if (odd)
2199             odd = lp_build_add(bld, coeff, lp_build_mul(bld, x2, odd));
2200          else
2201             odd = coeff;
2202       }
2203    }
2204
2205    if (odd)
2206       return lp_build_add(bld, lp_build_mul(bld, odd, x), even);
2207    else if (even)
2208       return even;
2209    else
2210       return bld->undef;
2211 }
2212
2213
2214 /**
2215  * Minimax polynomial fit of 2**x, in range [0, 1[
2216  */
2217 const double lp_build_exp2_polynomial[] = {
2218 #if EXP_POLY_DEGREE == 5
2219    0.999999925063526176901,
2220    0.693153073200168932794,
2221    0.240153617044375388211,
2222    0.0558263180532956664775,
2223    0.00898934009049466391101,
2224    0.00187757667519147912699
2225 #elif EXP_POLY_DEGREE == 4
2226    1.00000259337069434683,
2227    0.693003834469974940458,
2228    0.24144275689150793076,
2229    0.0520114606103070150235,
2230    0.0135341679161270268764
2231 #elif EXP_POLY_DEGREE == 3
2232    0.999925218562710312959,
2233    0.695833540494823811697,
2234    0.226067155427249155588,
2235    0.0780245226406372992967
2236 #elif EXP_POLY_DEGREE == 2
2237    1.00172476321474503578,
2238    0.657636275736077639316,
2239    0.33718943461968720704
2240 #else
2241 #error
2242 #endif
2243 };
2244
2245
2246 void
2247 lp_build_exp2_approx(struct lp_build_context *bld,
2248                      LLVMValueRef x,
2249                      LLVMValueRef *p_exp2_int_part,
2250                      LLVMValueRef *p_frac_part,
2251                      LLVMValueRef *p_exp2)
2252 {
2253    LLVMBuilderRef builder = bld->gallivm->builder;
2254    const struct lp_type type = bld->type;
2255    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
2256    LLVMValueRef ipart = NULL;
2257    LLVMValueRef fpart = NULL;
2258    LLVMValueRef expipart = NULL;
2259    LLVMValueRef expfpart = NULL;
2260    LLVMValueRef res = NULL;
2261
2262    assert(lp_check_value(bld->type, x));
2263
2264    if(p_exp2_int_part || p_frac_part || p_exp2) {
2265       /* TODO: optimize the constant case */
2266       if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2267           LLVMIsConstant(x)) {
2268          debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2269                       __FUNCTION__);
2270       }
2271
2272       assert(type.floating && type.width == 32);
2273
2274       x = lp_build_min(bld, x, lp_build_const_vec(bld->gallivm, type,  129.0));
2275       x = lp_build_max(bld, x, lp_build_const_vec(bld->gallivm, type, -126.99999));
2276
2277       /* ipart = floor(x) */
2278       /* fpart = x - ipart */
2279       lp_build_ifloor_fract(bld, x, &ipart, &fpart);
2280    }
2281
2282    if(p_exp2_int_part || p_exp2) {
2283       /* expipart = (float) (1 << ipart) */
2284       expipart = LLVMBuildAdd(builder, ipart,
2285                               lp_build_const_int_vec(bld->gallivm, type, 127), "");
2286       expipart = LLVMBuildShl(builder, expipart,
2287                               lp_build_const_int_vec(bld->gallivm, type, 23), "");
2288       expipart = LLVMBuildBitCast(builder, expipart, vec_type, "");
2289    }
2290
2291    if(p_exp2) {
2292       expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
2293                                      Elements(lp_build_exp2_polynomial));
2294
2295       res = LLVMBuildFMul(builder, expipart, expfpart, "");
2296    }
2297
2298    if(p_exp2_int_part)
2299       *p_exp2_int_part = expipart;
2300
2301    if(p_frac_part)
2302       *p_frac_part = fpart;
2303
2304    if(p_exp2)
2305       *p_exp2 = res;
2306 }
2307
2308
2309 LLVMValueRef
2310 lp_build_exp2(struct lp_build_context *bld,
2311               LLVMValueRef x)
2312 {
2313    LLVMValueRef res;
2314    lp_build_exp2_approx(bld, x, NULL, NULL, &res);
2315    return res;
2316 }
2317
2318
2319 /**
2320  * Extract the exponent of a IEEE-754 floating point value.
2321  *
2322  * Optionally apply an integer bias.
2323  *
2324  * Result is an integer value with
2325  *
2326  *   ifloor(log2(x)) + bias
2327  */
2328 LLVMValueRef
2329 lp_build_extract_exponent(struct lp_build_context *bld,
2330                           LLVMValueRef x,
2331                           int bias)
2332 {
2333    LLVMBuilderRef builder = bld->gallivm->builder;
2334    const struct lp_type type = bld->type;
2335    unsigned mantissa = lp_mantissa(type);
2336    LLVMValueRef res;
2337
2338    assert(type.floating);
2339
2340    assert(lp_check_value(bld->type, x));
2341
2342    x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
2343
2344    res = LLVMBuildLShr(builder, x,
2345                        lp_build_const_int_vec(bld->gallivm, type, mantissa), "");
2346    res = LLVMBuildAnd(builder, res,
2347                       lp_build_const_int_vec(bld->gallivm, type, 255), "");
2348    res = LLVMBuildSub(builder, res,
2349                       lp_build_const_int_vec(bld->gallivm, type, 127 - bias), "");
2350
2351    return res;
2352 }
2353
2354
2355 /**
2356  * Extract the mantissa of the a floating.
2357  *
2358  * Result is a floating point value with
2359  *
2360  *   x / floor(log2(x))
2361  */
2362 LLVMValueRef
2363 lp_build_extract_mantissa(struct lp_build_context *bld,
2364                           LLVMValueRef x)
2365 {
2366    LLVMBuilderRef builder = bld->gallivm->builder;
2367    const struct lp_type type = bld->type;
2368    unsigned mantissa = lp_mantissa(type);
2369    LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type,
2370                                                   (1ULL << mantissa) - 1);
2371    LLVMValueRef one = LLVMConstBitCast(bld->one, bld->int_vec_type);
2372    LLVMValueRef res;
2373
2374    assert(lp_check_value(bld->type, x));
2375
2376    assert(type.floating);
2377
2378    x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
2379
2380    /* res = x / 2**ipart */
2381    res = LLVMBuildAnd(builder, x, mantmask, "");
2382    res = LLVMBuildOr(builder, res, one, "");
2383    res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
2384
2385    return res;
2386 }
2387
2388
2389
2390 /**
2391  * Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[
2392  * These coefficients can be generate with
2393  * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
2394  */
2395 const double lp_build_log2_polynomial[] = {
2396 #if LOG_POLY_DEGREE == 6
2397    3.11578814719469302614,
2398    -3.32419399085241980044,
2399    2.59883907202499966007,
2400    -1.23152682416275988241,
2401    0.318212422185251071475,
2402    -0.0344359067839062357313
2403 #elif LOG_POLY_DEGREE == 5
2404    2.8882704548164776201,
2405    -2.52074962577807006663,
2406    1.48116647521213171641,
2407    -0.465725644288844778798,
2408    0.0596515482674574969533
2409 #elif LOG_POLY_DEGREE == 4
2410    2.61761038894603480148,
2411    -1.75647175389045657003,
2412    0.688243882994381274313,
2413    -0.107254423828329604454
2414 #elif LOG_POLY_DEGREE == 3
2415    2.28330284476918490682,
2416    -1.04913055217340124191,
2417    0.204446009836232697516
2418 #else
2419 #error
2420 #endif
2421 };
2422
2423
2424 /**
2425  * See http://www.devmaster.net/forums/showthread.php?p=43580
2426  */
2427 void
2428 lp_build_log2_approx(struct lp_build_context *bld,
2429                      LLVMValueRef x,
2430                      LLVMValueRef *p_exp,
2431                      LLVMValueRef *p_floor_log2,
2432                      LLVMValueRef *p_log2)
2433 {
2434    LLVMBuilderRef builder = bld->gallivm->builder;
2435    const struct lp_type type = bld->type;
2436    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
2437    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
2438
2439    LLVMValueRef expmask = lp_build_const_int_vec(bld->gallivm, type, 0x7f800000);
2440    LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 0x007fffff);
2441    LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
2442
2443    LLVMValueRef i = NULL;
2444    LLVMValueRef exp = NULL;
2445    LLVMValueRef mant = NULL;
2446    LLVMValueRef logexp = NULL;
2447    LLVMValueRef logmant = NULL;
2448    LLVMValueRef res = NULL;
2449
2450    assert(lp_check_value(bld->type, x));
2451
2452    if(p_exp || p_floor_log2 || p_log2) {
2453       /* TODO: optimize the constant case */
2454       if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2455           LLVMIsConstant(x)) {
2456          debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2457                       __FUNCTION__);
2458       }
2459
2460       assert(type.floating && type.width == 32);
2461
2462       /*
2463        * We don't explicitly handle denormalized numbers. They will yield a
2464        * result in the neighbourhood of -127, which appears to be adequate
2465        * enough.
2466        */
2467
2468       i = LLVMBuildBitCast(builder, x, int_vec_type, "");
2469
2470       /* exp = (float) exponent(x) */
2471       exp = LLVMBuildAnd(builder, i, expmask, "");
2472    }
2473
2474    if(p_floor_log2 || p_log2) {
2475       logexp = LLVMBuildLShr(builder, exp, lp_build_const_int_vec(bld->gallivm, type, 23), "");
2476       logexp = LLVMBuildSub(builder, logexp, lp_build_const_int_vec(bld->gallivm, type, 127), "");
2477       logexp = LLVMBuildSIToFP(builder, logexp, vec_type, "");
2478    }
2479
2480    if(p_log2) {
2481       /* mant = (float) mantissa(x) */
2482       mant = LLVMBuildAnd(builder, i, mantmask, "");
2483       mant = LLVMBuildOr(builder, mant, one, "");
2484       mant = LLVMBuildBitCast(builder, mant, vec_type, "");
2485
2486       logmant = lp_build_polynomial(bld, mant, lp_build_log2_polynomial,
2487                                     Elements(lp_build_log2_polynomial));
2488
2489       /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
2490       logmant = LLVMBuildFMul(builder, logmant, LLVMBuildFSub(builder, mant, bld->one, ""), "");
2491
2492       res = LLVMBuildFAdd(builder, logmant, logexp, "");
2493    }
2494
2495    if(p_exp) {
2496       exp = LLVMBuildBitCast(builder, exp, vec_type, "");
2497       *p_exp = exp;
2498    }
2499
2500    if(p_floor_log2)
2501       *p_floor_log2 = logexp;
2502
2503    if(p_log2)
2504       *p_log2 = res;
2505 }
2506
2507
2508 LLVMValueRef
2509 lp_build_log2(struct lp_build_context *bld,
2510               LLVMValueRef x)
2511 {
2512    LLVMValueRef res;
2513    lp_build_log2_approx(bld, x, NULL, NULL, &res);
2514    return res;
2515 }
2516
2517
2518 /**
2519  * Faster (and less accurate) log2.
2520  *
2521  *    log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x))
2522  *
2523  * Piece-wise linear approximation, with exact results when x is a
2524  * power of two.
2525  *
2526  * See http://www.flipcode.com/archives/Fast_log_Function.shtml
2527  */
2528 LLVMValueRef
2529 lp_build_fast_log2(struct lp_build_context *bld,
2530                    LLVMValueRef x)
2531 {
2532    LLVMBuilderRef builder = bld->gallivm->builder;
2533    LLVMValueRef ipart;
2534    LLVMValueRef fpart;
2535
2536    assert(lp_check_value(bld->type, x));
2537
2538    assert(bld->type.floating);
2539
2540    /* ipart = floor(log2(x)) - 1 */
2541    ipart = lp_build_extract_exponent(bld, x, -1);
2542    ipart = LLVMBuildSIToFP(builder, ipart, bld->vec_type, "");
2543
2544    /* fpart = x / 2**ipart */
2545    fpart = lp_build_extract_mantissa(bld, x);
2546
2547    /* ipart + fpart */
2548    return LLVMBuildFAdd(builder, ipart, fpart, "");
2549 }
2550
2551
2552 /**
2553  * Fast implementation of iround(log2(x)).
2554  *
2555  * Not an approximation -- it should give accurate results all the time.
2556  */
2557 LLVMValueRef
2558 lp_build_ilog2(struct lp_build_context *bld,
2559                LLVMValueRef x)
2560 {
2561    LLVMBuilderRef builder = bld->gallivm->builder;
2562    LLVMValueRef sqrt2 = lp_build_const_vec(bld->gallivm, bld->type, M_SQRT2);
2563    LLVMValueRef ipart;
2564
2565    assert(bld->type.floating);
2566
2567    assert(lp_check_value(bld->type, x));
2568
2569    /* x * 2^(0.5)   i.e., add 0.5 to the log2(x) */
2570    x = LLVMBuildFMul(builder, x, sqrt2, "");
2571
2572    /* ipart = floor(log2(x) + 0.5)  */
2573    ipart = lp_build_extract_exponent(bld, x, 0);
2574
2575    return ipart;
2576 }
2577
2578 LLVMValueRef
2579 lp_build_mod(struct lp_build_context *bld,
2580              LLVMValueRef x,
2581              LLVMValueRef y)
2582 {
2583    LLVMBuilderRef builder = bld->gallivm->builder;
2584    LLVMValueRef res;
2585    const struct lp_type type = bld->type;
2586
2587    assert(lp_check_value(type, x));
2588    assert(lp_check_value(type, y));
2589
2590    if (type.floating)
2591       res = LLVMBuildFRem(builder, x, y, "");
2592    else if (type.sign)
2593       res = LLVMBuildSRem(builder, x, y, "");
2594    else
2595       res = LLVMBuildURem(builder, x, y, "");
2596    return res;
2597 }