src/gallium/auxiliary/gallivm/lp_bld_arit.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2009 VMware, Inc.
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the
  15  * next paragraph) shall be included in all copies or substantial portions
  16  * of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  21  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
  22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25  *
  26  **************************************************************************/
  27
  28
  29 /**
  30  * @file
  31  * Helper
  32  *
  33  * LLVM IR doesn't support all basic arithmetic operations we care about (most
  34  * notably min/max and saturated operations), and it is often necessary to
  35  * resort machine-specific intrinsics directly. The functions here hide all
  36  * these implementation details from the other modules.
  37  *
  38  * We also do simple expressions simplification here. Reasons are:
  39  * - it is very easy given we have all necessary information readily available
  40  * - LLVM optimization passes fail to simplify several vector expressions
  41  * - We often know value constraints which the optimization passes have no way
  42  *   of knowing, such as when source arguments are known to be in [0, 1] range.
  43  *
  44  * @author Jose Fonseca <jfonseca@vmware.com>
  45  */
  46
  47
  48 #include "util/u_memory.h"
  49 #include "util/u_debug.h"
  50 #include "util/u_math.h"
  51 #include "util/u_string.h"
  52 #include "util/u_cpu_detect.h"
  53
  54 #include "lp_bld_type.h"
  55 #include "lp_bld_const.h"
  56 #include "lp_bld_intr.h"
  57 #include "lp_bld_logic.h"
  58 #include "lp_bld_pack.h"
  59 #include "lp_bld_arit.h"
  60
  61
  62 /**
  63  * Generate min(a, b)
  64  * No checks for special case values of a or b = 1 or 0 are done.
  65  */
  66 static LLVMValueRef
  67 lp_build_min_simple(struct lp_build_context *bld,
  68                     LLVMValueRef a,
  69                     LLVMValueRef b)
  70 {
  71    const struct lp_type type = bld->type;
  72    const char *intrinsic = NULL;
  73    LLVMValueRef cond;
  74
  75    /* TODO: optimize the constant case */
  76
  77    if(type.width * type.length == 128) {
  78       if(type.floating) {
  79          if(type.width == 32 && util_cpu_caps.has_sse)
  80             intrinsic = "llvm.x86.sse.min.ps";
  81          if(type.width == 64 && util_cpu_caps.has_sse2)
  82             intrinsic = "llvm.x86.sse2.min.pd";
  83       }
  84       else {
  85          if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2)
  86             intrinsic = "llvm.x86.sse2.pminu.b";
  87          if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1)
  88             intrinsic = "llvm.x86.sse41.pminsb";
  89          if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1)
  90             intrinsic = "llvm.x86.sse41.pminuw";
  91          if(type.width == 16 && type.sign && util_cpu_caps.has_sse2)
  92             intrinsic = "llvm.x86.sse2.pmins.w";
  93          if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1)
  94             intrinsic = "llvm.x86.sse41.pminud";
  95          if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1)
  96             intrinsic = "llvm.x86.sse41.pminsd";
  97       }
  98    }
  99
 100    if(intrinsic)
 101       return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
 102
 103    cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 104    return lp_build_select(bld, cond, a, b);
 105 }
 106
 107
 108 /**
 109  * Generate max(a, b)
 110  * No checks for special case values of a or b = 1 or 0 are done.
 111  */
 112 static LLVMValueRef
 113 lp_build_max_simple(struct lp_build_context *bld,
 114                     LLVMValueRef a,
 115                     LLVMValueRef b)
 116 {
 117    const struct lp_type type = bld->type;
 118    const char *intrinsic = NULL;
 119    LLVMValueRef cond;
 120
 121    /* TODO: optimize the constant case */
 122
 123    if(type.width * type.length == 128) {
 124       if(type.floating) {
 125          if(type.width == 32 && util_cpu_caps.has_sse)
 126             intrinsic = "llvm.x86.sse.max.ps";
 127          if(type.width == 64 && util_cpu_caps.has_sse2)
 128             intrinsic = "llvm.x86.sse2.max.pd";
 129       }
 130       else {
 131          if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2)
 132             intrinsic = "llvm.x86.sse2.pmaxu.b";
 133          if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1)
 134             intrinsic = "llvm.x86.sse41.pmaxsb";
 135          if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1)
 136             intrinsic = "llvm.x86.sse41.pmaxuw";
 137          if(type.width == 16 && type.sign && util_cpu_caps.has_sse2)
 138             intrinsic = "llvm.x86.sse2.pmaxs.w";
 139          if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1)
 140             intrinsic = "llvm.x86.sse41.pmaxud";
 141          if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1)
 142             intrinsic = "llvm.x86.sse41.pmaxsd";
 143       }
 144    }
 145
 146    if(intrinsic)
 147       return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
 148
 149    cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 150    return lp_build_select(bld, cond, a, b);
 151 }
 152
 153
 154 /**
 155  * Generate 1 - a, or ~a depending on bld->type.
 156  */
 157 LLVMValueRef
 158 lp_build_comp(struct lp_build_context *bld,
 159               LLVMValueRef a)
 160 {
 161    const struct lp_type type = bld->type;
 162
 163    if(a == bld->one)
 164       return bld->zero;
 165    if(a == bld->zero)
 166       return bld->one;
 167
 168    if(type.norm && !type.floating && !type.fixed && !type.sign) {
 169       if(LLVMIsConstant(a))
 170          return LLVMConstNot(a);
 171       else
 172          return LLVMBuildNot(bld->builder, a, "");
 173    }
 174
 175    if(LLVMIsConstant(a))
 176       if (type.floating)
 177           return LLVMConstFSub(bld->one, a);
 178       else
 179           return LLVMConstSub(bld->one, a);
 180    else
 181       if (type.floating)
 182          return LLVMBuildFSub(bld->builder, bld->one, a, "");
 183       else
 184          return LLVMBuildSub(bld->builder, bld->one, a, "");
 185 }
 186
 187
 188 /**
 189  * Generate a + b
 190  */
 191 LLVMValueRef
 192 lp_build_add(struct lp_build_context *bld,
 193              LLVMValueRef a,
 194              LLVMValueRef b)
 195 {
 196    const struct lp_type type = bld->type;
 197    LLVMValueRef res;
 198
 199    assert(lp_check_value(type, a));
 200    assert(lp_check_value(type, b));
 201
 202    if(a == bld->zero)
 203       return b;
 204    if(b == bld->zero)
 205       return a;
 206    if(a == bld->undef || b == bld->undef)
 207       return bld->undef;
 208
 209    if(bld->type.norm) {
 210       const char *intrinsic = NULL;
 211
 212       if(a == bld->one || b == bld->one)
 213         return bld->one;
 214
 215       if(util_cpu_caps.has_sse2 &&
 216          type.width * type.length == 128 &&
 217          !type.floating && !type.fixed) {
 218          if(type.width == 8)
 219             intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
 220          if(type.width == 16)
 221             intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
 222       }
 223
 224       if(intrinsic)
 225          return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
 226    }
 227
 228    if(LLVMIsConstant(a) && LLVMIsConstant(b))
 229       if (type.floating)
 230          res = LLVMConstFAdd(a, b);
 231       else
 232          res = LLVMConstAdd(a, b);
 233    else
 234       if (type.floating)
 235          res = LLVMBuildFAdd(bld->builder, a, b, "");
 236       else
 237          res = LLVMBuildAdd(bld->builder, a, b, "");
 238
 239    /* clamp to ceiling of 1.0 */
 240    if(bld->type.norm && (bld->type.floating || bld->type.fixed))
 241       res = lp_build_min_simple(bld, res, bld->one);
 242
 243    /* XXX clamp to floor of -1 or 0??? */
 244
 245    return res;
 246 }
 247
 248
 249 /** Return the sum of the elements of a */
 250 LLVMValueRef
 251 lp_build_sum_vector(struct lp_build_context *bld,
 252                     LLVMValueRef a)
 253 {
 254    const struct lp_type type = bld->type;
 255    LLVMValueRef index, res;
 256    unsigned i;
 257
 258    if (a == bld->zero)
 259       return bld->zero;
 260    if (a == bld->undef)
 261       return bld->undef;
 262    assert(type.length > 1);
 263
 264    assert(!bld->type.norm);
 265
 266    index = LLVMConstInt(LLVMInt32Type(), 0, 0);
 267    res = LLVMBuildExtractElement(bld->builder, a, index, "");
 268
 269    for (i = 1; i < type.length; i++) {
 270       index = LLVMConstInt(LLVMInt32Type(), i, 0);
 271       if (type.floating)
 272          res = LLVMBuildFAdd(bld->builder, res,
 273                             LLVMBuildExtractElement(bld->builder,
 274                                                     a, index, ""),
 275                             "");
 276       else
 277          res = LLVMBuildAdd(bld->builder, res,
 278                             LLVMBuildExtractElement(bld->builder,
 279                                                     a, index, ""),
 280                             "");
 281    }
 282
 283    return res;
 284 }
 285
 286
 287 /**
 288  * Generate a - b
 289  */
 290 LLVMValueRef
 291 lp_build_sub(struct lp_build_context *bld,
 292              LLVMValueRef a,
 293              LLVMValueRef b)
 294 {
 295    const struct lp_type type = bld->type;
 296    LLVMValueRef res;
 297
 298    assert(lp_check_value(type, a));
 299    assert(lp_check_value(type, b));
 300
 301    if(b == bld->zero)
 302       return a;
 303    if(a == bld->undef || b == bld->undef)
 304       return bld->undef;
 305    if(a == b)
 306       return bld->zero;
 307
 308    if(bld->type.norm) {
 309       const char *intrinsic = NULL;
 310
 311       if(b == bld->one)
 312         return bld->zero;
 313
 314       if(util_cpu_caps.has_sse2 &&
 315          type.width * type.length == 128 &&
 316          !type.floating && !type.fixed) {
 317          if(type.width == 8)
 318             intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
 319          if(type.width == 16)
 320             intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
 321       }
 322
 323       if(intrinsic)
 324          return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
 325    }
 326
 327    if(LLVMIsConstant(a) && LLVMIsConstant(b))
 328       if (type.floating)
 329          res = LLVMConstFSub(a, b);
 330       else
 331          res = LLVMConstSub(a, b);
 332    else
 333       if (type.floating)
 334          res = LLVMBuildFSub(bld->builder, a, b, "");
 335       else
 336          res = LLVMBuildSub(bld->builder, a, b, "");
 337
 338    if(bld->type.norm && (bld->type.floating || bld->type.fixed))
 339       res = lp_build_max_simple(bld, res, bld->zero);
 340
 341    return res;
 342 }
 343
 344
 345 /**
 346  * Normalized 8bit multiplication.
 347  *
 348  * - alpha plus one
 349  *
 350  *     makes the following approximation to the division (Sree)
 351  *
 352  *       a*b/255 ~= (a*(b + 1)) >> 256
 353  *
 354  *     which is the fastest method that satisfies the following OpenGL criteria
 355  *
 356  *       0*0 = 0 and 255*255 = 255
 357  *
 358  * - geometric series
 359  *
 360  *     takes the geometric series approximation to the division
 361  *
 362  *       t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
 363  *
 364  *     in this case just the first two terms to fit in 16bit arithmetic
 365  *
 366  *       t/255 ~= (t + (t >> 8)) >> 8
 367  *
 368  *     note that just by itself it doesn't satisfies the OpenGL criteria, as
 369  *     255*255 = 254, so the special case b = 255 must be accounted or roundoff
 370  *     must be used
 371  *
 372  * - geometric series plus rounding
 373  *
 374  *     when using a geometric series division instead of truncating the result
 375  *     use roundoff in the approximation (Jim Blinn)
 376  *
 377  *       t/255 ~= (t + (t >> 8) + 0x80) >> 8
 378  *
 379  *     achieving the exact results
 380  *
 381  * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
 382  *     ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
 383  * @sa Michael Herf, The "double blend trick", May 2000,
 384  *     http://www.stereopsis.com/doubleblend.html
 385  */
 386 static LLVMValueRef
 387 lp_build_mul_u8n(LLVMBuilderRef builder,
 388                  struct lp_type i16_type,
 389                  LLVMValueRef a, LLVMValueRef b)
 390 {
 391    LLVMValueRef c8;
 392    LLVMValueRef ab;
 393
 394    c8 = lp_build_const_int_vec(i16_type, 8);
 395
 396 #if 0
 397
 398    /* a*b/255 ~= (a*(b + 1)) >> 256 */
 399    b = LLVMBuildAdd(builder, b, lp_build_const_int_vec(i16_type, 1), "");
 400    ab = LLVMBuildMul(builder, a, b, "");
 401
 402 #else
 403
 404    /* ab/255 ~= (ab + (ab >> 8) + 0x80) >> 8 */
 405    ab = LLVMBuildMul(builder, a, b, "");
 406    ab = LLVMBuildAdd(builder, ab, LLVMBuildLShr(builder, ab, c8, ""), "");
 407    ab = LLVMBuildAdd(builder, ab, lp_build_const_int_vec(i16_type, 0x80), "");
 408
 409 #endif
 410
 411    ab = LLVMBuildLShr(builder, ab, c8, "");
 412
 413    return ab;
 414 }
 415
 416
 417 /**
 418  * Generate a * b
 419  */
 420 LLVMValueRef
 421 lp_build_mul(struct lp_build_context *bld,
 422              LLVMValueRef a,
 423              LLVMValueRef b)
 424 {
 425    const struct lp_type type = bld->type;
 426    LLVMValueRef shift;
 427    LLVMValueRef res;
 428
 429    assert(lp_check_value(type, a));
 430    assert(lp_check_value(type, b));
 431
 432    if(a == bld->zero)
 433       return bld->zero;
 434    if(a == bld->one)
 435       return b;
 436    if(b == bld->zero)
 437       return bld->zero;
 438    if(b == bld->one)
 439       return a;
 440    if(a == bld->undef || b == bld->undef)
 441       return bld->undef;
 442
 443    if(!type.floating && !type.fixed && type.norm) {
 444       if(type.width == 8) {
 445          struct lp_type i16_type = lp_wider_type(type);
 446          LLVMValueRef al, ah, bl, bh, abl, abh, ab;
 447
 448          lp_build_unpack2(bld->builder, type, i16_type, a, &al, &ah);
 449          lp_build_unpack2(bld->builder, type, i16_type, b, &bl, &bh);
 450
 451          /* PMULLW, PSRLW, PADDW */
 452          abl = lp_build_mul_u8n(bld->builder, i16_type, al, bl);
 453          abh = lp_build_mul_u8n(bld->builder, i16_type, ah, bh);
 454
 455          ab = lp_build_pack2(bld->builder, i16_type, type, abl, abh);
 456
 457          return ab;
 458       }
 459
 460       /* FIXME */
 461       assert(0);
 462    }
 463
 464    if(type.fixed)
 465       shift = lp_build_const_int_vec(type, type.width/2);
 466    else
 467       shift = NULL;
 468
 469    if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
 470       if (type.floating)
 471          res = LLVMConstFMul(a, b);
 472       else
 473          res = LLVMConstMul(a, b);
 474       if(shift) {
 475          if(type.sign)
 476             res = LLVMConstAShr(res, shift);
 477          else
 478             res = LLVMConstLShr(res, shift);
 479       }
 480    }
 481    else {
 482       if (type.floating)
 483          res = LLVMBuildFMul(bld->builder, a, b, "");
 484       else
 485          res = LLVMBuildMul(bld->builder, a, b, "");
 486       if(shift) {
 487          if(type.sign)
 488             res = LLVMBuildAShr(bld->builder, res, shift, "");
 489          else
 490             res = LLVMBuildLShr(bld->builder, res, shift, "");
 491       }
 492    }
 493
 494    return res;
 495 }
 496
 497
 498 /**
 499  * Small vector x scale multiplication optimization.
 500  */
 501 LLVMValueRef
 502 lp_build_mul_imm(struct lp_build_context *bld,
 503                  LLVMValueRef a,
 504                  int b)
 505 {
 506    LLVMValueRef factor;
 507
 508    if(b == 0)
 509       return bld->zero;
 510
 511    if(b == 1)
 512       return a;
 513
 514    if(b == -1)
 515       return lp_build_negate(bld, a);
 516
 517    if(b == 2 && bld->type.floating)
 518       return lp_build_add(bld, a, a);
 519
 520    if(util_is_pot(b)) {
 521       unsigned shift = ffs(b) - 1;
 522
 523       if(bld->type.floating) {
 524 #if 0
 525          /*
 526           * Power of two multiplication by directly manipulating the mantissa.
 527           *
 528           * XXX: This might not be always faster, it will introduce a small error
 529           * for multiplication by zero, and it will produce wrong results
 530           * for Inf and NaN.
 531           */
 532          unsigned mantissa = lp_mantissa(bld->type);
 533          factor = lp_build_const_int_vec(bld->type, (unsigned long long)shift << mantissa);
 534          a = LLVMBuildBitCast(bld->builder, a, lp_build_int_vec_type(bld->type), "");
 535          a = LLVMBuildAdd(bld->builder, a, factor, "");
 536          a = LLVMBuildBitCast(bld->builder, a, lp_build_vec_type(bld->type), "");
 537          return a;
 538 #endif
 539       }
 540       else {
 541          factor = lp_build_const_vec(bld->type, shift);
 542          return LLVMBuildShl(bld->builder, a, factor, "");
 543       }
 544    }
 545
 546    factor = lp_build_const_vec(bld->type, (double)b);
 547    return lp_build_mul(bld, a, factor);
 548 }
 549
 550
 551 /**
 552  * Generate a / b
 553  */
 554 LLVMValueRef
 555 lp_build_div(struct lp_build_context *bld,
 556              LLVMValueRef a,
 557              LLVMValueRef b)
 558 {
 559    const struct lp_type type = bld->type;
 560
 561    assert(lp_check_value(type, a));
 562    assert(lp_check_value(type, b));
 563
 564    if(a == bld->zero)
 565       return bld->zero;
 566    if(a == bld->one)
 567       return lp_build_rcp(bld, b);
 568    if(b == bld->zero)
 569       return bld->undef;
 570    if(b == bld->one)
 571       return a;
 572    if(a == bld->undef || b == bld->undef)
 573       return bld->undef;
 574
 575    if(LLVMIsConstant(a) && LLVMIsConstant(b))
 576       return LLVMConstFDiv(a, b);
 577
 578    if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4)
 579       return lp_build_mul(bld, a, lp_build_rcp(bld, b));
 580
 581    return LLVMBuildFDiv(bld->builder, a, b, "");
 582 }
 583
 584
 585 /**
 586  * Linear interpolation.
 587  *
 588  * This also works for integer values with a few caveats.
 589  *
 590  * @sa http://www.stereopsis.com/doubleblend.html
 591  */
 592 LLVMValueRef
 593 lp_build_lerp(struct lp_build_context *bld,
 594               LLVMValueRef x,
 595               LLVMValueRef v0,
 596               LLVMValueRef v1)
 597 {
 598    LLVMValueRef delta;
 599    LLVMValueRef res;
 600
 601    delta = lp_build_sub(bld, v1, v0);
 602
 603    res = lp_build_mul(bld, x, delta);
 604
 605    res = lp_build_add(bld, v0, res);
 606
 607    if(bld->type.fixed)
 608       /* XXX: This step is necessary for lerping 8bit colors stored on 16bits,
 609        * but it will be wrong for other uses. Basically we need a more
 610        * powerful lp_type, capable of further distinguishing the values
 611        * interpretation from the value storage. */
 612       res = LLVMBuildAnd(bld->builder, res, lp_build_const_int_vec(bld->type, (1 << bld->type.width/2) - 1), "");
 613
 614    return res;
 615 }
 616
 617
 618 LLVMValueRef
 619 lp_build_lerp_2d(struct lp_build_context *bld,
 620                  LLVMValueRef x,
 621                  LLVMValueRef y,
 622                  LLVMValueRef v00,
 623                  LLVMValueRef v01,
 624                  LLVMValueRef v10,
 625                  LLVMValueRef v11)
 626 {
 627    LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01);
 628    LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11);
 629    return lp_build_lerp(bld, y, v0, v1);
 630 }
 631
 632
 633 /**
 634  * Generate min(a, b)
 635  * Do checks for special cases.
 636  */
 637 LLVMValueRef
 638 lp_build_min(struct lp_build_context *bld,
 639              LLVMValueRef a,
 640              LLVMValueRef b)
 641 {
 642    if(a == bld->undef || b == bld->undef)
 643       return bld->undef;
 644
 645    if(a == b)
 646       return a;
 647
 648    if(bld->type.norm) {
 649       if(a == bld->zero || b == bld->zero)
 650          return bld->zero;
 651       if(a == bld->one)
 652          return b;
 653       if(b == bld->one)
 654          return a;
 655    }
 656
 657    return lp_build_min_simple(bld, a, b);
 658 }
 659
 660
 661 /**
 662  * Generate max(a, b)
 663  * Do checks for special cases.
 664  */
 665 LLVMValueRef
 666 lp_build_max(struct lp_build_context *bld,
 667              LLVMValueRef a,
 668              LLVMValueRef b)
 669 {
 670    if(a == bld->undef || b == bld->undef)
 671       return bld->undef;
 672
 673    if(a == b)
 674       return a;
 675
 676    if(bld->type.norm) {
 677       if(a == bld->one || b == bld->one)
 678          return bld->one;
 679       if(a == bld->zero)
 680          return b;
 681       if(b == bld->zero)
 682          return a;
 683    }
 684
 685    return lp_build_max_simple(bld, a, b);
 686 }
 687
 688
 689 /**
 690  * Generate clamp(a, min, max)
 691  * Do checks for special cases.
 692  */
 693 LLVMValueRef
 694 lp_build_clamp(struct lp_build_context *bld,
 695                LLVMValueRef a,
 696                LLVMValueRef min,
 697                LLVMValueRef max)
 698 {
 699    a = lp_build_min(bld, a, max);
 700    a = lp_build_max(bld, a, min);
 701    return a;
 702 }
 703
 704
 705 /**
 706  * Generate abs(a)
 707  */
 708 LLVMValueRef
 709 lp_build_abs(struct lp_build_context *bld,
 710              LLVMValueRef a)
 711 {
 712    const struct lp_type type = bld->type;
 713    LLVMTypeRef vec_type = lp_build_vec_type(type);
 714
 715    if(!type.sign)
 716       return a;
 717
 718    if(type.floating) {
 719       /* Mask out the sign bit */
 720       LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
 721       unsigned long long absMask = ~(1ULL << (type.width - 1));
 722       LLVMValueRef mask = lp_build_const_int_vec(type, ((unsigned long long) absMask));
 723       a = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
 724       a = LLVMBuildAnd(bld->builder, a, mask, "");
 725       a = LLVMBuildBitCast(bld->builder, a, vec_type, "");
 726       return a;
 727    }
 728
 729    if(type.width*type.length == 128 && util_cpu_caps.has_ssse3) {
 730       switch(type.width) {
 731       case 8:
 732          return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
 733       case 16:
 734          return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
 735       case 32:
 736          return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
 737       }
 738    }
 739
 740    return lp_build_max(bld, a, LLVMBuildNeg(bld->builder, a, ""));
 741 }
 742
 743
 744 LLVMValueRef
 745 lp_build_negate(struct lp_build_context *bld,
 746                 LLVMValueRef a)
 747 {
 748 #if HAVE_LLVM >= 0x0207
 749    if (bld->type.floating)
 750       a = LLVMBuildFNeg(bld->builder, a, "");
 751    else
 752 #endif
 753       a = LLVMBuildNeg(bld->builder, a, "");
 754
 755    return a;
 756 }
 757
 758
 759 /** Return -1, 0 or +1 depending on the sign of a */
 760 LLVMValueRef
 761 lp_build_sgn(struct lp_build_context *bld,
 762              LLVMValueRef a)
 763 {
 764    const struct lp_type type = bld->type;
 765    LLVMValueRef cond;
 766    LLVMValueRef res;
 767
 768    /* Handle non-zero case */
 769    if(!type.sign) {
 770       /* if not zero then sign must be positive */
 771       res = bld->one;
 772    }
 773    else if(type.floating) {
 774       LLVMTypeRef vec_type;
 775       LLVMTypeRef int_type;
 776       LLVMValueRef mask;
 777       LLVMValueRef sign;
 778       LLVMValueRef one;
 779       unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
 780
 781       int_type = lp_build_int_vec_type(type);
 782       vec_type = lp_build_vec_type(type);
 783       mask = lp_build_const_int_vec(type, maskBit);
 784
 785       /* Take the sign bit and add it to 1 constant */
 786       sign = LLVMBuildBitCast(bld->builder, a, int_type, "");
 787       sign = LLVMBuildAnd(bld->builder, sign, mask, "");
 788       one = LLVMConstBitCast(bld->one, int_type);
 789       res = LLVMBuildOr(bld->builder, sign, one, "");
 790       res = LLVMBuildBitCast(bld->builder, res, vec_type, "");
 791    }
 792    else
 793    {
 794       LLVMValueRef minus_one = lp_build_const_vec(type, -1.0);
 795       cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
 796       res = lp_build_select(bld, cond, bld->one, minus_one);
 797    }
 798
 799    /* Handle zero */
 800    cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
 801    res = lp_build_select(bld, cond, bld->zero, res);
 802
 803    return res;
 804 }
 805
 806
 807 /**
 808  * Set the sign of float vector 'a' according to 'sign'.
 809  * If sign==0, return abs(a).
 810  * If sign==1, return -abs(a);
 811  * Other values for sign produce undefined results.
 812  */
 813 LLVMValueRef
 814 lp_build_set_sign(struct lp_build_context *bld,
 815                   LLVMValueRef a, LLVMValueRef sign)
 816 {
 817    const struct lp_type type = bld->type;
 818    LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
 819    LLVMTypeRef vec_type = lp_build_vec_type(type);
 820    LLVMValueRef shift = lp_build_const_int_vec(type, type.width - 1);
 821    LLVMValueRef mask = lp_build_const_int_vec(type,
 822                              ~((unsigned long long) 1 << (type.width - 1)));
 823    LLVMValueRef val, res;
 824
 825    assert(type.floating);
 826
 827    /* val = reinterpret_cast<int>(a) */
 828    val = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
 829    /* val = val & mask */
 830    val = LLVMBuildAnd(bld->builder, val, mask, "");
 831    /* sign = sign << shift */
 832    sign = LLVMBuildShl(bld->builder, sign, shift, "");
 833    /* res = val | sign */
 834    res = LLVMBuildOr(bld->builder, val, sign, "");
 835    /* res = reinterpret_cast<float>(res) */
 836    res = LLVMBuildBitCast(bld->builder, res, vec_type, "");
 837
 838    return res;
 839 }
 840
 841
 842 /**
 843  * Convert vector of (or scalar) int to vector of (or scalar) float.
 844  */
 845 LLVMValueRef
 846 lp_build_int_to_float(struct lp_build_context *bld,
 847                       LLVMValueRef a)
 848 {
 849    const struct lp_type type = bld->type;
 850    LLVMTypeRef vec_type = lp_build_vec_type(type);
 851
 852    assert(type.floating);
 853
 854    return LLVMBuildSIToFP(bld->builder, a, vec_type, "");
 855 }
 856
 857
 858
 859 enum lp_build_round_sse41_mode
 860 {
 861    LP_BUILD_ROUND_SSE41_NEAREST = 0,
 862    LP_BUILD_ROUND_SSE41_FLOOR = 1,
 863    LP_BUILD_ROUND_SSE41_CEIL = 2,
 864    LP_BUILD_ROUND_SSE41_TRUNCATE = 3
 865 };
 866
 867
 868 static INLINE LLVMValueRef
 869 lp_build_round_sse41(struct lp_build_context *bld,
 870                      LLVMValueRef a,
 871                      enum lp_build_round_sse41_mode mode)
 872 {
 873    const struct lp_type type = bld->type;
 874    LLVMTypeRef vec_type = lp_build_vec_type(type);
 875    const char *intrinsic;
 876
 877    assert(type.floating);
 878    assert(type.width*type.length == 128);
 879    assert(lp_check_value(type, a));
 880    assert(util_cpu_caps.has_sse4_1);
 881
 882    switch(type.width) {
 883    case 32:
 884       intrinsic = "llvm.x86.sse41.round.ps";
 885       break;
 886    case 64:
 887       intrinsic = "llvm.x86.sse41.round.pd";
 888       break;
 889    default:
 890       assert(0);
 891       return bld->undef;
 892    }
 893
 894    return lp_build_intrinsic_binary(bld->builder, intrinsic, vec_type, a,
 895                                     LLVMConstInt(LLVMInt32Type(), mode, 0));
 896 }
 897
 898
 899 /**
 900  * Return the integer part of a float (vector) value.  The returned value is
 901  * a float (vector).
 902  * Ex: trunc(-1.5) = 1.0
 903  */
 904 LLVMValueRef
 905 lp_build_trunc(struct lp_build_context *bld,
 906                LLVMValueRef a)
 907 {
 908    const struct lp_type type = bld->type;
 909
 910    assert(type.floating);
 911    assert(lp_check_value(type, a));
 912
 913    if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128)
 914       return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_TRUNCATE);
 915    else {
 916       LLVMTypeRef vec_type = lp_build_vec_type(type);
 917       LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
 918       LLVMValueRef res;
 919       res = LLVMBuildFPToSI(bld->builder, a, int_vec_type, "");
 920       res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
 921       return res;
 922    }
 923 }
 924
 925
 926 /**
 927  * Return float (vector) rounded to nearest integer (vector).  The returned
 928  * value is a float (vector).
 929  * Ex: round(0.9) = 1.0
 930  * Ex: round(-1.5) = -2.0
 931  */
 932 LLVMValueRef
 933 lp_build_round(struct lp_build_context *bld,
 934                LLVMValueRef a)
 935 {
 936    const struct lp_type type = bld->type;
 937
 938    assert(type.floating);
 939    assert(lp_check_value(type, a));
 940
 941    if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128)
 942       return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
 943    else {
 944       LLVMTypeRef vec_type = lp_build_vec_type(type);
 945       LLVMValueRef res;
 946       res = lp_build_iround(bld, a);
 947       res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
 948       return res;
 949    }
 950 }
 951
 952
 953 /**
 954  * Return floor of float (vector), result is a float (vector)
 955  * Ex: floor(1.1) = 1.0
 956  * Ex: floor(-1.1) = -2.0
 957  */
 958 LLVMValueRef
 959 lp_build_floor(struct lp_build_context *bld,
 960                LLVMValueRef a)
 961 {
 962    const struct lp_type type = bld->type;
 963
 964    assert(type.floating);
 965    assert(lp_check_value(type, a));
 966
 967    if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128)
 968       return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
 969    else {
 970       LLVMTypeRef vec_type = lp_build_vec_type(type);
 971       LLVMValueRef res;
 972       res = lp_build_ifloor(bld, a);
 973       res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
 974       return res;
 975    }
 976 }
 977
 978
 979 /**
 980  * Return ceiling of float (vector), returning float (vector).
 981  * Ex: ceil( 1.1) = 2.0
 982  * Ex: ceil(-1.1) = -1.0
 983  */
 984 LLVMValueRef
 985 lp_build_ceil(struct lp_build_context *bld,
 986               LLVMValueRef a)
 987 {
 988    const struct lp_type type = bld->type;
 989
 990    assert(type.floating);
 991    assert(lp_check_value(type, a));
 992
 993    if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128)
 994       return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
 995    else {
 996       LLVMTypeRef vec_type = lp_build_vec_type(type);
 997       LLVMValueRef res;
 998       res = lp_build_iceil(bld, a);
 999       res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
1000       return res;
1001    }
1002 }
1003
1004
1005 /**
1006  * Return fractional part of 'a' computed as a - floor(a)
1007  * Typically used in texture coord arithmetic.
1008  */
1009 LLVMValueRef
1010 lp_build_fract(struct lp_build_context *bld,
1011                LLVMValueRef a)
1012 {
1013    assert(bld->type.floating);
1014    return lp_build_sub(bld, a, lp_build_floor(bld, a));
1015 }
1016
1017
1018 /**
1019  * Return the integer part of a float (vector) value.  The returned value is
1020  * an integer (vector).
1021  * Ex: itrunc(-1.5) = 1
1022  */
1023 LLVMValueRef
1024 lp_build_itrunc(struct lp_build_context *bld,
1025                 LLVMValueRef a)
1026 {
1027    const struct lp_type type = bld->type;
1028    LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
1029
1030    assert(type.floating);
1031    assert(lp_check_value(type, a));
1032
1033    return LLVMBuildFPToSI(bld->builder, a, int_vec_type, "");
1034 }
1035
1036
1037 /**
1038  * Return float (vector) rounded to nearest integer (vector).  The returned
1039  * value is an integer (vector).
1040  * Ex: iround(0.9) = 1
1041  * Ex: iround(-1.5) = -2
1042  */
1043 LLVMValueRef
1044 lp_build_iround(struct lp_build_context *bld,
1045                 LLVMValueRef a)
1046 {
1047    const struct lp_type type = bld->type;
1048    LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
1049    LLVMValueRef res;
1050
1051    assert(type.floating);
1052
1053    assert(lp_check_value(type, a));
1054
1055    if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128) {
1056       res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
1057    }
1058    else {
1059       LLVMTypeRef vec_type = lp_build_vec_type(type);
1060       LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1));
1061       LLVMValueRef sign;
1062       LLVMValueRef half;
1063
1064       /* get sign bit */
1065       sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
1066       sign = LLVMBuildAnd(bld->builder, sign, mask, "");
1067
1068       /* sign * 0.5 */
1069       half = lp_build_const_vec(type, 0.5);
1070       half = LLVMBuildBitCast(bld->builder, half, int_vec_type, "");
1071       half = LLVMBuildOr(bld->builder, sign, half, "");
1072       half = LLVMBuildBitCast(bld->builder, half, vec_type, "");
1073
1074       res = LLVMBuildFAdd(bld->builder, a, half, "");
1075    }
1076
1077    res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, "");
1078
1079    return res;
1080 }
1081
1082
1083 /**
1084  * Return floor of float (vector), result is an int (vector)
1085  * Ex: ifloor(1.1) = 1.0
1086  * Ex: ifloor(-1.1) = -2.0
1087  */
1088 LLVMValueRef
1089 lp_build_ifloor(struct lp_build_context *bld,
1090                 LLVMValueRef a)
1091 {
1092    const struct lp_type type = bld->type;
1093    LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
1094    LLVMValueRef res;
1095
1096    assert(type.floating);
1097    assert(lp_check_value(type, a));
1098
1099    if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128) {
1100       res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
1101    }
1102    else {
1103       /* Take the sign bit and add it to 1 constant */
1104       LLVMTypeRef vec_type = lp_build_vec_type(type);
1105       unsigned mantissa = lp_mantissa(type);
1106       LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1));
1107       LLVMValueRef sign;
1108       LLVMValueRef offset;
1109
1110       /* sign = a < 0 ? ~0 : 0 */
1111       sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
1112       sign = LLVMBuildAnd(bld->builder, sign, mask, "");
1113       sign = LLVMBuildAShr(bld->builder, sign, lp_build_const_int_vec(type, type.width - 1), "ifloor.sign");
1114
1115       /* offset = -0.99999(9)f */
1116       offset = lp_build_const_vec(type, -(double)(((unsigned long long)1 << mantissa) - 10)/((unsigned long long)1 << mantissa));
1117       offset = LLVMConstBitCast(offset, int_vec_type);
1118
1119       /* offset = a < 0 ? offset : 0.0f */
1120       offset = LLVMBuildAnd(bld->builder, offset, sign, "");
1121       offset = LLVMBuildBitCast(bld->builder, offset, vec_type, "ifloor.offset");
1122
1123       res = LLVMBuildFAdd(bld->builder, a, offset, "ifloor.res");
1124    }
1125
1126    /* round to nearest (toward zero) */
1127    res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, "ifloor.res");
1128
1129    return res;
1130 }
1131
1132
1133 /**
1134  * Return ceiling of float (vector), returning int (vector).
1135  * Ex: iceil( 1.1) = 2
1136  * Ex: iceil(-1.1) = -1
1137  */
1138 LLVMValueRef
1139 lp_build_iceil(struct lp_build_context *bld,
1140                LLVMValueRef a)
1141 {
1142    const struct lp_type type = bld->type;
1143    LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
1144    LLVMValueRef res;
1145
1146    assert(type.floating);
1147    assert(lp_check_value(type, a));
1148
1149    if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128) {
1150       res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
1151    }
1152    else {
1153       LLVMTypeRef vec_type = lp_build_vec_type(type);
1154       unsigned mantissa = lp_mantissa(type);
1155       LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1));
1156       LLVMValueRef sign;
1157       LLVMValueRef offset;
1158
1159       /* sign = a < 0 ? 0 : ~0 */
1160       sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
1161       sign = LLVMBuildAnd(bld->builder, sign, mask, "");
1162       sign = LLVMBuildAShr(bld->builder, sign, lp_build_const_int_vec(type, type.width - 1), "iceil.sign");
1163       sign = LLVMBuildNot(bld->builder, sign, "iceil.not");
1164
1165       /* offset = 0.99999(9)f */
1166       offset = lp_build_const_vec(type, (double)(((unsigned long long)1 << mantissa) - 10)/((unsigned long long)1 << mantissa));
1167       offset = LLVMConstBitCast(offset, int_vec_type);
1168
1169       /* offset = a < 0 ? 0.0 : offset */
1170       offset = LLVMBuildAnd(bld->builder, offset, sign, "");
1171       offset = LLVMBuildBitCast(bld->builder, offset, vec_type, "iceil.offset");
1172
1173       res = LLVMBuildFAdd(bld->builder, a, offset, "iceil.res");
1174    }
1175
1176    /* round to nearest (toward zero) */
1177    res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, "iceil.res");
1178
1179    return res;
1180 }
1181
1182
1183 LLVMValueRef
1184 lp_build_sqrt(struct lp_build_context *bld,
1185               LLVMValueRef a)
1186 {
1187    const struct lp_type type = bld->type;
1188    LLVMTypeRef vec_type = lp_build_vec_type(type);
1189    char intrinsic[32];
1190
1191    /* TODO: optimize the constant case */
1192    /* TODO: optimize the constant case */
1193
1194    assert(type.floating);
1195    util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.v%uf%u", type.length, type.width);
1196
1197    return lp_build_intrinsic_unary(bld->builder, intrinsic, vec_type, a);
1198 }
1199
1200
1201 LLVMValueRef
1202 lp_build_rcp(struct lp_build_context *bld,
1203              LLVMValueRef a)
1204 {
1205    const struct lp_type type = bld->type;
1206
1207    if(a == bld->zero)
1208       return bld->undef;
1209    if(a == bld->one)
1210       return bld->one;
1211    if(a == bld->undef)
1212       return bld->undef;
1213
1214    assert(type.floating);
1215
1216    if(LLVMIsConstant(a))
1217       return LLVMConstFDiv(bld->one, a);
1218
1219    if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4) {
1220       /*
1221        * XXX: Added precision is not always necessary, so only enable this
1222        * when we have a better system in place to track minimum precision.
1223        */
1224
1225 #if 0
1226       /*
1227        * Do one Newton-Raphson step to improve precision:
1228        *
1229        *   x1 = (2 - a * rcp(a)) * rcp(a)
1230        */
1231
1232       LLVMValueRef two = lp_build_const_vec(bld->type, 2.0);
1233       LLVMValueRef rcp_a;
1234       LLVMValueRef res;
1235
1236       rcp_a = lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rcp.ps", lp_build_vec_type(type), a);
1237
1238       res = LLVMBuildFMul(bld->builder, a, rcp_a, "");
1239       res = LLVMBuildFSub(bld->builder, two, res, "");
1240       res = LLVMBuildFMul(bld->builder, res, rcp_a, "");
1241
1242       return rcp_a;
1243 #else
1244       return lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rcp.ps", lp_build_vec_type(type), a);
1245 #endif
1246    }
1247
1248    return LLVMBuildFDiv(bld->builder, bld->one, a, "");
1249 }
1250
1251
1252 /**
1253  * Generate 1/sqrt(a)
1254  */
1255 LLVMValueRef
1256 lp_build_rsqrt(struct lp_build_context *bld,
1257                LLVMValueRef a)
1258 {
1259    const struct lp_type type = bld->type;
1260
1261    assert(type.floating);
1262
1263    if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4)
1264       return lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rsqrt.ps", lp_build_vec_type(type), a);
1265
1266    return lp_build_rcp(bld, lp_build_sqrt(bld, a));
1267 }
1268
1269
1270 static inline LLVMValueRef
1271 lp_build_const_v4si(unsigned long value)
1272 {
1273    LLVMValueRef element = LLVMConstInt(LLVMInt32Type(), value, 0);
1274    LLVMValueRef elements[4] = { element, element, element, element };
1275    return LLVMConstVector(elements, 4);
1276 }
1277
1278 static inline LLVMValueRef
1279 lp_build_const_v4sf(float value)
1280 {
1281    LLVMValueRef element = LLVMConstReal(LLVMFloatType(), value);
1282    LLVMValueRef elements[4] = { element, element, element, element };
1283    return LLVMConstVector(elements, 4);
1284 }
1285
1286
1287 /**
1288  * Generate sin(a) using SSE2
1289  */
1290 LLVMValueRef
1291 lp_build_sin(struct lp_build_context *bld,
1292              LLVMValueRef a)
1293 {
1294    struct lp_type int_type = lp_int_type(bld->type);
1295    LLVMBuilderRef b = bld->builder;
1296    LLVMTypeRef v4sf = LLVMVectorType(LLVMFloatType(), 4);
1297    LLVMTypeRef v4si = LLVMVectorType(LLVMInt32Type(), 4);
1298
1299    /*
1300     *  take the absolute value,
1301     *  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
1302     */
1303
1304    LLVMValueRef inv_sig_mask = lp_build_const_v4si(~0x80000000);
1305    LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, v4si, "a_v4si");
1306
1307    LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
1308    LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, v4sf, "x_abs");
1309
1310    /*
1311     * extract the sign bit (upper one)
1312     * sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask);
1313     */
1314    LLVMValueRef sig_mask = lp_build_const_v4si(0x80000000);
1315    LLVMValueRef sign_bit_i = LLVMBuildAnd(b, a_v4si, sig_mask, "sign_bit_i");
1316
1317    /*
1318     * scale by 4/Pi
1319     * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
1320     */
1321
1322    LLVMValueRef FOPi = lp_build_const_v4sf(1.27323954473516);
1323    LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
1324
1325    /*
1326     * store the integer part of y in mm0
1327     * emm2 = _mm_cvttps_epi32(y);
1328     */
1329
1330    LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, v4si, "emm2_i");
1331
1332    /*
1333     * j=(j+1) & (~1) (see the cephes sources)
1334     * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
1335     */
1336
1337    LLVMValueRef all_one = lp_build_const_v4si(1);
1338    LLVMValueRef emm2_add =  LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
1339    /*
1340     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
1341     */
1342    LLVMValueRef inv_one = lp_build_const_v4si(~1);
1343    LLVMValueRef emm2_and =  LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
1344
1345    /*
1346     * y = _mm_cvtepi32_ps(emm2);
1347     */
1348    LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, v4sf, "y_2");
1349
1350    /* get the swap sign flag
1351     * emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
1352     */
1353    LLVMValueRef pi32_4 = lp_build_const_v4si(4);
1354    LLVMValueRef emm0_and =  LLVMBuildAnd(b, emm2_add, pi32_4, "emm0_and");
1355
1356    /*
1357     * emm2 = _mm_slli_epi32(emm0, 29);
1358     */
1359    LLVMValueRef const_29 = lp_build_const_v4si(29);
1360    LLVMValueRef swap_sign_bit = LLVMBuildShl(b, emm0_and, const_29, "swap_sign_bit");
1361
1362    /*
1363     * get the polynom selection mask
1364     * there is one polynom for 0 <= x <= Pi/4
1365     * and another one for Pi/4<x<=Pi/2
1366     * Both branches will be computed.
1367     *
1368     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
1369     * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
1370     */
1371
1372    LLVMValueRef pi32_2 = lp_build_const_v4si(2);
1373    LLVMValueRef emm2_3 =  LLVMBuildAnd(b, emm2_and, pi32_2, "emm2_3");
1374    LLVMValueRef poly_mask = lp_build_compare(b, int_type, PIPE_FUNC_EQUAL,
1375                                              emm2_3, lp_build_const_v4si(0));
1376    /*
1377     *   sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
1378     */
1379    LLVMValueRef sign_bit_1 =  LLVMBuildXor(b, sign_bit_i, swap_sign_bit, "sign_bit");
1380
1381    /*
1382     * _PS_CONST(minus_cephes_DP1, -0.78515625);
1383     * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
1384     * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
1385     */
1386    LLVMValueRef DP1 = lp_build_const_v4sf(-0.78515625);
1387    LLVMValueRef DP2 = lp_build_const_v4sf(-2.4187564849853515625e-4);
1388    LLVMValueRef DP3 = lp_build_const_v4sf(-3.77489497744594108e-8);
1389
1390    /*
1391     * The magic pass: "Extended precision modular arithmetic"
1392     * x = ((x - y * DP1) - y * DP2) - y * DP3;
1393     * xmm1 = _mm_mul_ps(y, xmm1);
1394     * xmm2 = _mm_mul_ps(y, xmm2);
1395     * xmm3 = _mm_mul_ps(y, xmm3);
1396     */
1397    LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1");
1398    LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2");
1399    LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3");
1400
1401    /*
1402     * x = _mm_add_ps(x, xmm1);
1403     * x = _mm_add_ps(x, xmm2);
1404     * x = _mm_add_ps(x, xmm3);
1405     */
1406
1407    LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1");
1408    LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2");
1409    LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3");
1410
1411    /*
1412     * Evaluate the first polynom  (0 <= x <= Pi/4)
1413     *
1414     * z = _mm_mul_ps(x,x);
1415     */
1416    LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
1417
1418    /*
1419     * _PS_CONST(coscof_p0,  2.443315711809948E-005);
1420     * _PS_CONST(coscof_p1, -1.388731625493765E-003);
1421     * _PS_CONST(coscof_p2,  4.166664568298827E-002);
1422     */
1423    LLVMValueRef coscof_p0 = lp_build_const_v4sf(2.443315711809948E-005);
1424    LLVMValueRef coscof_p1 = lp_build_const_v4sf(-1.388731625493765E-003);
1425    LLVMValueRef coscof_p2 = lp_build_const_v4sf(4.166664568298827E-002);
1426
1427    /*
1428     * y = *(v4sf*)_ps_coscof_p0;
1429     * y = _mm_mul_ps(y, z);
1430     */
1431    LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3");
1432    LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4");
1433    LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5");
1434    LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6");
1435    LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
1436    LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
1437
1438
1439    /*
1440     * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
1441     * y = _mm_sub_ps(y, tmp);
1442     * y = _mm_add_ps(y, *(v4sf*)_ps_1);
1443     */
1444    LLVMValueRef half = lp_build_const_v4sf(0.5);
1445    LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
1446    LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
1447    LLVMValueRef one = lp_build_const_v4sf(1.0);
1448    LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
1449
1450    /*
1451     * _PS_CONST(sincof_p0, -1.9515295891E-4);
1452     * _PS_CONST(sincof_p1,  8.3321608736E-3);
1453     * _PS_CONST(sincof_p2, -1.6666654611E-1);
1454     */
1455    LLVMValueRef sincof_p0 = lp_build_const_v4sf(-1.9515295891E-4);
1456    LLVMValueRef sincof_p1 = lp_build_const_v4sf(8.3321608736E-3);
1457    LLVMValueRef sincof_p2 = lp_build_const_v4sf(-1.6666654611E-1);
1458
1459    /*
1460     * Evaluate the second polynom  (Pi/4 <= x <= 0)
1461     *
1462     * y2 = *(v4sf*)_ps_sincof_p0;
1463     * y2 = _mm_mul_ps(y2, z);
1464     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
1465     * y2 = _mm_mul_ps(y2, z);
1466     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
1467     * y2 = _mm_mul_ps(y2, z);
1468     * y2 = _mm_mul_ps(y2, x);
1469     * y2 = _mm_add_ps(y2, x);
1470     */
1471
1472    LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3");
1473    LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4");
1474    LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5");
1475    LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6");
1476    LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
1477    LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8");
1478    LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9");
1479
1480    /*
1481     * select the correct result from the two polynoms
1482     * xmm3 = poly_mask;
1483     * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
1484     * y = _mm_andnot_ps(xmm3, y);
1485     * y = _mm_add_ps(y,y2);
1486     */
1487    LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, v4si, "y2_i");
1488    LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, v4si, "y_i");
1489    LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
1490    LLVMValueRef inv = lp_build_const_v4si(~0);
1491    LLVMValueRef poly_mask_inv = LLVMBuildXor(b, poly_mask, inv, "poly_mask_inv");
1492    LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
1493    LLVMValueRef y_combine = LLVMBuildAdd(b, y_and, y2_and, "y_combine");
1494
1495    /*
1496     * update the sign
1497     * y = _mm_xor_ps(y, sign_bit);
1498     */
1499    LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit_1, "y_sin");
1500    LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, v4sf, "y_result");
1501    return y_result;
1502 }
1503
1504
1505 /**
1506  * Generate cos(a) using SSE2
1507  */
1508 LLVMValueRef
1509 lp_build_cos(struct lp_build_context *bld,
1510              LLVMValueRef a)
1511 {
1512    struct lp_type int_type = lp_int_type(bld->type);
1513    LLVMBuilderRef b = bld->builder;
1514    LLVMTypeRef v4sf = LLVMVectorType(LLVMFloatType(), 4);
1515    LLVMTypeRef v4si = LLVMVectorType(LLVMInt32Type(), 4);
1516
1517    /*
1518     *  take the absolute value,
1519     *  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
1520     */
1521
1522    LLVMValueRef inv_sig_mask = lp_build_const_v4si(~0x80000000);
1523    LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, v4si, "a_v4si");
1524
1525    LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
1526    LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, v4sf, "x_abs");
1527
1528    /*
1529     * scale by 4/Pi
1530     * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
1531     */
1532
1533    LLVMValueRef FOPi = lp_build_const_v4sf(1.27323954473516);
1534    LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
1535
1536    /*
1537     * store the integer part of y in mm0
1538     * emm2 = _mm_cvttps_epi32(y);
1539     */
1540
1541    LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, v4si, "emm2_i");
1542
1543    /*
1544     * j=(j+1) & (~1) (see the cephes sources)
1545     * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
1546     */
1547
1548    LLVMValueRef all_one = lp_build_const_v4si(1);
1549    LLVMValueRef emm2_add =  LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
1550    /*
1551     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
1552     */
1553    LLVMValueRef inv_one = lp_build_const_v4si(~1);
1554    LLVMValueRef emm2_and =  LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
1555
1556    /*
1557     * y = _mm_cvtepi32_ps(emm2);
1558     */
1559    LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, v4sf, "y_2");
1560
1561
1562    /*
1563     * emm2 = _mm_sub_epi32(emm2, *(v4si*)_pi32_2);
1564     */
1565    LLVMValueRef const_2 = lp_build_const_v4si(2);
1566    LLVMValueRef emm2_2 = LLVMBuildSub(b, emm2_and, const_2, "emm2_2");
1567
1568
1569    /* get the swap sign flag
1570     * emm0 = _mm_andnot_si128(emm2, *(v4si*)_pi32_4);
1571     */
1572    LLVMValueRef inv = lp_build_const_v4si(~0);
1573    LLVMValueRef emm0_not = LLVMBuildXor(b, emm2_2, inv, "emm0_not");
1574    LLVMValueRef pi32_4 = lp_build_const_v4si(4);
1575    LLVMValueRef emm0_and =  LLVMBuildAnd(b, emm0_not, pi32_4, "emm0_and");
1576
1577    /*
1578     * emm2 = _mm_slli_epi32(emm0, 29);
1579     */
1580    LLVMValueRef const_29 = lp_build_const_v4si(29);
1581    LLVMValueRef sign_bit = LLVMBuildShl(b, emm0_and, const_29, "sign_bit");
1582
1583    /*
1584     * get the polynom selection mask
1585     * there is one polynom for 0 <= x <= Pi/4
1586     * and another one for Pi/4<x<=Pi/2
1587     * Both branches will be computed.
1588     *
1589     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
1590     * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
1591     */
1592
1593    LLVMValueRef pi32_2 = lp_build_const_v4si(2);
1594    LLVMValueRef emm2_3 =  LLVMBuildAnd(b, emm2_2, pi32_2, "emm2_3");
1595    LLVMValueRef poly_mask = lp_build_compare(b, int_type, PIPE_FUNC_EQUAL,
1596                                              emm2_3, lp_build_const_v4si(0));
1597
1598    /*
1599     * _PS_CONST(minus_cephes_DP1, -0.78515625);
1600     * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
1601     * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
1602     */
1603    LLVMValueRef DP1 = lp_build_const_v4sf(-0.78515625);
1604    LLVMValueRef DP2 = lp_build_const_v4sf(-2.4187564849853515625e-4);
1605    LLVMValueRef DP3 = lp_build_const_v4sf(-3.77489497744594108e-8);
1606
1607    /*
1608     * The magic pass: "Extended precision modular arithmetic"
1609     * x = ((x - y * DP1) - y * DP2) - y * DP3;
1610     * xmm1 = _mm_mul_ps(y, xmm1);
1611     * xmm2 = _mm_mul_ps(y, xmm2);
1612     * xmm3 = _mm_mul_ps(y, xmm3);
1613     */
1614    LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1");
1615    LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2");
1616    LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3");
1617
1618    /*
1619     * x = _mm_add_ps(x, xmm1);
1620     * x = _mm_add_ps(x, xmm2);
1621     * x = _mm_add_ps(x, xmm3);
1622     */
1623
1624    LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1");
1625    LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2");
1626    LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3");
1627
1628    /*
1629     * Evaluate the first polynom  (0 <= x <= Pi/4)
1630     *
1631     * z = _mm_mul_ps(x,x);
1632     */
1633    LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
1634
1635    /*
1636     * _PS_CONST(coscof_p0,  2.443315711809948E-005);
1637     * _PS_CONST(coscof_p1, -1.388731625493765E-003);
1638     * _PS_CONST(coscof_p2,  4.166664568298827E-002);
1639     */
1640    LLVMValueRef coscof_p0 = lp_build_const_v4sf(2.443315711809948E-005);
1641    LLVMValueRef coscof_p1 = lp_build_const_v4sf(-1.388731625493765E-003);
1642    LLVMValueRef coscof_p2 = lp_build_const_v4sf(4.166664568298827E-002);
1643
1644    /*
1645     * y = *(v4sf*)_ps_coscof_p0;
1646     * y = _mm_mul_ps(y, z);
1647     */
1648    LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3");
1649    LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4");
1650    LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5");
1651    LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6");
1652    LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
1653    LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
1654
1655
1656    /*
1657     * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
1658     * y = _mm_sub_ps(y, tmp);
1659     * y = _mm_add_ps(y, *(v4sf*)_ps_1);
1660     */
1661    LLVMValueRef half = lp_build_const_v4sf(0.5);
1662    LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
1663    LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
1664    LLVMValueRef one = lp_build_const_v4sf(1.0);
1665    LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
1666
1667    /*
1668     * _PS_CONST(sincof_p0, -1.9515295891E-4);
1669     * _PS_CONST(sincof_p1,  8.3321608736E-3);
1670     * _PS_CONST(sincof_p2, -1.6666654611E-1);
1671     */
1672    LLVMValueRef sincof_p0 = lp_build_const_v4sf(-1.9515295891E-4);
1673    LLVMValueRef sincof_p1 = lp_build_const_v4sf(8.3321608736E-3);
1674    LLVMValueRef sincof_p2 = lp_build_const_v4sf(-1.6666654611E-1);
1675
1676    /*
1677     * Evaluate the second polynom  (Pi/4 <= x <= 0)
1678     *
1679     * y2 = *(v4sf*)_ps_sincof_p0;
1680     * y2 = _mm_mul_ps(y2, z);
1681     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
1682     * y2 = _mm_mul_ps(y2, z);
1683     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
1684     * y2 = _mm_mul_ps(y2, z);
1685     * y2 = _mm_mul_ps(y2, x);
1686     * y2 = _mm_add_ps(y2, x);
1687     */
1688
1689    LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3");
1690    LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4");
1691    LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5");
1692    LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6");
1693    LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
1694    LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8");
1695    LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9");
1696
1697    /*
1698     * select the correct result from the two polynoms
1699     * xmm3 = poly_mask;
1700     * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
1701     * y = _mm_andnot_ps(xmm3, y);
1702     * y = _mm_add_ps(y,y2);
1703     */
1704    LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, v4si, "y2_i");
1705    LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, v4si, "y_i");
1706    LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
1707    LLVMValueRef poly_mask_inv = LLVMBuildXor(b, poly_mask, inv, "poly_mask_inv");
1708    LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
1709    LLVMValueRef y_combine = LLVMBuildAdd(b, y_and, y2_and, "y_combine");
1710
1711    /*
1712     * update the sign
1713     * y = _mm_xor_ps(y, sign_bit);
1714     */
1715    LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sin");
1716    LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, v4sf, "y_result");
1717    return y_result;
1718 }
1719
1720
1721 /**
1722  * Generate pow(x, y)
1723  */
1724 LLVMValueRef
1725 lp_build_pow(struct lp_build_context *bld,
1726              LLVMValueRef x,
1727              LLVMValueRef y)
1728 {
1729    /* TODO: optimize the constant case */
1730    if(LLVMIsConstant(x) && LLVMIsConstant(y))
1731       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
1732                    __FUNCTION__);
1733
1734    return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
1735 }
1736
1737
1738 /**
1739  * Generate exp(x)
1740  */
1741 LLVMValueRef
1742 lp_build_exp(struct lp_build_context *bld,
1743              LLVMValueRef x)
1744 {
1745    /* log2(e) = 1/log(2) */
1746    LLVMValueRef log2e = lp_build_const_vec(bld->type, 1.4426950408889634);
1747
1748    return lp_build_mul(bld, log2e, lp_build_exp2(bld, x));
1749 }
1750
1751
1752 /**
1753  * Generate log(x)
1754  */
1755 LLVMValueRef
1756 lp_build_log(struct lp_build_context *bld,
1757              LLVMValueRef x)
1758 {
1759    /* log(2) */
1760    LLVMValueRef log2 = lp_build_const_vec(bld->type, 0.69314718055994529);
1761
1762    return lp_build_mul(bld, log2, lp_build_exp2(bld, x));
1763 }
1764
1765
1766 #define EXP_POLY_DEGREE 3
1767 #define LOG_POLY_DEGREE 5
1768
1769
1770 /**
1771  * Generate polynomial.
1772  * Ex:  coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
1773  */
1774 static LLVMValueRef
1775 lp_build_polynomial(struct lp_build_context *bld,
1776                     LLVMValueRef x,
1777                     const double *coeffs,
1778                     unsigned num_coeffs)
1779 {
1780    const struct lp_type type = bld->type;
1781    LLVMValueRef res = NULL;
1782    unsigned i;
1783
1784    /* TODO: optimize the constant case */
1785    if(LLVMIsConstant(x))
1786       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
1787                    __FUNCTION__);
1788
1789    for (i = num_coeffs; i--; ) {
1790       LLVMValueRef coeff;
1791
1792       coeff = lp_build_const_vec(type, coeffs[i]);
1793
1794       if(res)
1795          res = lp_build_add(bld, coeff, lp_build_mul(bld, x, res));
1796       else
1797          res = coeff;
1798    }
1799
1800    if(res)
1801       return res;
1802    else
1803       return bld->undef;
1804 }
1805
1806
1807 /**
1808  * Minimax polynomial fit of 2**x, in range [0, 1[
1809  */
1810 const double lp_build_exp2_polynomial[] = {
1811 #if EXP_POLY_DEGREE == 5
1812    0.999999999690134838155,
1813    0.583974334321735217258,
1814    0.164553105719676828492,
1815    0.0292811063701710962255,
1816    0.00354944426657875141846,
1817    0.000296253726543423377365
1818 #elif EXP_POLY_DEGREE == 4
1819    1.00000001502262084505,
1820    0.563586057338685991394,
1821    0.150436017652442413623,
1822    0.0243220604213317927308,
1823    0.0025359088446580436489
1824 #elif EXP_POLY_DEGREE == 3
1825    0.999925218562710312959,
1826    0.695833540494823811697,
1827    0.226067155427249155588,
1828    0.0780245226406372992967
1829 #elif EXP_POLY_DEGREE == 2
1830    1.00172476321474503578,
1831    0.657636275736077639316,
1832    0.33718943461968720704
1833 #else
1834 #error
1835 #endif
1836 };
1837
1838
1839 void
1840 lp_build_exp2_approx(struct lp_build_context *bld,
1841                      LLVMValueRef x,
1842                      LLVMValueRef *p_exp2_int_part,
1843                      LLVMValueRef *p_frac_part,
1844                      LLVMValueRef *p_exp2)
1845 {
1846    const struct lp_type type = bld->type;
1847    LLVMTypeRef vec_type = lp_build_vec_type(type);
1848    LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
1849    LLVMValueRef ipart = NULL;
1850    LLVMValueRef fpart = NULL;
1851    LLVMValueRef expipart = NULL;
1852    LLVMValueRef expfpart = NULL;
1853    LLVMValueRef res = NULL;
1854
1855    if(p_exp2_int_part || p_frac_part || p_exp2) {
1856       /* TODO: optimize the constant case */
1857       if(LLVMIsConstant(x))
1858          debug_printf("%s: inefficient/imprecise constant arithmetic\n",
1859                       __FUNCTION__);
1860
1861       assert(type.floating && type.width == 32);
1862
1863       x = lp_build_min(bld, x, lp_build_const_vec(type,  129.0));
1864       x = lp_build_max(bld, x, lp_build_const_vec(type, -126.99999));
1865
1866       /* ipart = floor(x) */
1867       ipart = lp_build_floor(bld, x);
1868
1869       /* fpart = x - ipart */
1870       fpart = LLVMBuildFSub(bld->builder, x, ipart, "");
1871    }
1872
1873    if(p_exp2_int_part || p_exp2) {
1874       /* expipart = (float) (1 << ipart) */
1875       ipart = LLVMBuildFPToSI(bld->builder, ipart, int_vec_type, "");
1876       expipart = LLVMBuildAdd(bld->builder, ipart, lp_build_const_int_vec(type, 127), "");
1877       expipart = LLVMBuildShl(bld->builder, expipart, lp_build_const_int_vec(type, 23), "");
1878       expipart = LLVMBuildBitCast(bld->builder, expipart, vec_type, "");
1879    }
1880
1881    if(p_exp2) {
1882       expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
1883                                      Elements(lp_build_exp2_polynomial));
1884
1885       res = LLVMBuildFMul(bld->builder, expipart, expfpart, "");
1886    }
1887
1888    if(p_exp2_int_part)
1889       *p_exp2_int_part = expipart;
1890
1891    if(p_frac_part)
1892       *p_frac_part = fpart;
1893
1894    if(p_exp2)
1895       *p_exp2 = res;
1896 }
1897
1898
1899 LLVMValueRef
1900 lp_build_exp2(struct lp_build_context *bld,
1901               LLVMValueRef x)
1902 {
1903    LLVMValueRef res;
1904    lp_build_exp2_approx(bld, x, NULL, NULL, &res);
1905    return res;
1906 }
1907
1908
1909 /**
1910  * Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[
1911  * These coefficients can be generate with
1912  * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
1913  */
1914 const double lp_build_log2_polynomial[] = {
1915 #if LOG_POLY_DEGREE == 6
1916    3.11578814719469302614,
1917    -3.32419399085241980044,
1918    2.59883907202499966007,
1919    -1.23152682416275988241,
1920    0.318212422185251071475,
1921    -0.0344359067839062357313
1922 #elif LOG_POLY_DEGREE == 5
1923    2.8882704548164776201,
1924    -2.52074962577807006663,
1925    1.48116647521213171641,
1926    -0.465725644288844778798,
1927    0.0596515482674574969533
1928 #elif LOG_POLY_DEGREE == 4
1929    2.61761038894603480148,
1930    -1.75647175389045657003,
1931    0.688243882994381274313,
1932    -0.107254423828329604454
1933 #elif LOG_POLY_DEGREE == 3
1934    2.28330284476918490682,
1935    -1.04913055217340124191,
1936    0.204446009836232697516
1937 #else
1938 #error
1939 #endif
1940 };
1941
1942
1943 /**
1944  * See http://www.devmaster.net/forums/showthread.php?p=43580
1945  */
1946 void
1947 lp_build_log2_approx(struct lp_build_context *bld,
1948                      LLVMValueRef x,
1949                      LLVMValueRef *p_exp,
1950                      LLVMValueRef *p_floor_log2,
1951                      LLVMValueRef *p_log2)
1952 {
1953    const struct lp_type type = bld->type;
1954    LLVMTypeRef vec_type = lp_build_vec_type(type);
1955    LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
1956
1957    LLVMValueRef expmask = lp_build_const_int_vec(type, 0x7f800000);
1958    LLVMValueRef mantmask = lp_build_const_int_vec(type, 0x007fffff);
1959    LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
1960
1961    LLVMValueRef i = NULL;
1962    LLVMValueRef exp = NULL;
1963    LLVMValueRef mant = NULL;
1964    LLVMValueRef logexp = NULL;
1965    LLVMValueRef logmant = NULL;
1966    LLVMValueRef res = NULL;
1967
1968    if(p_exp || p_floor_log2 || p_log2) {
1969       /* TODO: optimize the constant case */
1970       if(LLVMIsConstant(x))
1971          debug_printf("%s: inefficient/imprecise constant arithmetic\n",
1972                       __FUNCTION__);
1973
1974       assert(type.floating && type.width == 32);
1975
1976       i = LLVMBuildBitCast(bld->builder, x, int_vec_type, "");
1977
1978       /* exp = (float) exponent(x) */
1979       exp = LLVMBuildAnd(bld->builder, i, expmask, "");
1980    }
1981
1982    if(p_floor_log2 || p_log2) {
1983       logexp = LLVMBuildLShr(bld->builder, exp, lp_build_const_int_vec(type, 23), "");
1984       logexp = LLVMBuildSub(bld->builder, logexp, lp_build_const_int_vec(type, 127), "");
1985       logexp = LLVMBuildSIToFP(bld->builder, logexp, vec_type, "");
1986    }
1987
1988    if(p_log2) {
1989       /* mant = (float) mantissa(x) */
1990       mant = LLVMBuildAnd(bld->builder, i, mantmask, "");
1991       mant = LLVMBuildOr(bld->builder, mant, one, "");
1992       mant = LLVMBuildBitCast(bld->builder, mant, vec_type, "");
1993
1994       logmant = lp_build_polynomial(bld, mant, lp_build_log2_polynomial,
1995                                     Elements(lp_build_log2_polynomial));
1996
1997       /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
1998       logmant = LLVMBuildFMul(bld->builder, logmant, LLVMBuildFSub(bld->builder, mant, bld->one, ""), "");
1999
2000       res = LLVMBuildFAdd(bld->builder, logmant, logexp, "");
2001    }
2002
2003    if(p_exp) {
2004       exp = LLVMBuildBitCast(bld->builder, exp, vec_type, "");
2005       *p_exp = exp;
2006    }
2007
2008    if(p_floor_log2)
2009       *p_floor_log2 = logexp;
2010
2011    if(p_log2)
2012       *p_log2 = res;
2013 }
2014
2015
2016 LLVMValueRef
2017 lp_build_log2(struct lp_build_context *bld,
2018               LLVMValueRef x)
2019 {
2020    LLVMValueRef res;
2021    lp_build_log2_approx(bld, x, NULL, NULL, &res);
2022    return res;
2023 }