src/gallium/auxiliary/gallivm/lp_bld_arit.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2009 VMware, Inc.
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the
  15  * next paragraph) shall be included in all copies or substantial portions
  16  * of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  21  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
  22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25  *
  26  **************************************************************************/
  27
  28
  29 /**
  30  * @file
  31  * Helper
  32  *
  33  * LLVM IR doesn't support all basic arithmetic operations we care about (most
  34  * notably min/max and saturated operations), and it is often necessary to
  35  * resort machine-specific intrinsics directly. The functions here hide all
  36  * these implementation details from the other modules.
  37  *
  38  * We also do simple expressions simplification here. Reasons are:
  39  * - it is very easy given we have all necessary information readily available
  40  * - LLVM optimization passes fail to simplify several vector expressions
  41  * - We often know value constraints which the optimization passes have no way
  42  *   of knowing, such as when source arguments are known to be in [0, 1] range.
  43  *
  44  * @author Jose Fonseca <jfonseca@vmware.com>
  45  */
  46
  47
  48 #include "util/u_memory.h"
  49 #include "util/u_debug.h"
  50 #include "util/u_math.h"
  51 #include "util/u_string.h"
  52 #include "util/u_cpu_detect.h"
  53
  54 #include "lp_bld_type.h"
  55 #include "lp_bld_const.h"
  56 #include "lp_bld_intr.h"
  57 #include "lp_bld_logic.h"
  58 #include "lp_bld_pack.h"
  59 #include "lp_bld_debug.h"
  60 #include "lp_bld_arit.h"
  61
  62
  63 /**
  64  * Generate min(a, b)
  65  * No checks for special case values of a or b = 1 or 0 are done.
  66  */
  67 static LLVMValueRef
  68 lp_build_min_simple(struct lp_build_context *bld,
  69                     LLVMValueRef a,
  70                     LLVMValueRef b)
  71 {
  72    const struct lp_type type = bld->type;
  73    const char *intrinsic = NULL;
  74    LLVMValueRef cond;
  75
  76    /* TODO: optimize the constant case */
  77
  78    if(type.width * type.length == 128) {
  79       if(type.floating) {
  80          if(type.width == 32 && util_cpu_caps.has_sse)
  81             intrinsic = "llvm.x86.sse.min.ps";
  82          if(type.width == 64 && util_cpu_caps.has_sse2)
  83             intrinsic = "llvm.x86.sse2.min.pd";
  84       }
  85       else {
  86          if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2)
  87             intrinsic = "llvm.x86.sse2.pminu.b";
  88          if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1)
  89             intrinsic = "llvm.x86.sse41.pminsb";
  90          if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1)
  91             intrinsic = "llvm.x86.sse41.pminuw";
  92          if(type.width == 16 && type.sign && util_cpu_caps.has_sse2)
  93             intrinsic = "llvm.x86.sse2.pmins.w";
  94          if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1)
  95             intrinsic = "llvm.x86.sse41.pminud";
  96          if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1)
  97             intrinsic = "llvm.x86.sse41.pminsd";
  98       }
  99    }
 100
 101    if(intrinsic)
 102       return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
 103
 104    cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 105    return lp_build_select(bld, cond, a, b);
 106 }
 107
 108
 109 /**
 110  * Generate max(a, b)
 111  * No checks for special case values of a or b = 1 or 0 are done.
 112  */
 113 static LLVMValueRef
 114 lp_build_max_simple(struct lp_build_context *bld,
 115                     LLVMValueRef a,
 116                     LLVMValueRef b)
 117 {
 118    const struct lp_type type = bld->type;
 119    const char *intrinsic = NULL;
 120    LLVMValueRef cond;
 121
 122    /* TODO: optimize the constant case */
 123
 124    if(type.width * type.length == 128) {
 125       if(type.floating) {
 126          if(type.width == 32 && util_cpu_caps.has_sse)
 127             intrinsic = "llvm.x86.sse.max.ps";
 128          if(type.width == 64 && util_cpu_caps.has_sse2)
 129             intrinsic = "llvm.x86.sse2.max.pd";
 130       }
 131       else {
 132          if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2)
 133             intrinsic = "llvm.x86.sse2.pmaxu.b";
 134          if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1)
 135             intrinsic = "llvm.x86.sse41.pmaxsb";
 136          if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1)
 137             intrinsic = "llvm.x86.sse41.pmaxuw";
 138          if(type.width == 16 && type.sign && util_cpu_caps.has_sse2)
 139             intrinsic = "llvm.x86.sse2.pmaxs.w";
 140          if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1)
 141             intrinsic = "llvm.x86.sse41.pmaxud";
 142          if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1)
 143             intrinsic = "llvm.x86.sse41.pmaxsd";
 144       }
 145    }
 146
 147    if(intrinsic)
 148       return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
 149
 150    cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 151    return lp_build_select(bld, cond, a, b);
 152 }
 153
 154
 155 /**
 156  * Generate 1 - a, or ~a depending on bld->type.
 157  */
 158 LLVMValueRef
 159 lp_build_comp(struct lp_build_context *bld,
 160               LLVMValueRef a)
 161 {
 162    const struct lp_type type = bld->type;
 163
 164    if(a == bld->one)
 165       return bld->zero;
 166    if(a == bld->zero)
 167       return bld->one;
 168
 169    if(type.norm && !type.floating && !type.fixed && !type.sign) {
 170       if(LLVMIsConstant(a))
 171          return LLVMConstNot(a);
 172       else
 173          return LLVMBuildNot(bld->builder, a, "");
 174    }
 175
 176    if(LLVMIsConstant(a))
 177       return LLVMConstSub(bld->one, a);
 178    else
 179       return LLVMBuildSub(bld->builder, bld->one, a, "");
 180 }
 181
 182
 183 /**
 184  * Generate a + b
 185  */
 186 LLVMValueRef
 187 lp_build_add(struct lp_build_context *bld,
 188              LLVMValueRef a,
 189              LLVMValueRef b)
 190 {
 191    const struct lp_type type = bld->type;
 192    LLVMValueRef res;
 193
 194    if(a == bld->zero)
 195       return b;
 196    if(b == bld->zero)
 197       return a;
 198    if(a == bld->undef || b == bld->undef)
 199       return bld->undef;
 200
 201    if(bld->type.norm) {
 202       const char *intrinsic = NULL;
 203
 204       if(a == bld->one || b == bld->one)
 205         return bld->one;
 206
 207       if(util_cpu_caps.has_sse2 &&
 208          type.width * type.length == 128 &&
 209          !type.floating && !type.fixed) {
 210          if(type.width == 8)
 211             intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
 212          if(type.width == 16)
 213             intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
 214       }
 215
 216       if(intrinsic)
 217          return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
 218    }
 219
 220    if(LLVMIsConstant(a) && LLVMIsConstant(b))
 221       res = LLVMConstAdd(a, b);
 222    else
 223       res = LLVMBuildAdd(bld->builder, a, b, "");
 224
 225    /* clamp to ceiling of 1.0 */
 226    if(bld->type.norm && (bld->type.floating || bld->type.fixed))
 227       res = lp_build_min_simple(bld, res, bld->one);
 228
 229    /* XXX clamp to floor of -1 or 0??? */
 230
 231    return res;
 232 }
 233
 234
 235 /** Return the sum of the elements of a */
 236 LLVMValueRef
 237 lp_build_sum_vector(struct lp_build_context *bld,
 238                     LLVMValueRef a)
 239 {
 240    const struct lp_type type = bld->type;
 241    LLVMValueRef index, res;
 242    int i;
 243
 244    if (a == bld->zero)
 245       return bld->zero;
 246    if (a == bld->undef)
 247       return bld->undef;
 248    assert(type.length > 1);
 249
 250    assert(!bld->type.norm);
 251
 252    index = LLVMConstInt(LLVMInt32Type(), 0, 0);
 253    res = LLVMBuildExtractElement(bld->builder, a, index, "");
 254
 255    for (i = 1; i < type.length; i++) {
 256       index = LLVMConstInt(LLVMInt32Type(), i, 0);
 257       res = LLVMBuildAdd(bld->builder, res,
 258                          LLVMBuildExtractElement(bld->builder, a, index, ""),
 259                          "");
 260    }
 261
 262    return res;
 263 }
 264
 265
 266 /**
 267  * Generate a - b
 268  */
 269 LLVMValueRef
 270 lp_build_sub(struct lp_build_context *bld,
 271              LLVMValueRef a,
 272              LLVMValueRef b)
 273 {
 274    const struct lp_type type = bld->type;
 275    LLVMValueRef res;
 276
 277    if(b == bld->zero)
 278       return a;
 279    if(a == bld->undef || b == bld->undef)
 280       return bld->undef;
 281    if(a == b)
 282       return bld->zero;
 283
 284    if(bld->type.norm) {
 285       const char *intrinsic = NULL;
 286
 287       if(b == bld->one)
 288         return bld->zero;
 289
 290       if(util_cpu_caps.has_sse2 &&
 291          type.width * type.length == 128 &&
 292          !type.floating && !type.fixed) {
 293          if(type.width == 8)
 294             intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
 295          if(type.width == 16)
 296             intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
 297       }
 298
 299       if(intrinsic)
 300          return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
 301    }
 302
 303    if(LLVMIsConstant(a) && LLVMIsConstant(b))
 304       res = LLVMConstSub(a, b);
 305    else
 306       res = LLVMBuildSub(bld->builder, a, b, "");
 307
 308    if(bld->type.norm && (bld->type.floating || bld->type.fixed))
 309       res = lp_build_max_simple(bld, res, bld->zero);
 310
 311    return res;
 312 }
 313
 314
 315 /**
 316  * Normalized 8bit multiplication.
 317  *
 318  * - alpha plus one
 319  *
 320  *     makes the following approximation to the division (Sree)
 321  *
 322  *       a*b/255 ~= (a*(b + 1)) >> 256
 323  *
 324  *     which is the fastest method that satisfies the following OpenGL criteria
 325  *
 326  *       0*0 = 0 and 255*255 = 255
 327  *
 328  * - geometric series
 329  *
 330  *     takes the geometric series approximation to the division
 331  *
 332  *       t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
 333  *
 334  *     in this case just the first two terms to fit in 16bit arithmetic
 335  *
 336  *       t/255 ~= (t + (t >> 8)) >> 8
 337  *
 338  *     note that just by itself it doesn't satisfies the OpenGL criteria, as
 339  *     255*255 = 254, so the special case b = 255 must be accounted or roundoff
 340  *     must be used
 341  *
 342  * - geometric series plus rounding
 343  *
 344  *     when using a geometric series division instead of truncating the result
 345  *     use roundoff in the approximation (Jim Blinn)
 346  *
 347  *       t/255 ~= (t + (t >> 8) + 0x80) >> 8
 348  *
 349  *     achieving the exact results
 350  *
 351  * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
 352  *     ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
 353  * @sa Michael Herf, The "double blend trick", May 2000,
 354  *     http://www.stereopsis.com/doubleblend.html
 355  */
 356 static LLVMValueRef
 357 lp_build_mul_u8n(LLVMBuilderRef builder,
 358                  struct lp_type i16_type,
 359                  LLVMValueRef a, LLVMValueRef b)
 360 {
 361    LLVMValueRef c8;
 362    LLVMValueRef ab;
 363
 364    c8 = lp_build_const_int_vec(i16_type, 8);
 365
 366 #if 0
 367
 368    /* a*b/255 ~= (a*(b + 1)) >> 256 */
 369    b = LLVMBuildAdd(builder, b, lp_build_const_int_vec(i16_type, 1), "");
 370    ab = LLVMBuildMul(builder, a, b, "");
 371
 372 #else
 373
 374    /* ab/255 ~= (ab + (ab >> 8) + 0x80) >> 8 */
 375    ab = LLVMBuildMul(builder, a, b, "");
 376    ab = LLVMBuildAdd(builder, ab, LLVMBuildLShr(builder, ab, c8, ""), "");
 377    ab = LLVMBuildAdd(builder, ab, lp_build_const_int_vec(i16_type, 0x80), "");
 378
 379 #endif
 380
 381    ab = LLVMBuildLShr(builder, ab, c8, "");
 382
 383    return ab;
 384 }
 385
 386
 387 /**
 388  * Generate a * b
 389  */
 390 LLVMValueRef
 391 lp_build_mul(struct lp_build_context *bld,
 392              LLVMValueRef a,
 393              LLVMValueRef b)
 394 {
 395    const struct lp_type type = bld->type;
 396    LLVMValueRef shift;
 397    LLVMValueRef res;
 398
 399    if(a == bld->zero)
 400       return bld->zero;
 401    if(a == bld->one)
 402       return b;
 403    if(b == bld->zero)
 404       return bld->zero;
 405    if(b == bld->one)
 406       return a;
 407    if(a == bld->undef || b == bld->undef)
 408       return bld->undef;
 409
 410    if(!type.floating && !type.fixed && type.norm) {
 411       if(type.width == 8) {
 412          struct lp_type i16_type = lp_wider_type(type);
 413          LLVMValueRef al, ah, bl, bh, abl, abh, ab;
 414
 415          lp_build_unpack2(bld->builder, type, i16_type, a, &al, &ah);
 416          lp_build_unpack2(bld->builder, type, i16_type, b, &bl, &bh);
 417
 418          /* PMULLW, PSRLW, PADDW */
 419          abl = lp_build_mul_u8n(bld->builder, i16_type, al, bl);
 420          abh = lp_build_mul_u8n(bld->builder, i16_type, ah, bh);
 421
 422          ab = lp_build_pack2(bld->builder, i16_type, type, abl, abh);
 423
 424          return ab;
 425       }
 426
 427       /* FIXME */
 428       assert(0);
 429    }
 430
 431    if(type.fixed)
 432       shift = lp_build_const_int_vec(type, type.width/2);
 433    else
 434       shift = NULL;
 435
 436    if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
 437       res =  LLVMConstMul(a, b);
 438       if(shift) {
 439          if(type.sign)
 440             res = LLVMConstAShr(res, shift);
 441          else
 442             res = LLVMConstLShr(res, shift);
 443       }
 444    }
 445    else {
 446       res = LLVMBuildMul(bld->builder, a, b, "");
 447       if(shift) {
 448          if(type.sign)
 449             res = LLVMBuildAShr(bld->builder, res, shift, "");
 450          else
 451             res = LLVMBuildLShr(bld->builder, res, shift, "");
 452       }
 453    }
 454
 455    return res;
 456 }
 457
 458
 459 /**
 460  * Small vector x scale multiplication optimization.
 461  */
 462 LLVMValueRef
 463 lp_build_mul_imm(struct lp_build_context *bld,
 464                  LLVMValueRef a,
 465                  int b)
 466 {
 467    LLVMValueRef factor;
 468
 469    if(b == 0)
 470       return bld->zero;
 471
 472    if(b == 1)
 473       return a;
 474
 475    if(b == -1)
 476       return LLVMBuildNeg(bld->builder, a, "");
 477
 478    if(b == 2 && bld->type.floating)
 479       return lp_build_add(bld, a, a);
 480
 481    if(util_is_pot(b)) {
 482       unsigned shift = ffs(b) - 1;
 483
 484       if(bld->type.floating) {
 485 #if 0
 486          /*
 487           * Power of two multiplication by directly manipulating the mantissa.
 488           *
 489           * XXX: This might not be always faster, it will introduce a small error
 490           * for multiplication by zero, and it will produce wrong results
 491           * for Inf and NaN.
 492           */
 493          unsigned mantissa = lp_mantissa(bld->type);
 494          factor = lp_build_const_int_vec(bld->type, (unsigned long long)shift << mantissa);
 495          a = LLVMBuildBitCast(bld->builder, a, lp_build_int_vec_type(bld->type), "");
 496          a = LLVMBuildAdd(bld->builder, a, factor, "");
 497          a = LLVMBuildBitCast(bld->builder, a, lp_build_vec_type(bld->type), "");
 498          return a;
 499 #endif
 500       }
 501       else {
 502          factor = lp_build_const_vec(bld->type, shift);
 503          return LLVMBuildShl(bld->builder, a, factor, "");
 504       }
 505    }
 506
 507    factor = lp_build_const_vec(bld->type, (double)b);
 508    return lp_build_mul(bld, a, factor);
 509 }
 510
 511
 512 /**
 513  * Generate a / b
 514  */
 515 LLVMValueRef
 516 lp_build_div(struct lp_build_context *bld,
 517              LLVMValueRef a,
 518              LLVMValueRef b)
 519 {
 520    const struct lp_type type = bld->type;
 521
 522    if(a == bld->zero)
 523       return bld->zero;
 524    if(a == bld->one)
 525       return lp_build_rcp(bld, b);
 526    if(b == bld->zero)
 527       return bld->undef;
 528    if(b == bld->one)
 529       return a;
 530    if(a == bld->undef || b == bld->undef)
 531       return bld->undef;
 532
 533    if(LLVMIsConstant(a) && LLVMIsConstant(b))
 534       return LLVMConstFDiv(a, b);
 535
 536    if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4)
 537       return lp_build_mul(bld, a, lp_build_rcp(bld, b));
 538
 539    return LLVMBuildFDiv(bld->builder, a, b, "");
 540 }
 541
 542
 543 /**
 544  * Linear interpolation.
 545  *
 546  * This also works for integer values with a few caveats.
 547  *
 548  * @sa http://www.stereopsis.com/doubleblend.html
 549  */
 550 LLVMValueRef
 551 lp_build_lerp(struct lp_build_context *bld,
 552               LLVMValueRef x,
 553               LLVMValueRef v0,
 554               LLVMValueRef v1)
 555 {
 556    LLVMValueRef delta;
 557    LLVMValueRef res;
 558
 559    delta = lp_build_sub(bld, v1, v0);
 560
 561    res = lp_build_mul(bld, x, delta);
 562
 563    res = lp_build_add(bld, v0, res);
 564
 565    if(bld->type.fixed)
 566       /* XXX: This step is necessary for lerping 8bit colors stored on 16bits,
 567        * but it will be wrong for other uses. Basically we need a more
 568        * powerful lp_type, capable of further distinguishing the values
 569        * interpretation from the value storage. */
 570       res = LLVMBuildAnd(bld->builder, res, lp_build_const_int_vec(bld->type, (1 << bld->type.width/2) - 1), "");
 571
 572    return res;
 573 }
 574
 575
 576 LLVMValueRef
 577 lp_build_lerp_2d(struct lp_build_context *bld,
 578                  LLVMValueRef x,
 579                  LLVMValueRef y,
 580                  LLVMValueRef v00,
 581                  LLVMValueRef v01,
 582                  LLVMValueRef v10,
 583                  LLVMValueRef v11)
 584 {
 585    LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01);
 586    LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11);
 587    return lp_build_lerp(bld, y, v0, v1);
 588 }
 589
 590
 591 /**
 592  * Generate min(a, b)
 593  * Do checks for special cases.
 594  */
 595 LLVMValueRef
 596 lp_build_min(struct lp_build_context *bld,
 597              LLVMValueRef a,
 598              LLVMValueRef b)
 599 {
 600    if(a == bld->undef || b == bld->undef)
 601       return bld->undef;
 602
 603    if(a == b)
 604       return a;
 605
 606    if(bld->type.norm) {
 607       if(a == bld->zero || b == bld->zero)
 608          return bld->zero;
 609       if(a == bld->one)
 610          return b;
 611       if(b == bld->one)
 612          return a;
 613    }
 614
 615    return lp_build_min_simple(bld, a, b);
 616 }
 617
 618
 619 /**
 620  * Generate max(a, b)
 621  * Do checks for special cases.
 622  */
 623 LLVMValueRef
 624 lp_build_max(struct lp_build_context *bld,
 625              LLVMValueRef a,
 626              LLVMValueRef b)
 627 {
 628    if(a == bld->undef || b == bld->undef)
 629       return bld->undef;
 630
 631    if(a == b)
 632       return a;
 633
 634    if(bld->type.norm) {
 635       if(a == bld->one || b == bld->one)
 636          return bld->one;
 637       if(a == bld->zero)
 638          return b;
 639       if(b == bld->zero)
 640          return a;
 641    }
 642
 643    return lp_build_max_simple(bld, a, b);
 644 }
 645
 646
 647 /**
 648  * Generate clamp(a, min, max)
 649  * Do checks for special cases.
 650  */
 651 LLVMValueRef
 652 lp_build_clamp(struct lp_build_context *bld,
 653                LLVMValueRef a,
 654                LLVMValueRef min,
 655                LLVMValueRef max)
 656 {
 657    a = lp_build_min(bld, a, max);
 658    a = lp_build_max(bld, a, min);
 659    return a;
 660 }
 661
 662
 663 /**
 664  * Generate abs(a)
 665  */
 666 LLVMValueRef
 667 lp_build_abs(struct lp_build_context *bld,
 668              LLVMValueRef a)
 669 {
 670    const struct lp_type type = bld->type;
 671    LLVMTypeRef vec_type = lp_build_vec_type(type);
 672
 673    if(!type.sign)
 674       return a;
 675
 676    if(type.floating) {
 677       /* Mask out the sign bit */
 678       if (type.length == 1) {
 679          LLVMTypeRef int_type = LLVMIntType(type.width);
 680          LLVMTypeRef float_type = LLVMFloatType();
 681          unsigned long long absMask = ~(1ULL << (type.width - 1));
 682          LLVMValueRef mask = LLVMConstInt(int_type, absMask, 0);
 683          a = LLVMBuildBitCast(bld->builder, a, int_type, "");
 684          a = LLVMBuildAnd(bld->builder, a, mask, "");
 685          a = LLVMBuildBitCast(bld->builder, a, float_type, "");
 686          return a;
 687       }
 688       else {
 689          /* vector of floats */
 690          LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
 691          unsigned long long absMask = ~(1ULL << (type.width - 1));
 692          LLVMValueRef mask = lp_build_const_int_vec(type, ((unsigned long long) absMask));
 693          a = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
 694          a = LLVMBuildAnd(bld->builder, a, mask, "");
 695          a = LLVMBuildBitCast(bld->builder, a, vec_type, "");
 696          return a;
 697       }
 698    }
 699
 700    if(type.width*type.length == 128 && util_cpu_caps.has_ssse3) {
 701       switch(type.width) {
 702       case 8:
 703          return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
 704       case 16:
 705          return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
 706       case 32:
 707          return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
 708       }
 709    }
 710
 711    return lp_build_max(bld, a, LLVMBuildNeg(bld->builder, a, ""));
 712 }
 713
 714
 715 LLVMValueRef
 716 lp_build_negate(struct lp_build_context *bld,
 717                 LLVMValueRef a)
 718 {
 719    return LLVMBuildNeg(bld->builder, a, "");
 720 }
 721
 722
 723 /** Return -1, 0 or +1 depending on the sign of a */
 724 LLVMValueRef
 725 lp_build_sgn(struct lp_build_context *bld,
 726              LLVMValueRef a)
 727 {
 728    const struct lp_type type = bld->type;
 729    LLVMValueRef cond;
 730    LLVMValueRef res;
 731
 732    /* Handle non-zero case */
 733    if(!type.sign) {
 734       /* if not zero then sign must be positive */
 735       res = bld->one;
 736    }
 737    else if(type.floating) {
 738       LLVMTypeRef vec_type;
 739       LLVMTypeRef int_type;
 740       LLVMValueRef mask;
 741       LLVMValueRef sign;
 742       LLVMValueRef one;
 743       unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
 744
 745       if (type.length == 1) {
 746          int_type = lp_build_int_elem_type(type);
 747          vec_type = lp_build_elem_type(type);
 748          mask = LLVMConstInt(int_type, maskBit, 0);
 749       }
 750       else {
 751          /* vector */
 752          int_type = lp_build_int_vec_type(type);
 753          vec_type = lp_build_vec_type(type);
 754          mask = lp_build_const_int_vec(type, maskBit);
 755       }
 756
 757       /* Take the sign bit and add it to 1 constant */
 758       sign = LLVMBuildBitCast(bld->builder, a, int_type, "");
 759       sign = LLVMBuildAnd(bld->builder, sign, mask, "");
 760       one = LLVMConstBitCast(bld->one, int_type);
 761       res = LLVMBuildOr(bld->builder, sign, one, "");
 762       res = LLVMBuildBitCast(bld->builder, res, vec_type, "");
 763    }
 764    else
 765    {
 766       LLVMValueRef minus_one = lp_build_const_vec(type, -1.0);
 767       cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
 768       res = lp_build_select(bld, cond, bld->one, minus_one);
 769    }
 770
 771    /* Handle zero */
 772    cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
 773    res = lp_build_select(bld, cond, bld->zero, res);
 774
 775    return res;
 776 }
 777
 778
 779 /**
 780  * Set the sign of float vector 'a' according to 'sign'.
 781  * If sign==0, return abs(a).
 782  * If sign==1, return -abs(a);
 783  * Other values for sign produce undefined results.
 784  */
 785 LLVMValueRef
 786 lp_build_set_sign(struct lp_build_context *bld,
 787                   LLVMValueRef a, LLVMValueRef sign)
 788 {
 789    const struct lp_type type = bld->type;
 790    LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
 791    LLVMTypeRef vec_type = lp_build_vec_type(type);
 792    LLVMValueRef shift = lp_build_const_int_vec(type, type.width - 1);
 793    LLVMValueRef mask = lp_build_const_int_vec(type,
 794                              ~((unsigned long long) 1 << (type.width - 1)));
 795    LLVMValueRef val, res;
 796
 797    assert(type.floating);
 798
 799    /* val = reinterpret_cast<int>(a) */
 800    val = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
 801    /* val = val & mask */
 802    val = LLVMBuildAnd(bld->builder, val, mask, "");
 803    /* sign = sign << shift */
 804    sign = LLVMBuildShl(bld->builder, sign, shift, "");
 805    /* res = val | sign */
 806    res = LLVMBuildOr(bld->builder, val, sign, "");
 807    /* res = reinterpret_cast<float>(res) */
 808    res = LLVMBuildBitCast(bld->builder, res, vec_type, "");
 809
 810    return res;
 811 }
 812
 813
 814 /**
 815  * Convert vector of (or scalar) int to vector of (or scalar) float.
 816  */
 817 LLVMValueRef
 818 lp_build_int_to_float(struct lp_build_context *bld,
 819                       LLVMValueRef a)
 820 {
 821    const struct lp_type type = bld->type;
 822
 823    assert(type.floating);
 824    /*assert(lp_check_value(type, a));*/
 825
 826    if (type.length == 1) {
 827       LLVMTypeRef float_type = LLVMFloatType();
 828       return LLVMBuildSIToFP(bld->builder, a, float_type, "");
 829    }
 830    else {
 831       LLVMTypeRef vec_type = lp_build_vec_type(type);
 832       /*LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);*/
 833       LLVMValueRef res;
 834       res = LLVMBuildSIToFP(bld->builder, a, vec_type, "");
 835       return res;
 836    }
 837 }
 838
 839
 840
 841 enum lp_build_round_sse41_mode
 842 {
 843    LP_BUILD_ROUND_SSE41_NEAREST = 0,
 844    LP_BUILD_ROUND_SSE41_FLOOR = 1,
 845    LP_BUILD_ROUND_SSE41_CEIL = 2,
 846    LP_BUILD_ROUND_SSE41_TRUNCATE = 3
 847 };
 848
 849
 850 static INLINE LLVMValueRef
 851 lp_build_round_sse41(struct lp_build_context *bld,
 852                      LLVMValueRef a,
 853                      enum lp_build_round_sse41_mode mode)
 854 {
 855    const struct lp_type type = bld->type;
 856    LLVMTypeRef vec_type = lp_build_vec_type(type);
 857    const char *intrinsic;
 858
 859    assert(type.floating);
 860    assert(type.width*type.length == 128);
 861    assert(lp_check_value(type, a));
 862    assert(util_cpu_caps.has_sse4_1);
 863
 864    switch(type.width) {
 865    case 32:
 866       intrinsic = "llvm.x86.sse41.round.ps";
 867       break;
 868    case 64:
 869       intrinsic = "llvm.x86.sse41.round.pd";
 870       break;
 871    default:
 872       assert(0);
 873       return bld->undef;
 874    }
 875
 876    return lp_build_intrinsic_binary(bld->builder, intrinsic, vec_type, a,
 877                                     LLVMConstInt(LLVMInt32Type(), mode, 0));
 878 }
 879
 880
 881 LLVMValueRef
 882 lp_build_trunc(struct lp_build_context *bld,
 883                LLVMValueRef a)
 884 {
 885    const struct lp_type type = bld->type;
 886
 887    assert(type.floating);
 888    assert(lp_check_value(type, a));
 889
 890    if(util_cpu_caps.has_sse4_1)
 891       return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_TRUNCATE);
 892    else {
 893       LLVMTypeRef vec_type = lp_build_vec_type(type);
 894       LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
 895       LLVMValueRef res;
 896       res = LLVMBuildFPToSI(bld->builder, a, int_vec_type, "");
 897       res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
 898       return res;
 899    }
 900 }
 901
 902
 903 LLVMValueRef
 904 lp_build_round(struct lp_build_context *bld,
 905                LLVMValueRef a)
 906 {
 907    const struct lp_type type = bld->type;
 908
 909    assert(type.floating);
 910    assert(lp_check_value(type, a));
 911
 912    if(util_cpu_caps.has_sse4_1)
 913       return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
 914    else {
 915       LLVMTypeRef vec_type = lp_build_vec_type(type);
 916       LLVMValueRef res;
 917       res = lp_build_iround(bld, a);
 918       res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
 919       return res;
 920    }
 921 }
 922
 923
 924 LLVMValueRef
 925 lp_build_floor(struct lp_build_context *bld,
 926                LLVMValueRef a)
 927 {
 928    const struct lp_type type = bld->type;
 929
 930    assert(type.floating);
 931
 932    if (type.length == 1) {
 933       LLVMValueRef res;
 934       res = lp_build_ifloor(bld, a);
 935       res = LLVMBuildSIToFP(bld->builder, res, LLVMFloatType(), "");
 936       return res;
 937    }
 938
 939    if(util_cpu_caps.has_sse4_1)
 940       return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
 941    else {
 942       LLVMTypeRef vec_type = lp_build_vec_type(type);
 943       LLVMValueRef res;
 944       res = lp_build_ifloor(bld, a);
 945       res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
 946       return res;
 947    }
 948 }
 949
 950
 951 LLVMValueRef
 952 lp_build_ceil(struct lp_build_context *bld,
 953               LLVMValueRef a)
 954 {
 955    const struct lp_type type = bld->type;
 956
 957    assert(type.floating);
 958    assert(lp_check_value(type, a));
 959
 960    if(util_cpu_caps.has_sse4_1)
 961       return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
 962    else {
 963       LLVMTypeRef vec_type = lp_build_vec_type(type);
 964       LLVMValueRef res;
 965       res = lp_build_iceil(bld, a);
 966       res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
 967       return res;
 968    }
 969 }
 970
 971
 972 /**
 973  * Return fractional part of 'a' computed as a - floor(f)
 974  * Typically used in texture coord arithmetic.
 975  */
 976 LLVMValueRef
 977 lp_build_fract(struct lp_build_context *bld,
 978                LLVMValueRef a)
 979 {
 980    assert(bld->type.floating);
 981    return lp_build_sub(bld, a, lp_build_floor(bld, a));
 982 }
 983
 984
 985 /**
 986  * Convert to integer, through whichever rounding method that's fastest,
 987  * typically truncating toward zero.
 988  */
 989 LLVMValueRef
 990 lp_build_itrunc(struct lp_build_context *bld,
 991                 LLVMValueRef a)
 992 {
 993    const struct lp_type type = bld->type;
 994
 995    assert(type.floating);
 996
 997    if (type.length == 1) {
 998       LLVMTypeRef int_type = LLVMIntType(type.width);
 999       return LLVMBuildFPToSI(bld->builder, a, int_type, "");
1000    }
1001    else {
1002       LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
1003       assert(lp_check_value(type, a));
1004       return LLVMBuildFPToSI(bld->builder, a, int_vec_type, "");
1005    }
1006 }
1007
1008
1009 /**
1010  * Convert float[] to int[] with round().
1011  */
1012 LLVMValueRef
1013 lp_build_iround(struct lp_build_context *bld,
1014                 LLVMValueRef a)
1015 {
1016    const struct lp_type type = bld->type;
1017    LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
1018    LLVMValueRef res;
1019
1020    assert(type.floating);
1021
1022    if (type.length == 1) {
1023       /* scalar float to int */
1024       LLVMTypeRef int_type = LLVMIntType(type.width);
1025       /* XXX we want rounding here! */
1026       res = LLVMBuildFPToSI(bld->builder, a, int_type, "");
1027       return res;
1028    }
1029
1030    assert(lp_check_value(type, a));
1031
1032    if(util_cpu_caps.has_sse4_1) {
1033       res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
1034    }
1035    else {
1036       LLVMTypeRef vec_type = lp_build_vec_type(type);
1037       LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1));
1038       LLVMValueRef sign;
1039       LLVMValueRef half;
1040
1041       /* get sign bit */
1042       sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
1043       sign = LLVMBuildAnd(bld->builder, sign, mask, "");
1044
1045       /* sign * 0.5 */
1046       half = lp_build_const_vec(type, 0.5);
1047       half = LLVMBuildBitCast(bld->builder, half, int_vec_type, "");
1048       half = LLVMBuildOr(bld->builder, sign, half, "");
1049       half = LLVMBuildBitCast(bld->builder, half, vec_type, "");
1050
1051       res = LLVMBuildAdd(bld->builder, a, half, "");
1052    }
1053
1054    res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, "");
1055
1056    return res;
1057 }
1058
1059
1060 /**
1061  * Convert float[] to int[] with floor().
1062  */
1063 LLVMValueRef
1064 lp_build_ifloor(struct lp_build_context *bld,
1065                 LLVMValueRef a)
1066 {
1067    const struct lp_type type = bld->type;
1068    LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
1069    LLVMValueRef res;
1070
1071    assert(type.floating);
1072
1073    if (type.length == 1) {
1074       /* scalar float to int */
1075       LLVMTypeRef int_type = LLVMIntType(type.width);
1076       res = LLVMBuildFPToSI(bld->builder, a, int_type, "");
1077       return res;
1078    }
1079
1080    assert(lp_check_value(type, a));
1081
1082    if(util_cpu_caps.has_sse4_1) {
1083       res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
1084    }
1085    else {
1086       /* Take the sign bit and add it to 1 constant */
1087       LLVMTypeRef vec_type = lp_build_vec_type(type);
1088       unsigned mantissa = lp_mantissa(type);
1089       LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1));
1090       LLVMValueRef sign;
1091       LLVMValueRef offset;
1092
1093       /* sign = a < 0 ? ~0 : 0 */
1094       sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
1095       sign = LLVMBuildAnd(bld->builder, sign, mask, "");
1096       sign = LLVMBuildAShr(bld->builder, sign, lp_build_const_int_vec(type, type.width - 1), "");
1097       lp_build_name(sign, "floor.sign");
1098
1099       /* offset = -0.99999(9)f */
1100       offset = lp_build_const_vec(type, -(double)(((unsigned long long)1 << mantissa) - 1)/((unsigned long long)1 << mantissa));
1101       offset = LLVMConstBitCast(offset, int_vec_type);
1102
1103       /* offset = a < 0 ? -0.99999(9)f : 0.0f */
1104       offset = LLVMBuildAnd(bld->builder, offset, sign, "");
1105       offset = LLVMBuildBitCast(bld->builder, offset, vec_type, "");
1106       lp_build_name(offset, "floor.offset");
1107
1108       res = LLVMBuildAdd(bld->builder, a, offset, "");
1109       lp_build_name(res, "floor.res");
1110    }
1111
1112    res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, "");
1113    lp_build_name(res, "floor");
1114
1115    return res;
1116 }
1117
1118
1119 LLVMValueRef
1120 lp_build_iceil(struct lp_build_context *bld,
1121                LLVMValueRef a)
1122 {
1123    const struct lp_type type = bld->type;
1124    LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
1125    LLVMValueRef res;
1126
1127    assert(type.floating);
1128    assert(lp_check_value(type, a));
1129
1130    if(util_cpu_caps.has_sse4_1) {
1131       res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
1132    }
1133    else {
1134       assert(0);
1135       res = bld->undef;
1136    }
1137
1138    res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, "");
1139
1140    return res;
1141 }
1142
1143
1144 LLVMValueRef
1145 lp_build_sqrt(struct lp_build_context *bld,
1146               LLVMValueRef a)
1147 {
1148    const struct lp_type type = bld->type;
1149    LLVMTypeRef vec_type = lp_build_vec_type(type);
1150    char intrinsic[32];
1151
1152    /* TODO: optimize the constant case */
1153    /* TODO: optimize the constant case */
1154
1155    assert(type.floating);
1156    util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.v%uf%u", type.length, type.width);
1157
1158    return lp_build_intrinsic_unary(bld->builder, intrinsic, vec_type, a);
1159 }
1160
1161
1162 LLVMValueRef
1163 lp_build_rcp(struct lp_build_context *bld,
1164              LLVMValueRef a)
1165 {
1166    const struct lp_type type = bld->type;
1167
1168    if(a == bld->zero)
1169       return bld->undef;
1170    if(a == bld->one)
1171       return bld->one;
1172    if(a == bld->undef)
1173       return bld->undef;
1174
1175    assert(type.floating);
1176
1177    if(LLVMIsConstant(a))
1178       return LLVMConstFDiv(bld->one, a);
1179
1180    if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4) {
1181       /*
1182        * XXX: Added precision is not always necessary, so only enable this
1183        * when we have a better system in place to track minimum precision.
1184        */
1185
1186 #if 0
1187       /*
1188        * Do one Newton-Raphson step to improve precision:
1189        *
1190        *   x1 = (2 - a * rcp(a)) * rcp(a)
1191        */
1192
1193       LLVMValueRef two = lp_build_const_vec(bld->type, 2.0);
1194       LLVMValueRef rcp_a;
1195       LLVMValueRef res;
1196
1197       rcp_a = lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rcp.ps", lp_build_vec_type(type), a);
1198
1199       res = LLVMBuildMul(bld->builder, a, rcp_a, "");
1200       res = LLVMBuildSub(bld->builder, two, res, "");
1201       res = LLVMBuildMul(bld->builder, res, rcp_a, "");
1202
1203       return rcp_a;
1204 #else
1205       return lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rcp.ps", lp_build_vec_type(type), a);
1206 #endif
1207    }
1208
1209    return LLVMBuildFDiv(bld->builder, bld->one, a, "");
1210 }
1211
1212
1213 /**
1214  * Generate 1/sqrt(a)
1215  */
1216 LLVMValueRef
1217 lp_build_rsqrt(struct lp_build_context *bld,
1218                LLVMValueRef a)
1219 {
1220    const struct lp_type type = bld->type;
1221
1222    assert(type.floating);
1223
1224    if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4)
1225       return lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rsqrt.ps", lp_build_vec_type(type), a);
1226
1227    return lp_build_rcp(bld, lp_build_sqrt(bld, a));
1228 }
1229
1230
1231 /**
1232  * Generate cos(a)
1233  */
1234 LLVMValueRef
1235 lp_build_cos(struct lp_build_context *bld,
1236               LLVMValueRef a)
1237 {
1238 #ifdef PIPE_OS_WINDOWS
1239    /*
1240     * FIXME: X86 backend translates llvm.cos.v4f32 to 4 calls to CRT's cosf()
1241     * which is neither efficient nor does the CRT linkage work on Windows
1242     * causing segmentation fault. So simply disable the code for now.
1243     */
1244    return bld->one;
1245 #else
1246    const struct lp_type type = bld->type;
1247    LLVMTypeRef vec_type = lp_build_vec_type(type);
1248    char intrinsic[32];
1249
1250    /* TODO: optimize the constant case */
1251
1252    assert(type.floating);
1253    util_snprintf(intrinsic, sizeof intrinsic, "llvm.cos.v%uf%u", type.length, type.width);
1254
1255    return lp_build_intrinsic_unary(bld->builder, intrinsic, vec_type, a);
1256 #endif
1257 }
1258
1259
1260 /**
1261  * Generate sin(a)
1262  */
1263 LLVMValueRef
1264 lp_build_sin(struct lp_build_context *bld,
1265               LLVMValueRef a)
1266 {
1267 #ifdef PIPE_OS_WINDOWS
1268    /*
1269     * FIXME: X86 backend translates llvm.sin.v4f32 to 4 calls to CRT's sinf()
1270     * which is neither efficient nor does the CRT linkage work on Windows
1271     * causing segmentation fault. So simply disable the code for now.
1272     */
1273    return bld->zero;
1274 #else
1275    const struct lp_type type = bld->type;
1276    LLVMTypeRef vec_type = lp_build_vec_type(type);
1277    char intrinsic[32];
1278
1279    /* TODO: optimize the constant case */
1280
1281    assert(type.floating);
1282    util_snprintf(intrinsic, sizeof intrinsic, "llvm.sin.v%uf%u", type.length, type.width);
1283
1284    return lp_build_intrinsic_unary(bld->builder, intrinsic, vec_type, a);
1285 #endif
1286 }
1287
1288
1289 /**
1290  * Generate pow(x, y)
1291  */
1292 LLVMValueRef
1293 lp_build_pow(struct lp_build_context *bld,
1294              LLVMValueRef x,
1295              LLVMValueRef y)
1296 {
1297    /* TODO: optimize the constant case */
1298    if(LLVMIsConstant(x) && LLVMIsConstant(y))
1299       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
1300                    __FUNCTION__);
1301
1302    return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
1303 }
1304
1305
1306 /**
1307  * Generate exp(x)
1308  */
1309 LLVMValueRef
1310 lp_build_exp(struct lp_build_context *bld,
1311              LLVMValueRef x)
1312 {
1313    /* log2(e) = 1/log(2) */
1314    LLVMValueRef log2e = lp_build_const_vec(bld->type, 1.4426950408889634);
1315
1316    return lp_build_mul(bld, log2e, lp_build_exp2(bld, x));
1317 }
1318
1319
1320 /**
1321  * Generate log(x)
1322  */
1323 LLVMValueRef
1324 lp_build_log(struct lp_build_context *bld,
1325              LLVMValueRef x)
1326 {
1327    /* log(2) */
1328    LLVMValueRef log2 = lp_build_const_vec(bld->type, 0.69314718055994529);
1329
1330    return lp_build_mul(bld, log2, lp_build_exp2(bld, x));
1331 }
1332
1333
1334 #define EXP_POLY_DEGREE 3
1335 #define LOG_POLY_DEGREE 5
1336
1337
1338 /**
1339  * Generate polynomial.
1340  * Ex:  coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
1341  */
1342 static LLVMValueRef
1343 lp_build_polynomial(struct lp_build_context *bld,
1344                     LLVMValueRef x,
1345                     const double *coeffs,
1346                     unsigned num_coeffs)
1347 {
1348    const struct lp_type type = bld->type;
1349    LLVMTypeRef float_type = LLVMFloatType();
1350    LLVMValueRef res = NULL;
1351    unsigned i;
1352
1353    /* TODO: optimize the constant case */
1354    if(LLVMIsConstant(x))
1355       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
1356                    __FUNCTION__);
1357
1358    for (i = num_coeffs; i--; ) {
1359       LLVMValueRef coeff;
1360
1361       if (type.length == 1)
1362          coeff = LLVMConstReal(float_type, coeffs[i]);
1363       else
1364          coeff = lp_build_const_vec(type, coeffs[i]);
1365
1366       if(res)
1367          res = lp_build_add(bld, coeff, lp_build_mul(bld, x, res));
1368       else
1369          res = coeff;
1370    }
1371
1372    if(res)
1373       return res;
1374    else
1375       return bld->undef;
1376 }
1377
1378
1379 /**
1380  * Minimax polynomial fit of 2**x, in range [-0.5, 0.5[
1381  */
1382 const double lp_build_exp2_polynomial[] = {
1383 #if EXP_POLY_DEGREE == 5
1384    9.9999994e-1, 6.9315308e-1, 2.4015361e-1, 5.5826318e-2, 8.9893397e-3, 1.8775767e-3
1385 #elif EXP_POLY_DEGREE == 4
1386    1.0000026, 6.9300383e-1, 2.4144275e-1, 5.2011464e-2, 1.3534167e-2
1387 #elif EXP_POLY_DEGREE == 3
1388    9.9992520e-1, 6.9583356e-1, 2.2606716e-1, 7.8024521e-2
1389 #elif EXP_POLY_DEGREE == 2
1390    1.0017247, 6.5763628e-1, 3.3718944e-1
1391 #else
1392 #error
1393 #endif
1394 };
1395
1396
1397 void
1398 lp_build_exp2_approx(struct lp_build_context *bld,
1399                      LLVMValueRef x,
1400                      LLVMValueRef *p_exp2_int_part,
1401                      LLVMValueRef *p_frac_part,
1402                      LLVMValueRef *p_exp2)
1403 {
1404    const struct lp_type type = bld->type;
1405    LLVMTypeRef vec_type = lp_build_vec_type(type);
1406    LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
1407    LLVMValueRef ipart = NULL;
1408    LLVMValueRef fpart = NULL;
1409    LLVMValueRef expipart = NULL;
1410    LLVMValueRef expfpart = NULL;
1411    LLVMValueRef res = NULL;
1412
1413    if(p_exp2_int_part || p_frac_part || p_exp2) {
1414       /* TODO: optimize the constant case */
1415       if(LLVMIsConstant(x))
1416          debug_printf("%s: inefficient/imprecise constant arithmetic\n",
1417                       __FUNCTION__);
1418
1419       assert(type.floating && type.width == 32);
1420
1421       x = lp_build_min(bld, x, lp_build_const_vec(type,  129.0));
1422       x = lp_build_max(bld, x, lp_build_const_vec(type, -126.99999));
1423
1424       /* ipart = int(x - 0.5) */
1425       ipart = LLVMBuildSub(bld->builder, x, lp_build_const_vec(type, 0.5f), "");
1426       ipart = LLVMBuildFPToSI(bld->builder, ipart, int_vec_type, "");
1427
1428       /* fpart = x - ipart */
1429       fpart = LLVMBuildSIToFP(bld->builder, ipart, vec_type, "");
1430       fpart = LLVMBuildSub(bld->builder, x, fpart, "");
1431    }
1432
1433    if(p_exp2_int_part || p_exp2) {
1434       /* expipart = (float) (1 << ipart) */
1435       expipart = LLVMBuildAdd(bld->builder, ipart, lp_build_const_int_vec(type, 127), "");
1436       expipart = LLVMBuildShl(bld->builder, expipart, lp_build_const_int_vec(type, 23), "");
1437       expipart = LLVMBuildBitCast(bld->builder, expipart, vec_type, "");
1438    }
1439
1440    if(p_exp2) {
1441       expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
1442                                      Elements(lp_build_exp2_polynomial));
1443
1444       res = LLVMBuildMul(bld->builder, expipart, expfpart, "");
1445    }
1446
1447    if(p_exp2_int_part)
1448       *p_exp2_int_part = expipart;
1449
1450    if(p_frac_part)
1451       *p_frac_part = fpart;
1452
1453    if(p_exp2)
1454       *p_exp2 = res;
1455 }
1456
1457
1458 LLVMValueRef
1459 lp_build_exp2(struct lp_build_context *bld,
1460               LLVMValueRef x)
1461 {
1462    LLVMValueRef res;
1463    lp_build_exp2_approx(bld, x, NULL, NULL, &res);
1464    return res;
1465 }
1466
1467
1468 /**
1469  * Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[
1470  * These coefficients can be generate with
1471  * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
1472  */
1473 const double lp_build_log2_polynomial[] = {
1474 #if LOG_POLY_DEGREE == 6
1475    3.11578814719469302614, -3.32419399085241980044, 2.59883907202499966007, -1.23152682416275988241, 0.318212422185251071475, -0.0344359067839062357313
1476 #elif LOG_POLY_DEGREE == 5
1477    2.8882704548164776201, -2.52074962577807006663, 1.48116647521213171641, -0.465725644288844778798, 0.0596515482674574969533
1478 #elif LOG_POLY_DEGREE == 4
1479    2.61761038894603480148, -1.75647175389045657003, 0.688243882994381274313, -0.107254423828329604454
1480 #elif LOG_POLY_DEGREE == 3
1481    2.28330284476918490682, -1.04913055217340124191, 0.204446009836232697516
1482 #else
1483 #error
1484 #endif
1485 };
1486
1487
1488 /**
1489  * See http://www.devmaster.net/forums/showthread.php?p=43580
1490  */
1491 void
1492 lp_build_log2_approx(struct lp_build_context *bld,
1493                      LLVMValueRef x,
1494                      LLVMValueRef *p_exp,
1495                      LLVMValueRef *p_floor_log2,
1496                      LLVMValueRef *p_log2)
1497 {
1498    const struct lp_type type = bld->type;
1499    LLVMTypeRef vec_type = lp_build_vec_type(type);
1500    LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
1501
1502    LLVMValueRef expmask = lp_build_const_int_vec(type, 0x7f800000);
1503    LLVMValueRef mantmask = lp_build_const_int_vec(type, 0x007fffff);
1504    LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
1505
1506    LLVMValueRef i = NULL;
1507    LLVMValueRef exp = NULL;
1508    LLVMValueRef mant = NULL;
1509    LLVMValueRef logexp = NULL;
1510    LLVMValueRef logmant = NULL;
1511    LLVMValueRef res = NULL;
1512
1513    if(p_exp || p_floor_log2 || p_log2) {
1514       /* TODO: optimize the constant case */
1515       if(LLVMIsConstant(x))
1516          debug_printf("%s: inefficient/imprecise constant arithmetic\n",
1517                       __FUNCTION__);
1518
1519       assert(type.floating && type.width == 32);
1520
1521       i = LLVMBuildBitCast(bld->builder, x, int_vec_type, "");
1522
1523       /* exp = (float) exponent(x) */
1524       exp = LLVMBuildAnd(bld->builder, i, expmask, "");
1525    }
1526
1527    if(p_floor_log2 || p_log2) {
1528       logexp = LLVMBuildLShr(bld->builder, exp, lp_build_const_int_vec(type, 23), "");
1529       logexp = LLVMBuildSub(bld->builder, logexp, lp_build_const_int_vec(type, 127), "");
1530       logexp = LLVMBuildSIToFP(bld->builder, logexp, vec_type, "");
1531    }
1532
1533    if(p_log2) {
1534       /* mant = (float) mantissa(x) */
1535       mant = LLVMBuildAnd(bld->builder, i, mantmask, "");
1536       mant = LLVMBuildOr(bld->builder, mant, one, "");
1537       mant = LLVMBuildBitCast(bld->builder, mant, vec_type, "");
1538
1539       logmant = lp_build_polynomial(bld, mant, lp_build_log2_polynomial,
1540                                     Elements(lp_build_log2_polynomial));
1541
1542       /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
1543       logmant = LLVMBuildMul(bld->builder, logmant, LLVMBuildSub(bld->builder, mant, bld->one, ""), "");
1544
1545       res = LLVMBuildAdd(bld->builder, logmant, logexp, "");
1546    }
1547
1548    if(p_exp) {
1549       exp = LLVMBuildBitCast(bld->builder, exp, vec_type, "");
1550       *p_exp = exp;
1551    }
1552
1553    if(p_floor_log2)
1554       *p_floor_log2 = logexp;
1555
1556    if(p_log2)
1557       *p_log2 = res;
1558 }
1559
1560
1561 /** scalar version of above function */
1562 static void
1563 lp_build_float_log2_approx(struct lp_build_context *bld,
1564                            LLVMValueRef x,
1565                            LLVMValueRef *p_exp,
1566                            LLVMValueRef *p_floor_log2,
1567                            LLVMValueRef *p_log2)
1568 {
1569    const struct lp_type type = bld->type;
1570    LLVMTypeRef float_type = LLVMFloatType();
1571    LLVMTypeRef int_type = LLVMIntType(type.width);
1572
1573    LLVMValueRef expmask = LLVMConstInt(int_type, 0x7f800000, 0);
1574    LLVMValueRef mantmask = LLVMConstInt(int_type, 0x007fffff, 0);
1575    LLVMValueRef one = LLVMConstBitCast(bld->one, int_type);
1576
1577    LLVMValueRef i = NULL;
1578    LLVMValueRef exp = NULL;
1579    LLVMValueRef mant = NULL;
1580    LLVMValueRef logexp = NULL;
1581    LLVMValueRef logmant = NULL;
1582    LLVMValueRef res = NULL;
1583
1584    if(p_exp || p_floor_log2 || p_log2) {
1585       /* TODO: optimize the constant case */
1586       if(LLVMIsConstant(x))
1587          debug_printf("%s: inefficient/imprecise constant arithmetic\n",
1588                       __FUNCTION__);
1589
1590       assert(type.floating && type.width == 32);
1591
1592       i = LLVMBuildBitCast(bld->builder, x, int_type, "");
1593
1594       /* exp = (float) exponent(x) */
1595       exp = LLVMBuildAnd(bld->builder, i, expmask, "");
1596    }
1597
1598    if(p_floor_log2 || p_log2) {
1599       LLVMValueRef c23 = LLVMConstInt(int_type, 23, 0);
1600       LLVMValueRef c127 = LLVMConstInt(int_type, 127, 0);
1601       logexp = LLVMBuildLShr(bld->builder, exp, c23, "");
1602       logexp = LLVMBuildSub(bld->builder, logexp, c127, "");
1603       logexp = LLVMBuildSIToFP(bld->builder, logexp, float_type, "");
1604    }
1605
1606    if(p_log2) {
1607       /* mant = (float) mantissa(x) */
1608       mant = LLVMBuildAnd(bld->builder, i, mantmask, "");
1609       mant = LLVMBuildOr(bld->builder, mant, one, "");
1610       mant = LLVMBuildBitCast(bld->builder, mant, float_type, "");
1611
1612       logmant = lp_build_polynomial(bld, mant, lp_build_log2_polynomial,
1613                                     Elements(lp_build_log2_polynomial));
1614
1615       /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
1616       logmant = LLVMBuildMul(bld->builder, logmant, LLVMBuildSub(bld->builder, mant, bld->one, ""), "");
1617
1618       res = LLVMBuildAdd(bld->builder, logmant, logexp, "");
1619    }
1620
1621    if(p_exp) {
1622       exp = LLVMBuildBitCast(bld->builder, exp, float_type, "");
1623       *p_exp = exp;
1624    }
1625
1626    if(p_floor_log2)
1627       *p_floor_log2 = logexp;
1628
1629    if(p_log2)
1630       *p_log2 = res;
1631 }
1632
1633
1634 LLVMValueRef
1635 lp_build_log2(struct lp_build_context *bld,
1636               LLVMValueRef x)
1637 {
1638    LLVMValueRef res;
1639    if (bld->type.length == 1) {
1640       lp_build_float_log2_approx(bld, x, NULL, NULL, &res);
1641    }
1642    else {
1643       lp_build_log2_approx(bld, x, NULL, NULL, &res);
1644    }
1645    return res;
1646 }