src/gallium/auxiliary/gallivm/lp_bld_arit.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2009 VMware, Inc.
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the
  15  * next paragraph) shall be included in all copies or substantial portions
  16  * of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  21  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
  22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25  *
  26  **************************************************************************/
  27
  28
  29 /**
  30  * @file
  31  * Helper
  32  *
  33  * LLVM IR doesn't support all basic arithmetic operations we care about (most
  34  * notably min/max and saturated operations), and it is often necessary to
  35  * resort machine-specific intrinsics directly. The functions here hide all
  36  * these implementation details from the other modules.
  37  *
  38  * We also do simple expressions simplification here. Reasons are:
  39  * - it is very easy given we have all necessary information readily available
  40  * - LLVM optimization passes fail to simplify several vector expressions
  41  * - We often know value constraints which the optimization passes have no way
  42  *   of knowing, such as when source arguments are known to be in [0, 1] range.
  43  *
  44  * @author Jose Fonseca <jfonseca@vmware.com>
  45  */
  46
  47
  48 #include "util/u_memory.h"
  49 #include "util/u_debug.h"
  50 #include "util/u_math.h"
  51 #include "util/u_string.h"
  52 #include "util/u_cpu_detect.h"
  53
  54 #include "lp_bld_type.h"
  55 #include "lp_bld_const.h"
  56 #include "lp_bld_intr.h"
  57 #include "lp_bld_logic.h"
  58 #include "lp_bld_pack.h"
  59 #include "lp_bld_debug.h"
  60 #include "lp_bld_arit.h"
  61
  62
  63 /**
  64  * Generate min(a, b)
  65  * No checks for special case values of a or b = 1 or 0 are done.
  66  */
  67 static LLVMValueRef
  68 lp_build_min_simple(struct lp_build_context *bld,
  69                     LLVMValueRef a,
  70                     LLVMValueRef b)
  71 {
  72    const struct lp_type type = bld->type;
  73    const char *intrinsic = NULL;
  74    LLVMValueRef cond;
  75
  76    /* TODO: optimize the constant case */
  77
  78    if(type.width * type.length == 128) {
  79       if(type.floating) {
  80          if(type.width == 32 && util_cpu_caps.has_sse)
  81             intrinsic = "llvm.x86.sse.min.ps";
  82          if(type.width == 64 && util_cpu_caps.has_sse2)
  83             intrinsic = "llvm.x86.sse2.min.pd";
  84       }
  85       else {
  86          if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2)
  87             intrinsic = "llvm.x86.sse2.pminu.b";
  88          if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1)
  89             intrinsic = "llvm.x86.sse41.pminsb";
  90          if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1)
  91             intrinsic = "llvm.x86.sse41.pminuw";
  92          if(type.width == 16 && type.sign && util_cpu_caps.has_sse2)
  93             intrinsic = "llvm.x86.sse2.pmins.w";
  94          if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1)
  95             intrinsic = "llvm.x86.sse41.pminud";
  96          if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1)
  97             intrinsic = "llvm.x86.sse41.pminsd";
  98       }
  99    }
 100
 101    if(intrinsic)
 102       return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
 103
 104    cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 105    return lp_build_select(bld, cond, a, b);
 106 }
 107
 108
 109 /**
 110  * Generate max(a, b)
 111  * No checks for special case values of a or b = 1 or 0 are done.
 112  */
 113 static LLVMValueRef
 114 lp_build_max_simple(struct lp_build_context *bld,
 115                     LLVMValueRef a,
 116                     LLVMValueRef b)
 117 {
 118    const struct lp_type type = bld->type;
 119    const char *intrinsic = NULL;
 120    LLVMValueRef cond;
 121
 122    /* TODO: optimize the constant case */
 123
 124    if(type.width * type.length == 128) {
 125       if(type.floating) {
 126          if(type.width == 32 && util_cpu_caps.has_sse)
 127             intrinsic = "llvm.x86.sse.max.ps";
 128          if(type.width == 64 && util_cpu_caps.has_sse2)
 129             intrinsic = "llvm.x86.sse2.max.pd";
 130       }
 131       else {
 132          if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2)
 133             intrinsic = "llvm.x86.sse2.pmaxu.b";
 134          if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1)
 135             intrinsic = "llvm.x86.sse41.pmaxsb";
 136          if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1)
 137             intrinsic = "llvm.x86.sse41.pmaxuw";
 138          if(type.width == 16 && type.sign && util_cpu_caps.has_sse2)
 139             intrinsic = "llvm.x86.sse2.pmaxs.w";
 140          if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1)
 141             intrinsic = "llvm.x86.sse41.pmaxud";
 142          if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1)
 143             intrinsic = "llvm.x86.sse41.pmaxsd";
 144       }
 145    }
 146
 147    if(intrinsic)
 148       return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
 149
 150    cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 151    return lp_build_select(bld, cond, a, b);
 152 }
 153
 154
 155 /**
 156  * Generate 1 - a, or ~a depending on bld->type.
 157  */
 158 LLVMValueRef
 159 lp_build_comp(struct lp_build_context *bld,
 160               LLVMValueRef a)
 161 {
 162    const struct lp_type type = bld->type;
 163
 164    if(a == bld->one)
 165       return bld->zero;
 166    if(a == bld->zero)
 167       return bld->one;
 168
 169    if(type.norm && !type.floating && !type.fixed && !type.sign) {
 170       if(LLVMIsConstant(a))
 171          return LLVMConstNot(a);
 172       else
 173          return LLVMBuildNot(bld->builder, a, "");
 174    }
 175
 176    if(LLVMIsConstant(a))
 177       return LLVMConstSub(bld->one, a);
 178    else
 179       return LLVMBuildSub(bld->builder, bld->one, a, "");
 180 }
 181
 182
 183 /**
 184  * Generate a + b
 185  */
 186 LLVMValueRef
 187 lp_build_add(struct lp_build_context *bld,
 188              LLVMValueRef a,
 189              LLVMValueRef b)
 190 {
 191    const struct lp_type type = bld->type;
 192    LLVMValueRef res;
 193
 194    if(a == bld->zero)
 195       return b;
 196    if(b == bld->zero)
 197       return a;
 198    if(a == bld->undef || b == bld->undef)
 199       return bld->undef;
 200
 201    if(bld->type.norm) {
 202       const char *intrinsic = NULL;
 203
 204       if(a == bld->one || b == bld->one)
 205         return bld->one;
 206
 207       if(util_cpu_caps.has_sse2 &&
 208          type.width * type.length == 128 &&
 209          !type.floating && !type.fixed) {
 210          if(type.width == 8)
 211             intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
 212          if(type.width == 16)
 213             intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
 214       }
 215
 216       if(intrinsic)
 217          return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
 218    }
 219
 220    if(LLVMIsConstant(a) && LLVMIsConstant(b))
 221       res = LLVMConstAdd(a, b);
 222    else
 223       res = LLVMBuildAdd(bld->builder, a, b, "");
 224
 225    /* clamp to ceiling of 1.0 */
 226    if(bld->type.norm && (bld->type.floating || bld->type.fixed))
 227       res = lp_build_min_simple(bld, res, bld->one);
 228
 229    /* XXX clamp to floor of -1 or 0??? */
 230
 231    return res;
 232 }
 233
 234
 235 /** Return the sum of the elements of a */
 236 LLVMValueRef
 237 lp_build_sum_vector(struct lp_build_context *bld,
 238                     LLVMValueRef a)
 239 {
 240    const struct lp_type type = bld->type;
 241    LLVMValueRef index, res;
 242    int i;
 243
 244    if (a == bld->zero)
 245       return bld->zero;
 246    if (a == bld->undef)
 247       return bld->undef;
 248    assert(type.length > 1);
 249
 250    assert(!bld->type.norm);
 251
 252    index = LLVMConstInt(LLVMInt32Type(), 0, 0);
 253    res = LLVMBuildExtractElement(bld->builder, a, index, "");
 254
 255    for (i = 1; i < type.length; i++) {
 256       index = LLVMConstInt(LLVMInt32Type(), i, 0);
 257       res = LLVMBuildAdd(bld->builder, res,
 258                          LLVMBuildExtractElement(bld->builder, a, index, ""),
 259                          "");
 260    }
 261
 262    return res;
 263 }
 264
 265
 266 /**
 267  * Generate a - b
 268  */
 269 LLVMValueRef
 270 lp_build_sub(struct lp_build_context *bld,
 271              LLVMValueRef a,
 272              LLVMValueRef b)
 273 {
 274    const struct lp_type type = bld->type;
 275    LLVMValueRef res;
 276
 277    if(b == bld->zero)
 278       return a;
 279    if(a == bld->undef || b == bld->undef)
 280       return bld->undef;
 281    if(a == b)
 282       return bld->zero;
 283
 284    if(bld->type.norm) {
 285       const char *intrinsic = NULL;
 286
 287       if(b == bld->one)
 288         return bld->zero;
 289
 290       if(util_cpu_caps.has_sse2 &&
 291          type.width * type.length == 128 &&
 292          !type.floating && !type.fixed) {
 293          if(type.width == 8)
 294             intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
 295          if(type.width == 16)
 296             intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
 297       }
 298
 299       if(intrinsic)
 300          return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
 301    }
 302
 303    if(LLVMIsConstant(a) && LLVMIsConstant(b))
 304       res = LLVMConstSub(a, b);
 305    else
 306       res = LLVMBuildSub(bld->builder, a, b, "");
 307
 308    if(bld->type.norm && (bld->type.floating || bld->type.fixed))
 309       res = lp_build_max_simple(bld, res, bld->zero);
 310
 311    return res;
 312 }
 313
 314
 315 /**
 316  * Normalized 8bit multiplication.
 317  *
 318  * - alpha plus one
 319  *
 320  *     makes the following approximation to the division (Sree)
 321  *
 322  *       a*b/255 ~= (a*(b + 1)) >> 256
 323  *
 324  *     which is the fastest method that satisfies the following OpenGL criteria
 325  *
 326  *       0*0 = 0 and 255*255 = 255
 327  *
 328  * - geometric series
 329  *
 330  *     takes the geometric series approximation to the division
 331  *
 332  *       t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
 333  *
 334  *     in this case just the first two terms to fit in 16bit arithmetic
 335  *
 336  *       t/255 ~= (t + (t >> 8)) >> 8
 337  *
 338  *     note that just by itself it doesn't satisfies the OpenGL criteria, as
 339  *     255*255 = 254, so the special case b = 255 must be accounted or roundoff
 340  *     must be used
 341  *
 342  * - geometric series plus rounding
 343  *
 344  *     when using a geometric series division instead of truncating the result
 345  *     use roundoff in the approximation (Jim Blinn)
 346  *
 347  *       t/255 ~= (t + (t >> 8) + 0x80) >> 8
 348  *
 349  *     achieving the exact results
 350  *
 351  * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
 352  *     ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
 353  * @sa Michael Herf, The "double blend trick", May 2000,
 354  *     http://www.stereopsis.com/doubleblend.html
 355  */
 356 static LLVMValueRef
 357 lp_build_mul_u8n(LLVMBuilderRef builder,
 358                  struct lp_type i16_type,
 359                  LLVMValueRef a, LLVMValueRef b)
 360 {
 361    LLVMValueRef c8;
 362    LLVMValueRef ab;
 363
 364    c8 = lp_build_int_const_scalar(i16_type, 8);
 365
 366 #if 0
 367
 368    /* a*b/255 ~= (a*(b + 1)) >> 256 */
 369    b = LLVMBuildAdd(builder, b, lp_build_int_const_scalar(i16_type, 1), "");
 370    ab = LLVMBuildMul(builder, a, b, "");
 371
 372 #else
 373
 374    /* ab/255 ~= (ab + (ab >> 8) + 0x80) >> 8 */
 375    ab = LLVMBuildMul(builder, a, b, "");
 376    ab = LLVMBuildAdd(builder, ab, LLVMBuildLShr(builder, ab, c8, ""), "");
 377    ab = LLVMBuildAdd(builder, ab, lp_build_int_const_scalar(i16_type, 0x80), "");
 378
 379 #endif
 380
 381    ab = LLVMBuildLShr(builder, ab, c8, "");
 382
 383    return ab;
 384 }
 385
 386
 387 /**
 388  * Generate a * b
 389  */
 390 LLVMValueRef
 391 lp_build_mul(struct lp_build_context *bld,
 392              LLVMValueRef a,
 393              LLVMValueRef b)
 394 {
 395    const struct lp_type type = bld->type;
 396    LLVMValueRef shift;
 397    LLVMValueRef res;
 398
 399    if(a == bld->zero)
 400       return bld->zero;
 401    if(a == bld->one)
 402       return b;
 403    if(b == bld->zero)
 404       return bld->zero;
 405    if(b == bld->one)
 406       return a;
 407    if(a == bld->undef || b == bld->undef)
 408       return bld->undef;
 409
 410    if(!type.floating && !type.fixed && type.norm) {
 411       if(type.width == 8) {
 412          struct lp_type i16_type = lp_wider_type(type);
 413          LLVMValueRef al, ah, bl, bh, abl, abh, ab;
 414
 415          lp_build_unpack2(bld->builder, type, i16_type, a, &al, &ah);
 416          lp_build_unpack2(bld->builder, type, i16_type, b, &bl, &bh);
 417
 418          /* PMULLW, PSRLW, PADDW */
 419          abl = lp_build_mul_u8n(bld->builder, i16_type, al, bl);
 420          abh = lp_build_mul_u8n(bld->builder, i16_type, ah, bh);
 421
 422          ab = lp_build_pack2(bld->builder, i16_type, type, abl, abh);
 423
 424          return ab;
 425       }
 426
 427       /* FIXME */
 428       assert(0);
 429    }
 430
 431    if(type.fixed)
 432       shift = lp_build_int_const_scalar(type, type.width/2);
 433    else
 434       shift = NULL;
 435
 436    if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
 437       res =  LLVMConstMul(a, b);
 438       if(shift) {
 439          if(type.sign)
 440             res = LLVMConstAShr(res, shift);
 441          else
 442             res = LLVMConstLShr(res, shift);
 443       }
 444    }
 445    else {
 446       res = LLVMBuildMul(bld->builder, a, b, "");
 447       if(shift) {
 448          if(type.sign)
 449             res = LLVMBuildAShr(bld->builder, res, shift, "");
 450          else
 451             res = LLVMBuildLShr(bld->builder, res, shift, "");
 452       }
 453    }
 454
 455    return res;
 456 }
 457
 458
 459 /**
 460  * Small vector x scale multiplication optimization.
 461  */
 462 LLVMValueRef
 463 lp_build_mul_imm(struct lp_build_context *bld,
 464                  LLVMValueRef a,
 465                  int b)
 466 {
 467    LLVMValueRef factor;
 468
 469    if(b == 0)
 470       return bld->zero;
 471
 472    if(b == 1)
 473       return a;
 474
 475    if(b == -1)
 476       return LLVMBuildNeg(bld->builder, a, "");
 477
 478    if(b == 2 && bld->type.floating)
 479       return lp_build_add(bld, a, a);
 480
 481    if(util_is_pot(b)) {
 482       unsigned shift = ffs(b) - 1;
 483
 484       if(bld->type.floating) {
 485 #if 0
 486          /*
 487           * Power of two multiplication by directly manipulating the mantissa.
 488           *
 489           * XXX: This might not be always faster, it will introduce a small error
 490           * for multiplication by zero, and it will produce wrong results
 491           * for Inf and NaN.
 492           */
 493          unsigned mantissa = lp_mantissa(bld->type);
 494          factor = lp_build_int_const_scalar(bld->type, (unsigned long long)shift << mantissa);
 495          a = LLVMBuildBitCast(bld->builder, a, lp_build_int_vec_type(bld->type), "");
 496          a = LLVMBuildAdd(bld->builder, a, factor, "");
 497          a = LLVMBuildBitCast(bld->builder, a, lp_build_vec_type(bld->type), "");
 498          return a;
 499 #endif
 500       }
 501       else {
 502          factor = lp_build_const_scalar(bld->type, shift);
 503          return LLVMBuildShl(bld->builder, a, factor, "");
 504       }
 505    }
 506
 507    factor = lp_build_const_scalar(bld->type, (double)b);
 508    return lp_build_mul(bld, a, factor);
 509 }
 510
 511
 512 /**
 513  * Generate a / b
 514  */
 515 LLVMValueRef
 516 lp_build_div(struct lp_build_context *bld,
 517              LLVMValueRef a,
 518              LLVMValueRef b)
 519 {
 520    const struct lp_type type = bld->type;
 521
 522    if(a == bld->zero)
 523       return bld->zero;
 524    if(a == bld->one)
 525       return lp_build_rcp(bld, b);
 526    if(b == bld->zero)
 527       return bld->undef;
 528    if(b == bld->one)
 529       return a;
 530    if(a == bld->undef || b == bld->undef)
 531       return bld->undef;
 532
 533    if(LLVMIsConstant(a) && LLVMIsConstant(b))
 534       return LLVMConstFDiv(a, b);
 535
 536    if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4)
 537       return lp_build_mul(bld, a, lp_build_rcp(bld, b));
 538
 539    return LLVMBuildFDiv(bld->builder, a, b, "");
 540 }
 541
 542
 543 /**
 544  * Linear interpolation.
 545  *
 546  * This also works for integer values with a few caveats.
 547  *
 548  * @sa http://www.stereopsis.com/doubleblend.html
 549  */
 550 LLVMValueRef
 551 lp_build_lerp(struct lp_build_context *bld,
 552               LLVMValueRef x,
 553               LLVMValueRef v0,
 554               LLVMValueRef v1)
 555 {
 556    LLVMValueRef delta;
 557    LLVMValueRef res;
 558
 559    delta = lp_build_sub(bld, v1, v0);
 560
 561    res = lp_build_mul(bld, x, delta);
 562
 563    res = lp_build_add(bld, v0, res);
 564
 565    if(bld->type.fixed)
 566       /* XXX: This step is necessary for lerping 8bit colors stored on 16bits,
 567        * but it will be wrong for other uses. Basically we need a more
 568        * powerful lp_type, capable of further distinguishing the values
 569        * interpretation from the value storage. */
 570       res = LLVMBuildAnd(bld->builder, res, lp_build_int_const_scalar(bld->type, (1 << bld->type.width/2) - 1), "");
 571
 572    return res;
 573 }
 574
 575
 576 LLVMValueRef
 577 lp_build_lerp_2d(struct lp_build_context *bld,
 578                  LLVMValueRef x,
 579                  LLVMValueRef y,
 580                  LLVMValueRef v00,
 581                  LLVMValueRef v01,
 582                  LLVMValueRef v10,
 583                  LLVMValueRef v11)
 584 {
 585    LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01);
 586    LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11);
 587    return lp_build_lerp(bld, y, v0, v1);
 588 }
 589
 590
 591 /**
 592  * Generate min(a, b)
 593  * Do checks for special cases.
 594  */
 595 LLVMValueRef
 596 lp_build_min(struct lp_build_context *bld,
 597              LLVMValueRef a,
 598              LLVMValueRef b)
 599 {
 600    if(a == bld->undef || b == bld->undef)
 601       return bld->undef;
 602
 603    if(a == b)
 604       return a;
 605
 606    if(bld->type.norm) {
 607       if(a == bld->zero || b == bld->zero)
 608          return bld->zero;
 609       if(a == bld->one)
 610          return b;
 611       if(b == bld->one)
 612          return a;
 613    }
 614
 615    return lp_build_min_simple(bld, a, b);
 616 }
 617
 618
 619 /**
 620  * Generate max(a, b)
 621  * Do checks for special cases.
 622  */
 623 LLVMValueRef
 624 lp_build_max(struct lp_build_context *bld,
 625              LLVMValueRef a,
 626              LLVMValueRef b)
 627 {
 628    if(a == bld->undef || b == bld->undef)
 629       return bld->undef;
 630
 631    if(a == b)
 632       return a;
 633
 634    if(bld->type.norm) {
 635       if(a == bld->one || b == bld->one)
 636          return bld->one;
 637       if(a == bld->zero)
 638          return b;
 639       if(b == bld->zero)
 640          return a;
 641    }
 642
 643    return lp_build_max_simple(bld, a, b);
 644 }
 645
 646
 647 /**
 648  * Generate clamp(a, min, max)
 649  * Do checks for special cases.
 650  */
 651 LLVMValueRef
 652 lp_build_clamp(struct lp_build_context *bld,
 653                LLVMValueRef a,
 654                LLVMValueRef min,
 655                LLVMValueRef max)
 656 {
 657    a = lp_build_min(bld, a, max);
 658    a = lp_build_max(bld, a, min);
 659    return a;
 660 }
 661
 662
 663 /**
 664  * Generate abs(a)
 665  */
 666 LLVMValueRef
 667 lp_build_abs(struct lp_build_context *bld,
 668              LLVMValueRef a)
 669 {
 670    const struct lp_type type = bld->type;
 671    LLVMTypeRef vec_type = lp_build_vec_type(type);
 672
 673    if(!type.sign)
 674       return a;
 675
 676    if(type.floating) {
 677       /* Mask out the sign bit */
 678       if (type.length == 1) {
 679          LLVMTypeRef int_type = LLVMIntType(type.width);
 680          LLVMTypeRef float_type = LLVMFloatType();
 681          unsigned long long absMask = ~(1ULL << (type.width - 1));
 682          LLVMValueRef mask = LLVMConstInt(int_type, absMask, 0);
 683          a = LLVMBuildBitCast(bld->builder, a, int_type, "");
 684          a = LLVMBuildAnd(bld->builder, a, mask, "");
 685          a = LLVMBuildBitCast(bld->builder, a, float_type, "");
 686          return a;
 687       }
 688       else {
 689          /* vector of floats */
 690          LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
 691          unsigned long long absMask = ~(1ULL << (type.width - 1));
 692          LLVMValueRef mask = lp_build_int_const_scalar(type, ((unsigned long long) absMask));
 693          a = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
 694          a = LLVMBuildAnd(bld->builder, a, mask, "");
 695          a = LLVMBuildBitCast(bld->builder, a, vec_type, "");
 696          return a;
 697       }
 698    }
 699
 700    if(type.width*type.length == 128 && util_cpu_caps.has_ssse3) {
 701       switch(type.width) {
 702       case 8:
 703          return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
 704       case 16:
 705          return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
 706       case 32:
 707          return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
 708       }
 709    }
 710
 711    return lp_build_max(bld, a, LLVMBuildNeg(bld->builder, a, ""));
 712 }
 713
 714
 715 LLVMValueRef
 716 lp_build_negate(struct lp_build_context *bld,
 717                 LLVMValueRef a)
 718 {
 719    return LLVMBuildNeg(bld->builder, a, "");
 720 }
 721
 722
 723 /** Return -1, 0 or +1 depending on the sign of a */
 724 LLVMValueRef
 725 lp_build_sgn(struct lp_build_context *bld,
 726              LLVMValueRef a)
 727 {
 728    const struct lp_type type = bld->type;
 729    LLVMValueRef cond;
 730    LLVMValueRef res;
 731
 732    /* Handle non-zero case */
 733    if(!type.sign) {
 734       /* if not zero then sign must be positive */
 735       res = bld->one;
 736    }
 737    else if(type.floating) {
 738       LLVMTypeRef vec_type;
 739       LLVMTypeRef int_type;
 740       LLVMValueRef mask;
 741       LLVMValueRef sign;
 742       LLVMValueRef one;
 743       unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
 744
 745       if (type.length == 1) {
 746          int_type = lp_build_int_elem_type(type);
 747          vec_type = lp_build_elem_type(type);
 748          mask = LLVMConstInt(int_type, maskBit, 0);
 749       }
 750       else {
 751          /* vector */
 752          int_type = lp_build_int_vec_type(type);
 753          vec_type = lp_build_vec_type(type);
 754          mask = lp_build_int_const_scalar(type, maskBit);
 755       }
 756
 757       /* Take the sign bit and add it to 1 constant */
 758       sign = LLVMBuildBitCast(bld->builder, a, int_type, "");
 759       sign = LLVMBuildAnd(bld->builder, sign, mask, "");
 760       one = LLVMConstBitCast(bld->one, int_type);
 761       res = LLVMBuildOr(bld->builder, sign, one, "");
 762       res = LLVMBuildBitCast(bld->builder, res, vec_type, "");
 763    }
 764    else
 765    {
 766       LLVMValueRef minus_one = lp_build_const_scalar(type, -1.0);
 767       cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
 768       res = lp_build_select(bld, cond, bld->one, minus_one);
 769    }
 770
 771    /* Handle zero */
 772    cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
 773    res = lp_build_select(bld, cond, bld->zero, bld->one);
 774
 775    return res;
 776 }
 777
 778
 779 /**
 780  * Set the sign of float vector 'a' according to 'sign'.
 781  * If sign==0, return abs(a).
 782  * If sign==1, return -abs(a);
 783  * Other values for sign produce undefined results.
 784  */
 785 LLVMValueRef
 786 lp_build_set_sign(struct lp_build_context *bld,
 787                   LLVMValueRef a, LLVMValueRef sign)
 788 {
 789    const struct lp_type type = bld->type;
 790    LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
 791    LLVMTypeRef vec_type = lp_build_vec_type(type);
 792    LLVMValueRef shift = lp_build_int_const_scalar(type, type.width - 1);
 793    LLVMValueRef mask = lp_build_int_const_scalar(type,
 794                              ~((unsigned long long) 1 << (type.width - 1)));
 795    LLVMValueRef val, res;
 796
 797    assert(type.floating);
 798
 799    /* val = reinterpret_cast<int>(a) */
 800    val = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
 801    /* val = val & mask */
 802    val = LLVMBuildAnd(bld->builder, val, mask, "");
 803    /* sign = sign << shift */
 804    sign = LLVMBuildShl(bld->builder, sign, shift, "");
 805    /* res = val | sign */
 806    res = LLVMBuildOr(bld->builder, val, sign, "");
 807    /* res = reinterpret_cast<float>(res) */
 808    res = LLVMBuildBitCast(bld->builder, res, vec_type, "");
 809
 810    return res;
 811 }
 812
 813
 814 /**
 815  * Convert vector of (or scalar) int to vector of (or scalar) float.
 816  */
 817 LLVMValueRef
 818 lp_build_int_to_float(struct lp_build_context *bld,
 819                       LLVMValueRef a)
 820 {
 821    const struct lp_type type = bld->type;
 822
 823    assert(type.floating);
 824    /*assert(lp_check_value(type, a));*/
 825
 826    if (type.length == 1) {
 827       LLVMTypeRef float_type = LLVMFloatType();
 828       return LLVMBuildSIToFP(bld->builder, a, float_type, "");
 829    }
 830    else {
 831       LLVMTypeRef vec_type = lp_build_vec_type(type);
 832       /*LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);*/
 833       LLVMValueRef res;
 834       res = LLVMBuildSIToFP(bld->builder, a, vec_type, "");
 835       return res;
 836    }
 837 }
 838
 839
 840
 841 enum lp_build_round_sse41_mode
 842 {
 843    LP_BUILD_ROUND_SSE41_NEAREST = 0,
 844    LP_BUILD_ROUND_SSE41_FLOOR = 1,
 845    LP_BUILD_ROUND_SSE41_CEIL = 2,
 846    LP_BUILD_ROUND_SSE41_TRUNCATE = 3
 847 };
 848
 849
 850 static INLINE LLVMValueRef
 851 lp_build_round_sse41(struct lp_build_context *bld,
 852                      LLVMValueRef a,
 853                      enum lp_build_round_sse41_mode mode)
 854 {
 855    const struct lp_type type = bld->type;
 856    LLVMTypeRef vec_type = lp_build_vec_type(type);
 857    const char *intrinsic;
 858
 859    assert(type.floating);
 860    assert(type.width*type.length == 128);
 861    assert(lp_check_value(type, a));
 862    assert(util_cpu_caps.has_sse4_1);
 863
 864    switch(type.width) {
 865    case 32:
 866       intrinsic = "llvm.x86.sse41.round.ps";
 867       break;
 868    case 64:
 869       intrinsic = "llvm.x86.sse41.round.pd";
 870       break;
 871    default:
 872       assert(0);
 873       return bld->undef;
 874    }
 875
 876    return lp_build_intrinsic_binary(bld->builder, intrinsic, vec_type, a,
 877                                     LLVMConstInt(LLVMInt32Type(), mode, 0));
 878 }
 879
 880
 881 LLVMValueRef
 882 lp_build_trunc(struct lp_build_context *bld,
 883                LLVMValueRef a)
 884 {
 885    const struct lp_type type = bld->type;
 886
 887    assert(type.floating);
 888    assert(lp_check_value(type, a));
 889
 890    if(util_cpu_caps.has_sse4_1)
 891       return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_TRUNCATE);
 892    else {
 893       LLVMTypeRef vec_type = lp_build_vec_type(type);
 894       LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
 895       LLVMValueRef res;
 896       res = LLVMBuildFPToSI(bld->builder, a, int_vec_type, "");
 897       res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
 898       return res;
 899    }
 900 }
 901
 902
 903 LLVMValueRef
 904 lp_build_round(struct lp_build_context *bld,
 905                LLVMValueRef a)
 906 {
 907    const struct lp_type type = bld->type;
 908
 909    assert(type.floating);
 910    assert(lp_check_value(type, a));
 911
 912    if(util_cpu_caps.has_sse4_1)
 913       return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
 914    else {
 915       LLVMTypeRef vec_type = lp_build_vec_type(type);
 916       LLVMValueRef res;
 917       res = lp_build_iround(bld, a);
 918       res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
 919       return res;
 920    }
 921 }
 922
 923
 924 LLVMValueRef
 925 lp_build_floor(struct lp_build_context *bld,
 926                LLVMValueRef a)
 927 {
 928    const struct lp_type type = bld->type;
 929
 930    assert(type.floating);
 931
 932    if (type.length == 1) {
 933       return LLVMBuildFPTrunc(bld->builder, a, LLVMFloatType(), "");
 934    }
 935
 936    if(util_cpu_caps.has_sse4_1)
 937       return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
 938    else {
 939       LLVMTypeRef vec_type = lp_build_vec_type(type);
 940       LLVMValueRef res;
 941       res = lp_build_ifloor(bld, a);
 942       res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
 943       return res;
 944    }
 945 }
 946
 947
 948 LLVMValueRef
 949 lp_build_ceil(struct lp_build_context *bld,
 950               LLVMValueRef a)
 951 {
 952    const struct lp_type type = bld->type;
 953
 954    assert(type.floating);
 955    assert(lp_check_value(type, a));
 956
 957    if(util_cpu_caps.has_sse4_1)
 958       return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
 959    else {
 960       LLVMTypeRef vec_type = lp_build_vec_type(type);
 961       LLVMValueRef res;
 962       res = lp_build_iceil(bld, a);
 963       res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
 964       return res;
 965    }
 966 }
 967
 968
 969 /**
 970  * Return fractional part of 'a' computed as a - floor(f)
 971  * Typically used in texture coord arithmetic.
 972  */
 973 LLVMValueRef
 974 lp_build_fract(struct lp_build_context *bld,
 975                LLVMValueRef a)
 976 {
 977    assert(bld->type.floating);
 978    return lp_build_sub(bld, a, lp_build_floor(bld, a));
 979 }
 980
 981
 982 /**
 983  * Convert to integer, through whichever rounding method that's fastest,
 984  * typically truncating toward zero.
 985  */
 986 LLVMValueRef
 987 lp_build_itrunc(struct lp_build_context *bld,
 988                 LLVMValueRef a)
 989 {
 990    const struct lp_type type = bld->type;
 991
 992    assert(type.floating);
 993
 994    if (type.length == 1) {
 995       LLVMTypeRef int_type = LLVMIntType(type.width);
 996       return LLVMBuildFPTrunc(bld->builder, a, int_type, "");
 997    }
 998    else {
 999       LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
1000       assert(lp_check_value(type, a));
1001       return LLVMBuildFPToSI(bld->builder, a, int_vec_type, "");
1002    }
1003 }
1004
1005
1006 /**
1007  * Convert float[] to int[] with round().
1008  */
1009 LLVMValueRef
1010 lp_build_iround(struct lp_build_context *bld,
1011                 LLVMValueRef a)
1012 {
1013    const struct lp_type type = bld->type;
1014    LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
1015    LLVMValueRef res;
1016
1017    assert(type.floating);
1018
1019    if (type.length == 1) {
1020       /* scalar float to int */
1021       LLVMTypeRef int_type = LLVMIntType(type.width);
1022       /* XXX we want rounding here! */
1023       res = LLVMBuildFPToSI(bld->builder, a, int_type, "");
1024       return res;
1025    }
1026
1027    assert(lp_check_value(type, a));
1028
1029    if(util_cpu_caps.has_sse4_1) {
1030       res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
1031    }
1032    else {
1033       LLVMTypeRef vec_type = lp_build_vec_type(type);
1034       LLVMValueRef mask = lp_build_int_const_scalar(type, (unsigned long long)1 << (type.width - 1));
1035       LLVMValueRef sign;
1036       LLVMValueRef half;
1037
1038       /* get sign bit */
1039       sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
1040       sign = LLVMBuildAnd(bld->builder, sign, mask, "");
1041
1042       /* sign * 0.5 */
1043       half = lp_build_const_scalar(type, 0.5);
1044       half = LLVMBuildBitCast(bld->builder, half, int_vec_type, "");
1045       half = LLVMBuildOr(bld->builder, sign, half, "");
1046       half = LLVMBuildBitCast(bld->builder, half, vec_type, "");
1047
1048       res = LLVMBuildAdd(bld->builder, a, half, "");
1049    }
1050
1051    res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, "");
1052
1053    return res;
1054 }
1055
1056
1057 /**
1058  * Convert float[] to int[] with floor().
1059  */
1060 LLVMValueRef
1061 lp_build_ifloor(struct lp_build_context *bld,
1062                 LLVMValueRef a)
1063 {
1064    const struct lp_type type = bld->type;
1065    LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
1066    LLVMValueRef res;
1067
1068    assert(type.floating);
1069
1070    if (type.length == 1) {
1071       /* scalar float to int */
1072       LLVMTypeRef int_type = LLVMIntType(type.width);
1073       res = LLVMBuildFPToSI(bld->builder, a, int_type, "");
1074       return res;
1075    }
1076
1077    assert(lp_check_value(type, a));
1078
1079    if(util_cpu_caps.has_sse4_1) {
1080       res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
1081    }
1082    else {
1083       /* Take the sign bit and add it to 1 constant */
1084       LLVMTypeRef vec_type = lp_build_vec_type(type);
1085       unsigned mantissa = lp_mantissa(type);
1086       LLVMValueRef mask = lp_build_int_const_scalar(type, (unsigned long long)1 << (type.width - 1));
1087       LLVMValueRef sign;
1088       LLVMValueRef offset;
1089
1090       /* sign = a < 0 ? ~0 : 0 */
1091       sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
1092       sign = LLVMBuildAnd(bld->builder, sign, mask, "");
1093       sign = LLVMBuildAShr(bld->builder, sign, lp_build_int_const_scalar(type, type.width - 1), "");
1094       lp_build_name(sign, "floor.sign");
1095
1096       /* offset = -0.99999(9)f */
1097       offset = lp_build_const_scalar(type, -(double)(((unsigned long long)1 << mantissa) - 1)/((unsigned long long)1 << mantissa));
1098       offset = LLVMConstBitCast(offset, int_vec_type);
1099
1100       /* offset = a < 0 ? -0.99999(9)f : 0.0f */
1101       offset = LLVMBuildAnd(bld->builder, offset, sign, "");
1102       offset = LLVMBuildBitCast(bld->builder, offset, vec_type, "");
1103       lp_build_name(offset, "floor.offset");
1104
1105       res = LLVMBuildAdd(bld->builder, a, offset, "");
1106       lp_build_name(res, "floor.res");
1107    }
1108
1109    res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, "");
1110    lp_build_name(res, "floor");
1111
1112    return res;
1113 }
1114
1115
1116 LLVMValueRef
1117 lp_build_iceil(struct lp_build_context *bld,
1118                LLVMValueRef a)
1119 {
1120    const struct lp_type type = bld->type;
1121    LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
1122    LLVMValueRef res;
1123
1124    assert(type.floating);
1125    assert(lp_check_value(type, a));
1126
1127    if(util_cpu_caps.has_sse4_1) {
1128       res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
1129    }
1130    else {
1131       assert(0);
1132       res = bld->undef;
1133    }
1134
1135    res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, "");
1136
1137    return res;
1138 }
1139
1140
1141 LLVMValueRef
1142 lp_build_sqrt(struct lp_build_context *bld,
1143               LLVMValueRef a)
1144 {
1145    const struct lp_type type = bld->type;
1146    LLVMTypeRef vec_type = lp_build_vec_type(type);
1147    char intrinsic[32];
1148
1149    /* TODO: optimize the constant case */
1150    /* TODO: optimize the constant case */
1151
1152    assert(type.floating);
1153    util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.v%uf%u", type.length, type.width);
1154
1155    return lp_build_intrinsic_unary(bld->builder, intrinsic, vec_type, a);
1156 }
1157
1158
1159 LLVMValueRef
1160 lp_build_rcp(struct lp_build_context *bld,
1161              LLVMValueRef a)
1162 {
1163    const struct lp_type type = bld->type;
1164
1165    if(a == bld->zero)
1166       return bld->undef;
1167    if(a == bld->one)
1168       return bld->one;
1169    if(a == bld->undef)
1170       return bld->undef;
1171
1172    assert(type.floating);
1173
1174    if(LLVMIsConstant(a))
1175       return LLVMConstFDiv(bld->one, a);
1176
1177    if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4)
1178       /* FIXME: improve precision */
1179       return lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rcp.ps", lp_build_vec_type(type), a);
1180
1181    return LLVMBuildFDiv(bld->builder, bld->one, a, "");
1182 }
1183
1184
1185 /**
1186  * Generate 1/sqrt(a)
1187  */
1188 LLVMValueRef
1189 lp_build_rsqrt(struct lp_build_context *bld,
1190                LLVMValueRef a)
1191 {
1192    const struct lp_type type = bld->type;
1193
1194    assert(type.floating);
1195
1196    if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4)
1197       return lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rsqrt.ps", lp_build_vec_type(type), a);
1198
1199    return lp_build_rcp(bld, lp_build_sqrt(bld, a));
1200 }
1201
1202
1203 /**
1204  * Generate cos(a)
1205  */
1206 LLVMValueRef
1207 lp_build_cos(struct lp_build_context *bld,
1208               LLVMValueRef a)
1209 {
1210    const struct lp_type type = bld->type;
1211    LLVMTypeRef vec_type = lp_build_vec_type(type);
1212    char intrinsic[32];
1213
1214    /* TODO: optimize the constant case */
1215
1216    assert(type.floating);
1217    util_snprintf(intrinsic, sizeof intrinsic, "llvm.cos.v%uf%u", type.length, type.width);
1218
1219    return lp_build_intrinsic_unary(bld->builder, intrinsic, vec_type, a);
1220 }
1221
1222
1223 /**
1224  * Generate sin(a)
1225  */
1226 LLVMValueRef
1227 lp_build_sin(struct lp_build_context *bld,
1228               LLVMValueRef a)
1229 {
1230    const struct lp_type type = bld->type;
1231    LLVMTypeRef vec_type = lp_build_vec_type(type);
1232    char intrinsic[32];
1233
1234    /* TODO: optimize the constant case */
1235
1236    assert(type.floating);
1237    util_snprintf(intrinsic, sizeof intrinsic, "llvm.sin.v%uf%u", type.length, type.width);
1238
1239    return lp_build_intrinsic_unary(bld->builder, intrinsic, vec_type, a);
1240 }
1241
1242
1243 /**
1244  * Generate pow(x, y)
1245  */
1246 LLVMValueRef
1247 lp_build_pow(struct lp_build_context *bld,
1248              LLVMValueRef x,
1249              LLVMValueRef y)
1250 {
1251    /* TODO: optimize the constant case */
1252    if(LLVMIsConstant(x) && LLVMIsConstant(y))
1253       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
1254                    __FUNCTION__);
1255
1256    return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
1257 }
1258
1259
1260 /**
1261  * Generate exp(x)
1262  */
1263 LLVMValueRef
1264 lp_build_exp(struct lp_build_context *bld,
1265              LLVMValueRef x)
1266 {
1267    /* log2(e) = 1/log(2) */
1268    LLVMValueRef log2e = lp_build_const_scalar(bld->type, 1.4426950408889634);
1269
1270    return lp_build_mul(bld, log2e, lp_build_exp2(bld, x));
1271 }
1272
1273
1274 /**
1275  * Generate log(x)
1276  */
1277 LLVMValueRef
1278 lp_build_log(struct lp_build_context *bld,
1279              LLVMValueRef x)
1280 {
1281    /* log(2) */
1282    LLVMValueRef log2 = lp_build_const_scalar(bld->type, 0.69314718055994529);
1283
1284    return lp_build_mul(bld, log2, lp_build_exp2(bld, x));
1285 }
1286
1287
1288 #define EXP_POLY_DEGREE 3
1289 #define LOG_POLY_DEGREE 5
1290
1291
1292 /**
1293  * Generate polynomial.
1294  * Ex:  coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
1295  */
1296 static LLVMValueRef
1297 lp_build_polynomial(struct lp_build_context *bld,
1298                     LLVMValueRef x,
1299                     const double *coeffs,
1300                     unsigned num_coeffs)
1301 {
1302    const struct lp_type type = bld->type;
1303    LLVMTypeRef float_type = LLVMFloatType();
1304    LLVMValueRef res = NULL;
1305    unsigned i;
1306
1307    /* TODO: optimize the constant case */
1308    if(LLVMIsConstant(x))
1309       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
1310                    __FUNCTION__);
1311
1312    for (i = num_coeffs; i--; ) {
1313       LLVMValueRef coeff;
1314
1315       if (type.length == 1)
1316          coeff = LLVMConstReal(float_type, coeffs[i]);
1317       else
1318          coeff = lp_build_const_scalar(type, coeffs[i]);
1319
1320       if(res)
1321          res = lp_build_add(bld, coeff, lp_build_mul(bld, x, res));
1322       else
1323          res = coeff;
1324    }
1325
1326    if(res)
1327       return res;
1328    else
1329       return bld->undef;
1330 }
1331
1332
1333 /**
1334  * Minimax polynomial fit of 2**x, in range [-0.5, 0.5[
1335  */
1336 const double lp_build_exp2_polynomial[] = {
1337 #if EXP_POLY_DEGREE == 5
1338    9.9999994e-1, 6.9315308e-1, 2.4015361e-1, 5.5826318e-2, 8.9893397e-3, 1.8775767e-3
1339 #elif EXP_POLY_DEGREE == 4
1340    1.0000026, 6.9300383e-1, 2.4144275e-1, 5.2011464e-2, 1.3534167e-2
1341 #elif EXP_POLY_DEGREE == 3
1342    9.9992520e-1, 6.9583356e-1, 2.2606716e-1, 7.8024521e-2
1343 #elif EXP_POLY_DEGREE == 2
1344    1.0017247, 6.5763628e-1, 3.3718944e-1
1345 #else
1346 #error
1347 #endif
1348 };
1349
1350
1351 void
1352 lp_build_exp2_approx(struct lp_build_context *bld,
1353                      LLVMValueRef x,
1354                      LLVMValueRef *p_exp2_int_part,
1355                      LLVMValueRef *p_frac_part,
1356                      LLVMValueRef *p_exp2)
1357 {
1358    const struct lp_type type = bld->type;
1359    LLVMTypeRef vec_type = lp_build_vec_type(type);
1360    LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
1361    LLVMValueRef ipart = NULL;
1362    LLVMValueRef fpart = NULL;
1363    LLVMValueRef expipart = NULL;
1364    LLVMValueRef expfpart = NULL;
1365    LLVMValueRef res = NULL;
1366
1367    if(p_exp2_int_part || p_frac_part || p_exp2) {
1368       /* TODO: optimize the constant case */
1369       if(LLVMIsConstant(x))
1370          debug_printf("%s: inefficient/imprecise constant arithmetic\n",
1371                       __FUNCTION__);
1372
1373       assert(type.floating && type.width == 32);
1374
1375       x = lp_build_min(bld, x, lp_build_const_scalar(type,  129.0));
1376       x = lp_build_max(bld, x, lp_build_const_scalar(type, -126.99999));
1377
1378       /* ipart = int(x - 0.5) */
1379       ipart = LLVMBuildSub(bld->builder, x, lp_build_const_scalar(type, 0.5f), "");
1380       ipart = LLVMBuildFPToSI(bld->builder, ipart, int_vec_type, "");
1381
1382       /* fpart = x - ipart */
1383       fpart = LLVMBuildSIToFP(bld->builder, ipart, vec_type, "");
1384       fpart = LLVMBuildSub(bld->builder, x, fpart, "");
1385    }
1386
1387    if(p_exp2_int_part || p_exp2) {
1388       /* expipart = (float) (1 << ipart) */
1389       expipart = LLVMBuildAdd(bld->builder, ipart, lp_build_int_const_scalar(type, 127), "");
1390       expipart = LLVMBuildShl(bld->builder, expipart, lp_build_int_const_scalar(type, 23), "");
1391       expipart = LLVMBuildBitCast(bld->builder, expipart, vec_type, "");
1392    }
1393
1394    if(p_exp2) {
1395       expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
1396                                      Elements(lp_build_exp2_polynomial));
1397
1398       res = LLVMBuildMul(bld->builder, expipart, expfpart, "");
1399    }
1400
1401    if(p_exp2_int_part)
1402       *p_exp2_int_part = expipart;
1403
1404    if(p_frac_part)
1405       *p_frac_part = fpart;
1406
1407    if(p_exp2)
1408       *p_exp2 = res;
1409 }
1410
1411
1412 LLVMValueRef
1413 lp_build_exp2(struct lp_build_context *bld,
1414               LLVMValueRef x)
1415 {
1416    LLVMValueRef res;
1417    lp_build_exp2_approx(bld, x, NULL, NULL, &res);
1418    return res;
1419 }
1420
1421
1422 /**
1423  * Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[
1424  * These coefficients can be generate with
1425  * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
1426  */
1427 const double lp_build_log2_polynomial[] = {
1428 #if LOG_POLY_DEGREE == 6
1429    3.11578814719469302614, -3.32419399085241980044, 2.59883907202499966007, -1.23152682416275988241, 0.318212422185251071475, -0.0344359067839062357313
1430 #elif LOG_POLY_DEGREE == 5
1431    2.8882704548164776201, -2.52074962577807006663, 1.48116647521213171641, -0.465725644288844778798, 0.0596515482674574969533
1432 #elif LOG_POLY_DEGREE == 4
1433    2.61761038894603480148, -1.75647175389045657003, 0.688243882994381274313, -0.107254423828329604454
1434 #elif LOG_POLY_DEGREE == 3
1435    2.28330284476918490682, -1.04913055217340124191, 0.204446009836232697516
1436 #else
1437 #error
1438 #endif
1439 };
1440
1441
1442 /**
1443  * See http://www.devmaster.net/forums/showthread.php?p=43580
1444  */
1445 void
1446 lp_build_log2_approx(struct lp_build_context *bld,
1447                      LLVMValueRef x,
1448                      LLVMValueRef *p_exp,
1449                      LLVMValueRef *p_floor_log2,
1450                      LLVMValueRef *p_log2)
1451 {
1452    const struct lp_type type = bld->type;
1453    LLVMTypeRef vec_type = lp_build_vec_type(type);
1454    LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
1455
1456    LLVMValueRef expmask = lp_build_int_const_scalar(type, 0x7f800000);
1457    LLVMValueRef mantmask = lp_build_int_const_scalar(type, 0x007fffff);
1458    LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
1459
1460    LLVMValueRef i = NULL;
1461    LLVMValueRef exp = NULL;
1462    LLVMValueRef mant = NULL;
1463    LLVMValueRef logexp = NULL;
1464    LLVMValueRef logmant = NULL;
1465    LLVMValueRef res = NULL;
1466
1467    if(p_exp || p_floor_log2 || p_log2) {
1468       /* TODO: optimize the constant case */
1469       if(LLVMIsConstant(x))
1470          debug_printf("%s: inefficient/imprecise constant arithmetic\n",
1471                       __FUNCTION__);
1472
1473       assert(type.floating && type.width == 32);
1474
1475       i = LLVMBuildBitCast(bld->builder, x, int_vec_type, "");
1476
1477       /* exp = (float) exponent(x) */
1478       exp = LLVMBuildAnd(bld->builder, i, expmask, "");
1479    }
1480
1481    if(p_floor_log2 || p_log2) {
1482       logexp = LLVMBuildLShr(bld->builder, exp, lp_build_int_const_scalar(type, 23), "");
1483       logexp = LLVMBuildSub(bld->builder, logexp, lp_build_int_const_scalar(type, 127), "");
1484       logexp = LLVMBuildSIToFP(bld->builder, logexp, vec_type, "");
1485    }
1486
1487    if(p_log2) {
1488       /* mant = (float) mantissa(x) */
1489       mant = LLVMBuildAnd(bld->builder, i, mantmask, "");
1490       mant = LLVMBuildOr(bld->builder, mant, one, "");
1491       mant = LLVMBuildBitCast(bld->builder, mant, vec_type, "");
1492
1493       logmant = lp_build_polynomial(bld, mant, lp_build_log2_polynomial,
1494                                     Elements(lp_build_log2_polynomial));
1495
1496       /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
1497       logmant = LLVMBuildMul(bld->builder, logmant, LLVMBuildSub(bld->builder, mant, bld->one, ""), "");
1498
1499       res = LLVMBuildAdd(bld->builder, logmant, logexp, "");
1500    }
1501
1502    if(p_exp)
1503       *p_exp = exp;
1504
1505    if(p_floor_log2)
1506       *p_floor_log2 = logexp;
1507
1508    if(p_log2)
1509       *p_log2 = res;
1510 }
1511
1512
1513 /** scalar version of above function */
1514 static void
1515 lp_build_float_log2_approx(struct lp_build_context *bld,
1516                            LLVMValueRef x,
1517                            LLVMValueRef *p_exp,
1518                            LLVMValueRef *p_floor_log2,
1519                            LLVMValueRef *p_log2)
1520 {
1521    const struct lp_type type = bld->type;
1522    LLVMTypeRef float_type = LLVMFloatType();
1523    LLVMTypeRef int_type = LLVMIntType(type.width);
1524
1525    LLVMValueRef expmask = LLVMConstInt(int_type, 0x7f800000, 0);
1526    LLVMValueRef mantmask = LLVMConstInt(int_type, 0x007fffff, 0);
1527    LLVMValueRef one = LLVMConstBitCast(bld->one, int_type);
1528
1529    LLVMValueRef i = NULL;
1530    LLVMValueRef exp = NULL;
1531    LLVMValueRef mant = NULL;
1532    LLVMValueRef logexp = NULL;
1533    LLVMValueRef logmant = NULL;
1534    LLVMValueRef res = NULL;
1535
1536    if(p_exp || p_floor_log2 || p_log2) {
1537       /* TODO: optimize the constant case */
1538       if(LLVMIsConstant(x))
1539          debug_printf("%s: inefficient/imprecise constant arithmetic\n",
1540                       __FUNCTION__);
1541
1542       assert(type.floating && type.width == 32);
1543
1544       i = LLVMBuildBitCast(bld->builder, x, int_type, "");
1545
1546       /* exp = (float) exponent(x) */
1547       exp = LLVMBuildAnd(bld->builder, i, expmask, "");
1548    }
1549
1550    if(p_floor_log2 || p_log2) {
1551       LLVMValueRef c23 = LLVMConstInt(int_type, 23, 0);
1552       LLVMValueRef c127 = LLVMConstInt(int_type, 127, 0);
1553       logexp = LLVMBuildLShr(bld->builder, exp, c23, "");
1554       logexp = LLVMBuildSub(bld->builder, logexp, c127, "");
1555       logexp = LLVMBuildSIToFP(bld->builder, logexp, float_type, "");
1556    }
1557
1558    if(p_log2) {
1559       /* mant = (float) mantissa(x) */
1560       mant = LLVMBuildAnd(bld->builder, i, mantmask, "");
1561       mant = LLVMBuildOr(bld->builder, mant, one, "");
1562       mant = LLVMBuildBitCast(bld->builder, mant, float_type, "");
1563
1564       logmant = lp_build_polynomial(bld, mant, lp_build_log2_polynomial,
1565                                     Elements(lp_build_log2_polynomial));
1566
1567       /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
1568       logmant = LLVMBuildMul(bld->builder, logmant, LLVMBuildSub(bld->builder, mant, bld->one, ""), "");
1569
1570       res = LLVMBuildAdd(bld->builder, logmant, logexp, "");
1571    }
1572
1573    if(p_exp)
1574       *p_exp = exp;
1575
1576    if(p_floor_log2)
1577       *p_floor_log2 = logexp;
1578
1579    if(p_log2)
1580       *p_log2 = res;
1581 }
1582
1583
1584 LLVMValueRef
1585 lp_build_log2(struct lp_build_context *bld,
1586               LLVMValueRef x)
1587 {
1588    LLVMValueRef res;
1589    if (bld->type.length == 1) {
1590       lp_build_float_log2_approx(bld, x, NULL, NULL, &res);
1591    }
1592    else {
1593       lp_build_log2_approx(bld, x, NULL, NULL, &res);
1594    }
1595    return res;
1596 }