src/gallium/auxiliary/gallivm/lp_bld_arit.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2009-2010 VMware, Inc.
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the
  15  * next paragraph) shall be included in all copies or substantial portions
  16  * of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  21  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
  22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25  *
  26  **************************************************************************/
  27
  28
  29 /**
  30  * @file
  31  * Helper
  32  *
  33  * LLVM IR doesn't support all basic arithmetic operations we care about (most
  34  * notably min/max and saturated operations), and it is often necessary to
  35  * resort machine-specific intrinsics directly. The functions here hide all
  36  * these implementation details from the other modules.
  37  *
  38  * We also do simple expressions simplification here. Reasons are:
  39  * - it is very easy given we have all necessary information readily available
  40  * - LLVM optimization passes fail to simplify several vector expressions
  41  * - We often know value constraints which the optimization passes have no way
  42  *   of knowing, such as when source arguments are known to be in [0, 1] range.
  43  *
  44  * @author Jose Fonseca <jfonseca@vmware.com>
  45  */
  46
  47
  48 #include <float.h>
  49
  50 #include "util/u_memory.h"
  51 #include "util/u_debug.h"
  52 #include "util/u_math.h"
  53 #include "util/u_string.h"
  54 #include "util/u_cpu_detect.h"
  55
  56 #include "lp_bld_type.h"
  57 #include "lp_bld_const.h"
  58 #include "lp_bld_init.h"
  59 #include "lp_bld_intr.h"
  60 #include "lp_bld_logic.h"
  61 #include "lp_bld_pack.h"
  62 #include "lp_bld_debug.h"
  63 #include "lp_bld_bitarit.h"
  64 #include "lp_bld_arit.h"
  65 #include "lp_bld_flow.h"
  66
  67 #if defined(PIPE_ARCH_SSE)
  68 #include <xmmintrin.h>
  69 #endif
  70
  71 #ifndef _MM_DENORMALS_ZERO_MASK
  72 #define _MM_DENORMALS_ZERO_MASK 0x0040
  73 #endif
  74
  75 #ifndef _MM_FLUSH_ZERO_MASK
  76 #define _MM_FLUSH_ZERO_MASK 0x8000
  77 #endif
  78
  79 #define EXP_POLY_DEGREE 5
  80
  81 #define LOG_POLY_DEGREE 4
  82
  83
  84 /**
  85  * Generate min(a, b)
  86  * No checks for special case values of a or b = 1 or 0 are done.
  87  * NaN's are handled according to the behavior specified by the
  88  * nan_behavior argument.
  89  */
  90 static LLVMValueRef
  91 lp_build_min_simple(struct lp_build_context *bld,
  92                     LLVMValueRef a,
  93                     LLVMValueRef b,
  94                     enum gallivm_nan_behavior nan_behavior)
  95 {
  96    const struct lp_type type = bld->type;
  97    const char *intrinsic = NULL;
  98    unsigned intr_size = 0;
  99    LLVMValueRef cond;
 100
 101    assert(lp_check_value(type, a));
 102    assert(lp_check_value(type, b));
 103
 104    /* TODO: optimize the constant case */
 105
 106    if (type.floating && util_cpu_caps.has_sse) {
 107       if (type.width == 32) {
 108          if (type.length == 1) {
 109             intrinsic = "llvm.x86.sse.min.ss";
 110             intr_size = 128;
 111          }
 112          else if (type.length <= 4 || !util_cpu_caps.has_avx) {
 113             intrinsic = "llvm.x86.sse.min.ps";
 114             intr_size = 128;
 115          }
 116          else {
 117             intrinsic = "llvm.x86.avx.min.ps.256";
 118             intr_size = 256;
 119          }
 120       }
 121       if (type.width == 64 && util_cpu_caps.has_sse2) {
 122          if (type.length == 1) {
 123             intrinsic = "llvm.x86.sse2.min.sd";
 124             intr_size = 128;
 125          }
 126          else if (type.length == 2 || !util_cpu_caps.has_avx) {
 127             intrinsic = "llvm.x86.sse2.min.pd";
 128             intr_size = 128;
 129          }
 130          else {
 131             intrinsic = "llvm.x86.avx.min.pd.256";
 132             intr_size = 256;
 133          }
 134       }
 135    }
 136    else if (type.floating && util_cpu_caps.has_altivec) {
 137       if (nan_behavior == GALLIVM_NAN_RETURN_NAN) {
 138          debug_printf("%s: altivec doesn't support nan return nan behavior\n",
 139                       __FUNCTION__);
 140       }
 141       if (type.width == 32 && type.length == 4) {
 142          intrinsic = "llvm.ppc.altivec.vminfp";
 143          intr_size = 128;
 144       }
 145    } else if (util_cpu_caps.has_sse2 && type.length >= 2) {
 146       intr_size = 128;
 147       if ((type.width == 8 || type.width == 16) &&
 148           (type.width * type.length <= 64) &&
 149           (gallivm_debug & GALLIVM_DEBUG_PERF)) {
 150          debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
 151                       __FUNCTION__);
 152       }
 153       if (type.width == 8 && !type.sign) {
 154          intrinsic = "llvm.x86.sse2.pminu.b";
 155       }
 156       else if (type.width == 16 && type.sign) {
 157          intrinsic = "llvm.x86.sse2.pmins.w";
 158       }
 159       if (util_cpu_caps.has_sse4_1) {
 160          if (type.width == 8 && type.sign) {
 161             intrinsic = "llvm.x86.sse41.pminsb";
 162          }
 163          if (type.width == 16 && !type.sign) {
 164             intrinsic = "llvm.x86.sse41.pminuw";
 165          }
 166          if (type.width == 32 && !type.sign) {
 167             intrinsic = "llvm.x86.sse41.pminud";
 168          }
 169          if (type.width == 32 && type.sign) {
 170             intrinsic = "llvm.x86.sse41.pminsd";
 171          }
 172       }
 173    } else if (util_cpu_caps.has_altivec) {
 174       intr_size = 128;
 175       if (type.width == 8) {
 176          if (!type.sign) {
 177             intrinsic = "llvm.ppc.altivec.vminub";
 178          } else {
 179             intrinsic = "llvm.ppc.altivec.vminsb";
 180          }
 181       } else if (type.width == 16) {
 182          if (!type.sign) {
 183             intrinsic = "llvm.ppc.altivec.vminuh";
 184          } else {
 185             intrinsic = "llvm.ppc.altivec.vminsh";
 186          }
 187       } else if (type.width == 32) {
 188          if (!type.sign) {
 189             intrinsic = "llvm.ppc.altivec.vminuw";
 190          } else {
 191             intrinsic = "llvm.ppc.altivec.vminsw";
 192          }
 193       }
 194    }
 195
 196    if(intrinsic) {
 197       /* We need to handle nan's for floating point numbers. If one of the
 198        * inputs is nan the other should be returned (required by both D3D10+
 199        * and OpenCL).
 200        * The sse intrinsics return the second operator in case of nan by
 201        * default so we need to special code to handle those.
 202        */
 203       if (util_cpu_caps.has_sse && type.floating &&
 204           nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
 205           nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN) {
 206          LLVMValueRef isnan, max;
 207          max = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
 208                                                    type,
 209                                                    intr_size, a, b);
 210          if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
 211             isnan = lp_build_isnan(bld, b);
 212             return lp_build_select(bld, isnan, a, max);
 213          } else {
 214             assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
 215             isnan = lp_build_isnan(bld, a);
 216             return lp_build_select(bld, isnan, a, max);
 217          }
 218       } else {
 219          return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
 220                                                     type,
 221                                                     intr_size, a, b);
 222       }
 223    }
 224
 225    if (type.floating) {
 226       switch (nan_behavior) {
 227       case GALLIVM_NAN_RETURN_NAN: {
 228          LLVMValueRef isnan = lp_build_isnan(bld, b);
 229          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 230          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
 231          return lp_build_select(bld, cond, a, b);
 232       }
 233          break;
 234       case GALLIVM_NAN_RETURN_OTHER: {
 235          LLVMValueRef isnan = lp_build_isnan(bld, a);
 236          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 237          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
 238          return lp_build_select(bld, cond, a, b);
 239       }
 240          break;
 241       case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
 242          cond = lp_build_cmp_ordered(bld, PIPE_FUNC_LESS, a, b);
 243          return lp_build_select(bld, cond, a, b);
 244       case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
 245          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 246          return lp_build_select(bld, cond, a, b);
 247          break;
 248       default:
 249          assert(0);
 250          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 251          return lp_build_select(bld, cond, a, b);
 252       }
 253    } else {
 254       cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 255       return lp_build_select(bld, cond, a, b);
 256    }
 257 }
 258
 259
 260 /**
 261  * Generate max(a, b)
 262  * No checks for special case values of a or b = 1 or 0 are done.
 263  * NaN's are handled according to the behavior specified by the
 264  * nan_behavior argument.
 265  */
 266 static LLVMValueRef
 267 lp_build_max_simple(struct lp_build_context *bld,
 268                     LLVMValueRef a,
 269                     LLVMValueRef b,
 270                     enum gallivm_nan_behavior nan_behavior)
 271 {
 272    const struct lp_type type = bld->type;
 273    const char *intrinsic = NULL;
 274    unsigned intr_size = 0;
 275    LLVMValueRef cond;
 276
 277    assert(lp_check_value(type, a));
 278    assert(lp_check_value(type, b));
 279
 280    /* TODO: optimize the constant case */
 281
 282    if (type.floating && util_cpu_caps.has_sse) {
 283       if (type.width == 32) {
 284          if (type.length == 1) {
 285             intrinsic = "llvm.x86.sse.max.ss";
 286             intr_size = 128;
 287          }
 288          else if (type.length <= 4 || !util_cpu_caps.has_avx) {
 289             intrinsic = "llvm.x86.sse.max.ps";
 290             intr_size = 128;
 291          }
 292          else {
 293             intrinsic = "llvm.x86.avx.max.ps.256";
 294             intr_size = 256;
 295          }
 296       }
 297       if (type.width == 64 && util_cpu_caps.has_sse2) {
 298          if (type.length == 1) {
 299             intrinsic = "llvm.x86.sse2.max.sd";
 300             intr_size = 128;
 301          }
 302          else if (type.length == 2 || !util_cpu_caps.has_avx) {
 303             intrinsic = "llvm.x86.sse2.max.pd";
 304             intr_size = 128;
 305          }
 306          else {
 307             intrinsic = "llvm.x86.avx.max.pd.256";
 308             intr_size = 256;
 309          }
 310       }
 311    }
 312    else if (type.floating && util_cpu_caps.has_altivec) {
 313       if (nan_behavior == GALLIVM_NAN_RETURN_NAN) {
 314          debug_printf("%s: altivec doesn't support nan return nan behavior\n",
 315                       __FUNCTION__);
 316       }
 317       if (type.width == 32 || type.length == 4) {
 318          intrinsic = "llvm.ppc.altivec.vmaxfp";
 319          intr_size = 128;
 320       }
 321    } else if (util_cpu_caps.has_sse2 && type.length >= 2) {
 322       intr_size = 128;
 323       if ((type.width == 8 || type.width == 16) &&
 324           (type.width * type.length <= 64) &&
 325           (gallivm_debug & GALLIVM_DEBUG_PERF)) {
 326          debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
 327                       __FUNCTION__);
 328          }
 329       if (type.width == 8 && !type.sign) {
 330          intrinsic = "llvm.x86.sse2.pmaxu.b";
 331          intr_size = 128;
 332       }
 333       else if (type.width == 16 && type.sign) {
 334          intrinsic = "llvm.x86.sse2.pmaxs.w";
 335       }
 336       if (util_cpu_caps.has_sse4_1) {
 337          if (type.width == 8 && type.sign) {
 338             intrinsic = "llvm.x86.sse41.pmaxsb";
 339          }
 340          if (type.width == 16 && !type.sign) {
 341             intrinsic = "llvm.x86.sse41.pmaxuw";
 342          }
 343          if (type.width == 32 && !type.sign) {
 344             intrinsic = "llvm.x86.sse41.pmaxud";
 345         }
 346          if (type.width == 32 && type.sign) {
 347             intrinsic = "llvm.x86.sse41.pmaxsd";
 348          }
 349       }
 350    } else if (util_cpu_caps.has_altivec) {
 351      intr_size = 128;
 352      if (type.width == 8) {
 353        if (!type.sign) {
 354          intrinsic = "llvm.ppc.altivec.vmaxub";
 355        } else {
 356          intrinsic = "llvm.ppc.altivec.vmaxsb";
 357        }
 358      } else if (type.width == 16) {
 359        if (!type.sign) {
 360          intrinsic = "llvm.ppc.altivec.vmaxuh";
 361        } else {
 362          intrinsic = "llvm.ppc.altivec.vmaxsh";
 363        }
 364      } else if (type.width == 32) {
 365        if (!type.sign) {
 366          intrinsic = "llvm.ppc.altivec.vmaxuw";
 367        } else {
 368          intrinsic = "llvm.ppc.altivec.vmaxsw";
 369        }
 370      }
 371    }
 372
 373    if(intrinsic) {
 374       if (util_cpu_caps.has_sse && type.floating &&
 375           nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
 376           nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN) {
 377          LLVMValueRef isnan, min;
 378          min = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
 379                                                    type,
 380                                                    intr_size, a, b);
 381          if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
 382             isnan = lp_build_isnan(bld, b);
 383             return lp_build_select(bld, isnan, a, min);
 384          } else {
 385             assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
 386             isnan = lp_build_isnan(bld, a);
 387             return lp_build_select(bld, isnan, a, min);
 388          }
 389       } else {
 390          return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
 391                                                     type,
 392                                                     intr_size, a, b);
 393       }
 394    }
 395
 396    if (type.floating) {
 397       switch (nan_behavior) {
 398       case GALLIVM_NAN_RETURN_NAN: {
 399          LLVMValueRef isnan = lp_build_isnan(bld, b);
 400          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 401          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
 402          return lp_build_select(bld, cond, a, b);
 403       }
 404          break;
 405       case GALLIVM_NAN_RETURN_OTHER: {
 406          LLVMValueRef isnan = lp_build_isnan(bld, a);
 407          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 408          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
 409          return lp_build_select(bld, cond, a, b);
 410       }
 411          break;
 412       case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
 413          cond = lp_build_cmp_ordered(bld, PIPE_FUNC_GREATER, a, b);
 414          return lp_build_select(bld, cond, a, b);
 415       case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
 416          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 417          return lp_build_select(bld, cond, a, b);
 418          break;
 419       default:
 420          assert(0);
 421          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 422          return lp_build_select(bld, cond, a, b);
 423       }
 424    } else {
 425       cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 426       return lp_build_select(bld, cond, a, b);
 427    }
 428 }
 429
 430
 431 /**
 432  * Generate 1 - a, or ~a depending on bld->type.
 433  */
 434 LLVMValueRef
 435 lp_build_comp(struct lp_build_context *bld,
 436               LLVMValueRef a)
 437 {
 438    LLVMBuilderRef builder = bld->gallivm->builder;
 439    const struct lp_type type = bld->type;
 440
 441    assert(lp_check_value(type, a));
 442
 443    if(a == bld->one)
 444       return bld->zero;
 445    if(a == bld->zero)
 446       return bld->one;
 447
 448    if(type.norm && !type.floating && !type.fixed && !type.sign) {
 449       if(LLVMIsConstant(a))
 450          return LLVMConstNot(a);
 451       else
 452          return LLVMBuildNot(builder, a, "");
 453    }
 454
 455    if(LLVMIsConstant(a))
 456       if (type.floating)
 457           return LLVMConstFSub(bld->one, a);
 458       else
 459           return LLVMConstSub(bld->one, a);
 460    else
 461       if (type.floating)
 462          return LLVMBuildFSub(builder, bld->one, a, "");
 463       else
 464          return LLVMBuildSub(builder, bld->one, a, "");
 465 }
 466
 467
 468 /**
 469  * Generate a + b
 470  */
 471 LLVMValueRef
 472 lp_build_add(struct lp_build_context *bld,
 473              LLVMValueRef a,
 474              LLVMValueRef b)
 475 {
 476    LLVMBuilderRef builder = bld->gallivm->builder;
 477    const struct lp_type type = bld->type;
 478    LLVMValueRef res;
 479
 480    assert(lp_check_value(type, a));
 481    assert(lp_check_value(type, b));
 482
 483    if(a == bld->zero)
 484       return b;
 485    if(b == bld->zero)
 486       return a;
 487    if(a == bld->undef || b == bld->undef)
 488       return bld->undef;
 489
 490    if(bld->type.norm) {
 491       const char *intrinsic = NULL;
 492
 493       if(a == bld->one || b == bld->one)
 494         return bld->one;
 495
 496       if (type.width * type.length == 128 &&
 497           !type.floating && !type.fixed) {
 498          if(util_cpu_caps.has_sse2) {
 499            if(type.width == 8)
 500              intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
 501            if(type.width == 16)
 502              intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
 503          } else if (util_cpu_caps.has_altivec) {
 504            if(type.width == 8)
 505               intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs";
 506            if(type.width == 16)
 507               intrinsic = type.sign ? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs";
 508          }
 509       }
 510
 511       if(intrinsic)
 512          return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
 513    }
 514
 515    /* TODO: handle signed case */
 516    if(type.norm && !type.floating && !type.fixed && !type.sign)
 517       a = lp_build_min_simple(bld, a, lp_build_comp(bld, b), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 518
 519    if(LLVMIsConstant(a) && LLVMIsConstant(b))
 520       if (type.floating)
 521          res = LLVMConstFAdd(a, b);
 522       else
 523          res = LLVMConstAdd(a, b);
 524    else
 525       if (type.floating)
 526          res = LLVMBuildFAdd(builder, a, b, "");
 527       else
 528          res = LLVMBuildAdd(builder, a, b, "");
 529
 530    /* clamp to ceiling of 1.0 */
 531    if(bld->type.norm && (bld->type.floating || bld->type.fixed))
 532       res = lp_build_min_simple(bld, res, bld->one, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 533
 534    /* XXX clamp to floor of -1 or 0??? */
 535
 536    return res;
 537 }
 538
 539
 540 /** Return the scalar sum of the elements of a.
 541  * Should avoid this operation whenever possible.
 542  */
 543 LLVMValueRef
 544 lp_build_horizontal_add(struct lp_build_context *bld,
 545                         LLVMValueRef a)
 546 {
 547    LLVMBuilderRef builder = bld->gallivm->builder;
 548    const struct lp_type type = bld->type;
 549    LLVMValueRef index, res;
 550    unsigned i, length;
 551    LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 2];
 552    LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 2];
 553    LLVMValueRef vecres, elem2;
 554
 555    assert(lp_check_value(type, a));
 556
 557    if (type.length == 1) {
 558       return a;
 559    }
 560
 561    assert(!bld->type.norm);
 562
 563    /*
 564     * for byte vectors can do much better with psadbw.
 565     * Using repeated shuffle/adds here. Note with multiple vectors
 566     * this can be done more efficiently as outlined in the intel
 567     * optimization manual.
 568     * Note: could cause data rearrangement if used with smaller element
 569     * sizes.
 570     */
 571
 572    vecres = a;
 573    length = type.length / 2;
 574    while (length > 1) {
 575       LLVMValueRef vec1, vec2;
 576       for (i = 0; i < length; i++) {
 577          shuffles1[i] = lp_build_const_int32(bld->gallivm, i);
 578          shuffles2[i] = lp_build_const_int32(bld->gallivm, i + length);
 579       }
 580       vec1 = LLVMBuildShuffleVector(builder, vecres, vecres,
 581                                     LLVMConstVector(shuffles1, length), "");
 582       vec2 = LLVMBuildShuffleVector(builder, vecres, vecres,
 583                                     LLVMConstVector(shuffles2, length), "");
 584       if (type.floating) {
 585          vecres = LLVMBuildFAdd(builder, vec1, vec2, "");
 586       }
 587       else {
 588          vecres = LLVMBuildAdd(builder, vec1, vec2, "");
 589       }
 590       length = length >> 1;
 591    }
 592
 593    /* always have vector of size 2 here */
 594    assert(length == 1);
 595
 596    index = lp_build_const_int32(bld->gallivm, 0);
 597    res = LLVMBuildExtractElement(builder, vecres, index, "");
 598    index = lp_build_const_int32(bld->gallivm, 1);
 599    elem2 = LLVMBuildExtractElement(builder, vecres, index, "");
 600
 601    if (type.floating)
 602       res = LLVMBuildFAdd(builder, res, elem2, "");
 603     else
 604       res = LLVMBuildAdd(builder, res, elem2, "");
 605
 606    return res;
 607 }
 608
 609 /**
 610  * Return the horizontal sums of 4 float vectors as a float4 vector.
 611  * This uses the technique as outlined in Intel Optimization Manual.
 612  */
 613 static LLVMValueRef
 614 lp_build_horizontal_add4x4f(struct lp_build_context *bld,
 615                             LLVMValueRef src[4])
 616 {
 617    struct gallivm_state *gallivm = bld->gallivm;
 618    LLVMBuilderRef builder = gallivm->builder;
 619    LLVMValueRef shuffles[4];
 620    LLVMValueRef tmp[4];
 621    LLVMValueRef sumtmp[2], shuftmp[2];
 622
 623    /* lower half of regs */
 624    shuffles[0] = lp_build_const_int32(gallivm, 0);
 625    shuffles[1] = lp_build_const_int32(gallivm, 1);
 626    shuffles[2] = lp_build_const_int32(gallivm, 4);
 627    shuffles[3] = lp_build_const_int32(gallivm, 5);
 628    tmp[0] = LLVMBuildShuffleVector(builder, src[0], src[1],
 629                                    LLVMConstVector(shuffles, 4), "");
 630    tmp[2] = LLVMBuildShuffleVector(builder, src[2], src[3],
 631                                    LLVMConstVector(shuffles, 4), "");
 632
 633    /* upper half of regs */
 634    shuffles[0] = lp_build_const_int32(gallivm, 2);
 635    shuffles[1] = lp_build_const_int32(gallivm, 3);
 636    shuffles[2] = lp_build_const_int32(gallivm, 6);
 637    shuffles[3] = lp_build_const_int32(gallivm, 7);
 638    tmp[1] = LLVMBuildShuffleVector(builder, src[0], src[1],
 639                                    LLVMConstVector(shuffles, 4), "");
 640    tmp[3] = LLVMBuildShuffleVector(builder, src[2], src[3],
 641                                    LLVMConstVector(shuffles, 4), "");
 642
 643    sumtmp[0] = LLVMBuildFAdd(builder, tmp[0], tmp[1], "");
 644    sumtmp[1] = LLVMBuildFAdd(builder, tmp[2], tmp[3], "");
 645
 646    shuffles[0] = lp_build_const_int32(gallivm, 0);
 647    shuffles[1] = lp_build_const_int32(gallivm, 2);
 648    shuffles[2] = lp_build_const_int32(gallivm, 4);
 649    shuffles[3] = lp_build_const_int32(gallivm, 6);
 650    shuftmp[0] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
 651                                        LLVMConstVector(shuffles, 4), "");
 652
 653    shuffles[0] = lp_build_const_int32(gallivm, 1);
 654    shuffles[1] = lp_build_const_int32(gallivm, 3);
 655    shuffles[2] = lp_build_const_int32(gallivm, 5);
 656    shuffles[3] = lp_build_const_int32(gallivm, 7);
 657    shuftmp[1] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
 658                                        LLVMConstVector(shuffles, 4), "");
 659
 660    return LLVMBuildFAdd(builder, shuftmp[0], shuftmp[1], "");
 661 }
 662
 663
 664 /*
 665  * partially horizontally add 2-4 float vectors with length nx4,
 666  * i.e. only four adjacent values in each vector will be added,
 667  * assuming values are really grouped in 4 which also determines
 668  * output order.
 669  *
 670  * Return a vector of the same length as the initial vectors,
 671  * with the excess elements (if any) being undefined.
 672  * The element order is independent of number of input vectors.
 673  * For 3 vectors x0x1x2x3x4x5x6x7, y0y1y2y3y4y5y6y7, z0z1z2z3z4z5z6z7
 674  * the output order thus will be
 675  * sumx0-x3,sumy0-y3,sumz0-z3,undef,sumx4-x7,sumy4-y7,sumz4z7,undef
 676  */
 677 LLVMValueRef
 678 lp_build_hadd_partial4(struct lp_build_context *bld,
 679                        LLVMValueRef vectors[],
 680                        unsigned num_vecs)
 681 {
 682    struct gallivm_state *gallivm = bld->gallivm;
 683    LLVMBuilderRef builder = gallivm->builder;
 684    LLVMValueRef ret_vec;
 685    LLVMValueRef tmp[4];
 686    const char *intrinsic = NULL;
 687
 688    assert(num_vecs >= 2 && num_vecs <= 4);
 689    assert(bld->type.floating);
 690
 691    /* only use this with at least 2 vectors, as it is sort of expensive
 692     * (depending on cpu) and we always need two horizontal adds anyway,
 693     * so a shuffle/add approach might be better.
 694     */
 695
 696    tmp[0] = vectors[0];
 697    tmp[1] = vectors[1];
 698
 699    tmp[2] = num_vecs > 2 ? vectors[2] : vectors[0];
 700    tmp[3] = num_vecs > 3 ? vectors[3] : vectors[0];
 701
 702    if (util_cpu_caps.has_sse3 && bld->type.width == 32 &&
 703        bld->type.length == 4) {
 704       intrinsic = "llvm.x86.sse3.hadd.ps";
 705    }
 706    else if (util_cpu_caps.has_avx && bld->type.width == 32 &&
 707             bld->type.length == 8) {
 708       intrinsic = "llvm.x86.avx.hadd.ps.256";
 709    }
 710    if (intrinsic) {
 711       tmp[0] = lp_build_intrinsic_binary(builder, intrinsic,
 712                                        lp_build_vec_type(gallivm, bld->type),
 713                                        tmp[0], tmp[1]);
 714       if (num_vecs > 2) {
 715          tmp[1] = lp_build_intrinsic_binary(builder, intrinsic,
 716                                           lp_build_vec_type(gallivm, bld->type),
 717                                           tmp[2], tmp[3]);
 718       }
 719       else {
 720          tmp[1] = tmp[0];
 721       }
 722       return lp_build_intrinsic_binary(builder, intrinsic,
 723                                        lp_build_vec_type(gallivm, bld->type),
 724                                        tmp[0], tmp[1]);
 725    }
 726
 727    if (bld->type.length == 4) {
 728       ret_vec = lp_build_horizontal_add4x4f(bld, tmp);
 729    }
 730    else {
 731       LLVMValueRef partres[LP_MAX_VECTOR_LENGTH/4];
 732       unsigned j;
 733       unsigned num_iter = bld->type.length / 4;
 734       struct lp_type parttype = bld->type;
 735       parttype.length = 4;
 736       for (j = 0; j < num_iter; j++) {
 737          LLVMValueRef partsrc[4];
 738          unsigned i;
 739          for (i = 0; i < 4; i++) {
 740             partsrc[i] = lp_build_extract_range(gallivm, tmp[i], j*4, 4);
 741          }
 742          partres[j] = lp_build_horizontal_add4x4f(bld, partsrc);
 743       }
 744       ret_vec = lp_build_concat(gallivm, partres, parttype, num_iter);
 745    }
 746    return ret_vec;
 747 }
 748
 749 /**
 750  * Generate a - b
 751  */
 752 LLVMValueRef
 753 lp_build_sub(struct lp_build_context *bld,
 754              LLVMValueRef a,
 755              LLVMValueRef b)
 756 {
 757    LLVMBuilderRef builder = bld->gallivm->builder;
 758    const struct lp_type type = bld->type;
 759    LLVMValueRef res;
 760
 761    assert(lp_check_value(type, a));
 762    assert(lp_check_value(type, b));
 763
 764    if(b == bld->zero)
 765       return a;
 766    if(a == bld->undef || b == bld->undef)
 767       return bld->undef;
 768    if(a == b)
 769       return bld->zero;
 770
 771    if(bld->type.norm) {
 772       const char *intrinsic = NULL;
 773
 774       if(b == bld->one)
 775         return bld->zero;
 776
 777       if (type.width * type.length == 128 &&
 778           !type.floating && !type.fixed) {
 779          if (util_cpu_caps.has_sse2) {
 780            if(type.width == 8)
 781               intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
 782            if(type.width == 16)
 783               intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
 784          } else if (util_cpu_caps.has_altivec) {
 785            if(type.width == 8)
 786               intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs";
 787            if(type.width == 16)
 788               intrinsic = type.sign ? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs";
 789          }
 790       }
 791
 792       if(intrinsic)
 793          return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
 794    }
 795
 796    /* TODO: handle signed case */
 797    if(type.norm && !type.floating && !type.fixed && !type.sign)
 798       a = lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 799
 800    if(LLVMIsConstant(a) && LLVMIsConstant(b))
 801       if (type.floating)
 802          res = LLVMConstFSub(a, b);
 803       else
 804          res = LLVMConstSub(a, b);
 805    else
 806       if (type.floating)
 807          res = LLVMBuildFSub(builder, a, b, "");
 808       else
 809          res = LLVMBuildSub(builder, a, b, "");
 810
 811    if(bld->type.norm && (bld->type.floating || bld->type.fixed))
 812       res = lp_build_max_simple(bld, res, bld->zero, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 813
 814    return res;
 815 }
 816
 817
 818
 819 /**
 820  * Normalized multiplication.
 821  *
 822  * There are several approaches for (using 8-bit normalized multiplication as
 823  * an example):
 824  *
 825  * - alpha plus one
 826  *
 827  *     makes the following approximation to the division (Sree)
 828  *
 829  *       a*b/255 ~= (a*(b + 1)) >> 256
 830  *
 831  *     which is the fastest method that satisfies the following OpenGL criteria of
 832  *
 833  *       0*0 = 0 and 255*255 = 255
 834  *
 835  * - geometric series
 836  *
 837  *     takes the geometric series approximation to the division
 838  *
 839  *       t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
 840  *
 841  *     in this case just the first two terms to fit in 16bit arithmetic
 842  *
 843  *       t/255 ~= (t + (t >> 8)) >> 8
 844  *
 845  *     note that just by itself it doesn't satisfies the OpenGL criteria, as
 846  *     255*255 = 254, so the special case b = 255 must be accounted or roundoff
 847  *     must be used.
 848  *
 849  * - geometric series plus rounding
 850  *
 851  *     when using a geometric series division instead of truncating the result
 852  *     use roundoff in the approximation (Jim Blinn)
 853  *
 854  *       t/255 ~= (t + (t >> 8) + 0x80) >> 8
 855  *
 856  *     achieving the exact results.
 857  *
 858  *
 859  *
 860  * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
 861  *     ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
 862  * @sa Michael Herf, The "double blend trick", May 2000,
 863  *     http://www.stereopsis.com/doubleblend.html
 864  */
 865 static LLVMValueRef
 866 lp_build_mul_norm(struct gallivm_state *gallivm,
 867                   struct lp_type wide_type,
 868                   LLVMValueRef a, LLVMValueRef b)
 869 {
 870    LLVMBuilderRef builder = gallivm->builder;
 871    struct lp_build_context bld;
 872    unsigned n;
 873    LLVMValueRef half;
 874    LLVMValueRef ab;
 875
 876    assert(!wide_type.floating);
 877    assert(lp_check_value(wide_type, a));
 878    assert(lp_check_value(wide_type, b));
 879
 880    lp_build_context_init(&bld, gallivm, wide_type);
 881
 882    n = wide_type.width / 2;
 883    if (wide_type.sign) {
 884       --n;
 885    }
 886
 887    /*
 888     * TODO: for 16bits normalized SSE2 vectors we could consider using PMULHUW
 889     * http://ssp.impulsetrain.com/2011/07/03/multiplying-normalized-16-bit-numbers-with-sse2/
 890     */
 891
 892    /*
 893     * a*b / (2**n - 1) ~= (a*b + (a*b >> n) + half) >> n
 894     */
 895
 896    ab = LLVMBuildMul(builder, a, b, "");
 897    ab = LLVMBuildAdd(builder, ab, lp_build_shr_imm(&bld, ab, n), "");
 898
 899    /*
 900     * half = sgn(ab) * 0.5 * (2 ** n) = sgn(ab) * (1 << (n - 1))
 901     */
 902
 903    half = lp_build_const_int_vec(gallivm, wide_type, 1 << (n - 1));
 904    if (wide_type.sign) {
 905       LLVMValueRef minus_half = LLVMBuildNeg(builder, half, "");
 906       LLVMValueRef sign = lp_build_shr_imm(&bld, ab, wide_type.width - 1);
 907       half = lp_build_select(&bld, sign, minus_half, half);
 908    }
 909    ab = LLVMBuildAdd(builder, ab, half, "");
 910
 911    /* Final division */
 912    ab = lp_build_shr_imm(&bld, ab, n);
 913
 914    return ab;
 915 }
 916
 917 /**
 918  * Generate a * b
 919  */
 920 LLVMValueRef
 921 lp_build_mul(struct lp_build_context *bld,
 922              LLVMValueRef a,
 923              LLVMValueRef b)
 924 {
 925    LLVMBuilderRef builder = bld->gallivm->builder;
 926    const struct lp_type type = bld->type;
 927    LLVMValueRef shift;
 928    LLVMValueRef res;
 929
 930    assert(lp_check_value(type, a));
 931    assert(lp_check_value(type, b));
 932
 933    if(a == bld->zero)
 934       return bld->zero;
 935    if(a == bld->one)
 936       return b;
 937    if(b == bld->zero)
 938       return bld->zero;
 939    if(b == bld->one)
 940       return a;
 941    if(a == bld->undef || b == bld->undef)
 942       return bld->undef;
 943
 944    if (!type.floating && !type.fixed && type.norm) {
 945       struct lp_type wide_type = lp_wider_type(type);
 946       LLVMValueRef al, ah, bl, bh, abl, abh, ab;
 947
 948       lp_build_unpack2(bld->gallivm, type, wide_type, a, &al, &ah);
 949       lp_build_unpack2(bld->gallivm, type, wide_type, b, &bl, &bh);
 950
 951       /* PMULLW, PSRLW, PADDW */
 952       abl = lp_build_mul_norm(bld->gallivm, wide_type, al, bl);
 953       abh = lp_build_mul_norm(bld->gallivm, wide_type, ah, bh);
 954
 955       ab = lp_build_pack2(bld->gallivm, wide_type, type, abl, abh);
 956
 957       return ab;
 958    }
 959
 960    if(type.fixed)
 961       shift = lp_build_const_int_vec(bld->gallivm, type, type.width/2);
 962    else
 963       shift = NULL;
 964
 965    if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
 966       if (type.floating)
 967          res = LLVMConstFMul(a, b);
 968       else
 969          res = LLVMConstMul(a, b);
 970       if(shift) {
 971          if(type.sign)
 972             res = LLVMConstAShr(res, shift);
 973          else
 974             res = LLVMConstLShr(res, shift);
 975       }
 976    }
 977    else {
 978       if (type.floating)
 979          res = LLVMBuildFMul(builder, a, b, "");
 980       else
 981          res = LLVMBuildMul(builder, a, b, "");
 982       if(shift) {
 983          if(type.sign)
 984             res = LLVMBuildAShr(builder, res, shift, "");
 985          else
 986             res = LLVMBuildLShr(builder, res, shift, "");
 987       }
 988    }
 989
 990    return res;
 991 }
 992
 993
 994 /**
 995  * Small vector x scale multiplication optimization.
 996  */
 997 LLVMValueRef
 998 lp_build_mul_imm(struct lp_build_context *bld,
 999                  LLVMValueRef a,
1000                  int b)
1001 {
1002    LLVMBuilderRef builder = bld->gallivm->builder;
1003    LLVMValueRef factor;
1004
1005    assert(lp_check_value(bld->type, a));
1006
1007    if(b == 0)
1008       return bld->zero;
1009
1010    if(b == 1)
1011       return a;
1012
1013    if(b == -1)
1014       return lp_build_negate(bld, a);
1015
1016    if(b == 2 && bld->type.floating)
1017       return lp_build_add(bld, a, a);
1018
1019    if(util_is_power_of_two(b)) {
1020       unsigned shift = ffs(b) - 1;
1021
1022       if(bld->type.floating) {
1023 #if 0
1024          /*
1025           * Power of two multiplication by directly manipulating the exponent.
1026           *
1027           * XXX: This might not be always faster, it will introduce a small error
1028           * for multiplication by zero, and it will produce wrong results
1029           * for Inf and NaN.
1030           */
1031          unsigned mantissa = lp_mantissa(bld->type);
1032          factor = lp_build_const_int_vec(bld->gallivm, bld->type, (unsigned long long)shift << mantissa);
1033          a = LLVMBuildBitCast(builder, a, lp_build_int_vec_type(bld->type), "");
1034          a = LLVMBuildAdd(builder, a, factor, "");
1035          a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, bld->type), "");
1036          return a;
1037 #endif
1038       }
1039       else {
1040          factor = lp_build_const_vec(bld->gallivm, bld->type, shift);
1041          return LLVMBuildShl(builder, a, factor, "");
1042       }
1043    }
1044
1045    factor = lp_build_const_vec(bld->gallivm, bld->type, (double)b);
1046    return lp_build_mul(bld, a, factor);
1047 }
1048
1049
1050 /**
1051  * Generate a / b
1052  */
1053 LLVMValueRef
1054 lp_build_div(struct lp_build_context *bld,
1055              LLVMValueRef a,
1056              LLVMValueRef b)
1057 {
1058    LLVMBuilderRef builder = bld->gallivm->builder;
1059    const struct lp_type type = bld->type;
1060
1061    assert(lp_check_value(type, a));
1062    assert(lp_check_value(type, b));
1063
1064    if(a == bld->zero)
1065       return bld->zero;
1066    if(a == bld->one)
1067       return lp_build_rcp(bld, b);
1068    if(b == bld->zero)
1069       return bld->undef;
1070    if(b == bld->one)
1071       return a;
1072    if(a == bld->undef || b == bld->undef)
1073       return bld->undef;
1074
1075    if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
1076       if (type.floating)
1077          return LLVMConstFDiv(a, b);
1078       else if (type.sign)
1079          return LLVMConstSDiv(a, b);
1080       else
1081          return LLVMConstUDiv(a, b);
1082    }
1083
1084    if(((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
1085        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) &&
1086       type.floating)
1087       return lp_build_mul(bld, a, lp_build_rcp(bld, b));
1088
1089    if (type.floating)
1090       return LLVMBuildFDiv(builder, a, b, "");
1091    else if (type.sign)
1092       return LLVMBuildSDiv(builder, a, b, "");
1093    else
1094       return LLVMBuildUDiv(builder, a, b, "");
1095 }
1096
1097
1098 /**
1099  * Linear interpolation helper.
1100  *
1101  * @param normalized whether we are interpolating normalized values,
1102  *        encoded in normalized integers, twice as wide.
1103  *
1104  * @sa http://www.stereopsis.com/doubleblend.html
1105  */
1106 static INLINE LLVMValueRef
1107 lp_build_lerp_simple(struct lp_build_context *bld,
1108                      LLVMValueRef x,
1109                      LLVMValueRef v0,
1110                      LLVMValueRef v1,
1111                      unsigned flags)
1112 {
1113    unsigned half_width = bld->type.width/2;
1114    LLVMBuilderRef builder = bld->gallivm->builder;
1115    LLVMValueRef delta;
1116    LLVMValueRef res;
1117
1118    assert(lp_check_value(bld->type, x));
1119    assert(lp_check_value(bld->type, v0));
1120    assert(lp_check_value(bld->type, v1));
1121
1122    delta = lp_build_sub(bld, v1, v0);
1123
1124    if (flags & LP_BLD_LERP_WIDE_NORMALIZED) {
1125       if (!bld->type.sign) {
1126          if (!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS)) {
1127             /*
1128              * Scale x from [0, 2**n - 1] to [0, 2**n] by adding the
1129              * most-significant-bit to the lowest-significant-bit, so that
1130              * later we can just divide by 2**n instead of 2**n - 1.
1131              */
1132
1133             x = lp_build_add(bld, x, lp_build_shr_imm(bld, x, half_width - 1));
1134          }
1135
1136          /* (x * delta) >> n */
1137          res = lp_build_mul(bld, x, delta);
1138          res = lp_build_shr_imm(bld, res, half_width);
1139       } else {
1140          /*
1141           * The rescaling trick above doesn't work for signed numbers, so
1142           * use the 2**n - 1 divison approximation in lp_build_mul_norm
1143           * instead.
1144           */
1145          assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1146          res = lp_build_mul_norm(bld->gallivm, bld->type, x, delta);
1147       }
1148    } else {
1149       assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1150       res = lp_build_mul(bld, x, delta);
1151    }
1152
1153    res = lp_build_add(bld, v0, res);
1154
1155    if (((flags & LP_BLD_LERP_WIDE_NORMALIZED) && !bld->type.sign) ||
1156        bld->type.fixed) {
1157       /* We need to mask out the high order bits when lerping 8bit normalized colors stored on 16bits */
1158       /* XXX: This step is necessary for lerping 8bit colors stored on 16bits,
1159        * but it will be wrong for true fixed point use cases. Basically we need
1160        * a more powerful lp_type, capable of further distinguishing the values
1161        * interpretation from the value storage. */
1162       res = LLVMBuildAnd(builder, res, lp_build_const_int_vec(bld->gallivm, bld->type, (1 << half_width) - 1), "");
1163    }
1164
1165    return res;
1166 }
1167
1168
1169 /**
1170  * Linear interpolation.
1171  */
1172 LLVMValueRef
1173 lp_build_lerp(struct lp_build_context *bld,
1174               LLVMValueRef x,
1175               LLVMValueRef v0,
1176               LLVMValueRef v1,
1177               unsigned flags)
1178 {
1179    const struct lp_type type = bld->type;
1180    LLVMValueRef res;
1181
1182    assert(lp_check_value(type, x));
1183    assert(lp_check_value(type, v0));
1184    assert(lp_check_value(type, v1));
1185
1186    assert(!(flags & LP_BLD_LERP_WIDE_NORMALIZED));
1187
1188    if (type.norm) {
1189       struct lp_type wide_type;
1190       struct lp_build_context wide_bld;
1191       LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh;
1192
1193       assert(type.length >= 2);
1194
1195       /*
1196        * Create a wider integer type, enough to hold the
1197        * intermediate result of the multiplication.
1198        */
1199       memset(&wide_type, 0, sizeof wide_type);
1200       wide_type.sign   = type.sign;
1201       wide_type.width  = type.width*2;
1202       wide_type.length = type.length/2;
1203
1204       lp_build_context_init(&wide_bld, bld->gallivm, wide_type);
1205
1206       lp_build_unpack2(bld->gallivm, type, wide_type, x,  &xl,  &xh);
1207       lp_build_unpack2(bld->gallivm, type, wide_type, v0, &v0l, &v0h);
1208       lp_build_unpack2(bld->gallivm, type, wide_type, v1, &v1l, &v1h);
1209
1210       /*
1211        * Lerp both halves.
1212        */
1213
1214       flags |= LP_BLD_LERP_WIDE_NORMALIZED;
1215
1216       resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l, flags);
1217       resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h, flags);
1218
1219       res = lp_build_pack2(bld->gallivm, wide_type, type, resl, resh);
1220    } else {
1221       res = lp_build_lerp_simple(bld, x, v0, v1, flags);
1222    }
1223
1224    return res;
1225 }
1226
1227
1228 /**
1229  * Bilinear interpolation.
1230  *
1231  * Values indices are in v_{yx}.
1232  */
1233 LLVMValueRef
1234 lp_build_lerp_2d(struct lp_build_context *bld,
1235                  LLVMValueRef x,
1236                  LLVMValueRef y,
1237                  LLVMValueRef v00,
1238                  LLVMValueRef v01,
1239                  LLVMValueRef v10,
1240                  LLVMValueRef v11,
1241                  unsigned flags)
1242 {
1243    LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01, flags);
1244    LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11, flags);
1245    return lp_build_lerp(bld, y, v0, v1, flags);
1246 }
1247
1248
1249 LLVMValueRef
1250 lp_build_lerp_3d(struct lp_build_context *bld,
1251                  LLVMValueRef x,
1252                  LLVMValueRef y,
1253                  LLVMValueRef z,
1254                  LLVMValueRef v000,
1255                  LLVMValueRef v001,
1256                  LLVMValueRef v010,
1257                  LLVMValueRef v011,
1258                  LLVMValueRef v100,
1259                  LLVMValueRef v101,
1260                  LLVMValueRef v110,
1261                  LLVMValueRef v111,
1262                  unsigned flags)
1263 {
1264    LLVMValueRef v0 = lp_build_lerp_2d(bld, x, y, v000, v001, v010, v011, flags);
1265    LLVMValueRef v1 = lp_build_lerp_2d(bld, x, y, v100, v101, v110, v111, flags);
1266    return lp_build_lerp(bld, z, v0, v1, flags);
1267 }
1268
1269
1270 /**
1271  * Generate min(a, b)
1272  * Do checks for special cases but not for nans.
1273  */
1274 LLVMValueRef
1275 lp_build_min(struct lp_build_context *bld,
1276              LLVMValueRef a,
1277              LLVMValueRef b)
1278 {
1279    assert(lp_check_value(bld->type, a));
1280    assert(lp_check_value(bld->type, b));
1281
1282    if(a == bld->undef || b == bld->undef)
1283       return bld->undef;
1284
1285    if(a == b)
1286       return a;
1287
1288    if (bld->type.norm) {
1289       if (!bld->type.sign) {
1290          if (a == bld->zero || b == bld->zero) {
1291             return bld->zero;
1292          }
1293       }
1294       if(a == bld->one)
1295          return b;
1296       if(b == bld->one)
1297          return a;
1298    }
1299
1300    return lp_build_min_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1301 }
1302
1303
1304 /**
1305  * Generate min(a, b)
1306  * NaN's are handled according to the behavior specified by the
1307  * nan_behavior argument.
1308  */
1309 LLVMValueRef
1310 lp_build_min_ext(struct lp_build_context *bld,
1311                  LLVMValueRef a,
1312                  LLVMValueRef b,
1313                  enum gallivm_nan_behavior nan_behavior)
1314 {
1315    assert(lp_check_value(bld->type, a));
1316    assert(lp_check_value(bld->type, b));
1317
1318    if(a == bld->undef || b == bld->undef)
1319       return bld->undef;
1320
1321    if(a == b)
1322       return a;
1323
1324    if (bld->type.norm) {
1325       if (!bld->type.sign) {
1326          if (a == bld->zero || b == bld->zero) {
1327             return bld->zero;
1328          }
1329       }
1330       if(a == bld->one)
1331          return b;
1332       if(b == bld->one)
1333          return a;
1334    }
1335
1336    return lp_build_min_simple(bld, a, b, nan_behavior);
1337 }
1338
1339 /**
1340  * Generate max(a, b)
1341  * Do checks for special cases, but NaN behavior is undefined.
1342  */
1343 LLVMValueRef
1344 lp_build_max(struct lp_build_context *bld,
1345              LLVMValueRef a,
1346              LLVMValueRef b)
1347 {
1348    assert(lp_check_value(bld->type, a));
1349    assert(lp_check_value(bld->type, b));
1350
1351    if(a == bld->undef || b == bld->undef)
1352       return bld->undef;
1353
1354    if(a == b)
1355       return a;
1356
1357    if(bld->type.norm) {
1358       if(a == bld->one || b == bld->one)
1359          return bld->one;
1360       if (!bld->type.sign) {
1361          if (a == bld->zero) {
1362             return b;
1363          }
1364          if (b == bld->zero) {
1365             return a;
1366          }
1367       }
1368    }
1369
1370    return lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1371 }
1372
1373
1374 /**
1375  * Generate max(a, b)
1376  * Checks for special cases.
1377  * NaN's are handled according to the behavior specified by the
1378  * nan_behavior argument.
1379  */
1380 LLVMValueRef
1381 lp_build_max_ext(struct lp_build_context *bld,
1382                   LLVMValueRef a,
1383                   LLVMValueRef b,
1384                   enum gallivm_nan_behavior nan_behavior)
1385 {
1386    assert(lp_check_value(bld->type, a));
1387    assert(lp_check_value(bld->type, b));
1388
1389    if(a == bld->undef || b == bld->undef)
1390       return bld->undef;
1391
1392    if(a == b)
1393       return a;
1394
1395    if(bld->type.norm) {
1396       if(a == bld->one || b == bld->one)
1397          return bld->one;
1398       if (!bld->type.sign) {
1399          if (a == bld->zero) {
1400             return b;
1401          }
1402          if (b == bld->zero) {
1403             return a;
1404          }
1405       }
1406    }
1407
1408    return lp_build_max_simple(bld, a, b, nan_behavior);
1409 }
1410
1411 /**
1412  * Generate clamp(a, min, max)
1413  * NaN behavior (for any of a, min, max) is undefined.
1414  * Do checks for special cases.
1415  */
1416 LLVMValueRef
1417 lp_build_clamp(struct lp_build_context *bld,
1418                LLVMValueRef a,
1419                LLVMValueRef min,
1420                LLVMValueRef max)
1421 {
1422    assert(lp_check_value(bld->type, a));
1423    assert(lp_check_value(bld->type, min));
1424    assert(lp_check_value(bld->type, max));
1425
1426    a = lp_build_min(bld, a, max);
1427    a = lp_build_max(bld, a, min);
1428    return a;
1429 }
1430
1431
1432 /**
1433  * Generate clamp(a, 0, 1)
1434  * A NaN will get converted to zero.
1435  */
1436 LLVMValueRef
1437 lp_build_clamp_zero_one_nanzero(struct lp_build_context *bld,
1438                                 LLVMValueRef a)
1439 {
1440    a = lp_build_max_ext(bld, a, bld->zero, GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
1441    a = lp_build_min(bld, a, bld->one);
1442    return a;
1443 }
1444
1445
1446 /**
1447  * Generate abs(a)
1448  */
1449 LLVMValueRef
1450 lp_build_abs(struct lp_build_context *bld,
1451              LLVMValueRef a)
1452 {
1453    LLVMBuilderRef builder = bld->gallivm->builder;
1454    const struct lp_type type = bld->type;
1455    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1456
1457    assert(lp_check_value(type, a));
1458
1459    if(!type.sign)
1460       return a;
1461
1462    if(type.floating) {
1463       /* Mask out the sign bit */
1464       LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1465       unsigned long long absMask = ~(1ULL << (type.width - 1));
1466       LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, ((unsigned long long) absMask));
1467       a = LLVMBuildBitCast(builder, a, int_vec_type, "");
1468       a = LLVMBuildAnd(builder, a, mask, "");
1469       a = LLVMBuildBitCast(builder, a, vec_type, "");
1470       return a;
1471    }
1472
1473    if(type.width*type.length == 128 && util_cpu_caps.has_ssse3) {
1474       switch(type.width) {
1475       case 8:
1476          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
1477       case 16:
1478          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
1479       case 32:
1480          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
1481       }
1482    }
1483    else if (type.width*type.length == 256 && util_cpu_caps.has_ssse3 &&
1484             (gallivm_debug & GALLIVM_DEBUG_PERF) &&
1485             (type.width == 8 || type.width == 16 || type.width == 32)) {
1486       debug_printf("%s: inefficient code, should split vectors manually\n",
1487                    __FUNCTION__);
1488    }
1489
1490    return lp_build_max(bld, a, LLVMBuildNeg(builder, a, ""));
1491 }
1492
1493
1494 LLVMValueRef
1495 lp_build_negate(struct lp_build_context *bld,
1496                 LLVMValueRef a)
1497 {
1498    LLVMBuilderRef builder = bld->gallivm->builder;
1499
1500    assert(lp_check_value(bld->type, a));
1501
1502    if (bld->type.floating)
1503       a = LLVMBuildFNeg(builder, a, "");
1504    else
1505       a = LLVMBuildNeg(builder, a, "");
1506
1507    return a;
1508 }
1509
1510
1511 /** Return -1, 0 or +1 depending on the sign of a */
1512 LLVMValueRef
1513 lp_build_sgn(struct lp_build_context *bld,
1514              LLVMValueRef a)
1515 {
1516    LLVMBuilderRef builder = bld->gallivm->builder;
1517    const struct lp_type type = bld->type;
1518    LLVMValueRef cond;
1519    LLVMValueRef res;
1520
1521    assert(lp_check_value(type, a));
1522
1523    /* Handle non-zero case */
1524    if(!type.sign) {
1525       /* if not zero then sign must be positive */
1526       res = bld->one;
1527    }
1528    else if(type.floating) {
1529       LLVMTypeRef vec_type;
1530       LLVMTypeRef int_type;
1531       LLVMValueRef mask;
1532       LLVMValueRef sign;
1533       LLVMValueRef one;
1534       unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
1535
1536       int_type = lp_build_int_vec_type(bld->gallivm, type);
1537       vec_type = lp_build_vec_type(bld->gallivm, type);
1538       mask = lp_build_const_int_vec(bld->gallivm, type, maskBit);
1539
1540       /* Take the sign bit and add it to 1 constant */
1541       sign = LLVMBuildBitCast(builder, a, int_type, "");
1542       sign = LLVMBuildAnd(builder, sign, mask, "");
1543       one = LLVMConstBitCast(bld->one, int_type);
1544       res = LLVMBuildOr(builder, sign, one, "");
1545       res = LLVMBuildBitCast(builder, res, vec_type, "");
1546    }
1547    else
1548    {
1549       /* signed int/norm/fixed point */
1550       /* could use psign with sse3 and appropriate vectors here */
1551       LLVMValueRef minus_one = lp_build_const_vec(bld->gallivm, type, -1.0);
1552       cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
1553       res = lp_build_select(bld, cond, bld->one, minus_one);
1554    }
1555
1556    /* Handle zero */
1557    cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
1558    res = lp_build_select(bld, cond, bld->zero, res);
1559
1560    return res;
1561 }
1562
1563
1564 /**
1565  * Set the sign of float vector 'a' according to 'sign'.
1566  * If sign==0, return abs(a).
1567  * If sign==1, return -abs(a);
1568  * Other values for sign produce undefined results.
1569  */
1570 LLVMValueRef
1571 lp_build_set_sign(struct lp_build_context *bld,
1572                   LLVMValueRef a, LLVMValueRef sign)
1573 {
1574    LLVMBuilderRef builder = bld->gallivm->builder;
1575    const struct lp_type type = bld->type;
1576    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1577    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1578    LLVMValueRef shift = lp_build_const_int_vec(bld->gallivm, type, type.width - 1);
1579    LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1580                              ~((unsigned long long) 1 << (type.width - 1)));
1581    LLVMValueRef val, res;
1582
1583    assert(type.floating);
1584    assert(lp_check_value(type, a));
1585
1586    /* val = reinterpret_cast<int>(a) */
1587    val = LLVMBuildBitCast(builder, a, int_vec_type, "");
1588    /* val = val & mask */
1589    val = LLVMBuildAnd(builder, val, mask, "");
1590    /* sign = sign << shift */
1591    sign = LLVMBuildShl(builder, sign, shift, "");
1592    /* res = val | sign */
1593    res = LLVMBuildOr(builder, val, sign, "");
1594    /* res = reinterpret_cast<float>(res) */
1595    res = LLVMBuildBitCast(builder, res, vec_type, "");
1596
1597    return res;
1598 }
1599
1600
1601 /**
1602  * Convert vector of (or scalar) int to vector of (or scalar) float.
1603  */
1604 LLVMValueRef
1605 lp_build_int_to_float(struct lp_build_context *bld,
1606                       LLVMValueRef a)
1607 {
1608    LLVMBuilderRef builder = bld->gallivm->builder;
1609    const struct lp_type type = bld->type;
1610    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1611
1612    assert(type.floating);
1613
1614    return LLVMBuildSIToFP(builder, a, vec_type, "");
1615 }
1616
1617 static boolean
1618 arch_rounding_available(const struct lp_type type)
1619 {
1620    if ((util_cpu_caps.has_sse4_1 &&
1621        (type.length == 1 || type.width*type.length == 128)) ||
1622        (util_cpu_caps.has_avx && type.width*type.length == 256))
1623       return TRUE;
1624    else if ((util_cpu_caps.has_altivec &&
1625             (type.width == 32 && type.length == 4)))
1626       return TRUE;
1627
1628    return FALSE;
1629 }
1630
1631 enum lp_build_round_mode
1632 {
1633    LP_BUILD_ROUND_NEAREST = 0,
1634    LP_BUILD_ROUND_FLOOR = 1,
1635    LP_BUILD_ROUND_CEIL = 2,
1636    LP_BUILD_ROUND_TRUNCATE = 3
1637 };
1638
1639 /**
1640  * Helper for SSE4.1's ROUNDxx instructions.
1641  *
1642  * NOTE: In the SSE4.1's nearest mode, if two values are equally close, the
1643  * result is the even value.  That is, rounding 2.5 will be 2.0, and not 3.0.
1644  */
1645 static INLINE LLVMValueRef
1646 lp_build_round_sse41(struct lp_build_context *bld,
1647                      LLVMValueRef a,
1648                      enum lp_build_round_mode mode)
1649 {
1650    LLVMBuilderRef builder = bld->gallivm->builder;
1651    const struct lp_type type = bld->type;
1652    LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1653    const char *intrinsic;
1654    LLVMValueRef res;
1655
1656    assert(type.floating);
1657
1658    assert(lp_check_value(type, a));
1659    assert(util_cpu_caps.has_sse4_1);
1660
1661    if (type.length == 1) {
1662       LLVMTypeRef vec_type;
1663       LLVMValueRef undef;
1664       LLVMValueRef args[3];
1665       LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
1666
1667       switch(type.width) {
1668       case 32:
1669          intrinsic = "llvm.x86.sse41.round.ss";
1670          break;
1671       case 64:
1672          intrinsic = "llvm.x86.sse41.round.sd";
1673          break;
1674       default:
1675          assert(0);
1676          return bld->undef;
1677       }
1678
1679       vec_type = LLVMVectorType(bld->elem_type, 4);
1680
1681       undef = LLVMGetUndef(vec_type);
1682
1683       args[0] = undef;
1684       args[1] = LLVMBuildInsertElement(builder, undef, a, index0, "");
1685       args[2] = LLVMConstInt(i32t, mode, 0);
1686
1687       res = lp_build_intrinsic(builder, intrinsic,
1688                                vec_type, args, Elements(args));
1689
1690       res = LLVMBuildExtractElement(builder, res, index0, "");
1691    }
1692    else {
1693       if (type.width * type.length == 128) {
1694          switch(type.width) {
1695          case 32:
1696             intrinsic = "llvm.x86.sse41.round.ps";
1697             break;
1698          case 64:
1699             intrinsic = "llvm.x86.sse41.round.pd";
1700             break;
1701          default:
1702             assert(0);
1703             return bld->undef;
1704          }
1705       }
1706       else {
1707          assert(type.width * type.length == 256);
1708          assert(util_cpu_caps.has_avx);
1709
1710          switch(type.width) {
1711          case 32:
1712             intrinsic = "llvm.x86.avx.round.ps.256";
1713             break;
1714          case 64:
1715             intrinsic = "llvm.x86.avx.round.pd.256";
1716             break;
1717          default:
1718             assert(0);
1719             return bld->undef;
1720          }
1721       }
1722
1723       res = lp_build_intrinsic_binary(builder, intrinsic,
1724                                       bld->vec_type, a,
1725                                       LLVMConstInt(i32t, mode, 0));
1726    }
1727
1728    return res;
1729 }
1730
1731
1732 static INLINE LLVMValueRef
1733 lp_build_iround_nearest_sse2(struct lp_build_context *bld,
1734                              LLVMValueRef a)
1735 {
1736    LLVMBuilderRef builder = bld->gallivm->builder;
1737    const struct lp_type type = bld->type;
1738    LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1739    LLVMTypeRef ret_type = lp_build_int_vec_type(bld->gallivm, type);
1740    const char *intrinsic;
1741    LLVMValueRef res;
1742
1743    assert(type.floating);
1744    /* using the double precision conversions is a bit more complicated */
1745    assert(type.width == 32);
1746
1747    assert(lp_check_value(type, a));
1748    assert(util_cpu_caps.has_sse2);
1749
1750    /* This is relying on MXCSR rounding mode, which should always be nearest. */
1751    if (type.length == 1) {
1752       LLVMTypeRef vec_type;
1753       LLVMValueRef undef;
1754       LLVMValueRef arg;
1755       LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
1756
1757       vec_type = LLVMVectorType(bld->elem_type, 4);
1758
1759       intrinsic = "llvm.x86.sse.cvtss2si";
1760
1761       undef = LLVMGetUndef(vec_type);
1762
1763       arg = LLVMBuildInsertElement(builder, undef, a, index0, "");
1764
1765       res = lp_build_intrinsic_unary(builder, intrinsic,
1766                                      ret_type, arg);
1767    }
1768    else {
1769       if (type.width* type.length == 128) {
1770          intrinsic = "llvm.x86.sse2.cvtps2dq";
1771       }
1772       else {
1773          assert(type.width*type.length == 256);
1774          assert(util_cpu_caps.has_avx);
1775
1776          intrinsic = "llvm.x86.avx.cvt.ps2dq.256";
1777       }
1778       res = lp_build_intrinsic_unary(builder, intrinsic,
1779                                      ret_type, a);
1780    }
1781
1782    return res;
1783 }
1784
1785
1786 /*
1787  */
1788 static INLINE LLVMValueRef
1789 lp_build_round_altivec(struct lp_build_context *bld,
1790                        LLVMValueRef a,
1791                        enum lp_build_round_mode mode)
1792 {
1793    LLVMBuilderRef builder = bld->gallivm->builder;
1794    const struct lp_type type = bld->type;
1795    const char *intrinsic = NULL;
1796
1797    assert(type.floating);
1798
1799    assert(lp_check_value(type, a));
1800    assert(util_cpu_caps.has_altivec);
1801
1802    switch (mode) {
1803    case LP_BUILD_ROUND_NEAREST:
1804       intrinsic = "llvm.ppc.altivec.vrfin";
1805       break;
1806    case LP_BUILD_ROUND_FLOOR:
1807       intrinsic = "llvm.ppc.altivec.vrfim";
1808       break;
1809    case LP_BUILD_ROUND_CEIL:
1810       intrinsic = "llvm.ppc.altivec.vrfip";
1811       break;
1812    case LP_BUILD_ROUND_TRUNCATE:
1813       intrinsic = "llvm.ppc.altivec.vrfiz";
1814       break;
1815    }
1816
1817    return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
1818 }
1819
1820 static INLINE LLVMValueRef
1821 lp_build_round_arch(struct lp_build_context *bld,
1822                     LLVMValueRef a,
1823                     enum lp_build_round_mode mode)
1824 {
1825    if (util_cpu_caps.has_sse4_1)
1826      return lp_build_round_sse41(bld, a, mode);
1827    else /* (util_cpu_caps.has_altivec) */
1828      return lp_build_round_altivec(bld, a, mode);
1829 }
1830
1831 /**
1832  * Return the integer part of a float (vector) value (== round toward zero).
1833  * The returned value is a float (vector).
1834  * Ex: trunc(-1.5) = -1.0
1835  */
1836 LLVMValueRef
1837 lp_build_trunc(struct lp_build_context *bld,
1838                LLVMValueRef a)
1839 {
1840    LLVMBuilderRef builder = bld->gallivm->builder;
1841    const struct lp_type type = bld->type;
1842
1843    assert(type.floating);
1844    assert(lp_check_value(type, a));
1845
1846    if (arch_rounding_available(type)) {
1847       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_TRUNCATE);
1848    }
1849    else {
1850       const struct lp_type type = bld->type;
1851       struct lp_type inttype;
1852       struct lp_build_context intbld;
1853       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 2^24);
1854       LLVMValueRef trunc, res, anosign, mask;
1855       LLVMTypeRef int_vec_type = bld->int_vec_type;
1856       LLVMTypeRef vec_type = bld->vec_type;
1857
1858       assert(type.width == 32); /* might want to handle doubles at some point */
1859
1860       inttype = type;
1861       inttype.floating = 0;
1862       lp_build_context_init(&intbld, bld->gallivm, inttype);
1863
1864       /* round by truncation */
1865       trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
1866       res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
1867
1868       /* mask out sign bit */
1869       anosign = lp_build_abs(bld, a);
1870       /*
1871        * mask out all values if anosign > 2^24
1872        * This should work both for large ints (all rounding is no-op for them
1873        * because such floats are always exact) as well as special cases like
1874        * NaNs, Infs (taking advantage of the fact they use max exponent).
1875        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
1876        */
1877       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
1878       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
1879       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
1880       return lp_build_select(bld, mask, a, res);
1881    }
1882 }
1883
1884
1885 /**
1886  * Return float (vector) rounded to nearest integer (vector).  The returned
1887  * value is a float (vector).
1888  * Ex: round(0.9) = 1.0
1889  * Ex: round(-1.5) = -2.0
1890  */
1891 LLVMValueRef
1892 lp_build_round(struct lp_build_context *bld,
1893                LLVMValueRef a)
1894 {
1895    LLVMBuilderRef builder = bld->gallivm->builder;
1896    const struct lp_type type = bld->type;
1897
1898    assert(type.floating);
1899    assert(lp_check_value(type, a));
1900
1901    if (arch_rounding_available(type)) {
1902       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
1903    }
1904    else {
1905       const struct lp_type type = bld->type;
1906       struct lp_type inttype;
1907       struct lp_build_context intbld;
1908       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 2^24);
1909       LLVMValueRef res, anosign, mask;
1910       LLVMTypeRef int_vec_type = bld->int_vec_type;
1911       LLVMTypeRef vec_type = bld->vec_type;
1912
1913       assert(type.width == 32); /* might want to handle doubles at some point */
1914
1915       inttype = type;
1916       inttype.floating = 0;
1917       lp_build_context_init(&intbld, bld->gallivm, inttype);
1918
1919       res = lp_build_iround(bld, a);
1920       res = LLVMBuildSIToFP(builder, res, vec_type, "");
1921
1922       /* mask out sign bit */
1923       anosign = lp_build_abs(bld, a);
1924       /*
1925        * mask out all values if anosign > 2^24
1926        * This should work both for large ints (all rounding is no-op for them
1927        * because such floats are always exact) as well as special cases like
1928        * NaNs, Infs (taking advantage of the fact they use max exponent).
1929        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
1930        */
1931       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
1932       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
1933       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
1934       return lp_build_select(bld, mask, a, res);
1935    }
1936 }
1937
1938
1939 /**
1940  * Return floor of float (vector), result is a float (vector)
1941  * Ex: floor(1.1) = 1.0
1942  * Ex: floor(-1.1) = -2.0
1943  */
1944 LLVMValueRef
1945 lp_build_floor(struct lp_build_context *bld,
1946                LLVMValueRef a)
1947 {
1948    LLVMBuilderRef builder = bld->gallivm->builder;
1949    const struct lp_type type = bld->type;
1950
1951    assert(type.floating);
1952    assert(lp_check_value(type, a));
1953
1954    if (arch_rounding_available(type)) {
1955       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
1956    }
1957    else {
1958       const struct lp_type type = bld->type;
1959       struct lp_type inttype;
1960       struct lp_build_context intbld;
1961       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 2^24);
1962       LLVMValueRef trunc, res, anosign, mask;
1963       LLVMTypeRef int_vec_type = bld->int_vec_type;
1964       LLVMTypeRef vec_type = bld->vec_type;
1965
1966       assert(type.width == 32); /* might want to handle doubles at some point */
1967
1968       inttype = type;
1969       inttype.floating = 0;
1970       lp_build_context_init(&intbld, bld->gallivm, inttype);
1971
1972       /* round by truncation */
1973       trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
1974       res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
1975
1976       if (type.sign) {
1977          LLVMValueRef tmp;
1978
1979          /*
1980           * fix values if rounding is wrong (for non-special cases)
1981           * - this is the case if trunc > a
1982           */
1983          mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, res, a);
1984          /* tmp = trunc > a ? 1.0 : 0.0 */
1985          tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
1986          tmp = lp_build_and(&intbld, mask, tmp);
1987          tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
1988          res = lp_build_sub(bld, res, tmp);
1989       }
1990
1991       /* mask out sign bit */
1992       anosign = lp_build_abs(bld, a);
1993       /*
1994        * mask out all values if anosign > 2^24
1995        * This should work both for large ints (all rounding is no-op for them
1996        * because such floats are always exact) as well as special cases like
1997        * NaNs, Infs (taking advantage of the fact they use max exponent).
1998        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
1999        */
2000       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2001       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2002       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2003       return lp_build_select(bld, mask, a, res);
2004    }
2005 }
2006
2007
2008 /**
2009  * Return ceiling of float (vector), returning float (vector).
2010  * Ex: ceil( 1.1) = 2.0
2011  * Ex: ceil(-1.1) = -1.0
2012  */
2013 LLVMValueRef
2014 lp_build_ceil(struct lp_build_context *bld,
2015               LLVMValueRef a)
2016 {
2017    LLVMBuilderRef builder = bld->gallivm->builder;
2018    const struct lp_type type = bld->type;
2019
2020    assert(type.floating);
2021    assert(lp_check_value(type, a));
2022
2023    if (arch_rounding_available(type)) {
2024       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2025    }
2026    else {
2027       const struct lp_type type = bld->type;
2028       struct lp_type inttype;
2029       struct lp_build_context intbld;
2030       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 2^24);
2031       LLVMValueRef trunc, res, anosign, mask, tmp;
2032       LLVMTypeRef int_vec_type = bld->int_vec_type;
2033       LLVMTypeRef vec_type = bld->vec_type;
2034
2035       assert(type.width == 32); /* might want to handle doubles at some point */
2036
2037       inttype = type;
2038       inttype.floating = 0;
2039       lp_build_context_init(&intbld, bld->gallivm, inttype);
2040
2041       /* round by truncation */
2042       trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2043       trunc = LLVMBuildSIToFP(builder, trunc, vec_type, "ceil.trunc");
2044
2045       /*
2046        * fix values if rounding is wrong (for non-special cases)
2047        * - this is the case if trunc < a
2048        */
2049       mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2050       /* tmp = trunc < a ? 1.0 : 0.0 */
2051       tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
2052       tmp = lp_build_and(&intbld, mask, tmp);
2053       tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
2054       res = lp_build_add(bld, trunc, tmp);
2055
2056       /* mask out sign bit */
2057       anosign = lp_build_abs(bld, a);
2058       /*
2059        * mask out all values if anosign > 2^24
2060        * This should work both for large ints (all rounding is no-op for them
2061        * because such floats are always exact) as well as special cases like
2062        * NaNs, Infs (taking advantage of the fact they use max exponent).
2063        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2064        */
2065       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2066       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2067       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2068       return lp_build_select(bld, mask, a, res);
2069    }
2070 }
2071
2072
2073 /**
2074  * Return fractional part of 'a' computed as a - floor(a)
2075  * Typically used in texture coord arithmetic.
2076  */
2077 LLVMValueRef
2078 lp_build_fract(struct lp_build_context *bld,
2079                LLVMValueRef a)
2080 {
2081    assert(bld->type.floating);
2082    return lp_build_sub(bld, a, lp_build_floor(bld, a));
2083 }
2084
2085
2086 /**
2087  * Prevent returning a fractional part of 1.0 for very small negative values of
2088  * 'a' by clamping against 0.99999(9).
2089  */
2090 static inline LLVMValueRef
2091 clamp_fract(struct lp_build_context *bld, LLVMValueRef fract)
2092 {
2093    LLVMValueRef max;
2094
2095    /* this is the largest number smaller than 1.0 representable as float */
2096    max = lp_build_const_vec(bld->gallivm, bld->type,
2097                             1.0 - 1.0/(1LL << (lp_mantissa(bld->type) + 1)));
2098    return lp_build_min(bld, fract, max);
2099 }
2100
2101
2102 /**
2103  * Same as lp_build_fract, but guarantees that the result is always smaller
2104  * than one.
2105  */
2106 LLVMValueRef
2107 lp_build_fract_safe(struct lp_build_context *bld,
2108                     LLVMValueRef a)
2109 {
2110    return clamp_fract(bld, lp_build_fract(bld, a));
2111 }
2112
2113
2114 /**
2115  * Return the integer part of a float (vector) value (== round toward zero).
2116  * The returned value is an integer (vector).
2117  * Ex: itrunc(-1.5) = -1
2118  */
2119 LLVMValueRef
2120 lp_build_itrunc(struct lp_build_context *bld,
2121                 LLVMValueRef a)
2122 {
2123    LLVMBuilderRef builder = bld->gallivm->builder;
2124    const struct lp_type type = bld->type;
2125    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
2126
2127    assert(type.floating);
2128    assert(lp_check_value(type, a));
2129
2130    return LLVMBuildFPToSI(builder, a, int_vec_type, "");
2131 }
2132
2133
2134 /**
2135  * Return float (vector) rounded to nearest integer (vector).  The returned
2136  * value is an integer (vector).
2137  * Ex: iround(0.9) = 1
2138  * Ex: iround(-1.5) = -2
2139  */
2140 LLVMValueRef
2141 lp_build_iround(struct lp_build_context *bld,
2142                 LLVMValueRef a)
2143 {
2144    LLVMBuilderRef builder = bld->gallivm->builder;
2145    const struct lp_type type = bld->type;
2146    LLVMTypeRef int_vec_type = bld->int_vec_type;
2147    LLVMValueRef res;
2148
2149    assert(type.floating);
2150
2151    assert(lp_check_value(type, a));
2152
2153    if ((util_cpu_caps.has_sse2 &&
2154        ((type.width == 32) && (type.length == 1 || type.length == 4))) ||
2155        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
2156       return lp_build_iround_nearest_sse2(bld, a);
2157    }
2158    if (arch_rounding_available(type)) {
2159       res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
2160    }
2161    else {
2162       LLVMValueRef half;
2163
2164       half = lp_build_const_vec(bld->gallivm, type, 0.5);
2165
2166       if (type.sign) {
2167          LLVMTypeRef vec_type = bld->vec_type;
2168          LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
2169                                     (unsigned long long)1 << (type.width - 1));
2170          LLVMValueRef sign;
2171
2172          /* get sign bit */
2173          sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
2174          sign = LLVMBuildAnd(builder, sign, mask, "");
2175
2176          /* sign * 0.5 */
2177          half = LLVMBuildBitCast(builder, half, int_vec_type, "");
2178          half = LLVMBuildOr(builder, sign, half, "");
2179          half = LLVMBuildBitCast(builder, half, vec_type, "");
2180       }
2181
2182       res = LLVMBuildFAdd(builder, a, half, "");
2183    }
2184
2185    res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
2186
2187    return res;
2188 }
2189
2190
2191 /**
2192  * Return floor of float (vector), result is an int (vector)
2193  * Ex: ifloor(1.1) = 1.0
2194  * Ex: ifloor(-1.1) = -2.0
2195  */
2196 LLVMValueRef
2197 lp_build_ifloor(struct lp_build_context *bld,
2198                 LLVMValueRef a)
2199 {
2200    LLVMBuilderRef builder = bld->gallivm->builder;
2201    const struct lp_type type = bld->type;
2202    LLVMTypeRef int_vec_type = bld->int_vec_type;
2203    LLVMValueRef res;
2204
2205    assert(type.floating);
2206    assert(lp_check_value(type, a));
2207
2208    res = a;
2209    if (type.sign) {
2210       if (arch_rounding_available(type)) {
2211          res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
2212       }
2213       else {
2214          struct lp_type inttype;
2215          struct lp_build_context intbld;
2216          LLVMValueRef trunc, itrunc, mask;
2217
2218          assert(type.floating);
2219          assert(lp_check_value(type, a));
2220
2221          inttype = type;
2222          inttype.floating = 0;
2223          lp_build_context_init(&intbld, bld->gallivm, inttype);
2224
2225          /* round by truncation */
2226          itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2227          trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "ifloor.trunc");
2228
2229          /*
2230           * fix values if rounding is wrong (for non-special cases)
2231           * - this is the case if trunc > a
2232           * The results of doing this with NaNs, very large values etc.
2233           * are undefined but this seems to be the case anyway.
2234           */
2235          mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, trunc, a);
2236          /* cheapie minus one with mask since the mask is minus one / zero */
2237          return lp_build_add(&intbld, itrunc, mask);
2238       }
2239    }
2240
2241    /* round to nearest (toward zero) */
2242    res = LLVMBuildFPToSI(builder, res, int_vec_type, "ifloor.res");
2243
2244    return res;
2245 }
2246
2247
2248 /**
2249  * Return ceiling of float (vector), returning int (vector).
2250  * Ex: iceil( 1.1) = 2
2251  * Ex: iceil(-1.1) = -1
2252  */
2253 LLVMValueRef
2254 lp_build_iceil(struct lp_build_context *bld,
2255                LLVMValueRef a)
2256 {
2257    LLVMBuilderRef builder = bld->gallivm->builder;
2258    const struct lp_type type = bld->type;
2259    LLVMTypeRef int_vec_type = bld->int_vec_type;
2260    LLVMValueRef res;
2261
2262    assert(type.floating);
2263    assert(lp_check_value(type, a));
2264
2265    if (arch_rounding_available(type)) {
2266       res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2267    }
2268    else {
2269       struct lp_type inttype;
2270       struct lp_build_context intbld;
2271       LLVMValueRef trunc, itrunc, mask;
2272
2273       assert(type.floating);
2274       assert(lp_check_value(type, a));
2275
2276       inttype = type;
2277       inttype.floating = 0;
2278       lp_build_context_init(&intbld, bld->gallivm, inttype);
2279
2280       /* round by truncation */
2281       itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2282       trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "iceil.trunc");
2283
2284       /*
2285        * fix values if rounding is wrong (for non-special cases)
2286        * - this is the case if trunc < a
2287        * The results of doing this with NaNs, very large values etc.
2288        * are undefined but this seems to be the case anyway.
2289        */
2290       mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2291       /* cheapie plus one with mask since the mask is minus one / zero */
2292       return lp_build_sub(&intbld, itrunc, mask);
2293    }
2294
2295    /* round to nearest (toward zero) */
2296    res = LLVMBuildFPToSI(builder, res, int_vec_type, "iceil.res");
2297
2298    return res;
2299 }
2300
2301
2302 /**
2303  * Combined ifloor() & fract().
2304  *
2305  * Preferred to calling the functions separately, as it will ensure that the
2306  * strategy (floor() vs ifloor()) that results in less redundant work is used.
2307  */
2308 void
2309 lp_build_ifloor_fract(struct lp_build_context *bld,
2310                       LLVMValueRef a,
2311                       LLVMValueRef *out_ipart,
2312                       LLVMValueRef *out_fpart)
2313 {
2314    LLVMBuilderRef builder = bld->gallivm->builder;
2315    const struct lp_type type = bld->type;
2316    LLVMValueRef ipart;
2317
2318    assert(type.floating);
2319    assert(lp_check_value(type, a));
2320
2321    if (arch_rounding_available(type)) {
2322       /*
2323        * floor() is easier.
2324        */
2325
2326       ipart = lp_build_floor(bld, a);
2327       *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2328       *out_ipart = LLVMBuildFPToSI(builder, ipart, bld->int_vec_type, "ipart");
2329    }
2330    else {
2331       /*
2332        * ifloor() is easier.
2333        */
2334
2335       *out_ipart = lp_build_ifloor(bld, a);
2336       ipart = LLVMBuildSIToFP(builder, *out_ipart, bld->vec_type, "ipart");
2337       *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2338    }
2339 }
2340
2341
2342 /**
2343  * Same as lp_build_ifloor_fract, but guarantees that the fractional part is
2344  * always smaller than one.
2345  */
2346 void
2347 lp_build_ifloor_fract_safe(struct lp_build_context *bld,
2348                            LLVMValueRef a,
2349                            LLVMValueRef *out_ipart,
2350                            LLVMValueRef *out_fpart)
2351 {
2352    lp_build_ifloor_fract(bld, a, out_ipart, out_fpart);
2353    *out_fpart = clamp_fract(bld, *out_fpart);
2354 }
2355
2356
2357 LLVMValueRef
2358 lp_build_sqrt(struct lp_build_context *bld,
2359               LLVMValueRef a)
2360 {
2361    LLVMBuilderRef builder = bld->gallivm->builder;
2362    const struct lp_type type = bld->type;
2363    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
2364    char intrinsic[32];
2365
2366    assert(lp_check_value(type, a));
2367
2368    /* TODO: optimize the constant case */
2369
2370    assert(type.floating);
2371    if (type.length == 1) {
2372       util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.f%u", type.width);
2373    }
2374    else {
2375       util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.v%uf%u", type.length, type.width);
2376    }
2377
2378    return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2379 }
2380
2381
2382 /**
2383  * Do one Newton-Raphson step to improve reciprocate precision:
2384  *
2385  *   x_{i+1} = x_i * (2 - a * x_i)
2386  *
2387  * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
2388  * +/-Inf, giving NaN instead.  Certain applications rely on this behavior,
2389  * such as Google Earth, which does RCP(RSQRT(0.0) when drawing the Earth's
2390  * halo. It would be necessary to clamp the argument to prevent this.
2391  *
2392  * See also:
2393  * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
2394  * - http://softwarecommunity.intel.com/articles/eng/1818.htm
2395  */
2396 static INLINE LLVMValueRef
2397 lp_build_rcp_refine(struct lp_build_context *bld,
2398                     LLVMValueRef a,
2399                     LLVMValueRef rcp_a)
2400 {
2401    LLVMBuilderRef builder = bld->gallivm->builder;
2402    LLVMValueRef two = lp_build_const_vec(bld->gallivm, bld->type, 2.0);
2403    LLVMValueRef res;
2404
2405    res = LLVMBuildFMul(builder, a, rcp_a, "");
2406    res = LLVMBuildFSub(builder, two, res, "");
2407    res = LLVMBuildFMul(builder, rcp_a, res, "");
2408
2409    return res;
2410 }
2411
2412
2413 LLVMValueRef
2414 lp_build_rcp(struct lp_build_context *bld,
2415              LLVMValueRef a)
2416 {
2417    LLVMBuilderRef builder = bld->gallivm->builder;
2418    const struct lp_type type = bld->type;
2419
2420    assert(lp_check_value(type, a));
2421
2422    if(a == bld->zero)
2423       return bld->undef;
2424    if(a == bld->one)
2425       return bld->one;
2426    if(a == bld->undef)
2427       return bld->undef;
2428
2429    assert(type.floating);
2430
2431    if(LLVMIsConstant(a))
2432       return LLVMConstFDiv(bld->one, a);
2433
2434    /*
2435     * We don't use RCPPS because:
2436     * - it only has 10bits of precision
2437     * - it doesn't even get the reciprocate of 1.0 exactly
2438     * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
2439     * - for recent processors the benefit over DIVPS is marginal, a case
2440     *   dependent
2441     *
2442     * We could still use it on certain processors if benchmarks show that the
2443     * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
2444     * particular uses that require less workarounds.
2445     */
2446
2447    if (FALSE && ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2448          (util_cpu_caps.has_avx && type.width == 32 && type.length == 8))){
2449       const unsigned num_iterations = 0;
2450       LLVMValueRef res;
2451       unsigned i;
2452       const char *intrinsic = NULL;
2453
2454       if (type.length == 4) {
2455          intrinsic = "llvm.x86.sse.rcp.ps";
2456       }
2457       else {
2458          intrinsic = "llvm.x86.avx.rcp.ps.256";
2459       }
2460
2461       res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2462
2463       for (i = 0; i < num_iterations; ++i) {
2464          res = lp_build_rcp_refine(bld, a, res);
2465       }
2466
2467       return res;
2468    }
2469
2470    return LLVMBuildFDiv(builder, bld->one, a, "");
2471 }
2472
2473
2474 /**
2475  * Do one Newton-Raphson step to improve rsqrt precision:
2476  *
2477  *   x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
2478  *
2479  * See also Intel 64 and IA-32 Architectures Optimization Manual.
2480  */
2481 static INLINE LLVMValueRef
2482 lp_build_rsqrt_refine(struct lp_build_context *bld,
2483                       LLVMValueRef a,
2484                       LLVMValueRef rsqrt_a)
2485 {
2486    LLVMBuilderRef builder = bld->gallivm->builder;
2487    LLVMValueRef half = lp_build_const_vec(bld->gallivm, bld->type, 0.5);
2488    LLVMValueRef three = lp_build_const_vec(bld->gallivm, bld->type, 3.0);
2489    LLVMValueRef res;
2490
2491    res = LLVMBuildFMul(builder, rsqrt_a, rsqrt_a, "");
2492    res = LLVMBuildFMul(builder, a, res, "");
2493    res = LLVMBuildFSub(builder, three, res, "");
2494    res = LLVMBuildFMul(builder, rsqrt_a, res, "");
2495    res = LLVMBuildFMul(builder, half, res, "");
2496
2497    return res;
2498 }
2499
2500
2501 /**
2502  * Generate 1/sqrt(a).
2503  * Result is undefined for values < 0, infinity for +0.
2504  */
2505 LLVMValueRef
2506 lp_build_rsqrt(struct lp_build_context *bld,
2507                LLVMValueRef a)
2508 {
2509    LLVMBuilderRef builder = bld->gallivm->builder;
2510    const struct lp_type type = bld->type;
2511
2512    assert(lp_check_value(type, a));
2513
2514    assert(type.floating);
2515
2516    /*
2517     * This should be faster but all denormals will end up as infinity.
2518     */
2519    if (0 && lp_build_fast_rsqrt_available(type)) {
2520       const unsigned num_iterations = 1;
2521       LLVMValueRef res;
2522       unsigned i;
2523
2524       /* rsqrt(1.0) != 1.0 here */
2525       res = lp_build_fast_rsqrt(bld, a);
2526
2527       if (num_iterations) {
2528          /*
2529           * Newton-Raphson will result in NaN instead of infinity for zero,
2530           * and NaN instead of zero for infinity.
2531           * Also, need to ensure rsqrt(1.0) == 1.0.
2532           * All numbers smaller than FLT_MIN will result in +infinity
2533           * (rsqrtps treats all denormals as zero).
2534           */
2535          /*
2536           * Certain non-c99 compilers don't know INFINITY and might not support
2537           * hacks to evaluate it at compile time neither.
2538           */
2539          const unsigned posinf_int = 0x7F800000;
2540          LLVMValueRef cmp;
2541          LLVMValueRef flt_min = lp_build_const_vec(bld->gallivm, type, FLT_MIN);
2542          LLVMValueRef inf = lp_build_const_int_vec(bld->gallivm, type, posinf_int);
2543
2544          inf = LLVMBuildBitCast(builder, inf, lp_build_vec_type(bld->gallivm, type), "");
2545
2546          for (i = 0; i < num_iterations; ++i) {
2547             res = lp_build_rsqrt_refine(bld, a, res);
2548          }
2549          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_LESS, a, flt_min);
2550          res = lp_build_select(bld, cmp, inf, res);
2551          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, inf);
2552          res = lp_build_select(bld, cmp, bld->zero, res);
2553          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, bld->one);
2554          res = lp_build_select(bld, cmp, bld->one, res);
2555       }
2556
2557       return res;
2558    }
2559
2560    return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2561 }
2562
2563 /**
2564  * If there's a fast (inaccurate) rsqrt instruction available
2565  * (caller may want to avoid to call rsqrt_fast if it's not available,
2566  * i.e. for calculating x^0.5 it may do rsqrt_fast(x) * x but if
2567  * unavailable it would result in sqrt/div/mul so obviously
2568  * much better to just call sqrt, skipping both div and mul).
2569  */
2570 boolean
2571 lp_build_fast_rsqrt_available(struct lp_type type)
2572 {
2573    assert(type.floating);
2574
2575    if ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2576        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
2577       return true;
2578    }
2579    return false;
2580 }
2581
2582
2583 /**
2584  * Generate 1/sqrt(a).
2585  * Result is undefined for values < 0, infinity for +0.
2586  * Precision is limited, only ~10 bits guaranteed
2587  * (rsqrt 1.0 may not be 1.0, denorms may be flushed to 0).
2588  */
2589 LLVMValueRef
2590 lp_build_fast_rsqrt(struct lp_build_context *bld,
2591                     LLVMValueRef a)
2592 {
2593    LLVMBuilderRef builder = bld->gallivm->builder;
2594    const struct lp_type type = bld->type;
2595
2596    assert(lp_check_value(type, a));
2597
2598    if (lp_build_fast_rsqrt_available(type)) {
2599       const char *intrinsic = NULL;
2600
2601       if (type.length == 4) {
2602          intrinsic = "llvm.x86.sse.rsqrt.ps";
2603       }
2604       else {
2605          intrinsic = "llvm.x86.avx.rsqrt.ps.256";
2606       }
2607       return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2608    }
2609    else {
2610       debug_printf("%s: emulating fast rsqrt with rcp/sqrt\n", __FUNCTION__);
2611    }
2612    return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2613 }
2614
2615
2616 /**
2617  * Generate sin(a) or cos(a) using polynomial approximation.
2618  * TODO: it might be worth recognizing sin and cos using same source
2619  * (i.e. d3d10 sincos opcode). Obviously doing both at the same time
2620  * would be way cheaper than calculating (nearly) everything twice...
2621  * Not sure it's common enough to be worth bothering however, scs
2622  * opcode could also benefit from calculating both though.
2623  */
2624 static LLVMValueRef
2625 lp_build_sin_or_cos(struct lp_build_context *bld,
2626                     LLVMValueRef a,
2627                     boolean cos)
2628 {
2629    struct gallivm_state *gallivm = bld->gallivm;
2630    LLVMBuilderRef b = gallivm->builder;
2631    struct lp_type int_type = lp_int_type(bld->type);
2632
2633    /*
2634     *  take the absolute value,
2635     *  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
2636     */
2637
2638    LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
2639    LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
2640
2641    LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
2642    LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
2643
2644    /*
2645     * scale by 4/Pi
2646     * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
2647     */
2648
2649    LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
2650    LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
2651
2652    /*
2653     * store the integer part of y in mm0
2654     * emm2 = _mm_cvttps_epi32(y);
2655     */
2656
2657    LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
2658
2659    /*
2660     * j=(j+1) & (~1) (see the cephes sources)
2661     * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
2662     */
2663
2664    LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
2665    LLVMValueRef emm2_add =  LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
2666    /*
2667     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
2668     */
2669    LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
2670    LLVMValueRef emm2_and =  LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
2671
2672    /*
2673     * y = _mm_cvtepi32_ps(emm2);
2674     */
2675    LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
2676
2677    LLVMValueRef const_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
2678    LLVMValueRef const_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
2679    LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
2680    LLVMValueRef sign_mask = lp_build_const_int_vec(gallivm, bld->type, 0x80000000);
2681
2682    /*
2683     * Argument used for poly selection and sign bit determination
2684     * is different for sin vs. cos.
2685     */
2686    LLVMValueRef emm2_2 = cos ? LLVMBuildSub(b, emm2_and, const_2, "emm2_2") :
2687                                emm2_and;
2688
2689    LLVMValueRef sign_bit = cos ? LLVMBuildShl(b, LLVMBuildAnd(b, const_4,
2690                                                               LLVMBuildNot(b, emm2_2, ""), ""),
2691                                               const_29, "sign_bit") :
2692                                  LLVMBuildAnd(b, LLVMBuildXor(b, a_v4si,
2693                                                               LLVMBuildShl(b, emm2_add,
2694                                                                            const_29, ""), ""),
2695                                               sign_mask, "sign_bit");
2696
2697    /*
2698     * get the polynom selection mask
2699     * there is one polynom for 0 <= x <= Pi/4
2700     * and another one for Pi/4<x<=Pi/2
2701     * Both branches will be computed.
2702     *
2703     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
2704     * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
2705     */
2706
2707    LLVMValueRef emm2_3 =  LLVMBuildAnd(b, emm2_2, const_2, "emm2_3");
2708    LLVMValueRef poly_mask = lp_build_compare(gallivm,
2709                                              int_type, PIPE_FUNC_EQUAL,
2710                                              emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
2711
2712    /*
2713     * _PS_CONST(minus_cephes_DP1, -0.78515625);
2714     * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
2715     * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
2716     */
2717    LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
2718    LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
2719    LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
2720
2721    /*
2722     * The magic pass: "Extended precision modular arithmetic"
2723     * x = ((x - y * DP1) - y * DP2) - y * DP3;
2724     * xmm1 = _mm_mul_ps(y, xmm1);
2725     * xmm2 = _mm_mul_ps(y, xmm2);
2726     * xmm3 = _mm_mul_ps(y, xmm3);
2727     */
2728    LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1");
2729    LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2");
2730    LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3");
2731
2732    /*
2733     * x = _mm_add_ps(x, xmm1);
2734     * x = _mm_add_ps(x, xmm2);
2735     * x = _mm_add_ps(x, xmm3);
2736     */
2737
2738    LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1");
2739    LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2");
2740    LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3");
2741
2742    /*
2743     * Evaluate the first polynom  (0 <= x <= Pi/4)
2744     *
2745     * z = _mm_mul_ps(x,x);
2746     */
2747    LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
2748
2749    /*
2750     * _PS_CONST(coscof_p0,  2.443315711809948E-005);
2751     * _PS_CONST(coscof_p1, -1.388731625493765E-003);
2752     * _PS_CONST(coscof_p2,  4.166664568298827E-002);
2753     */
2754    LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
2755    LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
2756    LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
2757
2758    /*
2759     * y = *(v4sf*)_ps_coscof_p0;
2760     * y = _mm_mul_ps(y, z);
2761     */
2762    LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3");
2763    LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4");
2764    LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5");
2765    LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6");
2766    LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
2767    LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
2768
2769
2770    /*
2771     * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
2772     * y = _mm_sub_ps(y, tmp);
2773     * y = _mm_add_ps(y, *(v4sf*)_ps_1);
2774     */
2775    LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
2776    LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
2777    LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
2778    LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
2779    LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
2780
2781    /*
2782     * _PS_CONST(sincof_p0, -1.9515295891E-4);
2783     * _PS_CONST(sincof_p1,  8.3321608736E-3);
2784     * _PS_CONST(sincof_p2, -1.6666654611E-1);
2785     */
2786    LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
2787    LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
2788    LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
2789
2790    /*
2791     * Evaluate the second polynom  (Pi/4 <= x <= 0)
2792     *
2793     * y2 = *(v4sf*)_ps_sincof_p0;
2794     * y2 = _mm_mul_ps(y2, z);
2795     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
2796     * y2 = _mm_mul_ps(y2, z);
2797     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
2798     * y2 = _mm_mul_ps(y2, z);
2799     * y2 = _mm_mul_ps(y2, x);
2800     * y2 = _mm_add_ps(y2, x);
2801     */
2802
2803    LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3");
2804    LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4");
2805    LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5");
2806    LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6");
2807    LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
2808    LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8");
2809    LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9");
2810
2811    /*
2812     * select the correct result from the two polynoms
2813     * xmm3 = poly_mask;
2814     * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
2815     * y = _mm_andnot_ps(xmm3, y);
2816     * y = _mm_or_ps(y,y2);
2817     */
2818    LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
2819    LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
2820    LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
2821    LLVMValueRef poly_mask_inv = LLVMBuildNot(b, poly_mask, "poly_mask_inv");
2822    LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
2823    LLVMValueRef y_combine = LLVMBuildOr(b, y_and, y2_and, "y_combine");
2824
2825    /*
2826     * update the sign
2827     * y = _mm_xor_ps(y, sign_bit);
2828     */
2829    LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sign");
2830    LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
2831
2832    LLVMValueRef isfinite = lp_build_isfinite(bld, a);
2833
2834    /* clamp output to be within [-1, 1] */
2835    y_result = lp_build_clamp(bld, y_result,
2836                              lp_build_const_vec(bld->gallivm, bld->type,  -1.f),
2837                              lp_build_const_vec(bld->gallivm, bld->type,  1.f));
2838    /* If a is -inf, inf or NaN then return NaN */
2839    y_result = lp_build_select(bld, isfinite, y_result,
2840                               lp_build_const_vec(bld->gallivm, bld->type,  NAN));
2841    return y_result;
2842 }
2843
2844
2845 /**
2846  * Generate sin(a)
2847  */
2848 LLVMValueRef
2849 lp_build_sin(struct lp_build_context *bld,
2850              LLVMValueRef a)
2851 {
2852    return lp_build_sin_or_cos(bld, a, FALSE);
2853 }
2854
2855
2856 /**
2857  * Generate cos(a)
2858  */
2859 LLVMValueRef
2860 lp_build_cos(struct lp_build_context *bld,
2861              LLVMValueRef a)
2862 {
2863    return lp_build_sin_or_cos(bld, a, TRUE);
2864 }
2865
2866
2867 /**
2868  * Generate pow(x, y)
2869  */
2870 LLVMValueRef
2871 lp_build_pow(struct lp_build_context *bld,
2872              LLVMValueRef x,
2873              LLVMValueRef y)
2874 {
2875    /* TODO: optimize the constant case */
2876    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2877        LLVMIsConstant(x) && LLVMIsConstant(y)) {
2878       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2879                    __FUNCTION__);
2880    }
2881
2882    return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
2883 }
2884
2885
2886 /**
2887  * Generate exp(x)
2888  */
2889 LLVMValueRef
2890 lp_build_exp(struct lp_build_context *bld,
2891              LLVMValueRef x)
2892 {
2893    /* log2(e) = 1/log(2) */
2894    LLVMValueRef log2e = lp_build_const_vec(bld->gallivm, bld->type,
2895                                            1.4426950408889634);
2896
2897    assert(lp_check_value(bld->type, x));
2898
2899    return lp_build_exp2(bld, lp_build_mul(bld, log2e, x));
2900 }
2901
2902
2903 /**
2904  * Generate log(x)
2905  * Behavior is undefined with infs, 0s and nans
2906  */
2907 LLVMValueRef
2908 lp_build_log(struct lp_build_context *bld,
2909              LLVMValueRef x)
2910 {
2911    /* log(2) */
2912    LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
2913                                           0.69314718055994529);
2914
2915    assert(lp_check_value(bld->type, x));
2916
2917    return lp_build_mul(bld, log2, lp_build_log2(bld, x));
2918 }
2919
2920 /**
2921  * Generate log(x) that handles edge cases (infs, 0s and nans)
2922  */
2923 LLVMValueRef
2924 lp_build_log_safe(struct lp_build_context *bld,
2925                   LLVMValueRef x)
2926 {
2927    /* log(2) */
2928    LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
2929                                           0.69314718055994529);
2930
2931    assert(lp_check_value(bld->type, x));
2932
2933    return lp_build_mul(bld, log2, lp_build_log2_safe(bld, x));
2934 }
2935
2936
2937 /**
2938  * Generate polynomial.
2939  * Ex:  coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
2940  */
2941 LLVMValueRef
2942 lp_build_polynomial(struct lp_build_context *bld,
2943                     LLVMValueRef x,
2944                     const double *coeffs,
2945                     unsigned num_coeffs)
2946 {
2947    const struct lp_type type = bld->type;
2948    LLVMValueRef even = NULL, odd = NULL;
2949    LLVMValueRef x2;
2950    unsigned i;
2951
2952    assert(lp_check_value(bld->type, x));
2953
2954    /* TODO: optimize the constant case */
2955    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2956        LLVMIsConstant(x)) {
2957       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2958                    __FUNCTION__);
2959    }
2960
2961    /*
2962     * Calculate odd and even terms seperately to decrease data dependency
2963     * Ex:
2964     *     c[0] + x^2 * c[2] + x^4 * c[4] ...
2965     *     + x * (c[1] + x^2 * c[3] + x^4 * c[5]) ...
2966     */
2967    x2 = lp_build_mul(bld, x, x);
2968
2969    for (i = num_coeffs; i--; ) {
2970       LLVMValueRef coeff;
2971
2972       coeff = lp_build_const_vec(bld->gallivm, type, coeffs[i]);
2973
2974       if (i % 2 == 0) {
2975          if (even)
2976             even = lp_build_add(bld, coeff, lp_build_mul(bld, x2, even));
2977          else
2978             even = coeff;
2979       } else {
2980          if (odd)
2981             odd = lp_build_add(bld, coeff, lp_build_mul(bld, x2, odd));
2982          else
2983             odd = coeff;
2984       }
2985    }
2986
2987    if (odd)
2988       return lp_build_add(bld, lp_build_mul(bld, odd, x), even);
2989    else if (even)
2990       return even;
2991    else
2992       return bld->undef;
2993 }
2994
2995
2996 /**
2997  * Minimax polynomial fit of 2**x, in range [0, 1[
2998  */
2999 const double lp_build_exp2_polynomial[] = {
3000 #if EXP_POLY_DEGREE == 5
3001    1.000000000000000000000, /*XXX: was 0.999999925063526176901, recompute others */
3002    0.693153073200168932794,
3003    0.240153617044375388211,
3004    0.0558263180532956664775,
3005    0.00898934009049466391101,
3006    0.00187757667519147912699
3007 #elif EXP_POLY_DEGREE == 4
3008    1.00000259337069434683,
3009    0.693003834469974940458,
3010    0.24144275689150793076,
3011    0.0520114606103070150235,
3012    0.0135341679161270268764
3013 #elif EXP_POLY_DEGREE == 3
3014    0.999925218562710312959,
3015    0.695833540494823811697,
3016    0.226067155427249155588,
3017    0.0780245226406372992967
3018 #elif EXP_POLY_DEGREE == 2
3019    1.00172476321474503578,
3020    0.657636275736077639316,
3021    0.33718943461968720704
3022 #else
3023 #error
3024 #endif
3025 };
3026
3027
3028 LLVMValueRef
3029 lp_build_exp2(struct lp_build_context *bld,
3030               LLVMValueRef x)
3031 {
3032    LLVMBuilderRef builder = bld->gallivm->builder;
3033    const struct lp_type type = bld->type;
3034    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3035    LLVMValueRef ipart = NULL;
3036    LLVMValueRef fpart = NULL;
3037    LLVMValueRef expipart = NULL;
3038    LLVMValueRef expfpart = NULL;
3039    LLVMValueRef res = NULL;
3040
3041    assert(lp_check_value(bld->type, x));
3042
3043
3044    /* TODO: optimize the constant case */
3045    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3046        LLVMIsConstant(x)) {
3047       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3048                    __FUNCTION__);
3049    }
3050
3051    assert(type.floating && type.width == 32);
3052
3053    /* We want to preserve NaN and make sure than for exp2 if x > 128,
3054     * the result is INF  and if it's smaller than -126.9 the result is 0 */
3055    x = lp_build_min_ext(bld, lp_build_const_vec(bld->gallivm, type,  128.0), x,
3056                         GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
3057    x = lp_build_max(bld, lp_build_const_vec(bld->gallivm, type, -126.99999), x);
3058
3059    /* ipart = floor(x) */
3060    /* fpart = x - ipart */
3061    lp_build_ifloor_fract(bld, x, &ipart, &fpart);
3062
3063
3064
3065    /* expipart = (float) (1 << ipart) */
3066    expipart = LLVMBuildAdd(builder, ipart,
3067                            lp_build_const_int_vec(bld->gallivm, type, 127), "");
3068    expipart = LLVMBuildShl(builder, expipart,
3069                            lp_build_const_int_vec(bld->gallivm, type, 23), "");
3070    expipart = LLVMBuildBitCast(builder, expipart, vec_type, "");
3071
3072
3073    expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
3074                                   Elements(lp_build_exp2_polynomial));
3075
3076    res = LLVMBuildFMul(builder, expipart, expfpart, "");
3077
3078
3079    return res;
3080 }
3081
3082
3083
3084 /**
3085  * Extract the exponent of a IEEE-754 floating point value.
3086  *
3087  * Optionally apply an integer bias.
3088  *
3089  * Result is an integer value with
3090  *
3091  *   ifloor(log2(x)) + bias
3092  */
3093 LLVMValueRef
3094 lp_build_extract_exponent(struct lp_build_context *bld,
3095                           LLVMValueRef x,
3096                           int bias)
3097 {
3098    LLVMBuilderRef builder = bld->gallivm->builder;
3099    const struct lp_type type = bld->type;
3100    unsigned mantissa = lp_mantissa(type);
3101    LLVMValueRef res;
3102
3103    assert(type.floating);
3104
3105    assert(lp_check_value(bld->type, x));
3106
3107    x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3108
3109    res = LLVMBuildLShr(builder, x,
3110                        lp_build_const_int_vec(bld->gallivm, type, mantissa), "");
3111    res = LLVMBuildAnd(builder, res,
3112                       lp_build_const_int_vec(bld->gallivm, type, 255), "");
3113    res = LLVMBuildSub(builder, res,
3114                       lp_build_const_int_vec(bld->gallivm, type, 127 - bias), "");
3115
3116    return res;
3117 }
3118
3119
3120 /**
3121  * Extract the mantissa of the a floating.
3122  *
3123  * Result is a floating point value with
3124  *
3125  *   x / floor(log2(x))
3126  */
3127 LLVMValueRef
3128 lp_build_extract_mantissa(struct lp_build_context *bld,
3129                           LLVMValueRef x)
3130 {
3131    LLVMBuilderRef builder = bld->gallivm->builder;
3132    const struct lp_type type = bld->type;
3133    unsigned mantissa = lp_mantissa(type);
3134    LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type,
3135                                                   (1ULL << mantissa) - 1);
3136    LLVMValueRef one = LLVMConstBitCast(bld->one, bld->int_vec_type);
3137    LLVMValueRef res;
3138
3139    assert(lp_check_value(bld->type, x));
3140
3141    assert(type.floating);
3142
3143    x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3144
3145    /* res = x / 2**ipart */
3146    res = LLVMBuildAnd(builder, x, mantmask, "");
3147    res = LLVMBuildOr(builder, res, one, "");
3148    res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
3149
3150    return res;
3151 }
3152
3153
3154
3155 /**
3156  * Minimax polynomial fit of log2((1.0 + sqrt(x))/(1.0 - sqrt(x)))/sqrt(x) ,for x in range of [0, 1/9[
3157  * These coefficients can be generate with
3158  * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
3159  */
3160 const double lp_build_log2_polynomial[] = {
3161 #if LOG_POLY_DEGREE == 5
3162    2.88539008148777786488L,
3163    0.961796878841293367824L,
3164    0.577058946784739859012L,
3165    0.412914355135828735411L,
3166    0.308591899232910175289L,
3167    0.352376952300281371868L,
3168 #elif LOG_POLY_DEGREE == 4
3169    2.88539009343309178325L,
3170    0.961791550404184197881L,
3171    0.577440339438736392009L,
3172    0.403343858251329912514L,
3173    0.406718052498846252698L,
3174 #elif LOG_POLY_DEGREE == 3
3175    2.88538959748872753838L,
3176    0.961932915889597772928L,
3177    0.571118517972136195241L,
3178    0.493997535084709500285L,
3179 #else
3180 #error
3181 #endif
3182 };
3183
3184 /**
3185  * See http://www.devmaster.net/forums/showthread.php?p=43580
3186  * http://en.wikipedia.org/wiki/Logarithm#Calculation
3187  * http://www.nezumi.demon.co.uk/consult/logx.htm
3188  *
3189  * If handle_edge_cases is true the function will perform computations
3190  * to match the required D3D10+ behavior for each of the edge cases.
3191  * That means that if input is:
3192  * - less than zero (to and including -inf) then NaN will be returned
3193  * - equal to zero (-denorm, -0, +0 or +denorm), then -inf will be returned
3194  * - +infinity, then +infinity will be returned
3195  * - NaN, then NaN will be returned
3196  *
3197  * Those checks are fairly expensive so if you don't need them make sure
3198  * handle_edge_cases is false.
3199  */
3200 void
3201 lp_build_log2_approx(struct lp_build_context *bld,
3202                      LLVMValueRef x,
3203                      LLVMValueRef *p_exp,
3204                      LLVMValueRef *p_floor_log2,
3205                      LLVMValueRef *p_log2,
3206                      boolean handle_edge_cases)
3207 {
3208    LLVMBuilderRef builder = bld->gallivm->builder;
3209    const struct lp_type type = bld->type;
3210    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3211    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
3212
3213    LLVMValueRef expmask = lp_build_const_int_vec(bld->gallivm, type, 0x7f800000);
3214    LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 0x007fffff);
3215    LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
3216
3217    LLVMValueRef i = NULL;
3218    LLVMValueRef y = NULL;
3219    LLVMValueRef z = NULL;
3220    LLVMValueRef exp = NULL;
3221    LLVMValueRef mant = NULL;
3222    LLVMValueRef logexp = NULL;
3223    LLVMValueRef logmant = NULL;
3224    LLVMValueRef res = NULL;
3225
3226    assert(lp_check_value(bld->type, x));
3227
3228    if(p_exp || p_floor_log2 || p_log2) {
3229       /* TODO: optimize the constant case */
3230       if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3231           LLVMIsConstant(x)) {
3232          debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3233                       __FUNCTION__);
3234       }
3235
3236       assert(type.floating && type.width == 32);
3237
3238       /*
3239        * We don't explicitly handle denormalized numbers. They will yield a
3240        * result in the neighbourhood of -127, which appears to be adequate
3241        * enough.
3242        */
3243
3244       i = LLVMBuildBitCast(builder, x, int_vec_type, "");
3245
3246       /* exp = (float) exponent(x) */
3247       exp = LLVMBuildAnd(builder, i, expmask, "");
3248    }
3249
3250    if(p_floor_log2 || p_log2) {
3251       logexp = LLVMBuildLShr(builder, exp, lp_build_const_int_vec(bld->gallivm, type, 23), "");
3252       logexp = LLVMBuildSub(builder, logexp, lp_build_const_int_vec(bld->gallivm, type, 127), "");
3253       logexp = LLVMBuildSIToFP(builder, logexp, vec_type, "");
3254    }
3255
3256    if(p_log2) {
3257       /* mant = 1 + (float) mantissa(x) */
3258       mant = LLVMBuildAnd(builder, i, mantmask, "");
3259       mant = LLVMBuildOr(builder, mant, one, "");
3260       mant = LLVMBuildBitCast(builder, mant, vec_type, "");
3261
3262       /* y = (mant - 1) / (mant + 1) */
3263       y = lp_build_div(bld,
3264          lp_build_sub(bld, mant, bld->one),
3265          lp_build_add(bld, mant, bld->one)
3266       );
3267
3268       /* z = y^2 */
3269       z = lp_build_mul(bld, y, y);
3270
3271       /* compute P(z) */
3272       logmant = lp_build_polynomial(bld, z, lp_build_log2_polynomial,
3273                                     Elements(lp_build_log2_polynomial));
3274
3275       /* logmant = y * P(z) */
3276       logmant = lp_build_mul(bld, y, logmant);
3277
3278       res = lp_build_add(bld, logmant, logexp);
3279
3280       if (type.floating && handle_edge_cases) {
3281          LLVMValueRef negmask, infmask,  zmask;
3282          negmask = lp_build_cmp(bld, PIPE_FUNC_LESS, x,
3283                                 lp_build_const_vec(bld->gallivm, type,  0.0f));
3284          zmask = lp_build_cmp(bld, PIPE_FUNC_EQUAL, x,
3285                               lp_build_const_vec(bld->gallivm, type,  0.0f));
3286          infmask = lp_build_cmp(bld, PIPE_FUNC_GEQUAL, x,
3287                                 lp_build_const_vec(bld->gallivm, type,  INFINITY));
3288
3289          /* If x is qual to inf make sure we return inf */
3290          res = lp_build_select(bld, infmask,
3291                                lp_build_const_vec(bld->gallivm, type,  INFINITY),
3292                                res);
3293          /* If x is qual to 0, return -inf */
3294          res = lp_build_select(bld, zmask,
3295                                lp_build_const_vec(bld->gallivm, type,  -INFINITY),
3296                                res);
3297          /* If x is nan or less than 0, return nan */
3298          res = lp_build_select(bld, negmask,
3299                                lp_build_const_vec(bld->gallivm, type,  NAN),
3300                                res);
3301       }
3302    }
3303
3304    if(p_exp) {
3305       exp = LLVMBuildBitCast(builder, exp, vec_type, "");
3306       *p_exp = exp;
3307    }
3308
3309    if(p_floor_log2)
3310       *p_floor_log2 = logexp;
3311
3312    if(p_log2)
3313       *p_log2 = res;
3314 }
3315
3316
3317 /*
3318  * log2 implementation which doesn't have special code to
3319  * handle edge cases (-inf, 0, inf, NaN). It's faster but
3320  * the results for those cases are undefined.
3321  */
3322 LLVMValueRef
3323 lp_build_log2(struct lp_build_context *bld,
3324               LLVMValueRef x)
3325 {
3326    LLVMValueRef res;
3327    lp_build_log2_approx(bld, x, NULL, NULL, &res, FALSE);
3328    return res;
3329 }
3330
3331 /*
3332  * Version of log2 which handles all edge cases.
3333  * Look at documentation of lp_build_log2_approx for
3334  * description of the behavior for each of the edge cases.
3335  */
3336 LLVMValueRef
3337 lp_build_log2_safe(struct lp_build_context *bld,
3338                    LLVMValueRef x)
3339 {
3340    LLVMValueRef res;
3341    lp_build_log2_approx(bld, x, NULL, NULL, &res, TRUE);
3342    return res;
3343 }
3344
3345
3346 /**
3347  * Faster (and less accurate) log2.
3348  *
3349  *    log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x))
3350  *
3351  * Piece-wise linear approximation, with exact results when x is a
3352  * power of two.
3353  *
3354  * See http://www.flipcode.com/archives/Fast_log_Function.shtml
3355  */
3356 LLVMValueRef
3357 lp_build_fast_log2(struct lp_build_context *bld,
3358                    LLVMValueRef x)
3359 {
3360    LLVMBuilderRef builder = bld->gallivm->builder;
3361    LLVMValueRef ipart;
3362    LLVMValueRef fpart;
3363
3364    assert(lp_check_value(bld->type, x));
3365
3366    assert(bld->type.floating);
3367
3368    /* ipart = floor(log2(x)) - 1 */
3369    ipart = lp_build_extract_exponent(bld, x, -1);
3370    ipart = LLVMBuildSIToFP(builder, ipart, bld->vec_type, "");
3371
3372    /* fpart = x / 2**ipart */
3373    fpart = lp_build_extract_mantissa(bld, x);
3374
3375    /* ipart + fpart */
3376    return LLVMBuildFAdd(builder, ipart, fpart, "");
3377 }
3378
3379
3380 /**
3381  * Fast implementation of iround(log2(x)).
3382  *
3383  * Not an approximation -- it should give accurate results all the time.
3384  */
3385 LLVMValueRef
3386 lp_build_ilog2(struct lp_build_context *bld,
3387                LLVMValueRef x)
3388 {
3389    LLVMBuilderRef builder = bld->gallivm->builder;
3390    LLVMValueRef sqrt2 = lp_build_const_vec(bld->gallivm, bld->type, M_SQRT2);
3391    LLVMValueRef ipart;
3392
3393    assert(bld->type.floating);
3394
3395    assert(lp_check_value(bld->type, x));
3396
3397    /* x * 2^(0.5)   i.e., add 0.5 to the log2(x) */
3398    x = LLVMBuildFMul(builder, x, sqrt2, "");
3399
3400    /* ipart = floor(log2(x) + 0.5)  */
3401    ipart = lp_build_extract_exponent(bld, x, 0);
3402
3403    return ipart;
3404 }
3405
3406 LLVMValueRef
3407 lp_build_mod(struct lp_build_context *bld,
3408              LLVMValueRef x,
3409              LLVMValueRef y)
3410 {
3411    LLVMBuilderRef builder = bld->gallivm->builder;
3412    LLVMValueRef res;
3413    const struct lp_type type = bld->type;
3414
3415    assert(lp_check_value(type, x));
3416    assert(lp_check_value(type, y));
3417
3418    if (type.floating)
3419       res = LLVMBuildFRem(builder, x, y, "");
3420    else if (type.sign)
3421       res = LLVMBuildSRem(builder, x, y, "");
3422    else
3423       res = LLVMBuildURem(builder, x, y, "");
3424    return res;
3425 }
3426
3427
3428 /*
3429  * For floating inputs it creates and returns a mask
3430  * which is all 1's for channels which are NaN.
3431  * Channels inside x which are not NaN will be 0.
3432  */
3433 LLVMValueRef
3434 lp_build_isnan(struct lp_build_context *bld,
3435                LLVMValueRef x)
3436 {
3437    LLVMValueRef mask;
3438    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3439
3440    assert(bld->type.floating);
3441    assert(lp_check_value(bld->type, x));
3442
3443    mask = LLVMBuildFCmp(bld->gallivm->builder, LLVMRealOEQ, x, x,
3444                         "isnotnan");
3445    mask = LLVMBuildNot(bld->gallivm->builder, mask, "");
3446    mask = LLVMBuildSExt(bld->gallivm->builder, mask, int_vec_type, "isnan");
3447    return mask;
3448 }
3449
3450 /* Returns all 1's for floating point numbers that are
3451  * finite numbers and returns all zeros for -inf,
3452  * inf and nan's */
3453 LLVMValueRef
3454 lp_build_isfinite(struct lp_build_context *bld,
3455                   LLVMValueRef x)
3456 {
3457    LLVMBuilderRef builder = bld->gallivm->builder;
3458    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3459    struct lp_type int_type = lp_int_type(bld->type);
3460    LLVMValueRef intx = LLVMBuildBitCast(builder, x, int_vec_type, "");
3461    LLVMValueRef infornan32 = lp_build_const_int_vec(bld->gallivm, bld->type,
3462                                                     0x7f800000);
3463
3464    if (!bld->type.floating) {
3465       return lp_build_const_int_vec(bld->gallivm, bld->type, 0);
3466    }
3467    assert(bld->type.floating);
3468    assert(lp_check_value(bld->type, x));
3469    assert(bld->type.width == 32);
3470
3471    intx = LLVMBuildAnd(builder, intx, infornan32, "");
3472    return lp_build_compare(bld->gallivm, int_type, PIPE_FUNC_NOTEQUAL,
3473                            intx, infornan32);
3474 }
3475
3476 /*
3477  * Returns true if the number is nan or inf and false otherwise.
3478  * The input has to be a floating point vector.
3479  */
3480 LLVMValueRef
3481 lp_build_is_inf_or_nan(struct gallivm_state *gallivm,
3482                        const struct lp_type type,
3483                        LLVMValueRef x)
3484 {
3485    LLVMBuilderRef builder = gallivm->builder;
3486    struct lp_type int_type = lp_int_type(type);
3487    LLVMValueRef const0 = lp_build_const_int_vec(gallivm, int_type,
3488                                                 0x7f800000);
3489    LLVMValueRef ret;
3490
3491    assert(type.floating);
3492
3493    ret = LLVMBuildBitCast(builder, x, lp_build_vec_type(gallivm, int_type), "");
3494    ret = LLVMBuildAnd(builder, ret, const0, "");
3495    ret = lp_build_compare(gallivm, int_type, PIPE_FUNC_EQUAL,
3496                           ret, const0);
3497
3498    return ret;
3499 }
3500
3501
3502 LLVMValueRef
3503 lp_build_fpstate_get(struct gallivm_state *gallivm)
3504 {
3505    if (util_cpu_caps.has_sse) {
3506       LLVMBuilderRef builder = gallivm->builder;
3507       LLVMValueRef mxcsr_ptr = lp_build_alloca(
3508          gallivm,
3509          LLVMInt32TypeInContext(gallivm->context),
3510          "mxcsr_ptr");
3511       LLVMValueRef mxcsr_ptr8 = LLVMBuildPointerCast(builder, mxcsr_ptr,
3512           LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3513       lp_build_intrinsic(builder,
3514                          "llvm.x86.sse.stmxcsr",
3515                          LLVMVoidTypeInContext(gallivm->context),
3516                          &mxcsr_ptr8, 1);
3517       return mxcsr_ptr;
3518    }
3519    return 0;
3520 }
3521
3522 void
3523 lp_build_fpstate_set_denorms_zero(struct gallivm_state *gallivm,
3524                                   boolean zero)
3525 {
3526    if (util_cpu_caps.has_sse) {
3527       /* turn on DAZ (64) | FTZ (32768) = 32832 if available */
3528       int daz_ftz = _MM_FLUSH_ZERO_MASK;
3529
3530       LLVMBuilderRef builder = gallivm->builder;
3531       LLVMValueRef mxcsr_ptr = lp_build_fpstate_get(gallivm);
3532       LLVMValueRef mxcsr =
3533          LLVMBuildLoad(builder, mxcsr_ptr, "mxcsr");
3534
3535       if (util_cpu_caps.has_daz) {
3536          /* Enable denormals are zero mode */
3537          daz_ftz |= _MM_DENORMALS_ZERO_MASK;
3538       }
3539       if (zero) {
3540          mxcsr = LLVMBuildOr(builder, mxcsr,
3541                              LLVMConstInt(LLVMTypeOf(mxcsr), daz_ftz, 0), "");
3542       } else {
3543          mxcsr = LLVMBuildAnd(builder, mxcsr,
3544                               LLVMConstInt(LLVMTypeOf(mxcsr), ~daz_ftz, 0), "");
3545       }
3546
3547       LLVMBuildStore(builder, mxcsr, mxcsr_ptr);
3548       lp_build_fpstate_set(gallivm, mxcsr_ptr);
3549    }
3550 }
3551
3552 void
3553 lp_build_fpstate_set(struct gallivm_state *gallivm,
3554                      LLVMValueRef mxcsr_ptr)
3555 {
3556    if (util_cpu_caps.has_sse) {
3557       LLVMBuilderRef builder = gallivm->builder;
3558       mxcsr_ptr = LLVMBuildPointerCast(builder, mxcsr_ptr,
3559                      LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3560       lp_build_intrinsic(builder,
3561                          "llvm.x86.sse.ldmxcsr",
3562                          LLVMVoidTypeInContext(gallivm->context),
3563                          &mxcsr_ptr, 1);
3564    }
3565 }