src/gallium/auxiliary/gallivm/lp_bld_arit.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2009-2010 VMware, Inc.
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the
  15  * next paragraph) shall be included in all copies or substantial portions
  16  * of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  21  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
  22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25  *
  26  **************************************************************************/
  27
  28
  29 /**
  30  * @file
  31  * Helper
  32  *
  33  * LLVM IR doesn't support all basic arithmetic operations we care about (most
  34  * notably min/max and saturated operations), and it is often necessary to
  35  * resort machine-specific intrinsics directly. The functions here hide all
  36  * these implementation details from the other modules.
  37  *
  38  * We also do simple expressions simplification here. Reasons are:
  39  * - it is very easy given we have all necessary information readily available
  40  * - LLVM optimization passes fail to simplify several vector expressions
  41  * - We often know value constraints which the optimization passes have no way
  42  *   of knowing, such as when source arguments are known to be in [0, 1] range.
  43  *
  44  * @author Jose Fonseca <jfonseca@vmware.com>
  45  */
  46
  47
  48 #include <float.h>
  49
  50 #include <llvm/Config/llvm-config.h>
  51
  52 #include "util/u_memory.h"
  53 #include "util/u_debug.h"
  54 #include "util/u_math.h"
  55 #include "util/u_cpu_detect.h"
  56
  57 #include "lp_bld_type.h"
  58 #include "lp_bld_const.h"
  59 #include "lp_bld_init.h"
  60 #include "lp_bld_intr.h"
  61 #include "lp_bld_logic.h"
  62 #include "lp_bld_pack.h"
  63 #include "lp_bld_debug.h"
  64 #include "lp_bld_bitarit.h"
  65 #include "lp_bld_arit.h"
  66 #include "lp_bld_flow.h"
  67
  68 #if defined(PIPE_ARCH_SSE)
  69 #include <xmmintrin.h>
  70 #endif
  71
  72 #ifndef _MM_DENORMALS_ZERO_MASK
  73 #define _MM_DENORMALS_ZERO_MASK 0x0040
  74 #endif
  75
  76 #ifndef _MM_FLUSH_ZERO_MASK
  77 #define _MM_FLUSH_ZERO_MASK 0x8000
  78 #endif
  79
  80 #define EXP_POLY_DEGREE 5
  81
  82 #define LOG_POLY_DEGREE 4
  83
  84
  85 /**
  86  * Generate min(a, b)
  87  * No checks for special case values of a or b = 1 or 0 are done.
  88  * NaN's are handled according to the behavior specified by the
  89  * nan_behavior argument.
  90  */
  91 static LLVMValueRef
  92 lp_build_min_simple(struct lp_build_context *bld,
  93                     LLVMValueRef a,
  94                     LLVMValueRef b,
  95                     enum gallivm_nan_behavior nan_behavior)
  96 {
  97    const struct lp_type type = bld->type;
  98    const char *intrinsic = NULL;
  99    unsigned intr_size = 0;
 100    LLVMValueRef cond;
 101
 102    assert(lp_check_value(type, a));
 103    assert(lp_check_value(type, b));
 104
 105    /* TODO: optimize the constant case */
 106
 107    if (type.floating && util_cpu_caps.has_sse) {
 108       if (type.width == 32) {
 109          if (type.length == 1) {
 110             intrinsic = "llvm.x86.sse.min.ss";
 111             intr_size = 128;
 112          }
 113          else if (type.length <= 4 || !util_cpu_caps.has_avx) {
 114             intrinsic = "llvm.x86.sse.min.ps";
 115             intr_size = 128;
 116          }
 117          else {
 118             intrinsic = "llvm.x86.avx.min.ps.256";
 119             intr_size = 256;
 120          }
 121       }
 122       if (type.width == 64 && util_cpu_caps.has_sse2) {
 123          if (type.length == 1) {
 124             intrinsic = "llvm.x86.sse2.min.sd";
 125             intr_size = 128;
 126          }
 127          else if (type.length == 2 || !util_cpu_caps.has_avx) {
 128             intrinsic = "llvm.x86.sse2.min.pd";
 129             intr_size = 128;
 130          }
 131          else {
 132             intrinsic = "llvm.x86.avx.min.pd.256";
 133             intr_size = 256;
 134          }
 135       }
 136    }
 137    else if (type.floating && util_cpu_caps.has_altivec) {
 138       if (nan_behavior == GALLIVM_NAN_RETURN_NAN ||
 139           nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
 140          debug_printf("%s: altivec doesn't support nan return nan behavior\n",
 141                       __FUNCTION__);
 142       }
 143       if (type.width == 32 && type.length == 4) {
 144          intrinsic = "llvm.ppc.altivec.vminfp";
 145          intr_size = 128;
 146       }
 147    } else if ((LLVM_VERSION_MAJOR < 3 || (LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR < 9)) &&
 148               util_cpu_caps.has_avx2 && type.length > 4) {
 149       intr_size = 256;
 150       switch (type.width) {
 151       case 8:
 152          intrinsic = type.sign ? "llvm.x86.avx2.pmins.b" : "llvm.x86.avx2.pminu.b";
 153          break;
 154       case 16:
 155          intrinsic = type.sign ? "llvm.x86.avx2.pmins.w" : "llvm.x86.avx2.pminu.w";
 156          break;
 157       case 32:
 158          intrinsic = type.sign ? "llvm.x86.avx2.pmins.d" : "llvm.x86.avx2.pminu.d";
 159          break;
 160       }
 161    } else if ((LLVM_VERSION_MAJOR < 3 || (LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR < 9)) &&
 162               util_cpu_caps.has_sse2 && type.length >= 2) {
 163       intr_size = 128;
 164       if ((type.width == 8 || type.width == 16) &&
 165           (type.width * type.length <= 64) &&
 166           (gallivm_debug & GALLIVM_DEBUG_PERF)) {
 167          debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
 168                       __FUNCTION__);
 169       }
 170       if (type.width == 8 && !type.sign) {
 171          intrinsic = "llvm.x86.sse2.pminu.b";
 172       }
 173       else if (type.width == 16 && type.sign) {
 174          intrinsic = "llvm.x86.sse2.pmins.w";
 175       }
 176       if (util_cpu_caps.has_sse4_1) {
 177          if (type.width == 8 && type.sign) {
 178             intrinsic = "llvm.x86.sse41.pminsb";
 179          }
 180          if (type.width == 16 && !type.sign) {
 181             intrinsic = "llvm.x86.sse41.pminuw";
 182          }
 183          if (type.width == 32 && !type.sign) {
 184             intrinsic = "llvm.x86.sse41.pminud";
 185          }
 186          if (type.width == 32 && type.sign) {
 187             intrinsic = "llvm.x86.sse41.pminsd";
 188          }
 189       }
 190    } else if (util_cpu_caps.has_altivec) {
 191       intr_size = 128;
 192       if (type.width == 8) {
 193          if (!type.sign) {
 194             intrinsic = "llvm.ppc.altivec.vminub";
 195          } else {
 196             intrinsic = "llvm.ppc.altivec.vminsb";
 197          }
 198       } else if (type.width == 16) {
 199          if (!type.sign) {
 200             intrinsic = "llvm.ppc.altivec.vminuh";
 201          } else {
 202             intrinsic = "llvm.ppc.altivec.vminsh";
 203          }
 204       } else if (type.width == 32) {
 205          if (!type.sign) {
 206             intrinsic = "llvm.ppc.altivec.vminuw";
 207          } else {
 208             intrinsic = "llvm.ppc.altivec.vminsw";
 209          }
 210       }
 211    }
 212
 213    if (intrinsic) {
 214       /* We need to handle nan's for floating point numbers. If one of the
 215        * inputs is nan the other should be returned (required by both D3D10+
 216        * and OpenCL).
 217        * The sse intrinsics return the second operator in case of nan by
 218        * default so we need to special code to handle those.
 219        */
 220       if (util_cpu_caps.has_sse && type.floating &&
 221           nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
 222           nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN &&
 223           nan_behavior != GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
 224          LLVMValueRef isnan, min;
 225          min = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
 226                                                    type,
 227                                                    intr_size, a, b);
 228          if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
 229             isnan = lp_build_isnan(bld, b);
 230             return lp_build_select(bld, isnan, a, min);
 231          } else {
 232             assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
 233             isnan = lp_build_isnan(bld, a);
 234             return lp_build_select(bld, isnan, a, min);
 235          }
 236       } else {
 237          return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
 238                                                     type,
 239                                                     intr_size, a, b);
 240       }
 241    }
 242
 243    if (type.floating) {
 244       switch (nan_behavior) {
 245       case GALLIVM_NAN_RETURN_NAN: {
 246          LLVMValueRef isnan = lp_build_isnan(bld, b);
 247          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 248          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
 249          return lp_build_select(bld, cond, a, b);
 250       }
 251          break;
 252       case GALLIVM_NAN_RETURN_OTHER: {
 253          LLVMValueRef isnan = lp_build_isnan(bld, a);
 254          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 255          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
 256          return lp_build_select(bld, cond, a, b);
 257       }
 258          break;
 259       case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
 260          cond = lp_build_cmp_ordered(bld, PIPE_FUNC_LESS, a, b);
 261          return lp_build_select(bld, cond, a, b);
 262       case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
 263          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, b, a);
 264          return lp_build_select(bld, cond, b, a);
 265       case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
 266          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 267          return lp_build_select(bld, cond, a, b);
 268          break;
 269       default:
 270          assert(0);
 271          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 272          return lp_build_select(bld, cond, a, b);
 273       }
 274    } else {
 275       cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 276       return lp_build_select(bld, cond, a, b);
 277    }
 278 }
 279
 280
 281 LLVMValueRef
 282 lp_build_fmuladd(LLVMBuilderRef builder,
 283                  LLVMValueRef a,
 284                  LLVMValueRef b,
 285                  LLVMValueRef c)
 286 {
 287    LLVMTypeRef type = LLVMTypeOf(a);
 288    assert(type == LLVMTypeOf(b));
 289    assert(type == LLVMTypeOf(c));
 290    if (LLVM_VERSION_MAJOR < 3 || (LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR < 4)) {
 291       /* XXX: LLVM 3.3 does not breakdown llvm.fmuladd into mul+add when FMA is
 292        * not supported, and instead it falls-back to a C function.
 293        */
 294       return LLVMBuildFAdd(builder, LLVMBuildFMul(builder, a, b, ""), c, "");
 295    }
 296    char intrinsic[32];
 297    lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fmuladd", type);
 298    LLVMValueRef args[] = { a, b, c };
 299    return lp_build_intrinsic(builder, intrinsic, type, args, 3, 0);
 300 }
 301
 302
 303 /**
 304  * Generate max(a, b)
 305  * No checks for special case values of a or b = 1 or 0 are done.
 306  * NaN's are handled according to the behavior specified by the
 307  * nan_behavior argument.
 308  */
 309 static LLVMValueRef
 310 lp_build_max_simple(struct lp_build_context *bld,
 311                     LLVMValueRef a,
 312                     LLVMValueRef b,
 313                     enum gallivm_nan_behavior nan_behavior)
 314 {
 315    const struct lp_type type = bld->type;
 316    const char *intrinsic = NULL;
 317    unsigned intr_size = 0;
 318    LLVMValueRef cond;
 319
 320    assert(lp_check_value(type, a));
 321    assert(lp_check_value(type, b));
 322
 323    /* TODO: optimize the constant case */
 324
 325    if (type.floating && util_cpu_caps.has_sse) {
 326       if (type.width == 32) {
 327          if (type.length == 1) {
 328             intrinsic = "llvm.x86.sse.max.ss";
 329             intr_size = 128;
 330          }
 331          else if (type.length <= 4 || !util_cpu_caps.has_avx) {
 332             intrinsic = "llvm.x86.sse.max.ps";
 333             intr_size = 128;
 334          }
 335          else {
 336             intrinsic = "llvm.x86.avx.max.ps.256";
 337             intr_size = 256;
 338          }
 339       }
 340       if (type.width == 64 && util_cpu_caps.has_sse2) {
 341          if (type.length == 1) {
 342             intrinsic = "llvm.x86.sse2.max.sd";
 343             intr_size = 128;
 344          }
 345          else if (type.length == 2 || !util_cpu_caps.has_avx) {
 346             intrinsic = "llvm.x86.sse2.max.pd";
 347             intr_size = 128;
 348          }
 349          else {
 350             intrinsic = "llvm.x86.avx.max.pd.256";
 351             intr_size = 256;
 352          }
 353       }
 354    }
 355    else if (type.floating && util_cpu_caps.has_altivec) {
 356       if (nan_behavior == GALLIVM_NAN_RETURN_NAN ||
 357           nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
 358          debug_printf("%s: altivec doesn't support nan return nan behavior\n",
 359                       __FUNCTION__);
 360       }
 361       if (type.width == 32 || type.length == 4) {
 362          intrinsic = "llvm.ppc.altivec.vmaxfp";
 363          intr_size = 128;
 364       }
 365    } else if ((LLVM_VERSION_MAJOR < 3 || (LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR < 9)) &&
 366               util_cpu_caps.has_avx2 && type.length > 4) {
 367       intr_size = 256;
 368       switch (type.width) {
 369       case 8:
 370          intrinsic = type.sign ? "llvm.x86.avx2.pmaxs.b" : "llvm.x86.avx2.pmaxu.b";
 371          break;
 372       case 16:
 373          intrinsic = type.sign ? "llvm.x86.avx2.pmaxs.w" : "llvm.x86.avx2.pmaxu.w";
 374          break;
 375       case 32:
 376          intrinsic = type.sign ? "llvm.x86.avx2.pmaxs.d" : "llvm.x86.avx2.pmaxu.d";
 377          break;
 378       }
 379    } else if ((LLVM_VERSION_MAJOR < 3 || (LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR < 9)) &&
 380               util_cpu_caps.has_sse2 && type.length >= 2) {
 381       intr_size = 128;
 382       if ((type.width == 8 || type.width == 16) &&
 383           (type.width * type.length <= 64) &&
 384           (gallivm_debug & GALLIVM_DEBUG_PERF)) {
 385          debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
 386                       __FUNCTION__);
 387          }
 388       if (type.width == 8 && !type.sign) {
 389          intrinsic = "llvm.x86.sse2.pmaxu.b";
 390          intr_size = 128;
 391       }
 392       else if (type.width == 16 && type.sign) {
 393          intrinsic = "llvm.x86.sse2.pmaxs.w";
 394       }
 395       if (util_cpu_caps.has_sse4_1) {
 396          if (type.width == 8 && type.sign) {
 397             intrinsic = "llvm.x86.sse41.pmaxsb";
 398          }
 399          if (type.width == 16 && !type.sign) {
 400             intrinsic = "llvm.x86.sse41.pmaxuw";
 401          }
 402          if (type.width == 32 && !type.sign) {
 403             intrinsic = "llvm.x86.sse41.pmaxud";
 404         }
 405          if (type.width == 32 && type.sign) {
 406             intrinsic = "llvm.x86.sse41.pmaxsd";
 407          }
 408       }
 409    } else if (util_cpu_caps.has_altivec) {
 410      intr_size = 128;
 411      if (type.width == 8) {
 412        if (!type.sign) {
 413          intrinsic = "llvm.ppc.altivec.vmaxub";
 414        } else {
 415          intrinsic = "llvm.ppc.altivec.vmaxsb";
 416        }
 417      } else if (type.width == 16) {
 418        if (!type.sign) {
 419          intrinsic = "llvm.ppc.altivec.vmaxuh";
 420        } else {
 421          intrinsic = "llvm.ppc.altivec.vmaxsh";
 422        }
 423      } else if (type.width == 32) {
 424        if (!type.sign) {
 425          intrinsic = "llvm.ppc.altivec.vmaxuw";
 426        } else {
 427          intrinsic = "llvm.ppc.altivec.vmaxsw";
 428        }
 429      }
 430    }
 431
 432    if (intrinsic) {
 433       if (util_cpu_caps.has_sse && type.floating &&
 434           nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
 435           nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN &&
 436           nan_behavior != GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
 437          LLVMValueRef isnan, max;
 438          max = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
 439                                                    type,
 440                                                    intr_size, a, b);
 441          if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
 442             isnan = lp_build_isnan(bld, b);
 443             return lp_build_select(bld, isnan, a, max);
 444          } else {
 445             assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
 446             isnan = lp_build_isnan(bld, a);
 447             return lp_build_select(bld, isnan, a, max);
 448          }
 449       } else {
 450          return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
 451                                                     type,
 452                                                     intr_size, a, b);
 453       }
 454    }
 455
 456    if (type.floating) {
 457       switch (nan_behavior) {
 458       case GALLIVM_NAN_RETURN_NAN: {
 459          LLVMValueRef isnan = lp_build_isnan(bld, b);
 460          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 461          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
 462          return lp_build_select(bld, cond, a, b);
 463       }
 464          break;
 465       case GALLIVM_NAN_RETURN_OTHER: {
 466          LLVMValueRef isnan = lp_build_isnan(bld, a);
 467          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 468          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
 469          return lp_build_select(bld, cond, a, b);
 470       }
 471          break;
 472       case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
 473          cond = lp_build_cmp_ordered(bld, PIPE_FUNC_GREATER, a, b);
 474          return lp_build_select(bld, cond, a, b);
 475       case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
 476          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, b, a);
 477          return lp_build_select(bld, cond, b, a);
 478       case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
 479          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 480          return lp_build_select(bld, cond, a, b);
 481          break;
 482       default:
 483          assert(0);
 484          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 485          return lp_build_select(bld, cond, a, b);
 486       }
 487    } else {
 488       cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 489       return lp_build_select(bld, cond, a, b);
 490    }
 491 }
 492
 493
 494 /**
 495  * Generate 1 - a, or ~a depending on bld->type.
 496  */
 497 LLVMValueRef
 498 lp_build_comp(struct lp_build_context *bld,
 499               LLVMValueRef a)
 500 {
 501    LLVMBuilderRef builder = bld->gallivm->builder;
 502    const struct lp_type type = bld->type;
 503
 504    assert(lp_check_value(type, a));
 505
 506    if(a == bld->one)
 507       return bld->zero;
 508    if(a == bld->zero)
 509       return bld->one;
 510
 511    if(type.norm && !type.floating && !type.fixed && !type.sign) {
 512       if(LLVMIsConstant(a))
 513          return LLVMConstNot(a);
 514       else
 515          return LLVMBuildNot(builder, a, "");
 516    }
 517
 518    if(LLVMIsConstant(a))
 519       if (type.floating)
 520           return LLVMConstFSub(bld->one, a);
 521       else
 522           return LLVMConstSub(bld->one, a);
 523    else
 524       if (type.floating)
 525          return LLVMBuildFSub(builder, bld->one, a, "");
 526       else
 527          return LLVMBuildSub(builder, bld->one, a, "");
 528 }
 529
 530
 531 /**
 532  * Generate a + b
 533  */
 534 LLVMValueRef
 535 lp_build_add(struct lp_build_context *bld,
 536              LLVMValueRef a,
 537              LLVMValueRef b)
 538 {
 539    LLVMBuilderRef builder = bld->gallivm->builder;
 540    const struct lp_type type = bld->type;
 541    LLVMValueRef res;
 542
 543    assert(lp_check_value(type, a));
 544    assert(lp_check_value(type, b));
 545
 546    if (a == bld->zero)
 547       return b;
 548    if (b == bld->zero)
 549       return a;
 550    if (a == bld->undef || b == bld->undef)
 551       return bld->undef;
 552
 553    if (type.norm) {
 554       const char *intrinsic = NULL;
 555
 556       if (!type.sign && (a == bld->one || b == bld->one))
 557         return bld->one;
 558
 559       if (!type.floating && !type.fixed) {
 560          if (LLVM_VERSION_MAJOR >= 9) {
 561             char intrin[32];
 562             intrinsic = type.sign ? "llvm.sadd.sat" : "llvm.uadd.sat";
 563             lp_format_intrinsic(intrin, sizeof intrin, intrinsic, bld->vec_type);
 564             return lp_build_intrinsic_binary(builder, intrin, bld->vec_type, a, b);
 565          }
 566          if (type.width * type.length == 128) {
 567             if (util_cpu_caps.has_sse2) {
 568                if (type.width == 8)
 569                  intrinsic = type.sign ? "llvm.x86.sse2.padds.b" :
 570                                          LLVM_VERSION_MAJOR < 8 ? "llvm.x86.sse2.paddus.b" : NULL;
 571                if (type.width == 16)
 572                  intrinsic = type.sign ? "llvm.x86.sse2.padds.w" :
 573                                          LLVM_VERSION_MAJOR < 8 ? "llvm.x86.sse2.paddus.w" : NULL;
 574             } else if (util_cpu_caps.has_altivec) {
 575                if (type.width == 8)
 576                   intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs";
 577                if (type.width == 16)
 578                   intrinsic = type.sign ? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs";
 579             }
 580          }
 581          if (type.width * type.length == 256) {
 582             if (util_cpu_caps.has_avx2) {
 583                if (type.width == 8)
 584                   intrinsic = type.sign ? "llvm.x86.avx2.padds.b" :
 585                                           LLVM_VERSION_MAJOR < 8 ? "llvm.x86.avx2.paddus.b" : NULL;
 586                if (type.width == 16)
 587                   intrinsic = type.sign ? "llvm.x86.avx2.padds.w" :
 588                                           LLVM_VERSION_MAJOR < 8 ? "llvm.x86.avx2.paddus.w" : NULL;
 589             }
 590          }
 591       }
 592
 593       if (intrinsic)
 594          return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
 595    }
 596
 597    if(type.norm && !type.floating && !type.fixed) {
 598       if (type.sign) {
 599          uint64_t sign = (uint64_t)1 << (type.width - 1);
 600          LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1);
 601          LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign);
 602          /* a_clamp_max is the maximum a for positive b,
 603             a_clamp_min is the minimum a for negative b. */
 604          LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildSub(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 605          LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildSub(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 606          a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_max, a_clamp_min);
 607       }
 608    }
 609
 610    if(LLVMIsConstant(a) && LLVMIsConstant(b))
 611       if (type.floating)
 612          res = LLVMConstFAdd(a, b);
 613       else
 614          res = LLVMConstAdd(a, b);
 615    else
 616       if (type.floating)
 617          res = LLVMBuildFAdd(builder, a, b, "");
 618       else
 619          res = LLVMBuildAdd(builder, a, b, "");
 620
 621    /* clamp to ceiling of 1.0 */
 622    if(bld->type.norm && (bld->type.floating || bld->type.fixed))
 623       res = lp_build_min_simple(bld, res, bld->one, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 624
 625    if (type.norm && !type.floating && !type.fixed) {
 626       if (!type.sign) {
 627          /*
 628           * newer llvm versions no longer support the intrinsics, but recognize
 629           * the pattern. Since auto-upgrade of intrinsics doesn't work for jit
 630           * code, it is important we match the pattern llvm uses (and pray llvm
 631           * doesn't change it - and hope they decide on the same pattern for
 632           * all backends supporting it...).
 633           * NOTE: cmp/select does sext/trunc of the mask. Does not seem to
 634           * interfere with llvm's ability to recognize the pattern but seems
 635           * a bit brittle.
 636           * NOTE: llvm 9+ always uses (non arch specific) intrinsic.
 637           */
 638          LLVMValueRef overflowed = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, res);
 639          res = lp_build_select(bld, overflowed,
 640                                LLVMConstAllOnes(bld->int_vec_type), res);
 641       }
 642    }
 643
 644    /* XXX clamp to floor of -1 or 0??? */
 645
 646    return res;
 647 }
 648
 649
 650 /** Return the scalar sum of the elements of a.
 651  * Should avoid this operation whenever possible.
 652  */
 653 LLVMValueRef
 654 lp_build_horizontal_add(struct lp_build_context *bld,
 655                         LLVMValueRef a)
 656 {
 657    LLVMBuilderRef builder = bld->gallivm->builder;
 658    const struct lp_type type = bld->type;
 659    LLVMValueRef index, res;
 660    unsigned i, length;
 661    LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 2];
 662    LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 2];
 663    LLVMValueRef vecres, elem2;
 664
 665    assert(lp_check_value(type, a));
 666
 667    if (type.length == 1) {
 668       return a;
 669    }
 670
 671    assert(!bld->type.norm);
 672
 673    /*
 674     * for byte vectors can do much better with psadbw.
 675     * Using repeated shuffle/adds here. Note with multiple vectors
 676     * this can be done more efficiently as outlined in the intel
 677     * optimization manual.
 678     * Note: could cause data rearrangement if used with smaller element
 679     * sizes.
 680     */
 681
 682    vecres = a;
 683    length = type.length / 2;
 684    while (length > 1) {
 685       LLVMValueRef vec1, vec2;
 686       for (i = 0; i < length; i++) {
 687          shuffles1[i] = lp_build_const_int32(bld->gallivm, i);
 688          shuffles2[i] = lp_build_const_int32(bld->gallivm, i + length);
 689       }
 690       vec1 = LLVMBuildShuffleVector(builder, vecres, vecres,
 691                                     LLVMConstVector(shuffles1, length), "");
 692       vec2 = LLVMBuildShuffleVector(builder, vecres, vecres,
 693                                     LLVMConstVector(shuffles2, length), "");
 694       if (type.floating) {
 695          vecres = LLVMBuildFAdd(builder, vec1, vec2, "");
 696       }
 697       else {
 698          vecres = LLVMBuildAdd(builder, vec1, vec2, "");
 699       }
 700       length = length >> 1;
 701    }
 702
 703    /* always have vector of size 2 here */
 704    assert(length == 1);
 705
 706    index = lp_build_const_int32(bld->gallivm, 0);
 707    res = LLVMBuildExtractElement(builder, vecres, index, "");
 708    index = lp_build_const_int32(bld->gallivm, 1);
 709    elem2 = LLVMBuildExtractElement(builder, vecres, index, "");
 710
 711    if (type.floating)
 712       res = LLVMBuildFAdd(builder, res, elem2, "");
 713     else
 714       res = LLVMBuildAdd(builder, res, elem2, "");
 715
 716    return res;
 717 }
 718
 719 /**
 720  * Return the horizontal sums of 4 float vectors as a float4 vector.
 721  * This uses the technique as outlined in Intel Optimization Manual.
 722  */
 723 static LLVMValueRef
 724 lp_build_horizontal_add4x4f(struct lp_build_context *bld,
 725                             LLVMValueRef src[4])
 726 {
 727    struct gallivm_state *gallivm = bld->gallivm;
 728    LLVMBuilderRef builder = gallivm->builder;
 729    LLVMValueRef shuffles[4];
 730    LLVMValueRef tmp[4];
 731    LLVMValueRef sumtmp[2], shuftmp[2];
 732
 733    /* lower half of regs */
 734    shuffles[0] = lp_build_const_int32(gallivm, 0);
 735    shuffles[1] = lp_build_const_int32(gallivm, 1);
 736    shuffles[2] = lp_build_const_int32(gallivm, 4);
 737    shuffles[3] = lp_build_const_int32(gallivm, 5);
 738    tmp[0] = LLVMBuildShuffleVector(builder, src[0], src[1],
 739                                    LLVMConstVector(shuffles, 4), "");
 740    tmp[2] = LLVMBuildShuffleVector(builder, src[2], src[3],
 741                                    LLVMConstVector(shuffles, 4), "");
 742
 743    /* upper half of regs */
 744    shuffles[0] = lp_build_const_int32(gallivm, 2);
 745    shuffles[1] = lp_build_const_int32(gallivm, 3);
 746    shuffles[2] = lp_build_const_int32(gallivm, 6);
 747    shuffles[3] = lp_build_const_int32(gallivm, 7);
 748    tmp[1] = LLVMBuildShuffleVector(builder, src[0], src[1],
 749                                    LLVMConstVector(shuffles, 4), "");
 750    tmp[3] = LLVMBuildShuffleVector(builder, src[2], src[3],
 751                                    LLVMConstVector(shuffles, 4), "");
 752
 753    sumtmp[0] = LLVMBuildFAdd(builder, tmp[0], tmp[1], "");
 754    sumtmp[1] = LLVMBuildFAdd(builder, tmp[2], tmp[3], "");
 755
 756    shuffles[0] = lp_build_const_int32(gallivm, 0);
 757    shuffles[1] = lp_build_const_int32(gallivm, 2);
 758    shuffles[2] = lp_build_const_int32(gallivm, 4);
 759    shuffles[3] = lp_build_const_int32(gallivm, 6);
 760    shuftmp[0] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
 761                                        LLVMConstVector(shuffles, 4), "");
 762
 763    shuffles[0] = lp_build_const_int32(gallivm, 1);
 764    shuffles[1] = lp_build_const_int32(gallivm, 3);
 765    shuffles[2] = lp_build_const_int32(gallivm, 5);
 766    shuffles[3] = lp_build_const_int32(gallivm, 7);
 767    shuftmp[1] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
 768                                        LLVMConstVector(shuffles, 4), "");
 769
 770    return LLVMBuildFAdd(builder, shuftmp[0], shuftmp[1], "");
 771 }
 772
 773
 774 /*
 775  * partially horizontally add 2-4 float vectors with length nx4,
 776  * i.e. only four adjacent values in each vector will be added,
 777  * assuming values are really grouped in 4 which also determines
 778  * output order.
 779  *
 780  * Return a vector of the same length as the initial vectors,
 781  * with the excess elements (if any) being undefined.
 782  * The element order is independent of number of input vectors.
 783  * For 3 vectors x0x1x2x3x4x5x6x7, y0y1y2y3y4y5y6y7, z0z1z2z3z4z5z6z7
 784  * the output order thus will be
 785  * sumx0-x3,sumy0-y3,sumz0-z3,undef,sumx4-x7,sumy4-y7,sumz4z7,undef
 786  */
 787 LLVMValueRef
 788 lp_build_hadd_partial4(struct lp_build_context *bld,
 789                        LLVMValueRef vectors[],
 790                        unsigned num_vecs)
 791 {
 792    struct gallivm_state *gallivm = bld->gallivm;
 793    LLVMBuilderRef builder = gallivm->builder;
 794    LLVMValueRef ret_vec;
 795    LLVMValueRef tmp[4];
 796    const char *intrinsic = NULL;
 797
 798    assert(num_vecs >= 2 && num_vecs <= 4);
 799    assert(bld->type.floating);
 800
 801    /* only use this with at least 2 vectors, as it is sort of expensive
 802     * (depending on cpu) and we always need two horizontal adds anyway,
 803     * so a shuffle/add approach might be better.
 804     */
 805
 806    tmp[0] = vectors[0];
 807    tmp[1] = vectors[1];
 808
 809    tmp[2] = num_vecs > 2 ? vectors[2] : vectors[0];
 810    tmp[3] = num_vecs > 3 ? vectors[3] : vectors[0];
 811
 812    if (util_cpu_caps.has_sse3 && bld->type.width == 32 &&
 813        bld->type.length == 4) {
 814       intrinsic = "llvm.x86.sse3.hadd.ps";
 815    }
 816    else if (util_cpu_caps.has_avx && bld->type.width == 32 &&
 817             bld->type.length == 8) {
 818       intrinsic = "llvm.x86.avx.hadd.ps.256";
 819    }
 820    if (intrinsic) {
 821       tmp[0] = lp_build_intrinsic_binary(builder, intrinsic,
 822                                        lp_build_vec_type(gallivm, bld->type),
 823                                        tmp[0], tmp[1]);
 824       if (num_vecs > 2) {
 825          tmp[1] = lp_build_intrinsic_binary(builder, intrinsic,
 826                                           lp_build_vec_type(gallivm, bld->type),
 827                                           tmp[2], tmp[3]);
 828       }
 829       else {
 830          tmp[1] = tmp[0];
 831       }
 832       return lp_build_intrinsic_binary(builder, intrinsic,
 833                                        lp_build_vec_type(gallivm, bld->type),
 834                                        tmp[0], tmp[1]);
 835    }
 836
 837    if (bld->type.length == 4) {
 838       ret_vec = lp_build_horizontal_add4x4f(bld, tmp);
 839    }
 840    else {
 841       LLVMValueRef partres[LP_MAX_VECTOR_LENGTH/4];
 842       unsigned j;
 843       unsigned num_iter = bld->type.length / 4;
 844       struct lp_type parttype = bld->type;
 845       parttype.length = 4;
 846       for (j = 0; j < num_iter; j++) {
 847          LLVMValueRef partsrc[4];
 848          unsigned i;
 849          for (i = 0; i < 4; i++) {
 850             partsrc[i] = lp_build_extract_range(gallivm, tmp[i], j*4, 4);
 851          }
 852          partres[j] = lp_build_horizontal_add4x4f(bld, partsrc);
 853       }
 854       ret_vec = lp_build_concat(gallivm, partres, parttype, num_iter);
 855    }
 856    return ret_vec;
 857 }
 858
 859 /**
 860  * Generate a - b
 861  */
 862 LLVMValueRef
 863 lp_build_sub(struct lp_build_context *bld,
 864              LLVMValueRef a,
 865              LLVMValueRef b)
 866 {
 867    LLVMBuilderRef builder = bld->gallivm->builder;
 868    const struct lp_type type = bld->type;
 869    LLVMValueRef res;
 870
 871    assert(lp_check_value(type, a));
 872    assert(lp_check_value(type, b));
 873
 874    if (b == bld->zero)
 875       return a;
 876    if (a == bld->undef || b == bld->undef)
 877       return bld->undef;
 878    if (a == b)
 879       return bld->zero;
 880
 881    if (type.norm) {
 882       const char *intrinsic = NULL;
 883
 884       if (!type.sign && b == bld->one)
 885         return bld->zero;
 886
 887       if (!type.floating && !type.fixed) {
 888          if (LLVM_VERSION_MAJOR >= 9) {
 889             char intrin[32];
 890             intrinsic = type.sign ? "llvm.ssub.sat" : "llvm.usub.sat";
 891             lp_format_intrinsic(intrin, sizeof intrin, intrinsic, bld->vec_type);
 892             return lp_build_intrinsic_binary(builder, intrin, bld->vec_type, a, b);
 893          }
 894          if (type.width * type.length == 128) {
 895             if (util_cpu_caps.has_sse2) {
 896                if (type.width == 8)
 897                   intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" :
 898                                           LLVM_VERSION_MAJOR < 8 ? "llvm.x86.sse2.psubus.b" : NULL;
 899                if (type.width == 16)
 900                   intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" :
 901                                           LLVM_VERSION_MAJOR < 8 ? "llvm.x86.sse2.psubus.w" : NULL;
 902             } else if (util_cpu_caps.has_altivec) {
 903                if (type.width == 8)
 904                   intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs";
 905                if (type.width == 16)
 906                   intrinsic = type.sign ? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs";
 907             }
 908          }
 909          if (type.width * type.length == 256) {
 910             if (util_cpu_caps.has_avx2) {
 911                if (type.width == 8)
 912                   intrinsic = type.sign ? "llvm.x86.avx2.psubs.b" :
 913                                           LLVM_VERSION_MAJOR < 8 ? "llvm.x86.avx2.psubus.b" : NULL;
 914                if (type.width == 16)
 915                   intrinsic = type.sign ? "llvm.x86.avx2.psubs.w" :
 916                                           LLVM_VERSION_MAJOR < 8 ? "llvm.x86.avx2.psubus.w" : NULL;
 917             }
 918          }
 919       }
 920
 921       if (intrinsic)
 922          return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
 923    }
 924
 925    if(type.norm && !type.floating && !type.fixed) {
 926       if (type.sign) {
 927          uint64_t sign = (uint64_t)1 << (type.width - 1);
 928          LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1);
 929          LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign);
 930          /* a_clamp_max is the maximum a for negative b,
 931             a_clamp_min is the minimum a for positive b. */
 932          LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildAdd(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 933          LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildAdd(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 934          a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_min, a_clamp_max);
 935       } else {
 936          /*
 937           * This must match llvm pattern for saturated unsigned sub.
 938           * (lp_build_max_simple actually does the job with its current
 939           * definition but do it explicitly here.)
 940           * NOTE: cmp/select does sext/trunc of the mask. Does not seem to
 941           * interfere with llvm's ability to recognize the pattern but seems
 942           * a bit brittle.
 943           * NOTE: llvm 9+ always uses (non arch specific) intrinsic.
 944           */
 945          LLVMValueRef no_ov = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 946          a = lp_build_select(bld, no_ov, a, b);
 947       }
 948    }
 949
 950    if(LLVMIsConstant(a) && LLVMIsConstant(b))
 951       if (type.floating)
 952          res = LLVMConstFSub(a, b);
 953       else
 954          res = LLVMConstSub(a, b);
 955    else
 956       if (type.floating)
 957          res = LLVMBuildFSub(builder, a, b, "");
 958       else
 959          res = LLVMBuildSub(builder, a, b, "");
 960
 961    if(bld->type.norm && (bld->type.floating || bld->type.fixed))
 962       res = lp_build_max_simple(bld, res, bld->zero, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 963
 964    return res;
 965 }
 966
 967
 968
 969 /**
 970  * Normalized multiplication.
 971  *
 972  * There are several approaches for (using 8-bit normalized multiplication as
 973  * an example):
 974  *
 975  * - alpha plus one
 976  *
 977  *     makes the following approximation to the division (Sree)
 978  *
 979  *       a*b/255 ~= (a*(b + 1)) >> 256
 980  *
 981  *     which is the fastest method that satisfies the following OpenGL criteria of
 982  *
 983  *       0*0 = 0 and 255*255 = 255
 984  *
 985  * - geometric series
 986  *
 987  *     takes the geometric series approximation to the division
 988  *
 989  *       t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
 990  *
 991  *     in this case just the first two terms to fit in 16bit arithmetic
 992  *
 993  *       t/255 ~= (t + (t >> 8)) >> 8
 994  *
 995  *     note that just by itself it doesn't satisfies the OpenGL criteria, as
 996  *     255*255 = 254, so the special case b = 255 must be accounted or roundoff
 997  *     must be used.
 998  *
 999  * - geometric series plus rounding
1000  *
1001  *     when using a geometric series division instead of truncating the result
1002  *     use roundoff in the approximation (Jim Blinn)
1003  *
1004  *       t/255 ~= (t + (t >> 8) + 0x80) >> 8
1005  *
1006  *     achieving the exact results.
1007  *
1008  *
1009  *
1010  * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
1011  *     ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
1012  * @sa Michael Herf, The "double blend trick", May 2000,
1013  *     http://www.stereopsis.com/doubleblend.html
1014  */
1015 LLVMValueRef
1016 lp_build_mul_norm(struct gallivm_state *gallivm,
1017                   struct lp_type wide_type,
1018                   LLVMValueRef a, LLVMValueRef b)
1019 {
1020    LLVMBuilderRef builder = gallivm->builder;
1021    struct lp_build_context bld;
1022    unsigned n;
1023    LLVMValueRef half;
1024    LLVMValueRef ab;
1025
1026    assert(!wide_type.floating);
1027    assert(lp_check_value(wide_type, a));
1028    assert(lp_check_value(wide_type, b));
1029
1030    lp_build_context_init(&bld, gallivm, wide_type);
1031
1032    n = wide_type.width / 2;
1033    if (wide_type.sign) {
1034       --n;
1035    }
1036
1037    /*
1038     * TODO: for 16bits normalized SSE2 vectors we could consider using PMULHUW
1039     * http://ssp.impulsetrain.com/2011/07/03/multiplying-normalized-16-bit-numbers-with-sse2/
1040     */
1041
1042    /*
1043     * a*b / (2**n - 1) ~= (a*b + (a*b >> n) + half) >> n
1044     */
1045
1046    ab = LLVMBuildMul(builder, a, b, "");
1047    ab = LLVMBuildAdd(builder, ab, lp_build_shr_imm(&bld, ab, n), "");
1048
1049    /*
1050     * half = sgn(ab) * 0.5 * (2 ** n) = sgn(ab) * (1 << (n - 1))
1051     */
1052
1053    half = lp_build_const_int_vec(gallivm, wide_type, 1LL << (n - 1));
1054    if (wide_type.sign) {
1055       LLVMValueRef minus_half = LLVMBuildNeg(builder, half, "");
1056       LLVMValueRef sign = lp_build_shr_imm(&bld, ab, wide_type.width - 1);
1057       half = lp_build_select(&bld, sign, minus_half, half);
1058    }
1059    ab = LLVMBuildAdd(builder, ab, half, "");
1060
1061    /* Final division */
1062    ab = lp_build_shr_imm(&bld, ab, n);
1063
1064    return ab;
1065 }
1066
1067 /**
1068  * Generate a * b
1069  */
1070 LLVMValueRef
1071 lp_build_mul(struct lp_build_context *bld,
1072              LLVMValueRef a,
1073              LLVMValueRef b)
1074 {
1075    LLVMBuilderRef builder = bld->gallivm->builder;
1076    const struct lp_type type = bld->type;
1077    LLVMValueRef shift;
1078    LLVMValueRef res;
1079
1080    assert(lp_check_value(type, a));
1081    assert(lp_check_value(type, b));
1082
1083    if(a == bld->zero)
1084       return bld->zero;
1085    if(a == bld->one)
1086       return b;
1087    if(b == bld->zero)
1088       return bld->zero;
1089    if(b == bld->one)
1090       return a;
1091    if(a == bld->undef || b == bld->undef)
1092       return bld->undef;
1093
1094    if (!type.floating && !type.fixed && type.norm) {
1095       struct lp_type wide_type = lp_wider_type(type);
1096       LLVMValueRef al, ah, bl, bh, abl, abh, ab;
1097
1098       lp_build_unpack2_native(bld->gallivm, type, wide_type, a, &al, &ah);
1099       lp_build_unpack2_native(bld->gallivm, type, wide_type, b, &bl, &bh);
1100
1101       /* PMULLW, PSRLW, PADDW */
1102       abl = lp_build_mul_norm(bld->gallivm, wide_type, al, bl);
1103       abh = lp_build_mul_norm(bld->gallivm, wide_type, ah, bh);
1104
1105       ab = lp_build_pack2_native(bld->gallivm, wide_type, type, abl, abh);
1106
1107       return ab;
1108    }
1109
1110    if(type.fixed)
1111       shift = lp_build_const_int_vec(bld->gallivm, type, type.width/2);
1112    else
1113       shift = NULL;
1114
1115    if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
1116       if (type.floating)
1117          res = LLVMConstFMul(a, b);
1118       else
1119          res = LLVMConstMul(a, b);
1120       if(shift) {
1121          if(type.sign)
1122             res = LLVMConstAShr(res, shift);
1123          else
1124             res = LLVMConstLShr(res, shift);
1125       }
1126    }
1127    else {
1128       if (type.floating)
1129          res = LLVMBuildFMul(builder, a, b, "");
1130       else
1131          res = LLVMBuildMul(builder, a, b, "");
1132       if(shift) {
1133          if(type.sign)
1134             res = LLVMBuildAShr(builder, res, shift, "");
1135          else
1136             res = LLVMBuildLShr(builder, res, shift, "");
1137       }
1138    }
1139
1140    return res;
1141 }
1142
1143 /*
1144  * Widening mul, valid for 32x32 bit -> 64bit only.
1145  * Result is low 32bits, high bits returned in res_hi.
1146  *
1147  * Emits code that is meant to be compiled for the host CPU.
1148  */
1149 LLVMValueRef
1150 lp_build_mul_32_lohi_cpu(struct lp_build_context *bld,
1151                          LLVMValueRef a,
1152                          LLVMValueRef b,
1153                          LLVMValueRef *res_hi)
1154 {
1155    struct gallivm_state *gallivm = bld->gallivm;
1156    LLVMBuilderRef builder = gallivm->builder;
1157
1158    assert(bld->type.width == 32);
1159    assert(bld->type.floating == 0);
1160    assert(bld->type.fixed == 0);
1161    assert(bld->type.norm == 0);
1162
1163    /*
1164     * XXX: for some reason, with zext/zext/mul/trunc the code llvm produces
1165     * for x86 simd is atrocious (even if the high bits weren't required),
1166     * trying to handle real 64bit inputs (which of course can't happen due
1167     * to using 64bit umul with 32bit numbers zero-extended to 64bit, but
1168     * apparently llvm does not recognize this widening mul). This includes 6
1169     * (instead of 2) pmuludq plus extra adds and shifts
1170     * The same story applies to signed mul, albeit fixing this requires sse41.
1171     * https://llvm.org/bugs/show_bug.cgi?id=30845
1172     * So, whip up our own code, albeit only for length 4 and 8 (which
1173     * should be good enough)...
1174     * FIXME: For llvm >= 7.0 we should match the autoupgrade pattern
1175     * (bitcast/and/mul/shuffle for unsigned, bitcast/shl/ashr/mul/shuffle
1176     * for signed), which the fallback code does not, without this llvm
1177     * will likely still produce atrocious code.
1178     */
1179    if (LLVM_VERSION_MAJOR < 7 &&
1180        (bld->type.length == 4 || bld->type.length == 8) &&
1181        ((util_cpu_caps.has_sse2 && (bld->type.sign == 0)) ||
1182         util_cpu_caps.has_sse4_1)) {
1183       const char *intrinsic = NULL;
1184       LLVMValueRef aeven, aodd, beven, bodd, muleven, mulodd;
1185       LLVMValueRef shuf[LP_MAX_VECTOR_WIDTH / 32], shuf_vec;
1186       struct lp_type type_wide = lp_wider_type(bld->type);
1187       LLVMTypeRef wider_type = lp_build_vec_type(gallivm, type_wide);
1188       unsigned i;
1189       for (i = 0; i < bld->type.length; i += 2) {
1190          shuf[i] = lp_build_const_int32(gallivm, i+1);
1191          shuf[i+1] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
1192       }
1193       shuf_vec = LLVMConstVector(shuf, bld->type.length);
1194       aeven = a;
1195       beven = b;
1196       aodd = LLVMBuildShuffleVector(builder, aeven, bld->undef, shuf_vec, "");
1197       bodd = LLVMBuildShuffleVector(builder, beven, bld->undef, shuf_vec, "");
1198
1199       if (util_cpu_caps.has_avx2 && bld->type.length == 8) {
1200          if (bld->type.sign) {
1201             intrinsic = "llvm.x86.avx2.pmul.dq";
1202          } else {
1203             intrinsic = "llvm.x86.avx2.pmulu.dq";
1204          }
1205          muleven = lp_build_intrinsic_binary(builder, intrinsic,
1206                                              wider_type, aeven, beven);
1207          mulodd = lp_build_intrinsic_binary(builder, intrinsic,
1208                                             wider_type, aodd, bodd);
1209       }
1210       else {
1211          /* for consistent naming look elsewhere... */
1212          if (bld->type.sign) {
1213             intrinsic = "llvm.x86.sse41.pmuldq";
1214          } else {
1215             intrinsic = "llvm.x86.sse2.pmulu.dq";
1216          }
1217          /*
1218           * XXX If we only have AVX but not AVX2 this is a pain.
1219           * lp_build_intrinsic_binary_anylength() can't handle it
1220           * (due to src and dst type not being identical).
1221           */
1222          if (bld->type.length == 8) {
1223             LLVMValueRef aevenlo, aevenhi, bevenlo, bevenhi;
1224             LLVMValueRef aoddlo, aoddhi, boddlo, boddhi;
1225             LLVMValueRef muleven2[2], mulodd2[2];
1226             struct lp_type type_wide_half = type_wide;
1227             LLVMTypeRef wtype_half;
1228             type_wide_half.length = 2;
1229             wtype_half = lp_build_vec_type(gallivm, type_wide_half);
1230             aevenlo = lp_build_extract_range(gallivm, aeven, 0, 4);
1231             aevenhi = lp_build_extract_range(gallivm, aeven, 4, 4);
1232             bevenlo = lp_build_extract_range(gallivm, beven, 0, 4);
1233             bevenhi = lp_build_extract_range(gallivm, beven, 4, 4);
1234             aoddlo = lp_build_extract_range(gallivm, aodd, 0, 4);
1235             aoddhi = lp_build_extract_range(gallivm, aodd, 4, 4);
1236             boddlo = lp_build_extract_range(gallivm, bodd, 0, 4);
1237             boddhi = lp_build_extract_range(gallivm, bodd, 4, 4);
1238             muleven2[0] = lp_build_intrinsic_binary(builder, intrinsic,
1239                                                     wtype_half, aevenlo, bevenlo);
1240             mulodd2[0] = lp_build_intrinsic_binary(builder, intrinsic,
1241                                                    wtype_half, aoddlo, boddlo);
1242             muleven2[1] = lp_build_intrinsic_binary(builder, intrinsic,
1243                                                     wtype_half, aevenhi, bevenhi);
1244             mulodd2[1] = lp_build_intrinsic_binary(builder, intrinsic,
1245                                                    wtype_half, aoddhi, boddhi);
1246             muleven = lp_build_concat(gallivm, muleven2, type_wide_half, 2);
1247             mulodd = lp_build_concat(gallivm, mulodd2, type_wide_half, 2);
1248
1249          }
1250          else {
1251             muleven = lp_build_intrinsic_binary(builder, intrinsic,
1252                                                 wider_type, aeven, beven);
1253             mulodd = lp_build_intrinsic_binary(builder, intrinsic,
1254                                                wider_type, aodd, bodd);
1255          }
1256       }
1257       muleven = LLVMBuildBitCast(builder, muleven, bld->vec_type, "");
1258       mulodd = LLVMBuildBitCast(builder, mulodd, bld->vec_type, "");
1259
1260       for (i = 0; i < bld->type.length; i += 2) {
1261          shuf[i] = lp_build_const_int32(gallivm, i + 1);
1262          shuf[i+1] = lp_build_const_int32(gallivm, i + 1 + bld->type.length);
1263       }
1264       shuf_vec = LLVMConstVector(shuf, bld->type.length);
1265       *res_hi = LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, "");
1266
1267       for (i = 0; i < bld->type.length; i += 2) {
1268          shuf[i] = lp_build_const_int32(gallivm, i);
1269          shuf[i+1] = lp_build_const_int32(gallivm, i + bld->type.length);
1270       }
1271       shuf_vec = LLVMConstVector(shuf, bld->type.length);
1272       return LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, "");
1273    }
1274    else {
1275       return lp_build_mul_32_lohi(bld, a, b, res_hi);
1276    }
1277 }
1278
1279
1280 /*
1281  * Widening mul, valid for 32x32 bit -> 64bit only.
1282  * Result is low 32bits, high bits returned in res_hi.
1283  *
1284  * Emits generic code.
1285  */
1286 LLVMValueRef
1287 lp_build_mul_32_lohi(struct lp_build_context *bld,
1288                      LLVMValueRef a,
1289                      LLVMValueRef b,
1290                      LLVMValueRef *res_hi)
1291 {
1292    struct gallivm_state *gallivm = bld->gallivm;
1293    LLVMBuilderRef builder = gallivm->builder;
1294    LLVMValueRef tmp, shift, res_lo;
1295    struct lp_type type_tmp;
1296    LLVMTypeRef wide_type, narrow_type;
1297
1298    type_tmp = bld->type;
1299    narrow_type = lp_build_vec_type(gallivm, type_tmp);
1300    type_tmp.width *= 2;
1301    wide_type = lp_build_vec_type(gallivm, type_tmp);
1302    shift = lp_build_const_vec(gallivm, type_tmp, 32);
1303
1304    if (bld->type.sign) {
1305       a = LLVMBuildSExt(builder, a, wide_type, "");
1306       b = LLVMBuildSExt(builder, b, wide_type, "");
1307    } else {
1308       a = LLVMBuildZExt(builder, a, wide_type, "");
1309       b = LLVMBuildZExt(builder, b, wide_type, "");
1310    }
1311    tmp = LLVMBuildMul(builder, a, b, "");
1312
1313    res_lo = LLVMBuildTrunc(builder, tmp, narrow_type, "");
1314
1315    /* Since we truncate anyway, LShr and AShr are equivalent. */
1316    tmp = LLVMBuildLShr(builder, tmp, shift, "");
1317    *res_hi = LLVMBuildTrunc(builder, tmp, narrow_type, "");
1318
1319    return res_lo;
1320 }
1321
1322
1323 /* a * b + c */
1324 LLVMValueRef
1325 lp_build_mad(struct lp_build_context *bld,
1326              LLVMValueRef a,
1327              LLVMValueRef b,
1328              LLVMValueRef c)
1329 {
1330    const struct lp_type type = bld->type;
1331    if (type.floating) {
1332       return lp_build_fmuladd(bld->gallivm->builder, a, b, c);
1333    } else {
1334       return lp_build_add(bld, lp_build_mul(bld, a, b), c);
1335    }
1336 }
1337
1338
1339 /**
1340  * Small vector x scale multiplication optimization.
1341  */
1342 LLVMValueRef
1343 lp_build_mul_imm(struct lp_build_context *bld,
1344                  LLVMValueRef a,
1345                  int b)
1346 {
1347    LLVMBuilderRef builder = bld->gallivm->builder;
1348    LLVMValueRef factor;
1349
1350    assert(lp_check_value(bld->type, a));
1351
1352    if(b == 0)
1353       return bld->zero;
1354
1355    if(b == 1)
1356       return a;
1357
1358    if(b == -1)
1359       return lp_build_negate(bld, a);
1360
1361    if(b == 2 && bld->type.floating)
1362       return lp_build_add(bld, a, a);
1363
1364    if(util_is_power_of_two_or_zero(b)) {
1365       unsigned shift = ffs(b) - 1;
1366
1367       if(bld->type.floating) {
1368 #if 0
1369          /*
1370           * Power of two multiplication by directly manipulating the exponent.
1371           *
1372           * XXX: This might not be always faster, it will introduce a small error
1373           * for multiplication by zero, and it will produce wrong results
1374           * for Inf and NaN.
1375           */
1376          unsigned mantissa = lp_mantissa(bld->type);
1377          factor = lp_build_const_int_vec(bld->gallivm, bld->type, (unsigned long long)shift << mantissa);
1378          a = LLVMBuildBitCast(builder, a, lp_build_int_vec_type(bld->type), "");
1379          a = LLVMBuildAdd(builder, a, factor, "");
1380          a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, bld->type), "");
1381          return a;
1382 #endif
1383       }
1384       else {
1385          factor = lp_build_const_vec(bld->gallivm, bld->type, shift);
1386          return LLVMBuildShl(builder, a, factor, "");
1387       }
1388    }
1389
1390    factor = lp_build_const_vec(bld->gallivm, bld->type, (double)b);
1391    return lp_build_mul(bld, a, factor);
1392 }
1393
1394
1395 /**
1396  * Generate a / b
1397  */
1398 LLVMValueRef
1399 lp_build_div(struct lp_build_context *bld,
1400              LLVMValueRef a,
1401              LLVMValueRef b)
1402 {
1403    LLVMBuilderRef builder = bld->gallivm->builder;
1404    const struct lp_type type = bld->type;
1405
1406    assert(lp_check_value(type, a));
1407    assert(lp_check_value(type, b));
1408
1409    if(a == bld->zero)
1410       return bld->zero;
1411    if(a == bld->one && type.floating)
1412       return lp_build_rcp(bld, b);
1413    if(b == bld->zero)
1414       return bld->undef;
1415    if(b == bld->one)
1416       return a;
1417    if(a == bld->undef || b == bld->undef)
1418       return bld->undef;
1419
1420    if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
1421       if (type.floating)
1422          return LLVMConstFDiv(a, b);
1423       else if (type.sign)
1424          return LLVMConstSDiv(a, b);
1425       else
1426          return LLVMConstUDiv(a, b);
1427    }
1428
1429    /* fast rcp is disabled (just uses div), so makes no sense to try that */
1430    if(FALSE &&
1431       ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
1432        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) &&
1433       type.floating)
1434       return lp_build_mul(bld, a, lp_build_rcp(bld, b));
1435
1436    if (type.floating)
1437       return LLVMBuildFDiv(builder, a, b, "");
1438    else if (type.sign)
1439       return LLVMBuildSDiv(builder, a, b, "");
1440    else
1441       return LLVMBuildUDiv(builder, a, b, "");
1442 }
1443
1444
1445 /**
1446  * Linear interpolation helper.
1447  *
1448  * @param normalized whether we are interpolating normalized values,
1449  *        encoded in normalized integers, twice as wide.
1450  *
1451  * @sa http://www.stereopsis.com/doubleblend.html
1452  */
1453 static inline LLVMValueRef
1454 lp_build_lerp_simple(struct lp_build_context *bld,
1455                      LLVMValueRef x,
1456                      LLVMValueRef v0,
1457                      LLVMValueRef v1,
1458                      unsigned flags)
1459 {
1460    unsigned half_width = bld->type.width/2;
1461    LLVMBuilderRef builder = bld->gallivm->builder;
1462    LLVMValueRef delta;
1463    LLVMValueRef res;
1464
1465    assert(lp_check_value(bld->type, x));
1466    assert(lp_check_value(bld->type, v0));
1467    assert(lp_check_value(bld->type, v1));
1468
1469    delta = lp_build_sub(bld, v1, v0);
1470
1471    if (bld->type.floating) {
1472       assert(flags == 0);
1473       return lp_build_mad(bld, x, delta, v0);
1474    }
1475
1476    if (flags & LP_BLD_LERP_WIDE_NORMALIZED) {
1477       if (!bld->type.sign) {
1478          if (!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS)) {
1479             /*
1480              * Scale x from [0, 2**n - 1] to [0, 2**n] by adding the
1481              * most-significant-bit to the lowest-significant-bit, so that
1482              * later we can just divide by 2**n instead of 2**n - 1.
1483              */
1484
1485             x = lp_build_add(bld, x, lp_build_shr_imm(bld, x, half_width - 1));
1486          }
1487
1488          /* (x * delta) >> n */
1489          res = lp_build_mul(bld, x, delta);
1490          res = lp_build_shr_imm(bld, res, half_width);
1491       } else {
1492          /*
1493           * The rescaling trick above doesn't work for signed numbers, so
1494           * use the 2**n - 1 divison approximation in lp_build_mul_norm
1495           * instead.
1496           */
1497          assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1498          res = lp_build_mul_norm(bld->gallivm, bld->type, x, delta);
1499       }
1500    } else {
1501       assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1502       res = lp_build_mul(bld, x, delta);
1503    }
1504
1505    if ((flags & LP_BLD_LERP_WIDE_NORMALIZED) && !bld->type.sign) {
1506       /*
1507        * At this point both res and v0 only use the lower half of the bits,
1508        * the rest is zero. Instead of add / mask, do add with half wide type.
1509        */
1510       struct lp_type narrow_type;
1511       struct lp_build_context narrow_bld;
1512
1513       memset(&narrow_type, 0, sizeof narrow_type);
1514       narrow_type.sign   = bld->type.sign;
1515       narrow_type.width  = bld->type.width/2;
1516       narrow_type.length = bld->type.length*2;
1517
1518       lp_build_context_init(&narrow_bld, bld->gallivm, narrow_type);
1519       res = LLVMBuildBitCast(builder, res, narrow_bld.vec_type, "");
1520       v0 = LLVMBuildBitCast(builder, v0, narrow_bld.vec_type, "");
1521       res = lp_build_add(&narrow_bld, v0, res);
1522       res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
1523    } else {
1524       res = lp_build_add(bld, v0, res);
1525
1526       if (bld->type.fixed) {
1527          /*
1528           * We need to mask out the high order bits when lerping 8bit
1529           * normalized colors stored on 16bits
1530           */
1531          /* XXX: This step is necessary for lerping 8bit colors stored on
1532           * 16bits, but it will be wrong for true fixed point use cases.
1533           * Basically we need a more powerful lp_type, capable of further
1534           * distinguishing the values interpretation from the value storage.
1535           */
1536          LLVMValueRef low_bits;
1537          low_bits = lp_build_const_int_vec(bld->gallivm, bld->type, (1 << half_width) - 1);
1538          res = LLVMBuildAnd(builder, res, low_bits, "");
1539       }
1540    }
1541
1542    return res;
1543 }
1544
1545
1546 /**
1547  * Linear interpolation.
1548  */
1549 LLVMValueRef
1550 lp_build_lerp(struct lp_build_context *bld,
1551               LLVMValueRef x,
1552               LLVMValueRef v0,
1553               LLVMValueRef v1,
1554               unsigned flags)
1555 {
1556    const struct lp_type type = bld->type;
1557    LLVMValueRef res;
1558
1559    assert(lp_check_value(type, x));
1560    assert(lp_check_value(type, v0));
1561    assert(lp_check_value(type, v1));
1562
1563    assert(!(flags & LP_BLD_LERP_WIDE_NORMALIZED));
1564
1565    if (type.norm) {
1566       struct lp_type wide_type;
1567       struct lp_build_context wide_bld;
1568       LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh;
1569
1570       assert(type.length >= 2);
1571
1572       /*
1573        * Create a wider integer type, enough to hold the
1574        * intermediate result of the multiplication.
1575        */
1576       memset(&wide_type, 0, sizeof wide_type);
1577       wide_type.sign   = type.sign;
1578       wide_type.width  = type.width*2;
1579       wide_type.length = type.length/2;
1580
1581       lp_build_context_init(&wide_bld, bld->gallivm, wide_type);
1582
1583       lp_build_unpack2_native(bld->gallivm, type, wide_type, x,  &xl,  &xh);
1584       lp_build_unpack2_native(bld->gallivm, type, wide_type, v0, &v0l, &v0h);
1585       lp_build_unpack2_native(bld->gallivm, type, wide_type, v1, &v1l, &v1h);
1586
1587       /*
1588        * Lerp both halves.
1589        */
1590
1591       flags |= LP_BLD_LERP_WIDE_NORMALIZED;
1592
1593       resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l, flags);
1594       resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h, flags);
1595
1596       res = lp_build_pack2_native(bld->gallivm, wide_type, type, resl, resh);
1597    } else {
1598       res = lp_build_lerp_simple(bld, x, v0, v1, flags);
1599    }
1600
1601    return res;
1602 }
1603
1604
1605 /**
1606  * Bilinear interpolation.
1607  *
1608  * Values indices are in v_{yx}.
1609  */
1610 LLVMValueRef
1611 lp_build_lerp_2d(struct lp_build_context *bld,
1612                  LLVMValueRef x,
1613                  LLVMValueRef y,
1614                  LLVMValueRef v00,
1615                  LLVMValueRef v01,
1616                  LLVMValueRef v10,
1617                  LLVMValueRef v11,
1618                  unsigned flags)
1619 {
1620    LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01, flags);
1621    LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11, flags);
1622    return lp_build_lerp(bld, y, v0, v1, flags);
1623 }
1624
1625
1626 LLVMValueRef
1627 lp_build_lerp_3d(struct lp_build_context *bld,
1628                  LLVMValueRef x,
1629                  LLVMValueRef y,
1630                  LLVMValueRef z,
1631                  LLVMValueRef v000,
1632                  LLVMValueRef v001,
1633                  LLVMValueRef v010,
1634                  LLVMValueRef v011,
1635                  LLVMValueRef v100,
1636                  LLVMValueRef v101,
1637                  LLVMValueRef v110,
1638                  LLVMValueRef v111,
1639                  unsigned flags)
1640 {
1641    LLVMValueRef v0 = lp_build_lerp_2d(bld, x, y, v000, v001, v010, v011, flags);
1642    LLVMValueRef v1 = lp_build_lerp_2d(bld, x, y, v100, v101, v110, v111, flags);
1643    return lp_build_lerp(bld, z, v0, v1, flags);
1644 }
1645
1646
1647 /**
1648  * Generate min(a, b)
1649  * Do checks for special cases but not for nans.
1650  */
1651 LLVMValueRef
1652 lp_build_min(struct lp_build_context *bld,
1653              LLVMValueRef a,
1654              LLVMValueRef b)
1655 {
1656    assert(lp_check_value(bld->type, a));
1657    assert(lp_check_value(bld->type, b));
1658
1659    if(a == bld->undef || b == bld->undef)
1660       return bld->undef;
1661
1662    if(a == b)
1663       return a;
1664
1665    if (bld->type.norm) {
1666       if (!bld->type.sign) {
1667          if (a == bld->zero || b == bld->zero) {
1668             return bld->zero;
1669          }
1670       }
1671       if(a == bld->one)
1672          return b;
1673       if(b == bld->one)
1674          return a;
1675    }
1676
1677    return lp_build_min_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1678 }
1679
1680
1681 /**
1682  * Generate min(a, b)
1683  * NaN's are handled according to the behavior specified by the
1684  * nan_behavior argument.
1685  */
1686 LLVMValueRef
1687 lp_build_min_ext(struct lp_build_context *bld,
1688                  LLVMValueRef a,
1689                  LLVMValueRef b,
1690                  enum gallivm_nan_behavior nan_behavior)
1691 {
1692    assert(lp_check_value(bld->type, a));
1693    assert(lp_check_value(bld->type, b));
1694
1695    if(a == bld->undef || b == bld->undef)
1696       return bld->undef;
1697
1698    if(a == b)
1699       return a;
1700
1701    if (bld->type.norm) {
1702       if (!bld->type.sign) {
1703          if (a == bld->zero || b == bld->zero) {
1704             return bld->zero;
1705          }
1706       }
1707       if(a == bld->one)
1708          return b;
1709       if(b == bld->one)
1710          return a;
1711    }
1712
1713    return lp_build_min_simple(bld, a, b, nan_behavior);
1714 }
1715
1716 /**
1717  * Generate max(a, b)
1718  * Do checks for special cases, but NaN behavior is undefined.
1719  */
1720 LLVMValueRef
1721 lp_build_max(struct lp_build_context *bld,
1722              LLVMValueRef a,
1723              LLVMValueRef b)
1724 {
1725    assert(lp_check_value(bld->type, a));
1726    assert(lp_check_value(bld->type, b));
1727
1728    if(a == bld->undef || b == bld->undef)
1729       return bld->undef;
1730
1731    if(a == b)
1732       return a;
1733
1734    if(bld->type.norm) {
1735       if(a == bld->one || b == bld->one)
1736          return bld->one;
1737       if (!bld->type.sign) {
1738          if (a == bld->zero) {
1739             return b;
1740          }
1741          if (b == bld->zero) {
1742             return a;
1743          }
1744       }
1745    }
1746
1747    return lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1748 }
1749
1750
1751 /**
1752  * Generate max(a, b)
1753  * Checks for special cases.
1754  * NaN's are handled according to the behavior specified by the
1755  * nan_behavior argument.
1756  */
1757 LLVMValueRef
1758 lp_build_max_ext(struct lp_build_context *bld,
1759                   LLVMValueRef a,
1760                   LLVMValueRef b,
1761                   enum gallivm_nan_behavior nan_behavior)
1762 {
1763    assert(lp_check_value(bld->type, a));
1764    assert(lp_check_value(bld->type, b));
1765
1766    if(a == bld->undef || b == bld->undef)
1767       return bld->undef;
1768
1769    if(a == b)
1770       return a;
1771
1772    if(bld->type.norm) {
1773       if(a == bld->one || b == bld->one)
1774          return bld->one;
1775       if (!bld->type.sign) {
1776          if (a == bld->zero) {
1777             return b;
1778          }
1779          if (b == bld->zero) {
1780             return a;
1781          }
1782       }
1783    }
1784
1785    return lp_build_max_simple(bld, a, b, nan_behavior);
1786 }
1787
1788 /**
1789  * Generate clamp(a, min, max)
1790  * NaN behavior (for any of a, min, max) is undefined.
1791  * Do checks for special cases.
1792  */
1793 LLVMValueRef
1794 lp_build_clamp(struct lp_build_context *bld,
1795                LLVMValueRef a,
1796                LLVMValueRef min,
1797                LLVMValueRef max)
1798 {
1799    assert(lp_check_value(bld->type, a));
1800    assert(lp_check_value(bld->type, min));
1801    assert(lp_check_value(bld->type, max));
1802
1803    a = lp_build_min(bld, a, max);
1804    a = lp_build_max(bld, a, min);
1805    return a;
1806 }
1807
1808
1809 /**
1810  * Generate clamp(a, 0, 1)
1811  * A NaN will get converted to zero.
1812  */
1813 LLVMValueRef
1814 lp_build_clamp_zero_one_nanzero(struct lp_build_context *bld,
1815                                 LLVMValueRef a)
1816 {
1817    a = lp_build_max_ext(bld, a, bld->zero, GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
1818    a = lp_build_min(bld, a, bld->one);
1819    return a;
1820 }
1821
1822
1823 /**
1824  * Generate abs(a)
1825  */
1826 LLVMValueRef
1827 lp_build_abs(struct lp_build_context *bld,
1828              LLVMValueRef a)
1829 {
1830    LLVMBuilderRef builder = bld->gallivm->builder;
1831    const struct lp_type type = bld->type;
1832    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1833
1834    assert(lp_check_value(type, a));
1835
1836    if(!type.sign)
1837       return a;
1838
1839    if(type.floating) {
1840       if ((LLVM_VERSION_MAJOR > 3 || (LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR > 6)) && (LLVM_VERSION_MAJOR < 3 || (LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR < 9))) {
1841          /* Workaround llvm.org/PR27332 */
1842          LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1843          unsigned long long absMask = ~(1ULL << (type.width - 1));
1844          LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, ((unsigned long long) absMask));
1845          a = LLVMBuildBitCast(builder, a, int_vec_type, "");
1846          a = LLVMBuildAnd(builder, a, mask, "");
1847          a = LLVMBuildBitCast(builder, a, vec_type, "");
1848          return a;
1849       } else {
1850          char intrinsic[32];
1851          lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fabs", vec_type);
1852          return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
1853       }
1854    }
1855
1856    if(type.width*type.length == 128 && util_cpu_caps.has_ssse3 && LLVM_VERSION_MAJOR < 6) {
1857       switch(type.width) {
1858       case 8:
1859          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
1860       case 16:
1861          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
1862       case 32:
1863          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
1864       }
1865    }
1866    else if (type.width*type.length == 256 && util_cpu_caps.has_avx2 && LLVM_VERSION_MAJOR < 6) {
1867       switch(type.width) {
1868       case 8:
1869          return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.b", vec_type, a);
1870       case 16:
1871          return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.w", vec_type, a);
1872       case 32:
1873          return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.d", vec_type, a);
1874       }
1875    }
1876
1877    return lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero),
1878                           a, LLVMBuildNeg(builder, a, ""));
1879 }
1880
1881
1882 LLVMValueRef
1883 lp_build_negate(struct lp_build_context *bld,
1884                 LLVMValueRef a)
1885 {
1886    LLVMBuilderRef builder = bld->gallivm->builder;
1887
1888    assert(lp_check_value(bld->type, a));
1889
1890    if (bld->type.floating)
1891       a = LLVMBuildFNeg(builder, a, "");
1892    else
1893       a = LLVMBuildNeg(builder, a, "");
1894
1895    return a;
1896 }
1897
1898
1899 /** Return -1, 0 or +1 depending on the sign of a */
1900 LLVMValueRef
1901 lp_build_sgn(struct lp_build_context *bld,
1902              LLVMValueRef a)
1903 {
1904    LLVMBuilderRef builder = bld->gallivm->builder;
1905    const struct lp_type type = bld->type;
1906    LLVMValueRef cond;
1907    LLVMValueRef res;
1908
1909    assert(lp_check_value(type, a));
1910
1911    /* Handle non-zero case */
1912    if(!type.sign) {
1913       /* if not zero then sign must be positive */
1914       res = bld->one;
1915    }
1916    else if(type.floating) {
1917       LLVMTypeRef vec_type;
1918       LLVMTypeRef int_type;
1919       LLVMValueRef mask;
1920       LLVMValueRef sign;
1921       LLVMValueRef one;
1922       unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
1923
1924       int_type = lp_build_int_vec_type(bld->gallivm, type);
1925       vec_type = lp_build_vec_type(bld->gallivm, type);
1926       mask = lp_build_const_int_vec(bld->gallivm, type, maskBit);
1927
1928       /* Take the sign bit and add it to 1 constant */
1929       sign = LLVMBuildBitCast(builder, a, int_type, "");
1930       sign = LLVMBuildAnd(builder, sign, mask, "");
1931       one = LLVMConstBitCast(bld->one, int_type);
1932       res = LLVMBuildOr(builder, sign, one, "");
1933       res = LLVMBuildBitCast(builder, res, vec_type, "");
1934    }
1935    else
1936    {
1937       /* signed int/norm/fixed point */
1938       /* could use psign with sse3 and appropriate vectors here */
1939       LLVMValueRef minus_one = lp_build_const_vec(bld->gallivm, type, -1.0);
1940       cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
1941       res = lp_build_select(bld, cond, bld->one, minus_one);
1942    }
1943
1944    /* Handle zero */
1945    cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
1946    res = lp_build_select(bld, cond, bld->zero, res);
1947
1948    return res;
1949 }
1950
1951
1952 /**
1953  * Set the sign of float vector 'a' according to 'sign'.
1954  * If sign==0, return abs(a).
1955  * If sign==1, return -abs(a);
1956  * Other values for sign produce undefined results.
1957  */
1958 LLVMValueRef
1959 lp_build_set_sign(struct lp_build_context *bld,
1960                   LLVMValueRef a, LLVMValueRef sign)
1961 {
1962    LLVMBuilderRef builder = bld->gallivm->builder;
1963    const struct lp_type type = bld->type;
1964    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1965    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1966    LLVMValueRef shift = lp_build_const_int_vec(bld->gallivm, type, type.width - 1);
1967    LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1968                              ~((unsigned long long) 1 << (type.width - 1)));
1969    LLVMValueRef val, res;
1970
1971    assert(type.floating);
1972    assert(lp_check_value(type, a));
1973
1974    /* val = reinterpret_cast<int>(a) */
1975    val = LLVMBuildBitCast(builder, a, int_vec_type, "");
1976    /* val = val & mask */
1977    val = LLVMBuildAnd(builder, val, mask, "");
1978    /* sign = sign << shift */
1979    sign = LLVMBuildShl(builder, sign, shift, "");
1980    /* res = val | sign */
1981    res = LLVMBuildOr(builder, val, sign, "");
1982    /* res = reinterpret_cast<float>(res) */
1983    res = LLVMBuildBitCast(builder, res, vec_type, "");
1984
1985    return res;
1986 }
1987
1988
1989 /**
1990  * Convert vector of (or scalar) int to vector of (or scalar) float.
1991  */
1992 LLVMValueRef
1993 lp_build_int_to_float(struct lp_build_context *bld,
1994                       LLVMValueRef a)
1995 {
1996    LLVMBuilderRef builder = bld->gallivm->builder;
1997    const struct lp_type type = bld->type;
1998    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1999
2000    assert(type.floating);
2001
2002    return LLVMBuildSIToFP(builder, a, vec_type, "");
2003 }
2004
2005 static boolean
2006 arch_rounding_available(const struct lp_type type)
2007 {
2008    if ((util_cpu_caps.has_sse4_1 &&
2009        (type.length == 1 || type.width*type.length == 128)) ||
2010        (util_cpu_caps.has_avx && type.width*type.length == 256) ||
2011        (util_cpu_caps.has_avx512f && type.width*type.length == 512))
2012       return TRUE;
2013    else if ((util_cpu_caps.has_altivec &&
2014             (type.width == 32 && type.length == 4)))
2015       return TRUE;
2016    else if (util_cpu_caps.has_neon)
2017       return TRUE;
2018
2019    return FALSE;
2020 }
2021
2022 enum lp_build_round_mode
2023 {
2024    LP_BUILD_ROUND_NEAREST = 0,
2025    LP_BUILD_ROUND_FLOOR = 1,
2026    LP_BUILD_ROUND_CEIL = 2,
2027    LP_BUILD_ROUND_TRUNCATE = 3
2028 };
2029
2030 static inline LLVMValueRef
2031 lp_build_iround_nearest_sse2(struct lp_build_context *bld,
2032                              LLVMValueRef a)
2033 {
2034    LLVMBuilderRef builder = bld->gallivm->builder;
2035    const struct lp_type type = bld->type;
2036    LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
2037    LLVMTypeRef ret_type = lp_build_int_vec_type(bld->gallivm, type);
2038    const char *intrinsic;
2039    LLVMValueRef res;
2040
2041    assert(type.floating);
2042    /* using the double precision conversions is a bit more complicated */
2043    assert(type.width == 32);
2044
2045    assert(lp_check_value(type, a));
2046    assert(util_cpu_caps.has_sse2);
2047
2048    /* This is relying on MXCSR rounding mode, which should always be nearest. */
2049    if (type.length == 1) {
2050       LLVMTypeRef vec_type;
2051       LLVMValueRef undef;
2052       LLVMValueRef arg;
2053       LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
2054
2055       vec_type = LLVMVectorType(bld->elem_type, 4);
2056
2057       intrinsic = "llvm.x86.sse.cvtss2si";
2058
2059       undef = LLVMGetUndef(vec_type);
2060
2061       arg = LLVMBuildInsertElement(builder, undef, a, index0, "");
2062
2063       res = lp_build_intrinsic_unary(builder, intrinsic,
2064                                      ret_type, arg);
2065    }
2066    else {
2067       if (type.width* type.length == 128) {
2068          intrinsic = "llvm.x86.sse2.cvtps2dq";
2069       }
2070       else {
2071          assert(type.width*type.length == 256);
2072          assert(util_cpu_caps.has_avx);
2073
2074          intrinsic = "llvm.x86.avx.cvt.ps2dq.256";
2075       }
2076       res = lp_build_intrinsic_unary(builder, intrinsic,
2077                                      ret_type, a);
2078    }
2079
2080    return res;
2081 }
2082
2083
2084 /*
2085  */
2086 static inline LLVMValueRef
2087 lp_build_round_altivec(struct lp_build_context *bld,
2088                        LLVMValueRef a,
2089                        enum lp_build_round_mode mode)
2090 {
2091    LLVMBuilderRef builder = bld->gallivm->builder;
2092    const struct lp_type type = bld->type;
2093    const char *intrinsic = NULL;
2094
2095    assert(type.floating);
2096
2097    assert(lp_check_value(type, a));
2098    assert(util_cpu_caps.has_altivec);
2099
2100    (void)type;
2101
2102    switch (mode) {
2103    case LP_BUILD_ROUND_NEAREST:
2104       intrinsic = "llvm.ppc.altivec.vrfin";
2105       break;
2106    case LP_BUILD_ROUND_FLOOR:
2107       intrinsic = "llvm.ppc.altivec.vrfim";
2108       break;
2109    case LP_BUILD_ROUND_CEIL:
2110       intrinsic = "llvm.ppc.altivec.vrfip";
2111       break;
2112    case LP_BUILD_ROUND_TRUNCATE:
2113       intrinsic = "llvm.ppc.altivec.vrfiz";
2114       break;
2115    }
2116
2117    return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2118 }
2119
2120 static inline LLVMValueRef
2121 lp_build_round_arch(struct lp_build_context *bld,
2122                     LLVMValueRef a,
2123                     enum lp_build_round_mode mode)
2124 {
2125    if (util_cpu_caps.has_sse4_1 || util_cpu_caps.has_neon) {
2126       LLVMBuilderRef builder = bld->gallivm->builder;
2127       const struct lp_type type = bld->type;
2128       const char *intrinsic_root;
2129       char intrinsic[32];
2130
2131       assert(type.floating);
2132       assert(lp_check_value(type, a));
2133       (void)type;
2134
2135       switch (mode) {
2136       case LP_BUILD_ROUND_NEAREST:
2137          intrinsic_root = "llvm.nearbyint";
2138          break;
2139       case LP_BUILD_ROUND_FLOOR:
2140          intrinsic_root = "llvm.floor";
2141          break;
2142       case LP_BUILD_ROUND_CEIL:
2143          intrinsic_root = "llvm.ceil";
2144          break;
2145       case LP_BUILD_ROUND_TRUNCATE:
2146          intrinsic_root = "llvm.trunc";
2147          break;
2148       }
2149
2150       lp_format_intrinsic(intrinsic, sizeof intrinsic, intrinsic_root, bld->vec_type);
2151       return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2152    }
2153    else /* (util_cpu_caps.has_altivec) */
2154      return lp_build_round_altivec(bld, a, mode);
2155 }
2156
2157 /**
2158  * Return the integer part of a float (vector) value (== round toward zero).
2159  * The returned value is a float (vector).
2160  * Ex: trunc(-1.5) = -1.0
2161  */
2162 LLVMValueRef
2163 lp_build_trunc(struct lp_build_context *bld,
2164                LLVMValueRef a)
2165 {
2166    LLVMBuilderRef builder = bld->gallivm->builder;
2167    const struct lp_type type = bld->type;
2168
2169    assert(type.floating);
2170    assert(lp_check_value(type, a));
2171
2172    if (arch_rounding_available(type)) {
2173       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_TRUNCATE);
2174    }
2175    else {
2176       const struct lp_type type = bld->type;
2177       struct lp_type inttype;
2178       struct lp_build_context intbld;
2179       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2180       LLVMValueRef trunc, res, anosign, mask;
2181       LLVMTypeRef int_vec_type = bld->int_vec_type;
2182       LLVMTypeRef vec_type = bld->vec_type;
2183
2184       assert(type.width == 32); /* might want to handle doubles at some point */
2185
2186       inttype = type;
2187       inttype.floating = 0;
2188       lp_build_context_init(&intbld, bld->gallivm, inttype);
2189
2190       /* round by truncation */
2191       trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2192       res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
2193
2194       /* mask out sign bit */
2195       anosign = lp_build_abs(bld, a);
2196       /*
2197        * mask out all values if anosign > 2^24
2198        * This should work both for large ints (all rounding is no-op for them
2199        * because such floats are always exact) as well as special cases like
2200        * NaNs, Infs (taking advantage of the fact they use max exponent).
2201        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2202        */
2203       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2204       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2205       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2206       return lp_build_select(bld, mask, a, res);
2207    }
2208 }
2209
2210
2211 /**
2212  * Return float (vector) rounded to nearest integer (vector).  The returned
2213  * value is a float (vector).
2214  * Ex: round(0.9) = 1.0
2215  * Ex: round(-1.5) = -2.0
2216  */
2217 LLVMValueRef
2218 lp_build_round(struct lp_build_context *bld,
2219                LLVMValueRef a)
2220 {
2221    LLVMBuilderRef builder = bld->gallivm->builder;
2222    const struct lp_type type = bld->type;
2223
2224    assert(type.floating);
2225    assert(lp_check_value(type, a));
2226
2227    if (arch_rounding_available(type)) {
2228       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
2229    }
2230    else {
2231       const struct lp_type type = bld->type;
2232       struct lp_type inttype;
2233       struct lp_build_context intbld;
2234       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2235       LLVMValueRef res, anosign, mask;
2236       LLVMTypeRef int_vec_type = bld->int_vec_type;
2237       LLVMTypeRef vec_type = bld->vec_type;
2238
2239       assert(type.width == 32); /* might want to handle doubles at some point */
2240
2241       inttype = type;
2242       inttype.floating = 0;
2243       lp_build_context_init(&intbld, bld->gallivm, inttype);
2244
2245       res = lp_build_iround(bld, a);
2246       res = LLVMBuildSIToFP(builder, res, vec_type, "");
2247
2248       /* mask out sign bit */
2249       anosign = lp_build_abs(bld, a);
2250       /*
2251        * mask out all values if anosign > 2^24
2252        * This should work both for large ints (all rounding is no-op for them
2253        * because such floats are always exact) as well as special cases like
2254        * NaNs, Infs (taking advantage of the fact they use max exponent).
2255        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2256        */
2257       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2258       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2259       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2260       return lp_build_select(bld, mask, a, res);
2261    }
2262 }
2263
2264
2265 /**
2266  * Return floor of float (vector), result is a float (vector)
2267  * Ex: floor(1.1) = 1.0
2268  * Ex: floor(-1.1) = -2.0
2269  */
2270 LLVMValueRef
2271 lp_build_floor(struct lp_build_context *bld,
2272                LLVMValueRef a)
2273 {
2274    LLVMBuilderRef builder = bld->gallivm->builder;
2275    const struct lp_type type = bld->type;
2276
2277    assert(type.floating);
2278    assert(lp_check_value(type, a));
2279
2280    if (arch_rounding_available(type)) {
2281       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
2282    }
2283    else {
2284       const struct lp_type type = bld->type;
2285       struct lp_type inttype;
2286       struct lp_build_context intbld;
2287       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2288       LLVMValueRef trunc, res, anosign, mask;
2289       LLVMTypeRef int_vec_type = bld->int_vec_type;
2290       LLVMTypeRef vec_type = bld->vec_type;
2291
2292       if (type.width != 32) {
2293          char intrinsic[32];
2294          lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.floor", vec_type);
2295          return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2296       }
2297
2298       assert(type.width == 32); /* might want to handle doubles at some point */
2299
2300       inttype = type;
2301       inttype.floating = 0;
2302       lp_build_context_init(&intbld, bld->gallivm, inttype);
2303
2304       /* round by truncation */
2305       trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2306       res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
2307
2308       if (type.sign) {
2309          LLVMValueRef tmp;
2310
2311          /*
2312           * fix values if rounding is wrong (for non-special cases)
2313           * - this is the case if trunc > a
2314           */
2315          mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, res, a);
2316          /* tmp = trunc > a ? 1.0 : 0.0 */
2317          tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
2318          tmp = lp_build_and(&intbld, mask, tmp);
2319          tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
2320          res = lp_build_sub(bld, res, tmp);
2321       }
2322
2323       /* mask out sign bit */
2324       anosign = lp_build_abs(bld, a);
2325       /*
2326        * mask out all values if anosign > 2^24
2327        * This should work both for large ints (all rounding is no-op for them
2328        * because such floats are always exact) as well as special cases like
2329        * NaNs, Infs (taking advantage of the fact they use max exponent).
2330        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2331        */
2332       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2333       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2334       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2335       return lp_build_select(bld, mask, a, res);
2336    }
2337 }
2338
2339
2340 /**
2341  * Return ceiling of float (vector), returning float (vector).
2342  * Ex: ceil( 1.1) = 2.0
2343  * Ex: ceil(-1.1) = -1.0
2344  */
2345 LLVMValueRef
2346 lp_build_ceil(struct lp_build_context *bld,
2347               LLVMValueRef a)
2348 {
2349    LLVMBuilderRef builder = bld->gallivm->builder;
2350    const struct lp_type type = bld->type;
2351
2352    assert(type.floating);
2353    assert(lp_check_value(type, a));
2354
2355    if (arch_rounding_available(type)) {
2356       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2357    }
2358    else {
2359       const struct lp_type type = bld->type;
2360       struct lp_type inttype;
2361       struct lp_build_context intbld;
2362       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2363       LLVMValueRef trunc, res, anosign, mask, tmp;
2364       LLVMTypeRef int_vec_type = bld->int_vec_type;
2365       LLVMTypeRef vec_type = bld->vec_type;
2366
2367       if (type.width != 32) {
2368          char intrinsic[32];
2369          lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.ceil", vec_type);
2370          return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2371       }
2372
2373       assert(type.width == 32); /* might want to handle doubles at some point */
2374
2375       inttype = type;
2376       inttype.floating = 0;
2377       lp_build_context_init(&intbld, bld->gallivm, inttype);
2378
2379       /* round by truncation */
2380       trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2381       trunc = LLVMBuildSIToFP(builder, trunc, vec_type, "ceil.trunc");
2382
2383       /*
2384        * fix values if rounding is wrong (for non-special cases)
2385        * - this is the case if trunc < a
2386        */
2387       mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2388       /* tmp = trunc < a ? 1.0 : 0.0 */
2389       tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
2390       tmp = lp_build_and(&intbld, mask, tmp);
2391       tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
2392       res = lp_build_add(bld, trunc, tmp);
2393
2394       /* mask out sign bit */
2395       anosign = lp_build_abs(bld, a);
2396       /*
2397        * mask out all values if anosign > 2^24
2398        * This should work both for large ints (all rounding is no-op for them
2399        * because such floats are always exact) as well as special cases like
2400        * NaNs, Infs (taking advantage of the fact they use max exponent).
2401        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2402        */
2403       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2404       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2405       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2406       return lp_build_select(bld, mask, a, res);
2407    }
2408 }
2409
2410
2411 /**
2412  * Return fractional part of 'a' computed as a - floor(a)
2413  * Typically used in texture coord arithmetic.
2414  */
2415 LLVMValueRef
2416 lp_build_fract(struct lp_build_context *bld,
2417                LLVMValueRef a)
2418 {
2419    assert(bld->type.floating);
2420    return lp_build_sub(bld, a, lp_build_floor(bld, a));
2421 }
2422
2423
2424 /**
2425  * Prevent returning 1.0 for very small negative values of 'a' by clamping
2426  * against 0.99999(9). (Will also return that value for NaNs.)
2427  */
2428 static inline LLVMValueRef
2429 clamp_fract(struct lp_build_context *bld, LLVMValueRef fract)
2430 {
2431    LLVMValueRef max;
2432
2433    /* this is the largest number smaller than 1.0 representable as float */
2434    max = lp_build_const_vec(bld->gallivm, bld->type,
2435                             1.0 - 1.0/(1LL << (lp_mantissa(bld->type) + 1)));
2436    return lp_build_min_ext(bld, fract, max,
2437                            GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
2438 }
2439
2440
2441 /**
2442  * Same as lp_build_fract, but guarantees that the result is always smaller
2443  * than one. Will also return the smaller-than-one value for infs, NaNs.
2444  */
2445 LLVMValueRef
2446 lp_build_fract_safe(struct lp_build_context *bld,
2447                     LLVMValueRef a)
2448 {
2449    return clamp_fract(bld, lp_build_fract(bld, a));
2450 }
2451
2452
2453 /**
2454  * Return the integer part of a float (vector) value (== round toward zero).
2455  * The returned value is an integer (vector).
2456  * Ex: itrunc(-1.5) = -1
2457  */
2458 LLVMValueRef
2459 lp_build_itrunc(struct lp_build_context *bld,
2460                 LLVMValueRef a)
2461 {
2462    LLVMBuilderRef builder = bld->gallivm->builder;
2463    const struct lp_type type = bld->type;
2464    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
2465
2466    assert(type.floating);
2467    assert(lp_check_value(type, a));
2468
2469    return LLVMBuildFPToSI(builder, a, int_vec_type, "");
2470 }
2471
2472
2473 /**
2474  * Return float (vector) rounded to nearest integer (vector).  The returned
2475  * value is an integer (vector).
2476  * Ex: iround(0.9) = 1
2477  * Ex: iround(-1.5) = -2
2478  */
2479 LLVMValueRef
2480 lp_build_iround(struct lp_build_context *bld,
2481                 LLVMValueRef a)
2482 {
2483    LLVMBuilderRef builder = bld->gallivm->builder;
2484    const struct lp_type type = bld->type;
2485    LLVMTypeRef int_vec_type = bld->int_vec_type;
2486    LLVMValueRef res;
2487
2488    assert(type.floating);
2489
2490    assert(lp_check_value(type, a));
2491
2492    if ((util_cpu_caps.has_sse2 &&
2493        ((type.width == 32) && (type.length == 1 || type.length == 4))) ||
2494        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
2495       return lp_build_iround_nearest_sse2(bld, a);
2496    }
2497    if (arch_rounding_available(type)) {
2498       res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
2499    }
2500    else {
2501       LLVMValueRef half;
2502
2503       half = lp_build_const_vec(bld->gallivm, type, nextafterf(0.5, 0.0));
2504
2505       if (type.sign) {
2506          LLVMTypeRef vec_type = bld->vec_type;
2507          LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
2508                                     (unsigned long long)1 << (type.width - 1));
2509          LLVMValueRef sign;
2510
2511          /* get sign bit */
2512          sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
2513          sign = LLVMBuildAnd(builder, sign, mask, "");
2514
2515          /* sign * 0.5 */
2516          half = LLVMBuildBitCast(builder, half, int_vec_type, "");
2517          half = LLVMBuildOr(builder, sign, half, "");
2518          half = LLVMBuildBitCast(builder, half, vec_type, "");
2519       }
2520
2521       res = LLVMBuildFAdd(builder, a, half, "");
2522    }
2523
2524    res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
2525
2526    return res;
2527 }
2528
2529
2530 /**
2531  * Return floor of float (vector), result is an int (vector)
2532  * Ex: ifloor(1.1) = 1.0
2533  * Ex: ifloor(-1.1) = -2.0
2534  */
2535 LLVMValueRef
2536 lp_build_ifloor(struct lp_build_context *bld,
2537                 LLVMValueRef a)
2538 {
2539    LLVMBuilderRef builder = bld->gallivm->builder;
2540    const struct lp_type type = bld->type;
2541    LLVMTypeRef int_vec_type = bld->int_vec_type;
2542    LLVMValueRef res;
2543
2544    assert(type.floating);
2545    assert(lp_check_value(type, a));
2546
2547    res = a;
2548    if (type.sign) {
2549       if (arch_rounding_available(type)) {
2550          res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
2551       }
2552       else {
2553          struct lp_type inttype;
2554          struct lp_build_context intbld;
2555          LLVMValueRef trunc, itrunc, mask;
2556
2557          assert(type.floating);
2558          assert(lp_check_value(type, a));
2559
2560          inttype = type;
2561          inttype.floating = 0;
2562          lp_build_context_init(&intbld, bld->gallivm, inttype);
2563
2564          /* round by truncation */
2565          itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2566          trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "ifloor.trunc");
2567
2568          /*
2569           * fix values if rounding is wrong (for non-special cases)
2570           * - this is the case if trunc > a
2571           * The results of doing this with NaNs, very large values etc.
2572           * are undefined but this seems to be the case anyway.
2573           */
2574          mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, trunc, a);
2575          /* cheapie minus one with mask since the mask is minus one / zero */
2576          return lp_build_add(&intbld, itrunc, mask);
2577       }
2578    }
2579
2580    /* round to nearest (toward zero) */
2581    res = LLVMBuildFPToSI(builder, res, int_vec_type, "ifloor.res");
2582
2583    return res;
2584 }
2585
2586
2587 /**
2588  * Return ceiling of float (vector), returning int (vector).
2589  * Ex: iceil( 1.1) = 2
2590  * Ex: iceil(-1.1) = -1
2591  */
2592 LLVMValueRef
2593 lp_build_iceil(struct lp_build_context *bld,
2594                LLVMValueRef a)
2595 {
2596    LLVMBuilderRef builder = bld->gallivm->builder;
2597    const struct lp_type type = bld->type;
2598    LLVMTypeRef int_vec_type = bld->int_vec_type;
2599    LLVMValueRef res;
2600
2601    assert(type.floating);
2602    assert(lp_check_value(type, a));
2603
2604    if (arch_rounding_available(type)) {
2605       res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2606    }
2607    else {
2608       struct lp_type inttype;
2609       struct lp_build_context intbld;
2610       LLVMValueRef trunc, itrunc, mask;
2611
2612       assert(type.floating);
2613       assert(lp_check_value(type, a));
2614
2615       inttype = type;
2616       inttype.floating = 0;
2617       lp_build_context_init(&intbld, bld->gallivm, inttype);
2618
2619       /* round by truncation */
2620       itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2621       trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "iceil.trunc");
2622
2623       /*
2624        * fix values if rounding is wrong (for non-special cases)
2625        * - this is the case if trunc < a
2626        * The results of doing this with NaNs, very large values etc.
2627        * are undefined but this seems to be the case anyway.
2628        */
2629       mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2630       /* cheapie plus one with mask since the mask is minus one / zero */
2631       return lp_build_sub(&intbld, itrunc, mask);
2632    }
2633
2634    /* round to nearest (toward zero) */
2635    res = LLVMBuildFPToSI(builder, res, int_vec_type, "iceil.res");
2636
2637    return res;
2638 }
2639
2640
2641 /**
2642  * Combined ifloor() & fract().
2643  *
2644  * Preferred to calling the functions separately, as it will ensure that the
2645  * strategy (floor() vs ifloor()) that results in less redundant work is used.
2646  */
2647 void
2648 lp_build_ifloor_fract(struct lp_build_context *bld,
2649                       LLVMValueRef a,
2650                       LLVMValueRef *out_ipart,
2651                       LLVMValueRef *out_fpart)
2652 {
2653    LLVMBuilderRef builder = bld->gallivm->builder;
2654    const struct lp_type type = bld->type;
2655    LLVMValueRef ipart;
2656
2657    assert(type.floating);
2658    assert(lp_check_value(type, a));
2659
2660    if (arch_rounding_available(type)) {
2661       /*
2662        * floor() is easier.
2663        */
2664
2665       ipart = lp_build_floor(bld, a);
2666       *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2667       *out_ipart = LLVMBuildFPToSI(builder, ipart, bld->int_vec_type, "ipart");
2668    }
2669    else {
2670       /*
2671        * ifloor() is easier.
2672        */
2673
2674       *out_ipart = lp_build_ifloor(bld, a);
2675       ipart = LLVMBuildSIToFP(builder, *out_ipart, bld->vec_type, "ipart");
2676       *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2677    }
2678 }
2679
2680
2681 /**
2682  * Same as lp_build_ifloor_fract, but guarantees that the fractional part is
2683  * always smaller than one.
2684  */
2685 void
2686 lp_build_ifloor_fract_safe(struct lp_build_context *bld,
2687                            LLVMValueRef a,
2688                            LLVMValueRef *out_ipart,
2689                            LLVMValueRef *out_fpart)
2690 {
2691    lp_build_ifloor_fract(bld, a, out_ipart, out_fpart);
2692    *out_fpart = clamp_fract(bld, *out_fpart);
2693 }
2694
2695
2696 LLVMValueRef
2697 lp_build_sqrt(struct lp_build_context *bld,
2698               LLVMValueRef a)
2699 {
2700    LLVMBuilderRef builder = bld->gallivm->builder;
2701    const struct lp_type type = bld->type;
2702    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
2703    char intrinsic[32];
2704
2705    assert(lp_check_value(type, a));
2706
2707    assert(type.floating);
2708    lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.sqrt", vec_type);
2709
2710    return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2711 }
2712
2713
2714 /**
2715  * Do one Newton-Raphson step to improve reciprocate precision:
2716  *
2717  *   x_{i+1} = x_i + x_i * (1 - a * x_i)
2718  *
2719  * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
2720  * +/-Inf, giving NaN instead.  Certain applications rely on this behavior,
2721  * such as Google Earth, which does RCP(RSQRT(0.0)) when drawing the Earth's
2722  * halo. It would be necessary to clamp the argument to prevent this.
2723  *
2724  * See also:
2725  * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
2726  * - http://softwarecommunity.intel.com/articles/eng/1818.htm
2727  */
2728 static inline LLVMValueRef
2729 lp_build_rcp_refine(struct lp_build_context *bld,
2730                     LLVMValueRef a,
2731                     LLVMValueRef rcp_a)
2732 {
2733    LLVMBuilderRef builder = bld->gallivm->builder;
2734    LLVMValueRef neg_a;
2735    LLVMValueRef res;
2736
2737    neg_a = LLVMBuildFNeg(builder, a, "");
2738    res = lp_build_fmuladd(builder, neg_a, rcp_a, bld->one);
2739    res = lp_build_fmuladd(builder, res, rcp_a, rcp_a);
2740
2741    return res;
2742 }
2743
2744
2745 LLVMValueRef
2746 lp_build_rcp(struct lp_build_context *bld,
2747              LLVMValueRef a)
2748 {
2749    LLVMBuilderRef builder = bld->gallivm->builder;
2750    const struct lp_type type = bld->type;
2751
2752    assert(lp_check_value(type, a));
2753
2754    if(a == bld->zero)
2755       return bld->undef;
2756    if(a == bld->one)
2757       return bld->one;
2758    if(a == bld->undef)
2759       return bld->undef;
2760
2761    assert(type.floating);
2762
2763    if(LLVMIsConstant(a))
2764       return LLVMConstFDiv(bld->one, a);
2765
2766    /*
2767     * We don't use RCPPS because:
2768     * - it only has 10bits of precision
2769     * - it doesn't even get the reciprocate of 1.0 exactly
2770     * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
2771     * - for recent processors the benefit over DIVPS is marginal, a case
2772     *   dependent
2773     *
2774     * We could still use it on certain processors if benchmarks show that the
2775     * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
2776     * particular uses that require less workarounds.
2777     */
2778
2779    if (FALSE && ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2780          (util_cpu_caps.has_avx && type.width == 32 && type.length == 8))){
2781       const unsigned num_iterations = 0;
2782       LLVMValueRef res;
2783       unsigned i;
2784       const char *intrinsic = NULL;
2785
2786       if (type.length == 4) {
2787          intrinsic = "llvm.x86.sse.rcp.ps";
2788       }
2789       else {
2790          intrinsic = "llvm.x86.avx.rcp.ps.256";
2791       }
2792
2793       res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2794
2795       for (i = 0; i < num_iterations; ++i) {
2796          res = lp_build_rcp_refine(bld, a, res);
2797       }
2798
2799       return res;
2800    }
2801
2802    return LLVMBuildFDiv(builder, bld->one, a, "");
2803 }
2804
2805
2806 /**
2807  * Do one Newton-Raphson step to improve rsqrt precision:
2808  *
2809  *   x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
2810  *
2811  * See also Intel 64 and IA-32 Architectures Optimization Manual.
2812  */
2813 static inline LLVMValueRef
2814 lp_build_rsqrt_refine(struct lp_build_context *bld,
2815                       LLVMValueRef a,
2816                       LLVMValueRef rsqrt_a)
2817 {
2818    LLVMBuilderRef builder = bld->gallivm->builder;
2819    LLVMValueRef half = lp_build_const_vec(bld->gallivm, bld->type, 0.5);
2820    LLVMValueRef three = lp_build_const_vec(bld->gallivm, bld->type, 3.0);
2821    LLVMValueRef res;
2822
2823    res = LLVMBuildFMul(builder, rsqrt_a, rsqrt_a, "");
2824    res = LLVMBuildFMul(builder, a, res, "");
2825    res = LLVMBuildFSub(builder, three, res, "");
2826    res = LLVMBuildFMul(builder, rsqrt_a, res, "");
2827    res = LLVMBuildFMul(builder, half, res, "");
2828
2829    return res;
2830 }
2831
2832
2833 /**
2834  * Generate 1/sqrt(a).
2835  * Result is undefined for values < 0, infinity for +0.
2836  */
2837 LLVMValueRef
2838 lp_build_rsqrt(struct lp_build_context *bld,
2839                LLVMValueRef a)
2840 {
2841    const struct lp_type type = bld->type;
2842
2843    assert(lp_check_value(type, a));
2844
2845    assert(type.floating);
2846
2847    /*
2848     * This should be faster but all denormals will end up as infinity.
2849     */
2850    if (0 && lp_build_fast_rsqrt_available(type)) {
2851       const unsigned num_iterations = 1;
2852       LLVMValueRef res;
2853       unsigned i;
2854
2855       /* rsqrt(1.0) != 1.0 here */
2856       res = lp_build_fast_rsqrt(bld, a);
2857
2858       if (num_iterations) {
2859          /*
2860           * Newton-Raphson will result in NaN instead of infinity for zero,
2861           * and NaN instead of zero for infinity.
2862           * Also, need to ensure rsqrt(1.0) == 1.0.
2863           * All numbers smaller than FLT_MIN will result in +infinity
2864           * (rsqrtps treats all denormals as zero).
2865           */
2866          LLVMValueRef cmp;
2867          LLVMValueRef flt_min = lp_build_const_vec(bld->gallivm, type, FLT_MIN);
2868          LLVMValueRef inf = lp_build_const_vec(bld->gallivm, type, INFINITY);
2869
2870          for (i = 0; i < num_iterations; ++i) {
2871             res = lp_build_rsqrt_refine(bld, a, res);
2872          }
2873          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_LESS, a, flt_min);
2874          res = lp_build_select(bld, cmp, inf, res);
2875          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, inf);
2876          res = lp_build_select(bld, cmp, bld->zero, res);
2877          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, bld->one);
2878          res = lp_build_select(bld, cmp, bld->one, res);
2879       }
2880
2881       return res;
2882    }
2883
2884    return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2885 }
2886
2887 /**
2888  * If there's a fast (inaccurate) rsqrt instruction available
2889  * (caller may want to avoid to call rsqrt_fast if it's not available,
2890  * i.e. for calculating x^0.5 it may do rsqrt_fast(x) * x but if
2891  * unavailable it would result in sqrt/div/mul so obviously
2892  * much better to just call sqrt, skipping both div and mul).
2893  */
2894 boolean
2895 lp_build_fast_rsqrt_available(struct lp_type type)
2896 {
2897    assert(type.floating);
2898
2899    if ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2900        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
2901       return true;
2902    }
2903    return false;
2904 }
2905
2906
2907 /**
2908  * Generate 1/sqrt(a).
2909  * Result is undefined for values < 0, infinity for +0.
2910  * Precision is limited, only ~10 bits guaranteed
2911  * (rsqrt 1.0 may not be 1.0, denorms may be flushed to 0).
2912  */
2913 LLVMValueRef
2914 lp_build_fast_rsqrt(struct lp_build_context *bld,
2915                     LLVMValueRef a)
2916 {
2917    LLVMBuilderRef builder = bld->gallivm->builder;
2918    const struct lp_type type = bld->type;
2919
2920    assert(lp_check_value(type, a));
2921
2922    if (lp_build_fast_rsqrt_available(type)) {
2923       const char *intrinsic = NULL;
2924
2925       if (type.length == 4) {
2926          intrinsic = "llvm.x86.sse.rsqrt.ps";
2927       }
2928       else {
2929          intrinsic = "llvm.x86.avx.rsqrt.ps.256";
2930       }
2931       return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2932    }
2933    else {
2934       debug_printf("%s: emulating fast rsqrt with rcp/sqrt\n", __FUNCTION__);
2935    }
2936    return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2937 }
2938
2939
2940 /**
2941  * Generate sin(a) or cos(a) using polynomial approximation.
2942  * TODO: it might be worth recognizing sin and cos using same source
2943  * (i.e. d3d10 sincos opcode). Obviously doing both at the same time
2944  * would be way cheaper than calculating (nearly) everything twice...
2945  * Not sure it's common enough to be worth bothering however, scs
2946  * opcode could also benefit from calculating both though.
2947  */
2948 static LLVMValueRef
2949 lp_build_sin_or_cos(struct lp_build_context *bld,
2950                     LLVMValueRef a,
2951                     boolean cos)
2952 {
2953    struct gallivm_state *gallivm = bld->gallivm;
2954    LLVMBuilderRef b = gallivm->builder;
2955    struct lp_type int_type = lp_int_type(bld->type);
2956
2957    /*
2958     *  take the absolute value,
2959     *  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
2960     */
2961
2962    LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
2963    LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
2964
2965    LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
2966    LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
2967
2968    /*
2969     * scale by 4/Pi
2970     * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
2971     */
2972
2973    LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
2974    LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
2975
2976    /*
2977     * store the integer part of y in mm0
2978     * emm2 = _mm_cvttps_epi32(y);
2979     */
2980
2981    LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
2982
2983    /*
2984     * j=(j+1) & (~1) (see the cephes sources)
2985     * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
2986     */
2987
2988    LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
2989    LLVMValueRef emm2_add =  LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
2990    /*
2991     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
2992     */
2993    LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
2994    LLVMValueRef emm2_and =  LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
2995
2996    /*
2997     * y = _mm_cvtepi32_ps(emm2);
2998     */
2999    LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
3000
3001    LLVMValueRef const_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
3002    LLVMValueRef const_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
3003    LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
3004    LLVMValueRef sign_mask = lp_build_const_int_vec(gallivm, bld->type, 0x80000000);
3005
3006    /*
3007     * Argument used for poly selection and sign bit determination
3008     * is different for sin vs. cos.
3009     */
3010    LLVMValueRef emm2_2 = cos ? LLVMBuildSub(b, emm2_and, const_2, "emm2_2") :
3011                                emm2_and;
3012
3013    LLVMValueRef sign_bit = cos ? LLVMBuildShl(b, LLVMBuildAnd(b, const_4,
3014                                                               LLVMBuildNot(b, emm2_2, ""), ""),
3015                                               const_29, "sign_bit") :
3016                                  LLVMBuildAnd(b, LLVMBuildXor(b, a_v4si,
3017                                                               LLVMBuildShl(b, emm2_add,
3018                                                                            const_29, ""), ""),
3019                                               sign_mask, "sign_bit");
3020
3021    /*
3022     * get the polynom selection mask
3023     * there is one polynom for 0 <= x <= Pi/4
3024     * and another one for Pi/4<x<=Pi/2
3025     * Both branches will be computed.
3026     *
3027     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
3028     * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
3029     */
3030
3031    LLVMValueRef emm2_3 =  LLVMBuildAnd(b, emm2_2, const_2, "emm2_3");
3032    LLVMValueRef poly_mask = lp_build_compare(gallivm,
3033                                              int_type, PIPE_FUNC_EQUAL,
3034                                              emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
3035
3036    /*
3037     * _PS_CONST(minus_cephes_DP1, -0.78515625);
3038     * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
3039     * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
3040     */
3041    LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
3042    LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
3043    LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
3044
3045    /*
3046     * The magic pass: "Extended precision modular arithmetic"
3047     * x = ((x - y * DP1) - y * DP2) - y * DP3;
3048     */
3049    LLVMValueRef x_1 = lp_build_fmuladd(b, y_2, DP1, x_abs);
3050    LLVMValueRef x_2 = lp_build_fmuladd(b, y_2, DP2, x_1);
3051    LLVMValueRef x_3 = lp_build_fmuladd(b, y_2, DP3, x_2);
3052
3053    /*
3054     * Evaluate the first polynom  (0 <= x <= Pi/4)
3055     *
3056     * z = _mm_mul_ps(x,x);
3057     */
3058    LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
3059
3060    /*
3061     * _PS_CONST(coscof_p0,  2.443315711809948E-005);
3062     * _PS_CONST(coscof_p1, -1.388731625493765E-003);
3063     * _PS_CONST(coscof_p2,  4.166664568298827E-002);
3064     */
3065    LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
3066    LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
3067    LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
3068
3069    /*
3070     * y = *(v4sf*)_ps_coscof_p0;
3071     * y = _mm_mul_ps(y, z);
3072     */
3073    LLVMValueRef y_4 = lp_build_fmuladd(b, z, coscof_p0, coscof_p1);
3074    LLVMValueRef y_6 = lp_build_fmuladd(b, y_4, z, coscof_p2);
3075    LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
3076    LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
3077
3078
3079    /*
3080     * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
3081     * y = _mm_sub_ps(y, tmp);
3082     * y = _mm_add_ps(y, *(v4sf*)_ps_1);
3083     */
3084    LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
3085    LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
3086    LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
3087    LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
3088    LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
3089
3090    /*
3091     * _PS_CONST(sincof_p0, -1.9515295891E-4);
3092     * _PS_CONST(sincof_p1,  8.3321608736E-3);
3093     * _PS_CONST(sincof_p2, -1.6666654611E-1);
3094     */
3095    LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
3096    LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
3097    LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
3098
3099    /*
3100     * Evaluate the second polynom  (Pi/4 <= x <= 0)
3101     *
3102     * y2 = *(v4sf*)_ps_sincof_p0;
3103     * y2 = _mm_mul_ps(y2, z);
3104     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
3105     * y2 = _mm_mul_ps(y2, z);
3106     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
3107     * y2 = _mm_mul_ps(y2, z);
3108     * y2 = _mm_mul_ps(y2, x);
3109     * y2 = _mm_add_ps(y2, x);
3110     */
3111
3112    LLVMValueRef y2_4 = lp_build_fmuladd(b, z, sincof_p0, sincof_p1);
3113    LLVMValueRef y2_6 = lp_build_fmuladd(b, y2_4, z, sincof_p2);
3114    LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
3115    LLVMValueRef y2_9 = lp_build_fmuladd(b, y2_7, x_3, x_3);
3116
3117    /*
3118     * select the correct result from the two polynoms
3119     * xmm3 = poly_mask;
3120     * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
3121     * y = _mm_andnot_ps(xmm3, y);
3122     * y = _mm_or_ps(y,y2);
3123     */
3124    LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
3125    LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
3126    LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
3127    LLVMValueRef poly_mask_inv = LLVMBuildNot(b, poly_mask, "poly_mask_inv");
3128    LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
3129    LLVMValueRef y_combine = LLVMBuildOr(b, y_and, y2_and, "y_combine");
3130
3131    /*
3132     * update the sign
3133     * y = _mm_xor_ps(y, sign_bit);
3134     */
3135    LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sign");
3136    LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
3137
3138    LLVMValueRef isfinite = lp_build_isfinite(bld, a);
3139
3140    /* clamp output to be within [-1, 1] */
3141    y_result = lp_build_clamp(bld, y_result,
3142                              lp_build_const_vec(bld->gallivm, bld->type,  -1.f),
3143                              lp_build_const_vec(bld->gallivm, bld->type,  1.f));
3144    /* If a is -inf, inf or NaN then return NaN */
3145    y_result = lp_build_select(bld, isfinite, y_result,
3146                               lp_build_const_vec(bld->gallivm, bld->type,  NAN));
3147    return y_result;
3148 }
3149
3150
3151 /**
3152  * Generate sin(a)
3153  */
3154 LLVMValueRef
3155 lp_build_sin(struct lp_build_context *bld,
3156              LLVMValueRef a)
3157 {
3158    return lp_build_sin_or_cos(bld, a, FALSE);
3159 }
3160
3161
3162 /**
3163  * Generate cos(a)
3164  */
3165 LLVMValueRef
3166 lp_build_cos(struct lp_build_context *bld,
3167              LLVMValueRef a)
3168 {
3169    return lp_build_sin_or_cos(bld, a, TRUE);
3170 }
3171
3172
3173 /**
3174  * Generate pow(x, y)
3175  */
3176 LLVMValueRef
3177 lp_build_pow(struct lp_build_context *bld,
3178              LLVMValueRef x,
3179              LLVMValueRef y)
3180 {
3181    /* TODO: optimize the constant case */
3182    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3183        LLVMIsConstant(x) && LLVMIsConstant(y)) {
3184       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3185                    __FUNCTION__);
3186    }
3187
3188    return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
3189 }
3190
3191
3192 /**
3193  * Generate exp(x)
3194  */
3195 LLVMValueRef
3196 lp_build_exp(struct lp_build_context *bld,
3197              LLVMValueRef x)
3198 {
3199    /* log2(e) = 1/log(2) */
3200    LLVMValueRef log2e = lp_build_const_vec(bld->gallivm, bld->type,
3201                                            1.4426950408889634);
3202
3203    assert(lp_check_value(bld->type, x));
3204
3205    return lp_build_exp2(bld, lp_build_mul(bld, log2e, x));
3206 }
3207
3208
3209 /**
3210  * Generate log(x)
3211  * Behavior is undefined with infs, 0s and nans
3212  */
3213 LLVMValueRef
3214 lp_build_log(struct lp_build_context *bld,
3215              LLVMValueRef x)
3216 {
3217    /* log(2) */
3218    LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
3219                                           0.69314718055994529);
3220
3221    assert(lp_check_value(bld->type, x));
3222
3223    return lp_build_mul(bld, log2, lp_build_log2(bld, x));
3224 }
3225
3226 /**
3227  * Generate log(x) that handles edge cases (infs, 0s and nans)
3228  */
3229 LLVMValueRef
3230 lp_build_log_safe(struct lp_build_context *bld,
3231                   LLVMValueRef x)
3232 {
3233    /* log(2) */
3234    LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
3235                                           0.69314718055994529);
3236
3237    assert(lp_check_value(bld->type, x));
3238
3239    return lp_build_mul(bld, log2, lp_build_log2_safe(bld, x));
3240 }
3241
3242
3243 /**
3244  * Generate polynomial.
3245  * Ex:  coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
3246  */
3247 LLVMValueRef
3248 lp_build_polynomial(struct lp_build_context *bld,
3249                     LLVMValueRef x,
3250                     const double *coeffs,
3251                     unsigned num_coeffs)
3252 {
3253    const struct lp_type type = bld->type;
3254    LLVMValueRef even = NULL, odd = NULL;
3255    LLVMValueRef x2;
3256    unsigned i;
3257
3258    assert(lp_check_value(bld->type, x));
3259
3260    /* TODO: optimize the constant case */
3261    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3262        LLVMIsConstant(x)) {
3263       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3264                    __FUNCTION__);
3265    }
3266
3267    /*
3268     * Calculate odd and even terms seperately to decrease data dependency
3269     * Ex:
3270     *     c[0] + x^2 * c[2] + x^4 * c[4] ...
3271     *     + x * (c[1] + x^2 * c[3] + x^4 * c[5]) ...
3272     */
3273    x2 = lp_build_mul(bld, x, x);
3274
3275    for (i = num_coeffs; i--; ) {
3276       LLVMValueRef coeff;
3277
3278       coeff = lp_build_const_vec(bld->gallivm, type, coeffs[i]);
3279
3280       if (i % 2 == 0) {
3281          if (even)
3282             even = lp_build_mad(bld, x2, even, coeff);
3283          else
3284             even = coeff;
3285       } else {
3286          if (odd)
3287             odd = lp_build_mad(bld, x2, odd, coeff);
3288          else
3289             odd = coeff;
3290       }
3291    }
3292
3293    if (odd)
3294       return lp_build_mad(bld, odd, x, even);
3295    else if (even)
3296       return even;
3297    else
3298       return bld->undef;
3299 }
3300
3301
3302 /**
3303  * Minimax polynomial fit of 2**x, in range [0, 1[
3304  */
3305 const double lp_build_exp2_polynomial[] = {
3306 #if EXP_POLY_DEGREE == 5
3307    1.000000000000000000000, /*XXX: was 0.999999925063526176901, recompute others */
3308    0.693153073200168932794,
3309    0.240153617044375388211,
3310    0.0558263180532956664775,
3311    0.00898934009049466391101,
3312    0.00187757667519147912699
3313 #elif EXP_POLY_DEGREE == 4
3314    1.00000259337069434683,
3315    0.693003834469974940458,
3316    0.24144275689150793076,
3317    0.0520114606103070150235,
3318    0.0135341679161270268764
3319 #elif EXP_POLY_DEGREE == 3
3320    0.999925218562710312959,
3321    0.695833540494823811697,
3322    0.226067155427249155588,
3323    0.0780245226406372992967
3324 #elif EXP_POLY_DEGREE == 2
3325    1.00172476321474503578,
3326    0.657636275736077639316,
3327    0.33718943461968720704
3328 #else
3329 #error
3330 #endif
3331 };
3332
3333
3334 LLVMValueRef
3335 lp_build_exp2(struct lp_build_context *bld,
3336               LLVMValueRef x)
3337 {
3338    LLVMBuilderRef builder = bld->gallivm->builder;
3339    const struct lp_type type = bld->type;
3340    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3341    LLVMValueRef ipart = NULL;
3342    LLVMValueRef fpart = NULL;
3343    LLVMValueRef expipart = NULL;
3344    LLVMValueRef expfpart = NULL;
3345    LLVMValueRef res = NULL;
3346
3347    assert(lp_check_value(bld->type, x));
3348
3349    /* TODO: optimize the constant case */
3350    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3351        LLVMIsConstant(x)) {
3352       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3353                    __FUNCTION__);
3354    }
3355
3356    assert(type.floating && type.width == 32);
3357
3358    /* We want to preserve NaN and make sure than for exp2 if x > 128,
3359     * the result is INF  and if it's smaller than -126.9 the result is 0 */
3360    x = lp_build_min_ext(bld, lp_build_const_vec(bld->gallivm, type,  128.0), x,
3361                         GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
3362    x = lp_build_max_ext(bld, lp_build_const_vec(bld->gallivm, type, -126.99999),
3363                         x, GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
3364
3365    /* ipart = floor(x) */
3366    /* fpart = x - ipart */
3367    lp_build_ifloor_fract(bld, x, &ipart, &fpart);
3368
3369    /* expipart = (float) (1 << ipart) */
3370    expipart = LLVMBuildAdd(builder, ipart,
3371                            lp_build_const_int_vec(bld->gallivm, type, 127), "");
3372    expipart = LLVMBuildShl(builder, expipart,
3373                            lp_build_const_int_vec(bld->gallivm, type, 23), "");
3374    expipart = LLVMBuildBitCast(builder, expipart, vec_type, "");
3375
3376    expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
3377                                   ARRAY_SIZE(lp_build_exp2_polynomial));
3378
3379    res = LLVMBuildFMul(builder, expipart, expfpart, "");
3380
3381    return res;
3382 }
3383
3384
3385
3386 /**
3387  * Extract the exponent of a IEEE-754 floating point value.
3388  *
3389  * Optionally apply an integer bias.
3390  *
3391  * Result is an integer value with
3392  *
3393  *   ifloor(log2(x)) + bias
3394  */
3395 LLVMValueRef
3396 lp_build_extract_exponent(struct lp_build_context *bld,
3397                           LLVMValueRef x,
3398                           int bias)
3399 {
3400    LLVMBuilderRef builder = bld->gallivm->builder;
3401    const struct lp_type type = bld->type;
3402    unsigned mantissa = lp_mantissa(type);
3403    LLVMValueRef res;
3404
3405    assert(type.floating);
3406
3407    assert(lp_check_value(bld->type, x));
3408
3409    x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3410
3411    res = LLVMBuildLShr(builder, x,
3412                        lp_build_const_int_vec(bld->gallivm, type, mantissa), "");
3413    res = LLVMBuildAnd(builder, res,
3414                       lp_build_const_int_vec(bld->gallivm, type, 255), "");
3415    res = LLVMBuildSub(builder, res,
3416                       lp_build_const_int_vec(bld->gallivm, type, 127 - bias), "");
3417
3418    return res;
3419 }
3420
3421
3422 /**
3423  * Extract the mantissa of the a floating.
3424  *
3425  * Result is a floating point value with
3426  *
3427  *   x / floor(log2(x))
3428  */
3429 LLVMValueRef
3430 lp_build_extract_mantissa(struct lp_build_context *bld,
3431                           LLVMValueRef x)
3432 {
3433    LLVMBuilderRef builder = bld->gallivm->builder;
3434    const struct lp_type type = bld->type;
3435    unsigned mantissa = lp_mantissa(type);
3436    LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type,
3437                                                   (1ULL << mantissa) - 1);
3438    LLVMValueRef one = LLVMConstBitCast(bld->one, bld->int_vec_type);
3439    LLVMValueRef res;
3440
3441    assert(lp_check_value(bld->type, x));
3442
3443    assert(type.floating);
3444
3445    x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3446
3447    /* res = x / 2**ipart */
3448    res = LLVMBuildAnd(builder, x, mantmask, "");
3449    res = LLVMBuildOr(builder, res, one, "");
3450    res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
3451
3452    return res;
3453 }
3454
3455
3456
3457 /**
3458  * Minimax polynomial fit of log2((1.0 + sqrt(x))/(1.0 - sqrt(x)))/sqrt(x) ,for x in range of [0, 1/9[
3459  * These coefficients can be generate with
3460  * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
3461  */
3462 const double lp_build_log2_polynomial[] = {
3463 #if LOG_POLY_DEGREE == 5
3464    2.88539008148777786488L,
3465    0.961796878841293367824L,
3466    0.577058946784739859012L,
3467    0.412914355135828735411L,
3468    0.308591899232910175289L,
3469    0.352376952300281371868L,
3470 #elif LOG_POLY_DEGREE == 4
3471    2.88539009343309178325L,
3472    0.961791550404184197881L,
3473    0.577440339438736392009L,
3474    0.403343858251329912514L,
3475    0.406718052498846252698L,
3476 #elif LOG_POLY_DEGREE == 3
3477    2.88538959748872753838L,
3478    0.961932915889597772928L,
3479    0.571118517972136195241L,
3480    0.493997535084709500285L,
3481 #else
3482 #error
3483 #endif
3484 };
3485
3486 /**
3487  * See http://www.devmaster.net/forums/showthread.php?p=43580
3488  * http://en.wikipedia.org/wiki/Logarithm#Calculation
3489  * http://www.nezumi.demon.co.uk/consult/logx.htm
3490  *
3491  * If handle_edge_cases is true the function will perform computations
3492  * to match the required D3D10+ behavior for each of the edge cases.
3493  * That means that if input is:
3494  * - less than zero (to and including -inf) then NaN will be returned
3495  * - equal to zero (-denorm, -0, +0 or +denorm), then -inf will be returned
3496  * - +infinity, then +infinity will be returned
3497  * - NaN, then NaN will be returned
3498  *
3499  * Those checks are fairly expensive so if you don't need them make sure
3500  * handle_edge_cases is false.
3501  */
3502 void
3503 lp_build_log2_approx(struct lp_build_context *bld,
3504                      LLVMValueRef x,
3505                      LLVMValueRef *p_exp,
3506                      LLVMValueRef *p_floor_log2,
3507                      LLVMValueRef *p_log2,
3508                      boolean handle_edge_cases)
3509 {
3510    LLVMBuilderRef builder = bld->gallivm->builder;
3511    const struct lp_type type = bld->type;
3512    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3513    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
3514
3515    LLVMValueRef expmask = lp_build_const_int_vec(bld->gallivm, type, 0x7f800000);
3516    LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 0x007fffff);
3517    LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
3518
3519    LLVMValueRef i = NULL;
3520    LLVMValueRef y = NULL;
3521    LLVMValueRef z = NULL;
3522    LLVMValueRef exp = NULL;
3523    LLVMValueRef mant = NULL;
3524    LLVMValueRef logexp = NULL;
3525    LLVMValueRef p_z = NULL;
3526    LLVMValueRef res = NULL;
3527
3528    assert(lp_check_value(bld->type, x));
3529
3530    if(p_exp || p_floor_log2 || p_log2) {
3531       /* TODO: optimize the constant case */
3532       if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3533           LLVMIsConstant(x)) {
3534          debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3535                       __FUNCTION__);
3536       }
3537
3538       assert(type.floating && type.width == 32);
3539
3540       /*
3541        * We don't explicitly handle denormalized numbers. They will yield a
3542        * result in the neighbourhood of -127, which appears to be adequate
3543        * enough.
3544        */
3545
3546       i = LLVMBuildBitCast(builder, x, int_vec_type, "");
3547
3548       /* exp = (float) exponent(x) */
3549       exp = LLVMBuildAnd(builder, i, expmask, "");
3550    }
3551
3552    if(p_floor_log2 || p_log2) {
3553       logexp = LLVMBuildLShr(builder, exp, lp_build_const_int_vec(bld->gallivm, type, 23), "");
3554       logexp = LLVMBuildSub(builder, logexp, lp_build_const_int_vec(bld->gallivm, type, 127), "");
3555       logexp = LLVMBuildSIToFP(builder, logexp, vec_type, "");
3556    }
3557
3558    if (p_log2) {
3559       /* mant = 1 + (float) mantissa(x) */
3560       mant = LLVMBuildAnd(builder, i, mantmask, "");
3561       mant = LLVMBuildOr(builder, mant, one, "");
3562       mant = LLVMBuildBitCast(builder, mant, vec_type, "");
3563
3564       /* y = (mant - 1) / (mant + 1) */
3565       y = lp_build_div(bld,
3566          lp_build_sub(bld, mant, bld->one),
3567          lp_build_add(bld, mant, bld->one)
3568       );
3569
3570       /* z = y^2 */
3571       z = lp_build_mul(bld, y, y);
3572
3573       /* compute P(z) */
3574       p_z = lp_build_polynomial(bld, z, lp_build_log2_polynomial,
3575                                 ARRAY_SIZE(lp_build_log2_polynomial));
3576
3577       /* y * P(z) + logexp */
3578       res = lp_build_mad(bld, y, p_z, logexp);
3579
3580       if (type.floating && handle_edge_cases) {
3581          LLVMValueRef negmask, infmask,  zmask;
3582          negmask = lp_build_cmp(bld, PIPE_FUNC_LESS, x,
3583                                 lp_build_const_vec(bld->gallivm, type,  0.0f));
3584          zmask = lp_build_cmp(bld, PIPE_FUNC_EQUAL, x,
3585                               lp_build_const_vec(bld->gallivm, type,  0.0f));
3586          infmask = lp_build_cmp(bld, PIPE_FUNC_GEQUAL, x,
3587                                 lp_build_const_vec(bld->gallivm, type,  INFINITY));
3588
3589          /* If x is qual to inf make sure we return inf */
3590          res = lp_build_select(bld, infmask,
3591                                lp_build_const_vec(bld->gallivm, type,  INFINITY),
3592                                res);
3593          /* If x is qual to 0, return -inf */
3594          res = lp_build_select(bld, zmask,
3595                                lp_build_const_vec(bld->gallivm, type,  -INFINITY),
3596                                res);
3597          /* If x is nan or less than 0, return nan */
3598          res = lp_build_select(bld, negmask,
3599                                lp_build_const_vec(bld->gallivm, type,  NAN),
3600                                res);
3601       }
3602    }
3603
3604    if (p_exp) {
3605       exp = LLVMBuildBitCast(builder, exp, vec_type, "");
3606       *p_exp = exp;
3607    }
3608
3609    if (p_floor_log2)
3610       *p_floor_log2 = logexp;
3611
3612    if (p_log2)
3613       *p_log2 = res;
3614 }
3615
3616
3617 /*
3618  * log2 implementation which doesn't have special code to
3619  * handle edge cases (-inf, 0, inf, NaN). It's faster but
3620  * the results for those cases are undefined.
3621  */
3622 LLVMValueRef
3623 lp_build_log2(struct lp_build_context *bld,
3624               LLVMValueRef x)
3625 {
3626    LLVMValueRef res;
3627    lp_build_log2_approx(bld, x, NULL, NULL, &res, FALSE);
3628    return res;
3629 }
3630
3631 /*
3632  * Version of log2 which handles all edge cases.
3633  * Look at documentation of lp_build_log2_approx for
3634  * description of the behavior for each of the edge cases.
3635  */
3636 LLVMValueRef
3637 lp_build_log2_safe(struct lp_build_context *bld,
3638                    LLVMValueRef x)
3639 {
3640    LLVMValueRef res;
3641    lp_build_log2_approx(bld, x, NULL, NULL, &res, TRUE);
3642    return res;
3643 }
3644
3645
3646 /**
3647  * Faster (and less accurate) log2.
3648  *
3649  *    log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x))
3650  *
3651  * Piece-wise linear approximation, with exact results when x is a
3652  * power of two.
3653  *
3654  * See http://www.flipcode.com/archives/Fast_log_Function.shtml
3655  */
3656 LLVMValueRef
3657 lp_build_fast_log2(struct lp_build_context *bld,
3658                    LLVMValueRef x)
3659 {
3660    LLVMBuilderRef builder = bld->gallivm->builder;
3661    LLVMValueRef ipart;
3662    LLVMValueRef fpart;
3663
3664    assert(lp_check_value(bld->type, x));
3665
3666    assert(bld->type.floating);
3667
3668    /* ipart = floor(log2(x)) - 1 */
3669    ipart = lp_build_extract_exponent(bld, x, -1);
3670    ipart = LLVMBuildSIToFP(builder, ipart, bld->vec_type, "");
3671
3672    /* fpart = x / 2**ipart */
3673    fpart = lp_build_extract_mantissa(bld, x);
3674
3675    /* ipart + fpart */
3676    return LLVMBuildFAdd(builder, ipart, fpart, "");
3677 }
3678
3679
3680 /**
3681  * Fast implementation of iround(log2(x)).
3682  *
3683  * Not an approximation -- it should give accurate results all the time.
3684  */
3685 LLVMValueRef
3686 lp_build_ilog2(struct lp_build_context *bld,
3687                LLVMValueRef x)
3688 {
3689    LLVMBuilderRef builder = bld->gallivm->builder;
3690    LLVMValueRef sqrt2 = lp_build_const_vec(bld->gallivm, bld->type, M_SQRT2);
3691    LLVMValueRef ipart;
3692
3693    assert(bld->type.floating);
3694
3695    assert(lp_check_value(bld->type, x));
3696
3697    /* x * 2^(0.5)   i.e., add 0.5 to the log2(x) */
3698    x = LLVMBuildFMul(builder, x, sqrt2, "");
3699
3700    /* ipart = floor(log2(x) + 0.5)  */
3701    ipart = lp_build_extract_exponent(bld, x, 0);
3702
3703    return ipart;
3704 }
3705
3706 LLVMValueRef
3707 lp_build_mod(struct lp_build_context *bld,
3708              LLVMValueRef x,
3709              LLVMValueRef y)
3710 {
3711    LLVMBuilderRef builder = bld->gallivm->builder;
3712    LLVMValueRef res;
3713    const struct lp_type type = bld->type;
3714
3715    assert(lp_check_value(type, x));
3716    assert(lp_check_value(type, y));
3717
3718    if (type.floating)
3719       res = LLVMBuildFRem(builder, x, y, "");
3720    else if (type.sign)
3721       res = LLVMBuildSRem(builder, x, y, "");
3722    else
3723       res = LLVMBuildURem(builder, x, y, "");
3724    return res;
3725 }
3726
3727
3728 /*
3729  * For floating inputs it creates and returns a mask
3730  * which is all 1's for channels which are NaN.
3731  * Channels inside x which are not NaN will be 0.
3732  */
3733 LLVMValueRef
3734 lp_build_isnan(struct lp_build_context *bld,
3735                LLVMValueRef x)
3736 {
3737    LLVMValueRef mask;
3738    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3739
3740    assert(bld->type.floating);
3741    assert(lp_check_value(bld->type, x));
3742
3743    mask = LLVMBuildFCmp(bld->gallivm->builder, LLVMRealOEQ, x, x,
3744                         "isnotnan");
3745    mask = LLVMBuildNot(bld->gallivm->builder, mask, "");
3746    mask = LLVMBuildSExt(bld->gallivm->builder, mask, int_vec_type, "isnan");
3747    return mask;
3748 }
3749
3750 /* Returns all 1's for floating point numbers that are
3751  * finite numbers and returns all zeros for -inf,
3752  * inf and nan's */
3753 LLVMValueRef
3754 lp_build_isfinite(struct lp_build_context *bld,
3755                   LLVMValueRef x)
3756 {
3757    LLVMBuilderRef builder = bld->gallivm->builder;
3758    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3759    struct lp_type int_type = lp_int_type(bld->type);
3760    LLVMValueRef intx = LLVMBuildBitCast(builder, x, int_vec_type, "");
3761    LLVMValueRef infornan32 = lp_build_const_int_vec(bld->gallivm, bld->type,
3762                                                     0x7f800000);
3763
3764    if (!bld->type.floating) {
3765       return lp_build_const_int_vec(bld->gallivm, bld->type, 0);
3766    }
3767    assert(bld->type.floating);
3768    assert(lp_check_value(bld->type, x));
3769    assert(bld->type.width == 32);
3770
3771    intx = LLVMBuildAnd(builder, intx, infornan32, "");
3772    return lp_build_compare(bld->gallivm, int_type, PIPE_FUNC_NOTEQUAL,
3773                            intx, infornan32);
3774 }
3775
3776 /*
3777  * Returns true if the number is nan or inf and false otherwise.
3778  * The input has to be a floating point vector.
3779  */
3780 LLVMValueRef
3781 lp_build_is_inf_or_nan(struct gallivm_state *gallivm,
3782                        const struct lp_type type,
3783                        LLVMValueRef x)
3784 {
3785    LLVMBuilderRef builder = gallivm->builder;
3786    struct lp_type int_type = lp_int_type(type);
3787    LLVMValueRef const0 = lp_build_const_int_vec(gallivm, int_type,
3788                                                 0x7f800000);
3789    LLVMValueRef ret;
3790
3791    assert(type.floating);
3792
3793    ret = LLVMBuildBitCast(builder, x, lp_build_vec_type(gallivm, int_type), "");
3794    ret = LLVMBuildAnd(builder, ret, const0, "");
3795    ret = lp_build_compare(gallivm, int_type, PIPE_FUNC_EQUAL,
3796                           ret, const0);
3797
3798    return ret;
3799 }
3800
3801
3802 LLVMValueRef
3803 lp_build_fpstate_get(struct gallivm_state *gallivm)
3804 {
3805    if (util_cpu_caps.has_sse) {
3806       LLVMBuilderRef builder = gallivm->builder;
3807       LLVMValueRef mxcsr_ptr = lp_build_alloca(
3808          gallivm,
3809          LLVMInt32TypeInContext(gallivm->context),
3810          "mxcsr_ptr");
3811       LLVMValueRef mxcsr_ptr8 = LLVMBuildPointerCast(builder, mxcsr_ptr,
3812           LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3813       lp_build_intrinsic(builder,
3814                          "llvm.x86.sse.stmxcsr",
3815                          LLVMVoidTypeInContext(gallivm->context),
3816                          &mxcsr_ptr8, 1, 0);
3817       return mxcsr_ptr;
3818    }
3819    return 0;
3820 }
3821
3822 void
3823 lp_build_fpstate_set_denorms_zero(struct gallivm_state *gallivm,
3824                                   boolean zero)
3825 {
3826    if (util_cpu_caps.has_sse) {
3827       /* turn on DAZ (64) | FTZ (32768) = 32832 if available */
3828       int daz_ftz = _MM_FLUSH_ZERO_MASK;
3829
3830       LLVMBuilderRef builder = gallivm->builder;
3831       LLVMValueRef mxcsr_ptr = lp_build_fpstate_get(gallivm);
3832       LLVMValueRef mxcsr =
3833          LLVMBuildLoad(builder, mxcsr_ptr, "mxcsr");
3834
3835       if (util_cpu_caps.has_daz) {
3836          /* Enable denormals are zero mode */
3837          daz_ftz |= _MM_DENORMALS_ZERO_MASK;
3838       }
3839       if (zero) {
3840          mxcsr = LLVMBuildOr(builder, mxcsr,
3841                              LLVMConstInt(LLVMTypeOf(mxcsr), daz_ftz, 0), "");
3842       } else {
3843          mxcsr = LLVMBuildAnd(builder, mxcsr,
3844                               LLVMConstInt(LLVMTypeOf(mxcsr), ~daz_ftz, 0), "");
3845       }
3846
3847       LLVMBuildStore(builder, mxcsr, mxcsr_ptr);
3848       lp_build_fpstate_set(gallivm, mxcsr_ptr);
3849    }
3850 }
3851
3852 void
3853 lp_build_fpstate_set(struct gallivm_state *gallivm,
3854                      LLVMValueRef mxcsr_ptr)
3855 {
3856    if (util_cpu_caps.has_sse) {
3857       LLVMBuilderRef builder = gallivm->builder;
3858       mxcsr_ptr = LLVMBuildPointerCast(builder, mxcsr_ptr,
3859                      LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3860       lp_build_intrinsic(builder,
3861                          "llvm.x86.sse.ldmxcsr",
3862                          LLVMVoidTypeInContext(gallivm->context),
3863                          &mxcsr_ptr, 1, 0);
3864    }
3865 }