src/gallium/auxiliary/gallivm/lp_bld_arit.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2009-2010 VMware, Inc.
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the
  15  * next paragraph) shall be included in all copies or substantial portions
  16  * of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  21  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
  22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25  *
  26  **************************************************************************/
  27
  28
  29 /**
  30  * @file
  31  * Helper
  32  *
  33  * LLVM IR doesn't support all basic arithmetic operations we care about (most
  34  * notably min/max and saturated operations), and it is often necessary to
  35  * resort machine-specific intrinsics directly. The functions here hide all
  36  * these implementation details from the other modules.
  37  *
  38  * We also do simple expressions simplification here. Reasons are:
  39  * - it is very easy given we have all necessary information readily available
  40  * - LLVM optimization passes fail to simplify several vector expressions
  41  * - We often know value constraints which the optimization passes have no way
  42  *   of knowing, such as when source arguments are known to be in [0, 1] range.
  43  *
  44  * @author Jose Fonseca <jfonseca@vmware.com>
  45  */
  46
  47
  48 #include <float.h>
  49
  50 #include <llvm/Config/llvm-config.h>
  51
  52 #include "util/u_memory.h"
  53 #include "util/u_debug.h"
  54 #include "util/u_math.h"
  55 #include "util/u_cpu_detect.h"
  56
  57 #include "lp_bld_type.h"
  58 #include "lp_bld_const.h"
  59 #include "lp_bld_init.h"
  60 #include "lp_bld_intr.h"
  61 #include "lp_bld_logic.h"
  62 #include "lp_bld_pack.h"
  63 #include "lp_bld_debug.h"
  64 #include "lp_bld_bitarit.h"
  65 #include "lp_bld_arit.h"
  66 #include "lp_bld_flow.h"
  67
  68 #if defined(PIPE_ARCH_SSE)
  69 #include <xmmintrin.h>
  70 #endif
  71
  72 #ifndef _MM_DENORMALS_ZERO_MASK
  73 #define _MM_DENORMALS_ZERO_MASK 0x0040
  74 #endif
  75
  76 #ifndef _MM_FLUSH_ZERO_MASK
  77 #define _MM_FLUSH_ZERO_MASK 0x8000
  78 #endif
  79
  80 #define EXP_POLY_DEGREE 5
  81
  82 #define LOG_POLY_DEGREE 4
  83
  84
  85 /**
  86  * Generate min(a, b)
  87  * No checks for special case values of a or b = 1 or 0 are done.
  88  * NaN's are handled according to the behavior specified by the
  89  * nan_behavior argument.
  90  */
  91 static LLVMValueRef
  92 lp_build_min_simple(struct lp_build_context *bld,
  93                     LLVMValueRef a,
  94                     LLVMValueRef b,
  95                     enum gallivm_nan_behavior nan_behavior)
  96 {
  97    const struct lp_type type = bld->type;
  98    const char *intrinsic = NULL;
  99    unsigned intr_size = 0;
 100    LLVMValueRef cond;
 101
 102    assert(lp_check_value(type, a));
 103    assert(lp_check_value(type, b));
 104
 105    /* TODO: optimize the constant case */
 106
 107    if (type.floating && util_cpu_caps.has_sse) {
 108       if (type.width == 32) {
 109          if (type.length == 1) {
 110             intrinsic = "llvm.x86.sse.min.ss";
 111             intr_size = 128;
 112          }
 113          else if (type.length <= 4 || !util_cpu_caps.has_avx) {
 114             intrinsic = "llvm.x86.sse.min.ps";
 115             intr_size = 128;
 116          }
 117          else {
 118             intrinsic = "llvm.x86.avx.min.ps.256";
 119             intr_size = 256;
 120          }
 121       }
 122       if (type.width == 64 && util_cpu_caps.has_sse2) {
 123          if (type.length == 1) {
 124             intrinsic = "llvm.x86.sse2.min.sd";
 125             intr_size = 128;
 126          }
 127          else if (type.length == 2 || !util_cpu_caps.has_avx) {
 128             intrinsic = "llvm.x86.sse2.min.pd";
 129             intr_size = 128;
 130          }
 131          else {
 132             intrinsic = "llvm.x86.avx.min.pd.256";
 133             intr_size = 256;
 134          }
 135       }
 136    }
 137    else if (type.floating && util_cpu_caps.has_altivec) {
 138       if (nan_behavior == GALLIVM_NAN_RETURN_NAN ||
 139           nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
 140          debug_printf("%s: altivec doesn't support nan return nan behavior\n",
 141                       __FUNCTION__);
 142       }
 143       if (type.width == 32 && type.length == 4) {
 144          intrinsic = "llvm.ppc.altivec.vminfp";
 145          intr_size = 128;
 146       }
 147    } else if (util_cpu_caps.has_altivec) {
 148       intr_size = 128;
 149       if (type.width == 8) {
 150          if (!type.sign) {
 151             intrinsic = "llvm.ppc.altivec.vminub";
 152          } else {
 153             intrinsic = "llvm.ppc.altivec.vminsb";
 154          }
 155       } else if (type.width == 16) {
 156          if (!type.sign) {
 157             intrinsic = "llvm.ppc.altivec.vminuh";
 158          } else {
 159             intrinsic = "llvm.ppc.altivec.vminsh";
 160          }
 161       } else if (type.width == 32) {
 162          if (!type.sign) {
 163             intrinsic = "llvm.ppc.altivec.vminuw";
 164          } else {
 165             intrinsic = "llvm.ppc.altivec.vminsw";
 166          }
 167       }
 168    }
 169
 170    if (intrinsic) {
 171       /* We need to handle nan's for floating point numbers. If one of the
 172        * inputs is nan the other should be returned (required by both D3D10+
 173        * and OpenCL).
 174        * The sse intrinsics return the second operator in case of nan by
 175        * default so we need to special code to handle those.
 176        */
 177       if (util_cpu_caps.has_sse && type.floating &&
 178           nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
 179           nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN &&
 180           nan_behavior != GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
 181          LLVMValueRef isnan, min;
 182          min = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
 183                                                    type,
 184                                                    intr_size, a, b);
 185          if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
 186             isnan = lp_build_isnan(bld, b);
 187             return lp_build_select(bld, isnan, a, min);
 188          } else {
 189             assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
 190             isnan = lp_build_isnan(bld, a);
 191             return lp_build_select(bld, isnan, a, min);
 192          }
 193       } else {
 194          return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
 195                                                     type,
 196                                                     intr_size, a, b);
 197       }
 198    }
 199
 200    if (type.floating) {
 201       switch (nan_behavior) {
 202       case GALLIVM_NAN_RETURN_NAN: {
 203          LLVMValueRef isnan = lp_build_isnan(bld, b);
 204          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 205          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
 206          return lp_build_select(bld, cond, a, b);
 207       }
 208          break;
 209       case GALLIVM_NAN_RETURN_OTHER: {
 210          LLVMValueRef isnan = lp_build_isnan(bld, a);
 211          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 212          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
 213          return lp_build_select(bld, cond, a, b);
 214       }
 215          break;
 216       case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
 217          cond = lp_build_cmp_ordered(bld, PIPE_FUNC_LESS, a, b);
 218          return lp_build_select(bld, cond, a, b);
 219       case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
 220          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, b, a);
 221          return lp_build_select(bld, cond, b, a);
 222       case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
 223          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 224          return lp_build_select(bld, cond, a, b);
 225          break;
 226       default:
 227          assert(0);
 228          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 229          return lp_build_select(bld, cond, a, b);
 230       }
 231    } else {
 232       cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 233       return lp_build_select(bld, cond, a, b);
 234    }
 235 }
 236
 237
 238 LLVMValueRef
 239 lp_build_fmuladd(LLVMBuilderRef builder,
 240                  LLVMValueRef a,
 241                  LLVMValueRef b,
 242                  LLVMValueRef c)
 243 {
 244    LLVMTypeRef type = LLVMTypeOf(a);
 245    assert(type == LLVMTypeOf(b));
 246    assert(type == LLVMTypeOf(c));
 247
 248    char intrinsic[32];
 249    lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fmuladd", type);
 250    LLVMValueRef args[] = { a, b, c };
 251    return lp_build_intrinsic(builder, intrinsic, type, args, 3, 0);
 252 }
 253
 254
 255 /**
 256  * Generate max(a, b)
 257  * No checks for special case values of a or b = 1 or 0 are done.
 258  * NaN's are handled according to the behavior specified by the
 259  * nan_behavior argument.
 260  */
 261 static LLVMValueRef
 262 lp_build_max_simple(struct lp_build_context *bld,
 263                     LLVMValueRef a,
 264                     LLVMValueRef b,
 265                     enum gallivm_nan_behavior nan_behavior)
 266 {
 267    const struct lp_type type = bld->type;
 268    const char *intrinsic = NULL;
 269    unsigned intr_size = 0;
 270    LLVMValueRef cond;
 271
 272    assert(lp_check_value(type, a));
 273    assert(lp_check_value(type, b));
 274
 275    /* TODO: optimize the constant case */
 276
 277    if (type.floating && util_cpu_caps.has_sse) {
 278       if (type.width == 32) {
 279          if (type.length == 1) {
 280             intrinsic = "llvm.x86.sse.max.ss";
 281             intr_size = 128;
 282          }
 283          else if (type.length <= 4 || !util_cpu_caps.has_avx) {
 284             intrinsic = "llvm.x86.sse.max.ps";
 285             intr_size = 128;
 286          }
 287          else {
 288             intrinsic = "llvm.x86.avx.max.ps.256";
 289             intr_size = 256;
 290          }
 291       }
 292       if (type.width == 64 && util_cpu_caps.has_sse2) {
 293          if (type.length == 1) {
 294             intrinsic = "llvm.x86.sse2.max.sd";
 295             intr_size = 128;
 296          }
 297          else if (type.length == 2 || !util_cpu_caps.has_avx) {
 298             intrinsic = "llvm.x86.sse2.max.pd";
 299             intr_size = 128;
 300          }
 301          else {
 302             intrinsic = "llvm.x86.avx.max.pd.256";
 303             intr_size = 256;
 304          }
 305       }
 306    }
 307    else if (type.floating && util_cpu_caps.has_altivec) {
 308       if (nan_behavior == GALLIVM_NAN_RETURN_NAN ||
 309           nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
 310          debug_printf("%s: altivec doesn't support nan return nan behavior\n",
 311                       __FUNCTION__);
 312       }
 313       if (type.width == 32 || type.length == 4) {
 314          intrinsic = "llvm.ppc.altivec.vmaxfp";
 315          intr_size = 128;
 316       }
 317    } else if (util_cpu_caps.has_altivec) {
 318      intr_size = 128;
 319      if (type.width == 8) {
 320        if (!type.sign) {
 321          intrinsic = "llvm.ppc.altivec.vmaxub";
 322        } else {
 323          intrinsic = "llvm.ppc.altivec.vmaxsb";
 324        }
 325      } else if (type.width == 16) {
 326        if (!type.sign) {
 327          intrinsic = "llvm.ppc.altivec.vmaxuh";
 328        } else {
 329          intrinsic = "llvm.ppc.altivec.vmaxsh";
 330        }
 331      } else if (type.width == 32) {
 332        if (!type.sign) {
 333          intrinsic = "llvm.ppc.altivec.vmaxuw";
 334        } else {
 335          intrinsic = "llvm.ppc.altivec.vmaxsw";
 336        }
 337      }
 338    }
 339
 340    if (intrinsic) {
 341       if (util_cpu_caps.has_sse && type.floating &&
 342           nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
 343           nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN &&
 344           nan_behavior != GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
 345          LLVMValueRef isnan, max;
 346          max = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
 347                                                    type,
 348                                                    intr_size, a, b);
 349          if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
 350             isnan = lp_build_isnan(bld, b);
 351             return lp_build_select(bld, isnan, a, max);
 352          } else {
 353             assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
 354             isnan = lp_build_isnan(bld, a);
 355             return lp_build_select(bld, isnan, a, max);
 356          }
 357       } else {
 358          return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
 359                                                     type,
 360                                                     intr_size, a, b);
 361       }
 362    }
 363
 364    if (type.floating) {
 365       switch (nan_behavior) {
 366       case GALLIVM_NAN_RETURN_NAN: {
 367          LLVMValueRef isnan = lp_build_isnan(bld, b);
 368          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 369          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
 370          return lp_build_select(bld, cond, a, b);
 371       }
 372          break;
 373       case GALLIVM_NAN_RETURN_OTHER: {
 374          LLVMValueRef isnan = lp_build_isnan(bld, a);
 375          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 376          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
 377          return lp_build_select(bld, cond, a, b);
 378       }
 379          break;
 380       case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
 381          cond = lp_build_cmp_ordered(bld, PIPE_FUNC_GREATER, a, b);
 382          return lp_build_select(bld, cond, a, b);
 383       case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
 384          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, b, a);
 385          return lp_build_select(bld, cond, b, a);
 386       case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
 387          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 388          return lp_build_select(bld, cond, a, b);
 389          break;
 390       default:
 391          assert(0);
 392          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 393          return lp_build_select(bld, cond, a, b);
 394       }
 395    } else {
 396       cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 397       return lp_build_select(bld, cond, a, b);
 398    }
 399 }
 400
 401
 402 /**
 403  * Generate 1 - a, or ~a depending on bld->type.
 404  */
 405 LLVMValueRef
 406 lp_build_comp(struct lp_build_context *bld,
 407               LLVMValueRef a)
 408 {
 409    LLVMBuilderRef builder = bld->gallivm->builder;
 410    const struct lp_type type = bld->type;
 411
 412    assert(lp_check_value(type, a));
 413
 414    if(a == bld->one)
 415       return bld->zero;
 416    if(a == bld->zero)
 417       return bld->one;
 418
 419    if(type.norm && !type.floating && !type.fixed && !type.sign) {
 420       if(LLVMIsConstant(a))
 421          return LLVMConstNot(a);
 422       else
 423          return LLVMBuildNot(builder, a, "");
 424    }
 425
 426    if(LLVMIsConstant(a))
 427       if (type.floating)
 428           return LLVMConstFSub(bld->one, a);
 429       else
 430           return LLVMConstSub(bld->one, a);
 431    else
 432       if (type.floating)
 433          return LLVMBuildFSub(builder, bld->one, a, "");
 434       else
 435          return LLVMBuildSub(builder, bld->one, a, "");
 436 }
 437
 438
 439 /**
 440  * Generate a + b
 441  */
 442 LLVMValueRef
 443 lp_build_add(struct lp_build_context *bld,
 444              LLVMValueRef a,
 445              LLVMValueRef b)
 446 {
 447    LLVMBuilderRef builder = bld->gallivm->builder;
 448    const struct lp_type type = bld->type;
 449    LLVMValueRef res;
 450
 451    assert(lp_check_value(type, a));
 452    assert(lp_check_value(type, b));
 453
 454    if (a == bld->zero)
 455       return b;
 456    if (b == bld->zero)
 457       return a;
 458    if (a == bld->undef || b == bld->undef)
 459       return bld->undef;
 460
 461    if (type.norm) {
 462       const char *intrinsic = NULL;
 463
 464       if (!type.sign && (a == bld->one || b == bld->one))
 465         return bld->one;
 466
 467       if (!type.floating && !type.fixed) {
 468          if (LLVM_VERSION_MAJOR >= 9) {
 469             char intrin[32];
 470             intrinsic = type.sign ? "llvm.sadd.sat" : "llvm.uadd.sat";
 471             lp_format_intrinsic(intrin, sizeof intrin, intrinsic, bld->vec_type);
 472             return lp_build_intrinsic_binary(builder, intrin, bld->vec_type, a, b);
 473          }
 474          if (type.width * type.length == 128) {
 475             if (util_cpu_caps.has_sse2) {
 476                if (type.width == 8)
 477                  intrinsic = type.sign ? "llvm.x86.sse2.padds.b" :
 478                                          LLVM_VERSION_MAJOR < 8 ? "llvm.x86.sse2.paddus.b" : NULL;
 479                if (type.width == 16)
 480                  intrinsic = type.sign ? "llvm.x86.sse2.padds.w" :
 481                                          LLVM_VERSION_MAJOR < 8 ? "llvm.x86.sse2.paddus.w" : NULL;
 482             } else if (util_cpu_caps.has_altivec) {
 483                if (type.width == 8)
 484                   intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs";
 485                if (type.width == 16)
 486                   intrinsic = type.sign ? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs";
 487             }
 488          }
 489          if (type.width * type.length == 256) {
 490             if (util_cpu_caps.has_avx2) {
 491                if (type.width == 8)
 492                   intrinsic = type.sign ? "llvm.x86.avx2.padds.b" :
 493                                           LLVM_VERSION_MAJOR < 8 ? "llvm.x86.avx2.paddus.b" : NULL;
 494                if (type.width == 16)
 495                   intrinsic = type.sign ? "llvm.x86.avx2.padds.w" :
 496                                           LLVM_VERSION_MAJOR < 8 ? "llvm.x86.avx2.paddus.w" : NULL;
 497             }
 498          }
 499       }
 500
 501       if (intrinsic)
 502          return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
 503    }
 504
 505    if(type.norm && !type.floating && !type.fixed) {
 506       if (type.sign) {
 507          uint64_t sign = (uint64_t)1 << (type.width - 1);
 508          LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1);
 509          LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign);
 510          /* a_clamp_max is the maximum a for positive b,
 511             a_clamp_min is the minimum a for negative b. */
 512          LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildSub(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 513          LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildSub(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 514          a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_max, a_clamp_min);
 515       }
 516    }
 517
 518    if(LLVMIsConstant(a) && LLVMIsConstant(b))
 519       if (type.floating)
 520          res = LLVMConstFAdd(a, b);
 521       else
 522          res = LLVMConstAdd(a, b);
 523    else
 524       if (type.floating)
 525          res = LLVMBuildFAdd(builder, a, b, "");
 526       else
 527          res = LLVMBuildAdd(builder, a, b, "");
 528
 529    /* clamp to ceiling of 1.0 */
 530    if(bld->type.norm && (bld->type.floating || bld->type.fixed))
 531       res = lp_build_min_simple(bld, res, bld->one, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 532
 533    if (type.norm && !type.floating && !type.fixed) {
 534       if (!type.sign) {
 535          /*
 536           * newer llvm versions no longer support the intrinsics, but recognize
 537           * the pattern. Since auto-upgrade of intrinsics doesn't work for jit
 538           * code, it is important we match the pattern llvm uses (and pray llvm
 539           * doesn't change it - and hope they decide on the same pattern for
 540           * all backends supporting it...).
 541           * NOTE: cmp/select does sext/trunc of the mask. Does not seem to
 542           * interfere with llvm's ability to recognize the pattern but seems
 543           * a bit brittle.
 544           * NOTE: llvm 9+ always uses (non arch specific) intrinsic.
 545           */
 546          LLVMValueRef overflowed = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, res);
 547          res = lp_build_select(bld, overflowed,
 548                                LLVMConstAllOnes(bld->int_vec_type), res);
 549       }
 550    }
 551
 552    /* XXX clamp to floor of -1 or 0??? */
 553
 554    return res;
 555 }
 556
 557
 558 /** Return the scalar sum of the elements of a.
 559  * Should avoid this operation whenever possible.
 560  */
 561 LLVMValueRef
 562 lp_build_horizontal_add(struct lp_build_context *bld,
 563                         LLVMValueRef a)
 564 {
 565    LLVMBuilderRef builder = bld->gallivm->builder;
 566    const struct lp_type type = bld->type;
 567    LLVMValueRef index, res;
 568    unsigned i, length;
 569    LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 2];
 570    LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 2];
 571    LLVMValueRef vecres, elem2;
 572
 573    assert(lp_check_value(type, a));
 574
 575    if (type.length == 1) {
 576       return a;
 577    }
 578
 579    assert(!bld->type.norm);
 580
 581    /*
 582     * for byte vectors can do much better with psadbw.
 583     * Using repeated shuffle/adds here. Note with multiple vectors
 584     * this can be done more efficiently as outlined in the intel
 585     * optimization manual.
 586     * Note: could cause data rearrangement if used with smaller element
 587     * sizes.
 588     */
 589
 590    vecres = a;
 591    length = type.length / 2;
 592    while (length > 1) {
 593       LLVMValueRef vec1, vec2;
 594       for (i = 0; i < length; i++) {
 595          shuffles1[i] = lp_build_const_int32(bld->gallivm, i);
 596          shuffles2[i] = lp_build_const_int32(bld->gallivm, i + length);
 597       }
 598       vec1 = LLVMBuildShuffleVector(builder, vecres, vecres,
 599                                     LLVMConstVector(shuffles1, length), "");
 600       vec2 = LLVMBuildShuffleVector(builder, vecres, vecres,
 601                                     LLVMConstVector(shuffles2, length), "");
 602       if (type.floating) {
 603          vecres = LLVMBuildFAdd(builder, vec1, vec2, "");
 604       }
 605       else {
 606          vecres = LLVMBuildAdd(builder, vec1, vec2, "");
 607       }
 608       length = length >> 1;
 609    }
 610
 611    /* always have vector of size 2 here */
 612    assert(length == 1);
 613
 614    index = lp_build_const_int32(bld->gallivm, 0);
 615    res = LLVMBuildExtractElement(builder, vecres, index, "");
 616    index = lp_build_const_int32(bld->gallivm, 1);
 617    elem2 = LLVMBuildExtractElement(builder, vecres, index, "");
 618
 619    if (type.floating)
 620       res = LLVMBuildFAdd(builder, res, elem2, "");
 621     else
 622       res = LLVMBuildAdd(builder, res, elem2, "");
 623
 624    return res;
 625 }
 626
 627 /**
 628  * Return the horizontal sums of 4 float vectors as a float4 vector.
 629  * This uses the technique as outlined in Intel Optimization Manual.
 630  */
 631 static LLVMValueRef
 632 lp_build_horizontal_add4x4f(struct lp_build_context *bld,
 633                             LLVMValueRef src[4])
 634 {
 635    struct gallivm_state *gallivm = bld->gallivm;
 636    LLVMBuilderRef builder = gallivm->builder;
 637    LLVMValueRef shuffles[4];
 638    LLVMValueRef tmp[4];
 639    LLVMValueRef sumtmp[2], shuftmp[2];
 640
 641    /* lower half of regs */
 642    shuffles[0] = lp_build_const_int32(gallivm, 0);
 643    shuffles[1] = lp_build_const_int32(gallivm, 1);
 644    shuffles[2] = lp_build_const_int32(gallivm, 4);
 645    shuffles[3] = lp_build_const_int32(gallivm, 5);
 646    tmp[0] = LLVMBuildShuffleVector(builder, src[0], src[1],
 647                                    LLVMConstVector(shuffles, 4), "");
 648    tmp[2] = LLVMBuildShuffleVector(builder, src[2], src[3],
 649                                    LLVMConstVector(shuffles, 4), "");
 650
 651    /* upper half of regs */
 652    shuffles[0] = lp_build_const_int32(gallivm, 2);
 653    shuffles[1] = lp_build_const_int32(gallivm, 3);
 654    shuffles[2] = lp_build_const_int32(gallivm, 6);
 655    shuffles[3] = lp_build_const_int32(gallivm, 7);
 656    tmp[1] = LLVMBuildShuffleVector(builder, src[0], src[1],
 657                                    LLVMConstVector(shuffles, 4), "");
 658    tmp[3] = LLVMBuildShuffleVector(builder, src[2], src[3],
 659                                    LLVMConstVector(shuffles, 4), "");
 660
 661    sumtmp[0] = LLVMBuildFAdd(builder, tmp[0], tmp[1], "");
 662    sumtmp[1] = LLVMBuildFAdd(builder, tmp[2], tmp[3], "");
 663
 664    shuffles[0] = lp_build_const_int32(gallivm, 0);
 665    shuffles[1] = lp_build_const_int32(gallivm, 2);
 666    shuffles[2] = lp_build_const_int32(gallivm, 4);
 667    shuffles[3] = lp_build_const_int32(gallivm, 6);
 668    shuftmp[0] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
 669                                        LLVMConstVector(shuffles, 4), "");
 670
 671    shuffles[0] = lp_build_const_int32(gallivm, 1);
 672    shuffles[1] = lp_build_const_int32(gallivm, 3);
 673    shuffles[2] = lp_build_const_int32(gallivm, 5);
 674    shuffles[3] = lp_build_const_int32(gallivm, 7);
 675    shuftmp[1] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
 676                                        LLVMConstVector(shuffles, 4), "");
 677
 678    return LLVMBuildFAdd(builder, shuftmp[0], shuftmp[1], "");
 679 }
 680
 681
 682 /*
 683  * partially horizontally add 2-4 float vectors with length nx4,
 684  * i.e. only four adjacent values in each vector will be added,
 685  * assuming values are really grouped in 4 which also determines
 686  * output order.
 687  *
 688  * Return a vector of the same length as the initial vectors,
 689  * with the excess elements (if any) being undefined.
 690  * The element order is independent of number of input vectors.
 691  * For 3 vectors x0x1x2x3x4x5x6x7, y0y1y2y3y4y5y6y7, z0z1z2z3z4z5z6z7
 692  * the output order thus will be
 693  * sumx0-x3,sumy0-y3,sumz0-z3,undef,sumx4-x7,sumy4-y7,sumz4z7,undef
 694  */
 695 LLVMValueRef
 696 lp_build_hadd_partial4(struct lp_build_context *bld,
 697                        LLVMValueRef vectors[],
 698                        unsigned num_vecs)
 699 {
 700    struct gallivm_state *gallivm = bld->gallivm;
 701    LLVMBuilderRef builder = gallivm->builder;
 702    LLVMValueRef ret_vec;
 703    LLVMValueRef tmp[4];
 704    const char *intrinsic = NULL;
 705
 706    assert(num_vecs >= 2 && num_vecs <= 4);
 707    assert(bld->type.floating);
 708
 709    /* only use this with at least 2 vectors, as it is sort of expensive
 710     * (depending on cpu) and we always need two horizontal adds anyway,
 711     * so a shuffle/add approach might be better.
 712     */
 713
 714    tmp[0] = vectors[0];
 715    tmp[1] = vectors[1];
 716
 717    tmp[2] = num_vecs > 2 ? vectors[2] : vectors[0];
 718    tmp[3] = num_vecs > 3 ? vectors[3] : vectors[0];
 719
 720    if (util_cpu_caps.has_sse3 && bld->type.width == 32 &&
 721        bld->type.length == 4) {
 722       intrinsic = "llvm.x86.sse3.hadd.ps";
 723    }
 724    else if (util_cpu_caps.has_avx && bld->type.width == 32 &&
 725             bld->type.length == 8) {
 726       intrinsic = "llvm.x86.avx.hadd.ps.256";
 727    }
 728    if (intrinsic) {
 729       tmp[0] = lp_build_intrinsic_binary(builder, intrinsic,
 730                                        lp_build_vec_type(gallivm, bld->type),
 731                                        tmp[0], tmp[1]);
 732       if (num_vecs > 2) {
 733          tmp[1] = lp_build_intrinsic_binary(builder, intrinsic,
 734                                           lp_build_vec_type(gallivm, bld->type),
 735                                           tmp[2], tmp[3]);
 736       }
 737       else {
 738          tmp[1] = tmp[0];
 739       }
 740       return lp_build_intrinsic_binary(builder, intrinsic,
 741                                        lp_build_vec_type(gallivm, bld->type),
 742                                        tmp[0], tmp[1]);
 743    }
 744
 745    if (bld->type.length == 4) {
 746       ret_vec = lp_build_horizontal_add4x4f(bld, tmp);
 747    }
 748    else {
 749       LLVMValueRef partres[LP_MAX_VECTOR_LENGTH/4];
 750       unsigned j;
 751       unsigned num_iter = bld->type.length / 4;
 752       struct lp_type parttype = bld->type;
 753       parttype.length = 4;
 754       for (j = 0; j < num_iter; j++) {
 755          LLVMValueRef partsrc[4];
 756          unsigned i;
 757          for (i = 0; i < 4; i++) {
 758             partsrc[i] = lp_build_extract_range(gallivm, tmp[i], j*4, 4);
 759          }
 760          partres[j] = lp_build_horizontal_add4x4f(bld, partsrc);
 761       }
 762       ret_vec = lp_build_concat(gallivm, partres, parttype, num_iter);
 763    }
 764    return ret_vec;
 765 }
 766
 767 /**
 768  * Generate a - b
 769  */
 770 LLVMValueRef
 771 lp_build_sub(struct lp_build_context *bld,
 772              LLVMValueRef a,
 773              LLVMValueRef b)
 774 {
 775    LLVMBuilderRef builder = bld->gallivm->builder;
 776    const struct lp_type type = bld->type;
 777    LLVMValueRef res;
 778
 779    assert(lp_check_value(type, a));
 780    assert(lp_check_value(type, b));
 781
 782    if (b == bld->zero)
 783       return a;
 784    if (a == bld->undef || b == bld->undef)
 785       return bld->undef;
 786    if (a == b)
 787       return bld->zero;
 788
 789    if (type.norm) {
 790       const char *intrinsic = NULL;
 791
 792       if (!type.sign && b == bld->one)
 793         return bld->zero;
 794
 795       if (!type.floating && !type.fixed) {
 796          if (LLVM_VERSION_MAJOR >= 9) {
 797             char intrin[32];
 798             intrinsic = type.sign ? "llvm.ssub.sat" : "llvm.usub.sat";
 799             lp_format_intrinsic(intrin, sizeof intrin, intrinsic, bld->vec_type);
 800             return lp_build_intrinsic_binary(builder, intrin, bld->vec_type, a, b);
 801          }
 802          if (type.width * type.length == 128) {
 803             if (util_cpu_caps.has_sse2) {
 804                if (type.width == 8)
 805                   intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" :
 806                                           LLVM_VERSION_MAJOR < 8 ? "llvm.x86.sse2.psubus.b" : NULL;
 807                if (type.width == 16)
 808                   intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" :
 809                                           LLVM_VERSION_MAJOR < 8 ? "llvm.x86.sse2.psubus.w" : NULL;
 810             } else if (util_cpu_caps.has_altivec) {
 811                if (type.width == 8)
 812                   intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs";
 813                if (type.width == 16)
 814                   intrinsic = type.sign ? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs";
 815             }
 816          }
 817          if (type.width * type.length == 256) {
 818             if (util_cpu_caps.has_avx2) {
 819                if (type.width == 8)
 820                   intrinsic = type.sign ? "llvm.x86.avx2.psubs.b" :
 821                                           LLVM_VERSION_MAJOR < 8 ? "llvm.x86.avx2.psubus.b" : NULL;
 822                if (type.width == 16)
 823                   intrinsic = type.sign ? "llvm.x86.avx2.psubs.w" :
 824                                           LLVM_VERSION_MAJOR < 8 ? "llvm.x86.avx2.psubus.w" : NULL;
 825             }
 826          }
 827       }
 828
 829       if (intrinsic)
 830          return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
 831    }
 832
 833    if(type.norm && !type.floating && !type.fixed) {
 834       if (type.sign) {
 835          uint64_t sign = (uint64_t)1 << (type.width - 1);
 836          LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1);
 837          LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign);
 838          /* a_clamp_max is the maximum a for negative b,
 839             a_clamp_min is the minimum a for positive b. */
 840          LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildAdd(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 841          LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildAdd(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 842          a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_min, a_clamp_max);
 843       } else {
 844          /*
 845           * This must match llvm pattern for saturated unsigned sub.
 846           * (lp_build_max_simple actually does the job with its current
 847           * definition but do it explicitly here.)
 848           * NOTE: cmp/select does sext/trunc of the mask. Does not seem to
 849           * interfere with llvm's ability to recognize the pattern but seems
 850           * a bit brittle.
 851           * NOTE: llvm 9+ always uses (non arch specific) intrinsic.
 852           */
 853          LLVMValueRef no_ov = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 854          a = lp_build_select(bld, no_ov, a, b);
 855       }
 856    }
 857
 858    if(LLVMIsConstant(a) && LLVMIsConstant(b))
 859       if (type.floating)
 860          res = LLVMConstFSub(a, b);
 861       else
 862          res = LLVMConstSub(a, b);
 863    else
 864       if (type.floating)
 865          res = LLVMBuildFSub(builder, a, b, "");
 866       else
 867          res = LLVMBuildSub(builder, a, b, "");
 868
 869    if(bld->type.norm && (bld->type.floating || bld->type.fixed))
 870       res = lp_build_max_simple(bld, res, bld->zero, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 871
 872    return res;
 873 }
 874
 875
 876
 877 /**
 878  * Normalized multiplication.
 879  *
 880  * There are several approaches for (using 8-bit normalized multiplication as
 881  * an example):
 882  *
 883  * - alpha plus one
 884  *
 885  *     makes the following approximation to the division (Sree)
 886  *
 887  *       a*b/255 ~= (a*(b + 1)) >> 256
 888  *
 889  *     which is the fastest method that satisfies the following OpenGL criteria of
 890  *
 891  *       0*0 = 0 and 255*255 = 255
 892  *
 893  * - geometric series
 894  *
 895  *     takes the geometric series approximation to the division
 896  *
 897  *       t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
 898  *
 899  *     in this case just the first two terms to fit in 16bit arithmetic
 900  *
 901  *       t/255 ~= (t + (t >> 8)) >> 8
 902  *
 903  *     note that just by itself it doesn't satisfies the OpenGL criteria, as
 904  *     255*255 = 254, so the special case b = 255 must be accounted or roundoff
 905  *     must be used.
 906  *
 907  * - geometric series plus rounding
 908  *
 909  *     when using a geometric series division instead of truncating the result
 910  *     use roundoff in the approximation (Jim Blinn)
 911  *
 912  *       t/255 ~= (t + (t >> 8) + 0x80) >> 8
 913  *
 914  *     achieving the exact results.
 915  *
 916  *
 917  *
 918  * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
 919  *     ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
 920  * @sa Michael Herf, The "double blend trick", May 2000,
 921  *     http://www.stereopsis.com/doubleblend.html
 922  */
 923 LLVMValueRef
 924 lp_build_mul_norm(struct gallivm_state *gallivm,
 925                   struct lp_type wide_type,
 926                   LLVMValueRef a, LLVMValueRef b)
 927 {
 928    LLVMBuilderRef builder = gallivm->builder;
 929    struct lp_build_context bld;
 930    unsigned n;
 931    LLVMValueRef half;
 932    LLVMValueRef ab;
 933
 934    assert(!wide_type.floating);
 935    assert(lp_check_value(wide_type, a));
 936    assert(lp_check_value(wide_type, b));
 937
 938    lp_build_context_init(&bld, gallivm, wide_type);
 939
 940    n = wide_type.width / 2;
 941    if (wide_type.sign) {
 942       --n;
 943    }
 944
 945    /*
 946     * TODO: for 16bits normalized SSE2 vectors we could consider using PMULHUW
 947     * http://ssp.impulsetrain.com/2011/07/03/multiplying-normalized-16-bit-numbers-with-sse2/
 948     */
 949
 950    /*
 951     * a*b / (2**n - 1) ~= (a*b + (a*b >> n) + half) >> n
 952     */
 953
 954    ab = LLVMBuildMul(builder, a, b, "");
 955    ab = LLVMBuildAdd(builder, ab, lp_build_shr_imm(&bld, ab, n), "");
 956
 957    /*
 958     * half = sgn(ab) * 0.5 * (2 ** n) = sgn(ab) * (1 << (n - 1))
 959     */
 960
 961    half = lp_build_const_int_vec(gallivm, wide_type, 1LL << (n - 1));
 962    if (wide_type.sign) {
 963       LLVMValueRef minus_half = LLVMBuildNeg(builder, half, "");
 964       LLVMValueRef sign = lp_build_shr_imm(&bld, ab, wide_type.width - 1);
 965       half = lp_build_select(&bld, sign, minus_half, half);
 966    }
 967    ab = LLVMBuildAdd(builder, ab, half, "");
 968
 969    /* Final division */
 970    ab = lp_build_shr_imm(&bld, ab, n);
 971
 972    return ab;
 973 }
 974
 975 /**
 976  * Generate a * b
 977  */
 978 LLVMValueRef
 979 lp_build_mul(struct lp_build_context *bld,
 980              LLVMValueRef a,
 981              LLVMValueRef b)
 982 {
 983    LLVMBuilderRef builder = bld->gallivm->builder;
 984    const struct lp_type type = bld->type;
 985    LLVMValueRef shift;
 986    LLVMValueRef res;
 987
 988    assert(lp_check_value(type, a));
 989    assert(lp_check_value(type, b));
 990
 991    if(a == bld->zero)
 992       return bld->zero;
 993    if(a == bld->one)
 994       return b;
 995    if(b == bld->zero)
 996       return bld->zero;
 997    if(b == bld->one)
 998       return a;
 999    if(a == bld->undef || b == bld->undef)
1000       return bld->undef;
1001
1002    if (!type.floating && !type.fixed && type.norm) {
1003       struct lp_type wide_type = lp_wider_type(type);
1004       LLVMValueRef al, ah, bl, bh, abl, abh, ab;
1005
1006       lp_build_unpack2_native(bld->gallivm, type, wide_type, a, &al, &ah);
1007       lp_build_unpack2_native(bld->gallivm, type, wide_type, b, &bl, &bh);
1008
1009       /* PMULLW, PSRLW, PADDW */
1010       abl = lp_build_mul_norm(bld->gallivm, wide_type, al, bl);
1011       abh = lp_build_mul_norm(bld->gallivm, wide_type, ah, bh);
1012
1013       ab = lp_build_pack2_native(bld->gallivm, wide_type, type, abl, abh);
1014
1015       return ab;
1016    }
1017
1018    if(type.fixed)
1019       shift = lp_build_const_int_vec(bld->gallivm, type, type.width/2);
1020    else
1021       shift = NULL;
1022
1023    if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
1024       if (type.floating)
1025          res = LLVMConstFMul(a, b);
1026       else
1027          res = LLVMConstMul(a, b);
1028       if(shift) {
1029          if(type.sign)
1030             res = LLVMConstAShr(res, shift);
1031          else
1032             res = LLVMConstLShr(res, shift);
1033       }
1034    }
1035    else {
1036       if (type.floating)
1037          res = LLVMBuildFMul(builder, a, b, "");
1038       else
1039          res = LLVMBuildMul(builder, a, b, "");
1040       if(shift) {
1041          if(type.sign)
1042             res = LLVMBuildAShr(builder, res, shift, "");
1043          else
1044             res = LLVMBuildLShr(builder, res, shift, "");
1045       }
1046    }
1047
1048    return res;
1049 }
1050
1051 /*
1052  * Widening mul, valid for 32x32 bit -> 64bit only.
1053  * Result is low 32bits, high bits returned in res_hi.
1054  *
1055  * Emits code that is meant to be compiled for the host CPU.
1056  */
1057 LLVMValueRef
1058 lp_build_mul_32_lohi_cpu(struct lp_build_context *bld,
1059                          LLVMValueRef a,
1060                          LLVMValueRef b,
1061                          LLVMValueRef *res_hi)
1062 {
1063    struct gallivm_state *gallivm = bld->gallivm;
1064    LLVMBuilderRef builder = gallivm->builder;
1065
1066    assert(bld->type.width == 32);
1067    assert(bld->type.floating == 0);
1068    assert(bld->type.fixed == 0);
1069    assert(bld->type.norm == 0);
1070
1071    /*
1072     * XXX: for some reason, with zext/zext/mul/trunc the code llvm produces
1073     * for x86 simd is atrocious (even if the high bits weren't required),
1074     * trying to handle real 64bit inputs (which of course can't happen due
1075     * to using 64bit umul with 32bit numbers zero-extended to 64bit, but
1076     * apparently llvm does not recognize this widening mul). This includes 6
1077     * (instead of 2) pmuludq plus extra adds and shifts
1078     * The same story applies to signed mul, albeit fixing this requires sse41.
1079     * https://llvm.org/bugs/show_bug.cgi?id=30845
1080     * So, whip up our own code, albeit only for length 4 and 8 (which
1081     * should be good enough)...
1082     * FIXME: For llvm >= 7.0 we should match the autoupgrade pattern
1083     * (bitcast/and/mul/shuffle for unsigned, bitcast/shl/ashr/mul/shuffle
1084     * for signed), which the fallback code does not, without this llvm
1085     * will likely still produce atrocious code.
1086     */
1087    if (LLVM_VERSION_MAJOR < 7 &&
1088        (bld->type.length == 4 || bld->type.length == 8) &&
1089        ((util_cpu_caps.has_sse2 && (bld->type.sign == 0)) ||
1090         util_cpu_caps.has_sse4_1)) {
1091       const char *intrinsic = NULL;
1092       LLVMValueRef aeven, aodd, beven, bodd, muleven, mulodd;
1093       LLVMValueRef shuf[LP_MAX_VECTOR_WIDTH / 32], shuf_vec;
1094       struct lp_type type_wide = lp_wider_type(bld->type);
1095       LLVMTypeRef wider_type = lp_build_vec_type(gallivm, type_wide);
1096       unsigned i;
1097       for (i = 0; i < bld->type.length; i += 2) {
1098          shuf[i] = lp_build_const_int32(gallivm, i+1);
1099          shuf[i+1] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
1100       }
1101       shuf_vec = LLVMConstVector(shuf, bld->type.length);
1102       aeven = a;
1103       beven = b;
1104       aodd = LLVMBuildShuffleVector(builder, aeven, bld->undef, shuf_vec, "");
1105       bodd = LLVMBuildShuffleVector(builder, beven, bld->undef, shuf_vec, "");
1106
1107       if (util_cpu_caps.has_avx2 && bld->type.length == 8) {
1108          if (bld->type.sign) {
1109             intrinsic = "llvm.x86.avx2.pmul.dq";
1110          } else {
1111             intrinsic = "llvm.x86.avx2.pmulu.dq";
1112          }
1113          muleven = lp_build_intrinsic_binary(builder, intrinsic,
1114                                              wider_type, aeven, beven);
1115          mulodd = lp_build_intrinsic_binary(builder, intrinsic,
1116                                             wider_type, aodd, bodd);
1117       }
1118       else {
1119          /* for consistent naming look elsewhere... */
1120          if (bld->type.sign) {
1121             intrinsic = "llvm.x86.sse41.pmuldq";
1122          } else {
1123             intrinsic = "llvm.x86.sse2.pmulu.dq";
1124          }
1125          /*
1126           * XXX If we only have AVX but not AVX2 this is a pain.
1127           * lp_build_intrinsic_binary_anylength() can't handle it
1128           * (due to src and dst type not being identical).
1129           */
1130          if (bld->type.length == 8) {
1131             LLVMValueRef aevenlo, aevenhi, bevenlo, bevenhi;
1132             LLVMValueRef aoddlo, aoddhi, boddlo, boddhi;
1133             LLVMValueRef muleven2[2], mulodd2[2];
1134             struct lp_type type_wide_half = type_wide;
1135             LLVMTypeRef wtype_half;
1136             type_wide_half.length = 2;
1137             wtype_half = lp_build_vec_type(gallivm, type_wide_half);
1138             aevenlo = lp_build_extract_range(gallivm, aeven, 0, 4);
1139             aevenhi = lp_build_extract_range(gallivm, aeven, 4, 4);
1140             bevenlo = lp_build_extract_range(gallivm, beven, 0, 4);
1141             bevenhi = lp_build_extract_range(gallivm, beven, 4, 4);
1142             aoddlo = lp_build_extract_range(gallivm, aodd, 0, 4);
1143             aoddhi = lp_build_extract_range(gallivm, aodd, 4, 4);
1144             boddlo = lp_build_extract_range(gallivm, bodd, 0, 4);
1145             boddhi = lp_build_extract_range(gallivm, bodd, 4, 4);
1146             muleven2[0] = lp_build_intrinsic_binary(builder, intrinsic,
1147                                                     wtype_half, aevenlo, bevenlo);
1148             mulodd2[0] = lp_build_intrinsic_binary(builder, intrinsic,
1149                                                    wtype_half, aoddlo, boddlo);
1150             muleven2[1] = lp_build_intrinsic_binary(builder, intrinsic,
1151                                                     wtype_half, aevenhi, bevenhi);
1152             mulodd2[1] = lp_build_intrinsic_binary(builder, intrinsic,
1153                                                    wtype_half, aoddhi, boddhi);
1154             muleven = lp_build_concat(gallivm, muleven2, type_wide_half, 2);
1155             mulodd = lp_build_concat(gallivm, mulodd2, type_wide_half, 2);
1156
1157          }
1158          else {
1159             muleven = lp_build_intrinsic_binary(builder, intrinsic,
1160                                                 wider_type, aeven, beven);
1161             mulodd = lp_build_intrinsic_binary(builder, intrinsic,
1162                                                wider_type, aodd, bodd);
1163          }
1164       }
1165       muleven = LLVMBuildBitCast(builder, muleven, bld->vec_type, "");
1166       mulodd = LLVMBuildBitCast(builder, mulodd, bld->vec_type, "");
1167
1168       for (i = 0; i < bld->type.length; i += 2) {
1169          shuf[i] = lp_build_const_int32(gallivm, i + 1);
1170          shuf[i+1] = lp_build_const_int32(gallivm, i + 1 + bld->type.length);
1171       }
1172       shuf_vec = LLVMConstVector(shuf, bld->type.length);
1173       *res_hi = LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, "");
1174
1175       for (i = 0; i < bld->type.length; i += 2) {
1176          shuf[i] = lp_build_const_int32(gallivm, i);
1177          shuf[i+1] = lp_build_const_int32(gallivm, i + bld->type.length);
1178       }
1179       shuf_vec = LLVMConstVector(shuf, bld->type.length);
1180       return LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, "");
1181    }
1182    else {
1183       return lp_build_mul_32_lohi(bld, a, b, res_hi);
1184    }
1185 }
1186
1187
1188 /*
1189  * Widening mul, valid for 32x32 bit -> 64bit only.
1190  * Result is low 32bits, high bits returned in res_hi.
1191  *
1192  * Emits generic code.
1193  */
1194 LLVMValueRef
1195 lp_build_mul_32_lohi(struct lp_build_context *bld,
1196                      LLVMValueRef a,
1197                      LLVMValueRef b,
1198                      LLVMValueRef *res_hi)
1199 {
1200    struct gallivm_state *gallivm = bld->gallivm;
1201    LLVMBuilderRef builder = gallivm->builder;
1202    LLVMValueRef tmp, shift, res_lo;
1203    struct lp_type type_tmp;
1204    LLVMTypeRef wide_type, narrow_type;
1205
1206    type_tmp = bld->type;
1207    narrow_type = lp_build_vec_type(gallivm, type_tmp);
1208    type_tmp.width *= 2;
1209    wide_type = lp_build_vec_type(gallivm, type_tmp);
1210    shift = lp_build_const_vec(gallivm, type_tmp, 32);
1211
1212    if (bld->type.sign) {
1213       a = LLVMBuildSExt(builder, a, wide_type, "");
1214       b = LLVMBuildSExt(builder, b, wide_type, "");
1215    } else {
1216       a = LLVMBuildZExt(builder, a, wide_type, "");
1217       b = LLVMBuildZExt(builder, b, wide_type, "");
1218    }
1219    tmp = LLVMBuildMul(builder, a, b, "");
1220
1221    res_lo = LLVMBuildTrunc(builder, tmp, narrow_type, "");
1222
1223    /* Since we truncate anyway, LShr and AShr are equivalent. */
1224    tmp = LLVMBuildLShr(builder, tmp, shift, "");
1225    *res_hi = LLVMBuildTrunc(builder, tmp, narrow_type, "");
1226
1227    return res_lo;
1228 }
1229
1230
1231 /* a * b + c */
1232 LLVMValueRef
1233 lp_build_mad(struct lp_build_context *bld,
1234              LLVMValueRef a,
1235              LLVMValueRef b,
1236              LLVMValueRef c)
1237 {
1238    const struct lp_type type = bld->type;
1239    if (type.floating) {
1240       return lp_build_fmuladd(bld->gallivm->builder, a, b, c);
1241    } else {
1242       return lp_build_add(bld, lp_build_mul(bld, a, b), c);
1243    }
1244 }
1245
1246
1247 /**
1248  * Small vector x scale multiplication optimization.
1249  */
1250 LLVMValueRef
1251 lp_build_mul_imm(struct lp_build_context *bld,
1252                  LLVMValueRef a,
1253                  int b)
1254 {
1255    LLVMBuilderRef builder = bld->gallivm->builder;
1256    LLVMValueRef factor;
1257
1258    assert(lp_check_value(bld->type, a));
1259
1260    if(b == 0)
1261       return bld->zero;
1262
1263    if(b == 1)
1264       return a;
1265
1266    if(b == -1)
1267       return lp_build_negate(bld, a);
1268
1269    if(b == 2 && bld->type.floating)
1270       return lp_build_add(bld, a, a);
1271
1272    if(util_is_power_of_two_or_zero(b)) {
1273       unsigned shift = ffs(b) - 1;
1274
1275       if(bld->type.floating) {
1276 #if 0
1277          /*
1278           * Power of two multiplication by directly manipulating the exponent.
1279           *
1280           * XXX: This might not be always faster, it will introduce a small error
1281           * for multiplication by zero, and it will produce wrong results
1282           * for Inf and NaN.
1283           */
1284          unsigned mantissa = lp_mantissa(bld->type);
1285          factor = lp_build_const_int_vec(bld->gallivm, bld->type, (unsigned long long)shift << mantissa);
1286          a = LLVMBuildBitCast(builder, a, lp_build_int_vec_type(bld->type), "");
1287          a = LLVMBuildAdd(builder, a, factor, "");
1288          a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, bld->type), "");
1289          return a;
1290 #endif
1291       }
1292       else {
1293          factor = lp_build_const_vec(bld->gallivm, bld->type, shift);
1294          return LLVMBuildShl(builder, a, factor, "");
1295       }
1296    }
1297
1298    factor = lp_build_const_vec(bld->gallivm, bld->type, (double)b);
1299    return lp_build_mul(bld, a, factor);
1300 }
1301
1302
1303 /**
1304  * Generate a / b
1305  */
1306 LLVMValueRef
1307 lp_build_div(struct lp_build_context *bld,
1308              LLVMValueRef a,
1309              LLVMValueRef b)
1310 {
1311    LLVMBuilderRef builder = bld->gallivm->builder;
1312    const struct lp_type type = bld->type;
1313
1314    assert(lp_check_value(type, a));
1315    assert(lp_check_value(type, b));
1316
1317    if(a == bld->zero)
1318       return bld->zero;
1319    if(a == bld->one && type.floating)
1320       return lp_build_rcp(bld, b);
1321    if(b == bld->zero)
1322       return bld->undef;
1323    if(b == bld->one)
1324       return a;
1325    if(a == bld->undef || b == bld->undef)
1326       return bld->undef;
1327
1328    if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
1329       if (type.floating)
1330          return LLVMConstFDiv(a, b);
1331       else if (type.sign)
1332          return LLVMConstSDiv(a, b);
1333       else
1334          return LLVMConstUDiv(a, b);
1335    }
1336
1337    /* fast rcp is disabled (just uses div), so makes no sense to try that */
1338    if(FALSE &&
1339       ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
1340        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) &&
1341       type.floating)
1342       return lp_build_mul(bld, a, lp_build_rcp(bld, b));
1343
1344    if (type.floating)
1345       return LLVMBuildFDiv(builder, a, b, "");
1346    else if (type.sign)
1347       return LLVMBuildSDiv(builder, a, b, "");
1348    else
1349       return LLVMBuildUDiv(builder, a, b, "");
1350 }
1351
1352
1353 /**
1354  * Linear interpolation helper.
1355  *
1356  * @param normalized whether we are interpolating normalized values,
1357  *        encoded in normalized integers, twice as wide.
1358  *
1359  * @sa http://www.stereopsis.com/doubleblend.html
1360  */
1361 static inline LLVMValueRef
1362 lp_build_lerp_simple(struct lp_build_context *bld,
1363                      LLVMValueRef x,
1364                      LLVMValueRef v0,
1365                      LLVMValueRef v1,
1366                      unsigned flags)
1367 {
1368    unsigned half_width = bld->type.width/2;
1369    LLVMBuilderRef builder = bld->gallivm->builder;
1370    LLVMValueRef delta;
1371    LLVMValueRef res;
1372
1373    assert(lp_check_value(bld->type, x));
1374    assert(lp_check_value(bld->type, v0));
1375    assert(lp_check_value(bld->type, v1));
1376
1377    delta = lp_build_sub(bld, v1, v0);
1378
1379    if (bld->type.floating) {
1380       assert(flags == 0);
1381       return lp_build_mad(bld, x, delta, v0);
1382    }
1383
1384    if (flags & LP_BLD_LERP_WIDE_NORMALIZED) {
1385       if (!bld->type.sign) {
1386          if (!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS)) {
1387             /*
1388              * Scale x from [0, 2**n - 1] to [0, 2**n] by adding the
1389              * most-significant-bit to the lowest-significant-bit, so that
1390              * later we can just divide by 2**n instead of 2**n - 1.
1391              */
1392
1393             x = lp_build_add(bld, x, lp_build_shr_imm(bld, x, half_width - 1));
1394          }
1395
1396          /* (x * delta) >> n */
1397          res = lp_build_mul(bld, x, delta);
1398          res = lp_build_shr_imm(bld, res, half_width);
1399       } else {
1400          /*
1401           * The rescaling trick above doesn't work for signed numbers, so
1402           * use the 2**n - 1 divison approximation in lp_build_mul_norm
1403           * instead.
1404           */
1405          assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1406          res = lp_build_mul_norm(bld->gallivm, bld->type, x, delta);
1407       }
1408    } else {
1409       assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1410       res = lp_build_mul(bld, x, delta);
1411    }
1412
1413    if ((flags & LP_BLD_LERP_WIDE_NORMALIZED) && !bld->type.sign) {
1414       /*
1415        * At this point both res and v0 only use the lower half of the bits,
1416        * the rest is zero. Instead of add / mask, do add with half wide type.
1417        */
1418       struct lp_type narrow_type;
1419       struct lp_build_context narrow_bld;
1420
1421       memset(&narrow_type, 0, sizeof narrow_type);
1422       narrow_type.sign   = bld->type.sign;
1423       narrow_type.width  = bld->type.width/2;
1424       narrow_type.length = bld->type.length*2;
1425
1426       lp_build_context_init(&narrow_bld, bld->gallivm, narrow_type);
1427       res = LLVMBuildBitCast(builder, res, narrow_bld.vec_type, "");
1428       v0 = LLVMBuildBitCast(builder, v0, narrow_bld.vec_type, "");
1429       res = lp_build_add(&narrow_bld, v0, res);
1430       res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
1431    } else {
1432       res = lp_build_add(bld, v0, res);
1433
1434       if (bld->type.fixed) {
1435          /*
1436           * We need to mask out the high order bits when lerping 8bit
1437           * normalized colors stored on 16bits
1438           */
1439          /* XXX: This step is necessary for lerping 8bit colors stored on
1440           * 16bits, but it will be wrong for true fixed point use cases.
1441           * Basically we need a more powerful lp_type, capable of further
1442           * distinguishing the values interpretation from the value storage.
1443           */
1444          LLVMValueRef low_bits;
1445          low_bits = lp_build_const_int_vec(bld->gallivm, bld->type, (1 << half_width) - 1);
1446          res = LLVMBuildAnd(builder, res, low_bits, "");
1447       }
1448    }
1449
1450    return res;
1451 }
1452
1453
1454 /**
1455  * Linear interpolation.
1456  */
1457 LLVMValueRef
1458 lp_build_lerp(struct lp_build_context *bld,
1459               LLVMValueRef x,
1460               LLVMValueRef v0,
1461               LLVMValueRef v1,
1462               unsigned flags)
1463 {
1464    const struct lp_type type = bld->type;
1465    LLVMValueRef res;
1466
1467    assert(lp_check_value(type, x));
1468    assert(lp_check_value(type, v0));
1469    assert(lp_check_value(type, v1));
1470
1471    assert(!(flags & LP_BLD_LERP_WIDE_NORMALIZED));
1472
1473    if (type.norm) {
1474       struct lp_type wide_type;
1475       struct lp_build_context wide_bld;
1476       LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh;
1477
1478       assert(type.length >= 2);
1479
1480       /*
1481        * Create a wider integer type, enough to hold the
1482        * intermediate result of the multiplication.
1483        */
1484       memset(&wide_type, 0, sizeof wide_type);
1485       wide_type.sign   = type.sign;
1486       wide_type.width  = type.width*2;
1487       wide_type.length = type.length/2;
1488
1489       lp_build_context_init(&wide_bld, bld->gallivm, wide_type);
1490
1491       lp_build_unpack2_native(bld->gallivm, type, wide_type, x,  &xl,  &xh);
1492       lp_build_unpack2_native(bld->gallivm, type, wide_type, v0, &v0l, &v0h);
1493       lp_build_unpack2_native(bld->gallivm, type, wide_type, v1, &v1l, &v1h);
1494
1495       /*
1496        * Lerp both halves.
1497        */
1498
1499       flags |= LP_BLD_LERP_WIDE_NORMALIZED;
1500
1501       resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l, flags);
1502       resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h, flags);
1503
1504       res = lp_build_pack2_native(bld->gallivm, wide_type, type, resl, resh);
1505    } else {
1506       res = lp_build_lerp_simple(bld, x, v0, v1, flags);
1507    }
1508
1509    return res;
1510 }
1511
1512
1513 /**
1514  * Bilinear interpolation.
1515  *
1516  * Values indices are in v_{yx}.
1517  */
1518 LLVMValueRef
1519 lp_build_lerp_2d(struct lp_build_context *bld,
1520                  LLVMValueRef x,
1521                  LLVMValueRef y,
1522                  LLVMValueRef v00,
1523                  LLVMValueRef v01,
1524                  LLVMValueRef v10,
1525                  LLVMValueRef v11,
1526                  unsigned flags)
1527 {
1528    LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01, flags);
1529    LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11, flags);
1530    return lp_build_lerp(bld, y, v0, v1, flags);
1531 }
1532
1533
1534 LLVMValueRef
1535 lp_build_lerp_3d(struct lp_build_context *bld,
1536                  LLVMValueRef x,
1537                  LLVMValueRef y,
1538                  LLVMValueRef z,
1539                  LLVMValueRef v000,
1540                  LLVMValueRef v001,
1541                  LLVMValueRef v010,
1542                  LLVMValueRef v011,
1543                  LLVMValueRef v100,
1544                  LLVMValueRef v101,
1545                  LLVMValueRef v110,
1546                  LLVMValueRef v111,
1547                  unsigned flags)
1548 {
1549    LLVMValueRef v0 = lp_build_lerp_2d(bld, x, y, v000, v001, v010, v011, flags);
1550    LLVMValueRef v1 = lp_build_lerp_2d(bld, x, y, v100, v101, v110, v111, flags);
1551    return lp_build_lerp(bld, z, v0, v1, flags);
1552 }
1553
1554
1555 /**
1556  * Generate min(a, b)
1557  * Do checks for special cases but not for nans.
1558  */
1559 LLVMValueRef
1560 lp_build_min(struct lp_build_context *bld,
1561              LLVMValueRef a,
1562              LLVMValueRef b)
1563 {
1564    assert(lp_check_value(bld->type, a));
1565    assert(lp_check_value(bld->type, b));
1566
1567    if(a == bld->undef || b == bld->undef)
1568       return bld->undef;
1569
1570    if(a == b)
1571       return a;
1572
1573    if (bld->type.norm) {
1574       if (!bld->type.sign) {
1575          if (a == bld->zero || b == bld->zero) {
1576             return bld->zero;
1577          }
1578       }
1579       if(a == bld->one)
1580          return b;
1581       if(b == bld->one)
1582          return a;
1583    }
1584
1585    return lp_build_min_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1586 }
1587
1588
1589 /**
1590  * Generate min(a, b)
1591  * NaN's are handled according to the behavior specified by the
1592  * nan_behavior argument.
1593  */
1594 LLVMValueRef
1595 lp_build_min_ext(struct lp_build_context *bld,
1596                  LLVMValueRef a,
1597                  LLVMValueRef b,
1598                  enum gallivm_nan_behavior nan_behavior)
1599 {
1600    assert(lp_check_value(bld->type, a));
1601    assert(lp_check_value(bld->type, b));
1602
1603    if(a == bld->undef || b == bld->undef)
1604       return bld->undef;
1605
1606    if(a == b)
1607       return a;
1608
1609    if (bld->type.norm) {
1610       if (!bld->type.sign) {
1611          if (a == bld->zero || b == bld->zero) {
1612             return bld->zero;
1613          }
1614       }
1615       if(a == bld->one)
1616          return b;
1617       if(b == bld->one)
1618          return a;
1619    }
1620
1621    return lp_build_min_simple(bld, a, b, nan_behavior);
1622 }
1623
1624 /**
1625  * Generate max(a, b)
1626  * Do checks for special cases, but NaN behavior is undefined.
1627  */
1628 LLVMValueRef
1629 lp_build_max(struct lp_build_context *bld,
1630              LLVMValueRef a,
1631              LLVMValueRef b)
1632 {
1633    assert(lp_check_value(bld->type, a));
1634    assert(lp_check_value(bld->type, b));
1635
1636    if(a == bld->undef || b == bld->undef)
1637       return bld->undef;
1638
1639    if(a == b)
1640       return a;
1641
1642    if(bld->type.norm) {
1643       if(a == bld->one || b == bld->one)
1644          return bld->one;
1645       if (!bld->type.sign) {
1646          if (a == bld->zero) {
1647             return b;
1648          }
1649          if (b == bld->zero) {
1650             return a;
1651          }
1652       }
1653    }
1654
1655    return lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1656 }
1657
1658
1659 /**
1660  * Generate max(a, b)
1661  * Checks for special cases.
1662  * NaN's are handled according to the behavior specified by the
1663  * nan_behavior argument.
1664  */
1665 LLVMValueRef
1666 lp_build_max_ext(struct lp_build_context *bld,
1667                   LLVMValueRef a,
1668                   LLVMValueRef b,
1669                   enum gallivm_nan_behavior nan_behavior)
1670 {
1671    assert(lp_check_value(bld->type, a));
1672    assert(lp_check_value(bld->type, b));
1673
1674    if(a == bld->undef || b == bld->undef)
1675       return bld->undef;
1676
1677    if(a == b)
1678       return a;
1679
1680    if(bld->type.norm) {
1681       if(a == bld->one || b == bld->one)
1682          return bld->one;
1683       if (!bld->type.sign) {
1684          if (a == bld->zero) {
1685             return b;
1686          }
1687          if (b == bld->zero) {
1688             return a;
1689          }
1690       }
1691    }
1692
1693    return lp_build_max_simple(bld, a, b, nan_behavior);
1694 }
1695
1696 /**
1697  * Generate clamp(a, min, max)
1698  * NaN behavior (for any of a, min, max) is undefined.
1699  * Do checks for special cases.
1700  */
1701 LLVMValueRef
1702 lp_build_clamp(struct lp_build_context *bld,
1703                LLVMValueRef a,
1704                LLVMValueRef min,
1705                LLVMValueRef max)
1706 {
1707    assert(lp_check_value(bld->type, a));
1708    assert(lp_check_value(bld->type, min));
1709    assert(lp_check_value(bld->type, max));
1710
1711    a = lp_build_min(bld, a, max);
1712    a = lp_build_max(bld, a, min);
1713    return a;
1714 }
1715
1716
1717 /**
1718  * Generate clamp(a, 0, 1)
1719  * A NaN will get converted to zero.
1720  */
1721 LLVMValueRef
1722 lp_build_clamp_zero_one_nanzero(struct lp_build_context *bld,
1723                                 LLVMValueRef a)
1724 {
1725    a = lp_build_max_ext(bld, a, bld->zero, GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
1726    a = lp_build_min(bld, a, bld->one);
1727    return a;
1728 }
1729
1730
1731 /**
1732  * Generate abs(a)
1733  */
1734 LLVMValueRef
1735 lp_build_abs(struct lp_build_context *bld,
1736              LLVMValueRef a)
1737 {
1738    LLVMBuilderRef builder = bld->gallivm->builder;
1739    const struct lp_type type = bld->type;
1740    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1741
1742    assert(lp_check_value(type, a));
1743
1744    if(!type.sign)
1745       return a;
1746
1747    if(type.floating) {
1748       char intrinsic[32];
1749       lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fabs", vec_type);
1750       return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
1751    }
1752
1753    if(type.width*type.length == 128 && util_cpu_caps.has_ssse3 && LLVM_VERSION_MAJOR < 6) {
1754       switch(type.width) {
1755       case 8:
1756          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
1757       case 16:
1758          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
1759       case 32:
1760          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
1761       }
1762    }
1763    else if (type.width*type.length == 256 && util_cpu_caps.has_avx2 && LLVM_VERSION_MAJOR < 6) {
1764       switch(type.width) {
1765       case 8:
1766          return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.b", vec_type, a);
1767       case 16:
1768          return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.w", vec_type, a);
1769       case 32:
1770          return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.d", vec_type, a);
1771       }
1772    }
1773
1774    return lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero),
1775                           a, LLVMBuildNeg(builder, a, ""));
1776 }
1777
1778
1779 LLVMValueRef
1780 lp_build_negate(struct lp_build_context *bld,
1781                 LLVMValueRef a)
1782 {
1783    LLVMBuilderRef builder = bld->gallivm->builder;
1784
1785    assert(lp_check_value(bld->type, a));
1786
1787    if (bld->type.floating)
1788       a = LLVMBuildFNeg(builder, a, "");
1789    else
1790       a = LLVMBuildNeg(builder, a, "");
1791
1792    return a;
1793 }
1794
1795
1796 /** Return -1, 0 or +1 depending on the sign of a */
1797 LLVMValueRef
1798 lp_build_sgn(struct lp_build_context *bld,
1799              LLVMValueRef a)
1800 {
1801    LLVMBuilderRef builder = bld->gallivm->builder;
1802    const struct lp_type type = bld->type;
1803    LLVMValueRef cond;
1804    LLVMValueRef res;
1805
1806    assert(lp_check_value(type, a));
1807
1808    /* Handle non-zero case */
1809    if(!type.sign) {
1810       /* if not zero then sign must be positive */
1811       res = bld->one;
1812    }
1813    else if(type.floating) {
1814       LLVMTypeRef vec_type;
1815       LLVMTypeRef int_type;
1816       LLVMValueRef mask;
1817       LLVMValueRef sign;
1818       LLVMValueRef one;
1819       unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
1820
1821       int_type = lp_build_int_vec_type(bld->gallivm, type);
1822       vec_type = lp_build_vec_type(bld->gallivm, type);
1823       mask = lp_build_const_int_vec(bld->gallivm, type, maskBit);
1824
1825       /* Take the sign bit and add it to 1 constant */
1826       sign = LLVMBuildBitCast(builder, a, int_type, "");
1827       sign = LLVMBuildAnd(builder, sign, mask, "");
1828       one = LLVMConstBitCast(bld->one, int_type);
1829       res = LLVMBuildOr(builder, sign, one, "");
1830       res = LLVMBuildBitCast(builder, res, vec_type, "");
1831    }
1832    else
1833    {
1834       /* signed int/norm/fixed point */
1835       /* could use psign with sse3 and appropriate vectors here */
1836       LLVMValueRef minus_one = lp_build_const_vec(bld->gallivm, type, -1.0);
1837       cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
1838       res = lp_build_select(bld, cond, bld->one, minus_one);
1839    }
1840
1841    /* Handle zero */
1842    cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
1843    res = lp_build_select(bld, cond, bld->zero, res);
1844
1845    return res;
1846 }
1847
1848
1849 /**
1850  * Set the sign of float vector 'a' according to 'sign'.
1851  * If sign==0, return abs(a).
1852  * If sign==1, return -abs(a);
1853  * Other values for sign produce undefined results.
1854  */
1855 LLVMValueRef
1856 lp_build_set_sign(struct lp_build_context *bld,
1857                   LLVMValueRef a, LLVMValueRef sign)
1858 {
1859    LLVMBuilderRef builder = bld->gallivm->builder;
1860    const struct lp_type type = bld->type;
1861    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1862    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1863    LLVMValueRef shift = lp_build_const_int_vec(bld->gallivm, type, type.width - 1);
1864    LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1865                              ~((unsigned long long) 1 << (type.width - 1)));
1866    LLVMValueRef val, res;
1867
1868    assert(type.floating);
1869    assert(lp_check_value(type, a));
1870
1871    /* val = reinterpret_cast<int>(a) */
1872    val = LLVMBuildBitCast(builder, a, int_vec_type, "");
1873    /* val = val & mask */
1874    val = LLVMBuildAnd(builder, val, mask, "");
1875    /* sign = sign << shift */
1876    sign = LLVMBuildShl(builder, sign, shift, "");
1877    /* res = val | sign */
1878    res = LLVMBuildOr(builder, val, sign, "");
1879    /* res = reinterpret_cast<float>(res) */
1880    res = LLVMBuildBitCast(builder, res, vec_type, "");
1881
1882    return res;
1883 }
1884
1885
1886 /**
1887  * Convert vector of (or scalar) int to vector of (or scalar) float.
1888  */
1889 LLVMValueRef
1890 lp_build_int_to_float(struct lp_build_context *bld,
1891                       LLVMValueRef a)
1892 {
1893    LLVMBuilderRef builder = bld->gallivm->builder;
1894    const struct lp_type type = bld->type;
1895    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1896
1897    assert(type.floating);
1898
1899    return LLVMBuildSIToFP(builder, a, vec_type, "");
1900 }
1901
1902 static boolean
1903 arch_rounding_available(const struct lp_type type)
1904 {
1905    if ((util_cpu_caps.has_sse4_1 &&
1906        (type.length == 1 || type.width*type.length == 128)) ||
1907        (util_cpu_caps.has_avx && type.width*type.length == 256) ||
1908        (util_cpu_caps.has_avx512f && type.width*type.length == 512))
1909       return TRUE;
1910    else if ((util_cpu_caps.has_altivec &&
1911             (type.width == 32 && type.length == 4)))
1912       return TRUE;
1913    else if (util_cpu_caps.has_neon)
1914       return TRUE;
1915
1916    return FALSE;
1917 }
1918
1919 enum lp_build_round_mode
1920 {
1921    LP_BUILD_ROUND_NEAREST = 0,
1922    LP_BUILD_ROUND_FLOOR = 1,
1923    LP_BUILD_ROUND_CEIL = 2,
1924    LP_BUILD_ROUND_TRUNCATE = 3
1925 };
1926
1927 static inline LLVMValueRef
1928 lp_build_iround_nearest_sse2(struct lp_build_context *bld,
1929                              LLVMValueRef a)
1930 {
1931    LLVMBuilderRef builder = bld->gallivm->builder;
1932    const struct lp_type type = bld->type;
1933    LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1934    LLVMTypeRef ret_type = lp_build_int_vec_type(bld->gallivm, type);
1935    const char *intrinsic;
1936    LLVMValueRef res;
1937
1938    assert(type.floating);
1939    /* using the double precision conversions is a bit more complicated */
1940    assert(type.width == 32);
1941
1942    assert(lp_check_value(type, a));
1943    assert(util_cpu_caps.has_sse2);
1944
1945    /* This is relying on MXCSR rounding mode, which should always be nearest. */
1946    if (type.length == 1) {
1947       LLVMTypeRef vec_type;
1948       LLVMValueRef undef;
1949       LLVMValueRef arg;
1950       LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
1951
1952       vec_type = LLVMVectorType(bld->elem_type, 4);
1953
1954       intrinsic = "llvm.x86.sse.cvtss2si";
1955
1956       undef = LLVMGetUndef(vec_type);
1957
1958       arg = LLVMBuildInsertElement(builder, undef, a, index0, "");
1959
1960       res = lp_build_intrinsic_unary(builder, intrinsic,
1961                                      ret_type, arg);
1962    }
1963    else {
1964       if (type.width* type.length == 128) {
1965          intrinsic = "llvm.x86.sse2.cvtps2dq";
1966       }
1967       else {
1968          assert(type.width*type.length == 256);
1969          assert(util_cpu_caps.has_avx);
1970
1971          intrinsic = "llvm.x86.avx.cvt.ps2dq.256";
1972       }
1973       res = lp_build_intrinsic_unary(builder, intrinsic,
1974                                      ret_type, a);
1975    }
1976
1977    return res;
1978 }
1979
1980
1981 /*
1982  */
1983 static inline LLVMValueRef
1984 lp_build_round_altivec(struct lp_build_context *bld,
1985                        LLVMValueRef a,
1986                        enum lp_build_round_mode mode)
1987 {
1988    LLVMBuilderRef builder = bld->gallivm->builder;
1989    const struct lp_type type = bld->type;
1990    const char *intrinsic = NULL;
1991
1992    assert(type.floating);
1993
1994    assert(lp_check_value(type, a));
1995    assert(util_cpu_caps.has_altivec);
1996
1997    (void)type;
1998
1999    switch (mode) {
2000    case LP_BUILD_ROUND_NEAREST:
2001       intrinsic = "llvm.ppc.altivec.vrfin";
2002       break;
2003    case LP_BUILD_ROUND_FLOOR:
2004       intrinsic = "llvm.ppc.altivec.vrfim";
2005       break;
2006    case LP_BUILD_ROUND_CEIL:
2007       intrinsic = "llvm.ppc.altivec.vrfip";
2008       break;
2009    case LP_BUILD_ROUND_TRUNCATE:
2010       intrinsic = "llvm.ppc.altivec.vrfiz";
2011       break;
2012    }
2013
2014    return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2015 }
2016
2017 static inline LLVMValueRef
2018 lp_build_round_arch(struct lp_build_context *bld,
2019                     LLVMValueRef a,
2020                     enum lp_build_round_mode mode)
2021 {
2022    if (util_cpu_caps.has_sse4_1 || util_cpu_caps.has_neon) {
2023       LLVMBuilderRef builder = bld->gallivm->builder;
2024       const struct lp_type type = bld->type;
2025       const char *intrinsic_root;
2026       char intrinsic[32];
2027
2028       assert(type.floating);
2029       assert(lp_check_value(type, a));
2030       (void)type;
2031
2032       switch (mode) {
2033       case LP_BUILD_ROUND_NEAREST:
2034          intrinsic_root = "llvm.nearbyint";
2035          break;
2036       case LP_BUILD_ROUND_FLOOR:
2037          intrinsic_root = "llvm.floor";
2038          break;
2039       case LP_BUILD_ROUND_CEIL:
2040          intrinsic_root = "llvm.ceil";
2041          break;
2042       case LP_BUILD_ROUND_TRUNCATE:
2043          intrinsic_root = "llvm.trunc";
2044          break;
2045       }
2046
2047       lp_format_intrinsic(intrinsic, sizeof intrinsic, intrinsic_root, bld->vec_type);
2048       return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2049    }
2050    else /* (util_cpu_caps.has_altivec) */
2051      return lp_build_round_altivec(bld, a, mode);
2052 }
2053
2054 /**
2055  * Return the integer part of a float (vector) value (== round toward zero).
2056  * The returned value is a float (vector).
2057  * Ex: trunc(-1.5) = -1.0
2058  */
2059 LLVMValueRef
2060 lp_build_trunc(struct lp_build_context *bld,
2061                LLVMValueRef a)
2062 {
2063    LLVMBuilderRef builder = bld->gallivm->builder;
2064    const struct lp_type type = bld->type;
2065
2066    assert(type.floating);
2067    assert(lp_check_value(type, a));
2068
2069    if (arch_rounding_available(type)) {
2070       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_TRUNCATE);
2071    }
2072    else {
2073       const struct lp_type type = bld->type;
2074       struct lp_type inttype;
2075       struct lp_build_context intbld;
2076       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2077       LLVMValueRef trunc, res, anosign, mask;
2078       LLVMTypeRef int_vec_type = bld->int_vec_type;
2079       LLVMTypeRef vec_type = bld->vec_type;
2080
2081       assert(type.width == 32); /* might want to handle doubles at some point */
2082
2083       inttype = type;
2084       inttype.floating = 0;
2085       lp_build_context_init(&intbld, bld->gallivm, inttype);
2086
2087       /* round by truncation */
2088       trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2089       res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
2090
2091       /* mask out sign bit */
2092       anosign = lp_build_abs(bld, a);
2093       /*
2094        * mask out all values if anosign > 2^24
2095        * This should work both for large ints (all rounding is no-op for them
2096        * because such floats are always exact) as well as special cases like
2097        * NaNs, Infs (taking advantage of the fact they use max exponent).
2098        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2099        */
2100       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2101       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2102       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2103       return lp_build_select(bld, mask, a, res);
2104    }
2105 }
2106
2107
2108 /**
2109  * Return float (vector) rounded to nearest integer (vector).  The returned
2110  * value is a float (vector).
2111  * Ex: round(0.9) = 1.0
2112  * Ex: round(-1.5) = -2.0
2113  */
2114 LLVMValueRef
2115 lp_build_round(struct lp_build_context *bld,
2116                LLVMValueRef a)
2117 {
2118    LLVMBuilderRef builder = bld->gallivm->builder;
2119    const struct lp_type type = bld->type;
2120
2121    assert(type.floating);
2122    assert(lp_check_value(type, a));
2123
2124    if (arch_rounding_available(type)) {
2125       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
2126    }
2127    else {
2128       const struct lp_type type = bld->type;
2129       struct lp_type inttype;
2130       struct lp_build_context intbld;
2131       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2132       LLVMValueRef res, anosign, mask;
2133       LLVMTypeRef int_vec_type = bld->int_vec_type;
2134       LLVMTypeRef vec_type = bld->vec_type;
2135
2136       assert(type.width == 32); /* might want to handle doubles at some point */
2137
2138       inttype = type;
2139       inttype.floating = 0;
2140       lp_build_context_init(&intbld, bld->gallivm, inttype);
2141
2142       res = lp_build_iround(bld, a);
2143       res = LLVMBuildSIToFP(builder, res, vec_type, "");
2144
2145       /* mask out sign bit */
2146       anosign = lp_build_abs(bld, a);
2147       /*
2148        * mask out all values if anosign > 2^24
2149        * This should work both for large ints (all rounding is no-op for them
2150        * because such floats are always exact) as well as special cases like
2151        * NaNs, Infs (taking advantage of the fact they use max exponent).
2152        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2153        */
2154       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2155       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2156       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2157       return lp_build_select(bld, mask, a, res);
2158    }
2159 }
2160
2161
2162 /**
2163  * Return floor of float (vector), result is a float (vector)
2164  * Ex: floor(1.1) = 1.0
2165  * Ex: floor(-1.1) = -2.0
2166  */
2167 LLVMValueRef
2168 lp_build_floor(struct lp_build_context *bld,
2169                LLVMValueRef a)
2170 {
2171    LLVMBuilderRef builder = bld->gallivm->builder;
2172    const struct lp_type type = bld->type;
2173
2174    assert(type.floating);
2175    assert(lp_check_value(type, a));
2176
2177    if (arch_rounding_available(type)) {
2178       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
2179    }
2180    else {
2181       const struct lp_type type = bld->type;
2182       struct lp_type inttype;
2183       struct lp_build_context intbld;
2184       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2185       LLVMValueRef trunc, res, anosign, mask;
2186       LLVMTypeRef int_vec_type = bld->int_vec_type;
2187       LLVMTypeRef vec_type = bld->vec_type;
2188
2189       if (type.width != 32) {
2190          char intrinsic[32];
2191          lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.floor", vec_type);
2192          return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2193       }
2194
2195       assert(type.width == 32); /* might want to handle doubles at some point */
2196
2197       inttype = type;
2198       inttype.floating = 0;
2199       lp_build_context_init(&intbld, bld->gallivm, inttype);
2200
2201       /* round by truncation */
2202       trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2203       res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
2204
2205       if (type.sign) {
2206          LLVMValueRef tmp;
2207
2208          /*
2209           * fix values if rounding is wrong (for non-special cases)
2210           * - this is the case if trunc > a
2211           */
2212          mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, res, a);
2213          /* tmp = trunc > a ? 1.0 : 0.0 */
2214          tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
2215          tmp = lp_build_and(&intbld, mask, tmp);
2216          tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
2217          res = lp_build_sub(bld, res, tmp);
2218       }
2219
2220       /* mask out sign bit */
2221       anosign = lp_build_abs(bld, a);
2222       /*
2223        * mask out all values if anosign > 2^24
2224        * This should work both for large ints (all rounding is no-op for them
2225        * because such floats are always exact) as well as special cases like
2226        * NaNs, Infs (taking advantage of the fact they use max exponent).
2227        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2228        */
2229       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2230       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2231       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2232       return lp_build_select(bld, mask, a, res);
2233    }
2234 }
2235
2236
2237 /**
2238  * Return ceiling of float (vector), returning float (vector).
2239  * Ex: ceil( 1.1) = 2.0
2240  * Ex: ceil(-1.1) = -1.0
2241  */
2242 LLVMValueRef
2243 lp_build_ceil(struct lp_build_context *bld,
2244               LLVMValueRef a)
2245 {
2246    LLVMBuilderRef builder = bld->gallivm->builder;
2247    const struct lp_type type = bld->type;
2248
2249    assert(type.floating);
2250    assert(lp_check_value(type, a));
2251
2252    if (arch_rounding_available(type)) {
2253       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2254    }
2255    else {
2256       const struct lp_type type = bld->type;
2257       struct lp_type inttype;
2258       struct lp_build_context intbld;
2259       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2260       LLVMValueRef trunc, res, anosign, mask, tmp;
2261       LLVMTypeRef int_vec_type = bld->int_vec_type;
2262       LLVMTypeRef vec_type = bld->vec_type;
2263
2264       if (type.width != 32) {
2265          char intrinsic[32];
2266          lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.ceil", vec_type);
2267          return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2268       }
2269
2270       assert(type.width == 32); /* might want to handle doubles at some point */
2271
2272       inttype = type;
2273       inttype.floating = 0;
2274       lp_build_context_init(&intbld, bld->gallivm, inttype);
2275
2276       /* round by truncation */
2277       trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2278       trunc = LLVMBuildSIToFP(builder, trunc, vec_type, "ceil.trunc");
2279
2280       /*
2281        * fix values if rounding is wrong (for non-special cases)
2282        * - this is the case if trunc < a
2283        */
2284       mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2285       /* tmp = trunc < a ? 1.0 : 0.0 */
2286       tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
2287       tmp = lp_build_and(&intbld, mask, tmp);
2288       tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
2289       res = lp_build_add(bld, trunc, tmp);
2290
2291       /* mask out sign bit */
2292       anosign = lp_build_abs(bld, a);
2293       /*
2294        * mask out all values if anosign > 2^24
2295        * This should work both for large ints (all rounding is no-op for them
2296        * because such floats are always exact) as well as special cases like
2297        * NaNs, Infs (taking advantage of the fact they use max exponent).
2298        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2299        */
2300       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2301       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2302       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2303       return lp_build_select(bld, mask, a, res);
2304    }
2305 }
2306
2307
2308 /**
2309  * Return fractional part of 'a' computed as a - floor(a)
2310  * Typically used in texture coord arithmetic.
2311  */
2312 LLVMValueRef
2313 lp_build_fract(struct lp_build_context *bld,
2314                LLVMValueRef a)
2315 {
2316    assert(bld->type.floating);
2317    return lp_build_sub(bld, a, lp_build_floor(bld, a));
2318 }
2319
2320
2321 /**
2322  * Prevent returning 1.0 for very small negative values of 'a' by clamping
2323  * against 0.99999(9). (Will also return that value for NaNs.)
2324  */
2325 static inline LLVMValueRef
2326 clamp_fract(struct lp_build_context *bld, LLVMValueRef fract)
2327 {
2328    LLVMValueRef max;
2329
2330    /* this is the largest number smaller than 1.0 representable as float */
2331    max = lp_build_const_vec(bld->gallivm, bld->type,
2332                             1.0 - 1.0/(1LL << (lp_mantissa(bld->type) + 1)));
2333    return lp_build_min_ext(bld, fract, max,
2334                            GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
2335 }
2336
2337
2338 /**
2339  * Same as lp_build_fract, but guarantees that the result is always smaller
2340  * than one. Will also return the smaller-than-one value for infs, NaNs.
2341  */
2342 LLVMValueRef
2343 lp_build_fract_safe(struct lp_build_context *bld,
2344                     LLVMValueRef a)
2345 {
2346    return clamp_fract(bld, lp_build_fract(bld, a));
2347 }
2348
2349
2350 /**
2351  * Return the integer part of a float (vector) value (== round toward zero).
2352  * The returned value is an integer (vector).
2353  * Ex: itrunc(-1.5) = -1
2354  */
2355 LLVMValueRef
2356 lp_build_itrunc(struct lp_build_context *bld,
2357                 LLVMValueRef a)
2358 {
2359    LLVMBuilderRef builder = bld->gallivm->builder;
2360    const struct lp_type type = bld->type;
2361    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
2362
2363    assert(type.floating);
2364    assert(lp_check_value(type, a));
2365
2366    return LLVMBuildFPToSI(builder, a, int_vec_type, "");
2367 }
2368
2369
2370 /**
2371  * Return float (vector) rounded to nearest integer (vector).  The returned
2372  * value is an integer (vector).
2373  * Ex: iround(0.9) = 1
2374  * Ex: iround(-1.5) = -2
2375  */
2376 LLVMValueRef
2377 lp_build_iround(struct lp_build_context *bld,
2378                 LLVMValueRef a)
2379 {
2380    LLVMBuilderRef builder = bld->gallivm->builder;
2381    const struct lp_type type = bld->type;
2382    LLVMTypeRef int_vec_type = bld->int_vec_type;
2383    LLVMValueRef res;
2384
2385    assert(type.floating);
2386
2387    assert(lp_check_value(type, a));
2388
2389    if ((util_cpu_caps.has_sse2 &&
2390        ((type.width == 32) && (type.length == 1 || type.length == 4))) ||
2391        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
2392       return lp_build_iround_nearest_sse2(bld, a);
2393    }
2394    if (arch_rounding_available(type)) {
2395       res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
2396    }
2397    else {
2398       LLVMValueRef half;
2399
2400       half = lp_build_const_vec(bld->gallivm, type, nextafterf(0.5, 0.0));
2401
2402       if (type.sign) {
2403          LLVMTypeRef vec_type = bld->vec_type;
2404          LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
2405                                     (unsigned long long)1 << (type.width - 1));
2406          LLVMValueRef sign;
2407
2408          /* get sign bit */
2409          sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
2410          sign = LLVMBuildAnd(builder, sign, mask, "");
2411
2412          /* sign * 0.5 */
2413          half = LLVMBuildBitCast(builder, half, int_vec_type, "");
2414          half = LLVMBuildOr(builder, sign, half, "");
2415          half = LLVMBuildBitCast(builder, half, vec_type, "");
2416       }
2417
2418       res = LLVMBuildFAdd(builder, a, half, "");
2419    }
2420
2421    res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
2422
2423    return res;
2424 }
2425
2426
2427 /**
2428  * Return floor of float (vector), result is an int (vector)
2429  * Ex: ifloor(1.1) = 1.0
2430  * Ex: ifloor(-1.1) = -2.0
2431  */
2432 LLVMValueRef
2433 lp_build_ifloor(struct lp_build_context *bld,
2434                 LLVMValueRef a)
2435 {
2436    LLVMBuilderRef builder = bld->gallivm->builder;
2437    const struct lp_type type = bld->type;
2438    LLVMTypeRef int_vec_type = bld->int_vec_type;
2439    LLVMValueRef res;
2440
2441    assert(type.floating);
2442    assert(lp_check_value(type, a));
2443
2444    res = a;
2445    if (type.sign) {
2446       if (arch_rounding_available(type)) {
2447          res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
2448       }
2449       else {
2450          struct lp_type inttype;
2451          struct lp_build_context intbld;
2452          LLVMValueRef trunc, itrunc, mask;
2453
2454          assert(type.floating);
2455          assert(lp_check_value(type, a));
2456
2457          inttype = type;
2458          inttype.floating = 0;
2459          lp_build_context_init(&intbld, bld->gallivm, inttype);
2460
2461          /* round by truncation */
2462          itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2463          trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "ifloor.trunc");
2464
2465          /*
2466           * fix values if rounding is wrong (for non-special cases)
2467           * - this is the case if trunc > a
2468           * The results of doing this with NaNs, very large values etc.
2469           * are undefined but this seems to be the case anyway.
2470           */
2471          mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, trunc, a);
2472          /* cheapie minus one with mask since the mask is minus one / zero */
2473          return lp_build_add(&intbld, itrunc, mask);
2474       }
2475    }
2476
2477    /* round to nearest (toward zero) */
2478    res = LLVMBuildFPToSI(builder, res, int_vec_type, "ifloor.res");
2479
2480    return res;
2481 }
2482
2483
2484 /**
2485  * Return ceiling of float (vector), returning int (vector).
2486  * Ex: iceil( 1.1) = 2
2487  * Ex: iceil(-1.1) = -1
2488  */
2489 LLVMValueRef
2490 lp_build_iceil(struct lp_build_context *bld,
2491                LLVMValueRef a)
2492 {
2493    LLVMBuilderRef builder = bld->gallivm->builder;
2494    const struct lp_type type = bld->type;
2495    LLVMTypeRef int_vec_type = bld->int_vec_type;
2496    LLVMValueRef res;
2497
2498    assert(type.floating);
2499    assert(lp_check_value(type, a));
2500
2501    if (arch_rounding_available(type)) {
2502       res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2503    }
2504    else {
2505       struct lp_type inttype;
2506       struct lp_build_context intbld;
2507       LLVMValueRef trunc, itrunc, mask;
2508
2509       assert(type.floating);
2510       assert(lp_check_value(type, a));
2511
2512       inttype = type;
2513       inttype.floating = 0;
2514       lp_build_context_init(&intbld, bld->gallivm, inttype);
2515
2516       /* round by truncation */
2517       itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2518       trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "iceil.trunc");
2519
2520       /*
2521        * fix values if rounding is wrong (for non-special cases)
2522        * - this is the case if trunc < a
2523        * The results of doing this with NaNs, very large values etc.
2524        * are undefined but this seems to be the case anyway.
2525        */
2526       mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2527       /* cheapie plus one with mask since the mask is minus one / zero */
2528       return lp_build_sub(&intbld, itrunc, mask);
2529    }
2530
2531    /* round to nearest (toward zero) */
2532    res = LLVMBuildFPToSI(builder, res, int_vec_type, "iceil.res");
2533
2534    return res;
2535 }
2536
2537
2538 /**
2539  * Combined ifloor() & fract().
2540  *
2541  * Preferred to calling the functions separately, as it will ensure that the
2542  * strategy (floor() vs ifloor()) that results in less redundant work is used.
2543  */
2544 void
2545 lp_build_ifloor_fract(struct lp_build_context *bld,
2546                       LLVMValueRef a,
2547                       LLVMValueRef *out_ipart,
2548                       LLVMValueRef *out_fpart)
2549 {
2550    LLVMBuilderRef builder = bld->gallivm->builder;
2551    const struct lp_type type = bld->type;
2552    LLVMValueRef ipart;
2553
2554    assert(type.floating);
2555    assert(lp_check_value(type, a));
2556
2557    if (arch_rounding_available(type)) {
2558       /*
2559        * floor() is easier.
2560        */
2561
2562       ipart = lp_build_floor(bld, a);
2563       *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2564       *out_ipart = LLVMBuildFPToSI(builder, ipart, bld->int_vec_type, "ipart");
2565    }
2566    else {
2567       /*
2568        * ifloor() is easier.
2569        */
2570
2571       *out_ipart = lp_build_ifloor(bld, a);
2572       ipart = LLVMBuildSIToFP(builder, *out_ipart, bld->vec_type, "ipart");
2573       *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2574    }
2575 }
2576
2577
2578 /**
2579  * Same as lp_build_ifloor_fract, but guarantees that the fractional part is
2580  * always smaller than one.
2581  */
2582 void
2583 lp_build_ifloor_fract_safe(struct lp_build_context *bld,
2584                            LLVMValueRef a,
2585                            LLVMValueRef *out_ipart,
2586                            LLVMValueRef *out_fpart)
2587 {
2588    lp_build_ifloor_fract(bld, a, out_ipart, out_fpart);
2589    *out_fpart = clamp_fract(bld, *out_fpart);
2590 }
2591
2592
2593 LLVMValueRef
2594 lp_build_sqrt(struct lp_build_context *bld,
2595               LLVMValueRef a)
2596 {
2597    LLVMBuilderRef builder = bld->gallivm->builder;
2598    const struct lp_type type = bld->type;
2599    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
2600    char intrinsic[32];
2601
2602    assert(lp_check_value(type, a));
2603
2604    assert(type.floating);
2605    lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.sqrt", vec_type);
2606
2607    return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2608 }
2609
2610
2611 /**
2612  * Do one Newton-Raphson step to improve reciprocate precision:
2613  *
2614  *   x_{i+1} = x_i + x_i * (1 - a * x_i)
2615  *
2616  * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
2617  * +/-Inf, giving NaN instead.  Certain applications rely on this behavior,
2618  * such as Google Earth, which does RCP(RSQRT(0.0)) when drawing the Earth's
2619  * halo. It would be necessary to clamp the argument to prevent this.
2620  *
2621  * See also:
2622  * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
2623  * - http://softwarecommunity.intel.com/articles/eng/1818.htm
2624  */
2625 static inline LLVMValueRef
2626 lp_build_rcp_refine(struct lp_build_context *bld,
2627                     LLVMValueRef a,
2628                     LLVMValueRef rcp_a)
2629 {
2630    LLVMBuilderRef builder = bld->gallivm->builder;
2631    LLVMValueRef neg_a;
2632    LLVMValueRef res;
2633
2634    neg_a = LLVMBuildFNeg(builder, a, "");
2635    res = lp_build_fmuladd(builder, neg_a, rcp_a, bld->one);
2636    res = lp_build_fmuladd(builder, res, rcp_a, rcp_a);
2637
2638    return res;
2639 }
2640
2641
2642 LLVMValueRef
2643 lp_build_rcp(struct lp_build_context *bld,
2644              LLVMValueRef a)
2645 {
2646    LLVMBuilderRef builder = bld->gallivm->builder;
2647    const struct lp_type type = bld->type;
2648
2649    assert(lp_check_value(type, a));
2650
2651    if(a == bld->zero)
2652       return bld->undef;
2653    if(a == bld->one)
2654       return bld->one;
2655    if(a == bld->undef)
2656       return bld->undef;
2657
2658    assert(type.floating);
2659
2660    if(LLVMIsConstant(a))
2661       return LLVMConstFDiv(bld->one, a);
2662
2663    /*
2664     * We don't use RCPPS because:
2665     * - it only has 10bits of precision
2666     * - it doesn't even get the reciprocate of 1.0 exactly
2667     * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
2668     * - for recent processors the benefit over DIVPS is marginal, a case
2669     *   dependent
2670     *
2671     * We could still use it on certain processors if benchmarks show that the
2672     * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
2673     * particular uses that require less workarounds.
2674     */
2675
2676    if (FALSE && ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2677          (util_cpu_caps.has_avx && type.width == 32 && type.length == 8))){
2678       const unsigned num_iterations = 0;
2679       LLVMValueRef res;
2680       unsigned i;
2681       const char *intrinsic = NULL;
2682
2683       if (type.length == 4) {
2684          intrinsic = "llvm.x86.sse.rcp.ps";
2685       }
2686       else {
2687          intrinsic = "llvm.x86.avx.rcp.ps.256";
2688       }
2689
2690       res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2691
2692       for (i = 0; i < num_iterations; ++i) {
2693          res = lp_build_rcp_refine(bld, a, res);
2694       }
2695
2696       return res;
2697    }
2698
2699    return LLVMBuildFDiv(builder, bld->one, a, "");
2700 }
2701
2702
2703 /**
2704  * Do one Newton-Raphson step to improve rsqrt precision:
2705  *
2706  *   x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
2707  *
2708  * See also Intel 64 and IA-32 Architectures Optimization Manual.
2709  */
2710 static inline LLVMValueRef
2711 lp_build_rsqrt_refine(struct lp_build_context *bld,
2712                       LLVMValueRef a,
2713                       LLVMValueRef rsqrt_a)
2714 {
2715    LLVMBuilderRef builder = bld->gallivm->builder;
2716    LLVMValueRef half = lp_build_const_vec(bld->gallivm, bld->type, 0.5);
2717    LLVMValueRef three = lp_build_const_vec(bld->gallivm, bld->type, 3.0);
2718    LLVMValueRef res;
2719
2720    res = LLVMBuildFMul(builder, rsqrt_a, rsqrt_a, "");
2721    res = LLVMBuildFMul(builder, a, res, "");
2722    res = LLVMBuildFSub(builder, three, res, "");
2723    res = LLVMBuildFMul(builder, rsqrt_a, res, "");
2724    res = LLVMBuildFMul(builder, half, res, "");
2725
2726    return res;
2727 }
2728
2729
2730 /**
2731  * Generate 1/sqrt(a).
2732  * Result is undefined for values < 0, infinity for +0.
2733  */
2734 LLVMValueRef
2735 lp_build_rsqrt(struct lp_build_context *bld,
2736                LLVMValueRef a)
2737 {
2738    const struct lp_type type = bld->type;
2739
2740    assert(lp_check_value(type, a));
2741
2742    assert(type.floating);
2743
2744    /*
2745     * This should be faster but all denormals will end up as infinity.
2746     */
2747    if (0 && lp_build_fast_rsqrt_available(type)) {
2748       const unsigned num_iterations = 1;
2749       LLVMValueRef res;
2750       unsigned i;
2751
2752       /* rsqrt(1.0) != 1.0 here */
2753       res = lp_build_fast_rsqrt(bld, a);
2754
2755       if (num_iterations) {
2756          /*
2757           * Newton-Raphson will result in NaN instead of infinity for zero,
2758           * and NaN instead of zero for infinity.
2759           * Also, need to ensure rsqrt(1.0) == 1.0.
2760           * All numbers smaller than FLT_MIN will result in +infinity
2761           * (rsqrtps treats all denormals as zero).
2762           */
2763          LLVMValueRef cmp;
2764          LLVMValueRef flt_min = lp_build_const_vec(bld->gallivm, type, FLT_MIN);
2765          LLVMValueRef inf = lp_build_const_vec(bld->gallivm, type, INFINITY);
2766
2767          for (i = 0; i < num_iterations; ++i) {
2768             res = lp_build_rsqrt_refine(bld, a, res);
2769          }
2770          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_LESS, a, flt_min);
2771          res = lp_build_select(bld, cmp, inf, res);
2772          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, inf);
2773          res = lp_build_select(bld, cmp, bld->zero, res);
2774          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, bld->one);
2775          res = lp_build_select(bld, cmp, bld->one, res);
2776       }
2777
2778       return res;
2779    }
2780
2781    return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2782 }
2783
2784 /**
2785  * If there's a fast (inaccurate) rsqrt instruction available
2786  * (caller may want to avoid to call rsqrt_fast if it's not available,
2787  * i.e. for calculating x^0.5 it may do rsqrt_fast(x) * x but if
2788  * unavailable it would result in sqrt/div/mul so obviously
2789  * much better to just call sqrt, skipping both div and mul).
2790  */
2791 boolean
2792 lp_build_fast_rsqrt_available(struct lp_type type)
2793 {
2794    assert(type.floating);
2795
2796    if ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2797        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
2798       return true;
2799    }
2800    return false;
2801 }
2802
2803
2804 /**
2805  * Generate 1/sqrt(a).
2806  * Result is undefined for values < 0, infinity for +0.
2807  * Precision is limited, only ~10 bits guaranteed
2808  * (rsqrt 1.0 may not be 1.0, denorms may be flushed to 0).
2809  */
2810 LLVMValueRef
2811 lp_build_fast_rsqrt(struct lp_build_context *bld,
2812                     LLVMValueRef a)
2813 {
2814    LLVMBuilderRef builder = bld->gallivm->builder;
2815    const struct lp_type type = bld->type;
2816
2817    assert(lp_check_value(type, a));
2818
2819    if (lp_build_fast_rsqrt_available(type)) {
2820       const char *intrinsic = NULL;
2821
2822       if (type.length == 4) {
2823          intrinsic = "llvm.x86.sse.rsqrt.ps";
2824       }
2825       else {
2826          intrinsic = "llvm.x86.avx.rsqrt.ps.256";
2827       }
2828       return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2829    }
2830    else {
2831       debug_printf("%s: emulating fast rsqrt with rcp/sqrt\n", __FUNCTION__);
2832    }
2833    return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2834 }
2835
2836
2837 /**
2838  * Generate sin(a) or cos(a) using polynomial approximation.
2839  * TODO: it might be worth recognizing sin and cos using same source
2840  * (i.e. d3d10 sincos opcode). Obviously doing both at the same time
2841  * would be way cheaper than calculating (nearly) everything twice...
2842  * Not sure it's common enough to be worth bothering however, scs
2843  * opcode could also benefit from calculating both though.
2844  */
2845 static LLVMValueRef
2846 lp_build_sin_or_cos(struct lp_build_context *bld,
2847                     LLVMValueRef a,
2848                     boolean cos)
2849 {
2850    struct gallivm_state *gallivm = bld->gallivm;
2851    LLVMBuilderRef b = gallivm->builder;
2852    struct lp_type int_type = lp_int_type(bld->type);
2853
2854    /*
2855     *  take the absolute value,
2856     *  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
2857     */
2858
2859    LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
2860    LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
2861
2862    LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
2863    LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
2864
2865    /*
2866     * scale by 4/Pi
2867     * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
2868     */
2869
2870    LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
2871    LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
2872
2873    /*
2874     * store the integer part of y in mm0
2875     * emm2 = _mm_cvttps_epi32(y);
2876     */
2877
2878    LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
2879
2880    /*
2881     * j=(j+1) & (~1) (see the cephes sources)
2882     * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
2883     */
2884
2885    LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
2886    LLVMValueRef emm2_add =  LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
2887    /*
2888     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
2889     */
2890    LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
2891    LLVMValueRef emm2_and =  LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
2892
2893    /*
2894     * y = _mm_cvtepi32_ps(emm2);
2895     */
2896    LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
2897
2898    LLVMValueRef const_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
2899    LLVMValueRef const_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
2900    LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
2901    LLVMValueRef sign_mask = lp_build_const_int_vec(gallivm, bld->type, 0x80000000);
2902
2903    /*
2904     * Argument used for poly selection and sign bit determination
2905     * is different for sin vs. cos.
2906     */
2907    LLVMValueRef emm2_2 = cos ? LLVMBuildSub(b, emm2_and, const_2, "emm2_2") :
2908                                emm2_and;
2909
2910    LLVMValueRef sign_bit = cos ? LLVMBuildShl(b, LLVMBuildAnd(b, const_4,
2911                                                               LLVMBuildNot(b, emm2_2, ""), ""),
2912                                               const_29, "sign_bit") :
2913                                  LLVMBuildAnd(b, LLVMBuildXor(b, a_v4si,
2914                                                               LLVMBuildShl(b, emm2_add,
2915                                                                            const_29, ""), ""),
2916                                               sign_mask, "sign_bit");
2917
2918    /*
2919     * get the polynom selection mask
2920     * there is one polynom for 0 <= x <= Pi/4
2921     * and another one for Pi/4<x<=Pi/2
2922     * Both branches will be computed.
2923     *
2924     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
2925     * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
2926     */
2927
2928    LLVMValueRef emm2_3 =  LLVMBuildAnd(b, emm2_2, const_2, "emm2_3");
2929    LLVMValueRef poly_mask = lp_build_compare(gallivm,
2930                                              int_type, PIPE_FUNC_EQUAL,
2931                                              emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
2932
2933    /*
2934     * _PS_CONST(minus_cephes_DP1, -0.78515625);
2935     * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
2936     * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
2937     */
2938    LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
2939    LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
2940    LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
2941
2942    /*
2943     * The magic pass: "Extended precision modular arithmetic"
2944     * x = ((x - y * DP1) - y * DP2) - y * DP3;
2945     */
2946    LLVMValueRef x_1 = lp_build_fmuladd(b, y_2, DP1, x_abs);
2947    LLVMValueRef x_2 = lp_build_fmuladd(b, y_2, DP2, x_1);
2948    LLVMValueRef x_3 = lp_build_fmuladd(b, y_2, DP3, x_2);
2949
2950    /*
2951     * Evaluate the first polynom  (0 <= x <= Pi/4)
2952     *
2953     * z = _mm_mul_ps(x,x);
2954     */
2955    LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
2956
2957    /*
2958     * _PS_CONST(coscof_p0,  2.443315711809948E-005);
2959     * _PS_CONST(coscof_p1, -1.388731625493765E-003);
2960     * _PS_CONST(coscof_p2,  4.166664568298827E-002);
2961     */
2962    LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
2963    LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
2964    LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
2965
2966    /*
2967     * y = *(v4sf*)_ps_coscof_p0;
2968     * y = _mm_mul_ps(y, z);
2969     */
2970    LLVMValueRef y_4 = lp_build_fmuladd(b, z, coscof_p0, coscof_p1);
2971    LLVMValueRef y_6 = lp_build_fmuladd(b, y_4, z, coscof_p2);
2972    LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
2973    LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
2974
2975
2976    /*
2977     * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
2978     * y = _mm_sub_ps(y, tmp);
2979     * y = _mm_add_ps(y, *(v4sf*)_ps_1);
2980     */
2981    LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
2982    LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
2983    LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
2984    LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
2985    LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
2986
2987    /*
2988     * _PS_CONST(sincof_p0, -1.9515295891E-4);
2989     * _PS_CONST(sincof_p1,  8.3321608736E-3);
2990     * _PS_CONST(sincof_p2, -1.6666654611E-1);
2991     */
2992    LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
2993    LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
2994    LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
2995
2996    /*
2997     * Evaluate the second polynom  (Pi/4 <= x <= 0)
2998     *
2999     * y2 = *(v4sf*)_ps_sincof_p0;
3000     * y2 = _mm_mul_ps(y2, z);
3001     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
3002     * y2 = _mm_mul_ps(y2, z);
3003     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
3004     * y2 = _mm_mul_ps(y2, z);
3005     * y2 = _mm_mul_ps(y2, x);
3006     * y2 = _mm_add_ps(y2, x);
3007     */
3008
3009    LLVMValueRef y2_4 = lp_build_fmuladd(b, z, sincof_p0, sincof_p1);
3010    LLVMValueRef y2_6 = lp_build_fmuladd(b, y2_4, z, sincof_p2);
3011    LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
3012    LLVMValueRef y2_9 = lp_build_fmuladd(b, y2_7, x_3, x_3);
3013
3014    /*
3015     * select the correct result from the two polynoms
3016     * xmm3 = poly_mask;
3017     * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
3018     * y = _mm_andnot_ps(xmm3, y);
3019     * y = _mm_or_ps(y,y2);
3020     */
3021    LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
3022    LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
3023    LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
3024    LLVMValueRef poly_mask_inv = LLVMBuildNot(b, poly_mask, "poly_mask_inv");
3025    LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
3026    LLVMValueRef y_combine = LLVMBuildOr(b, y_and, y2_and, "y_combine");
3027
3028    /*
3029     * update the sign
3030     * y = _mm_xor_ps(y, sign_bit);
3031     */
3032    LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sign");
3033    LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
3034
3035    LLVMValueRef isfinite = lp_build_isfinite(bld, a);
3036
3037    /* clamp output to be within [-1, 1] */
3038    y_result = lp_build_clamp(bld, y_result,
3039                              lp_build_const_vec(bld->gallivm, bld->type,  -1.f),
3040                              lp_build_const_vec(bld->gallivm, bld->type,  1.f));
3041    /* If a is -inf, inf or NaN then return NaN */
3042    y_result = lp_build_select(bld, isfinite, y_result,
3043                               lp_build_const_vec(bld->gallivm, bld->type,  NAN));
3044    return y_result;
3045 }
3046
3047
3048 /**
3049  * Generate sin(a)
3050  */
3051 LLVMValueRef
3052 lp_build_sin(struct lp_build_context *bld,
3053              LLVMValueRef a)
3054 {
3055    return lp_build_sin_or_cos(bld, a, FALSE);
3056 }
3057
3058
3059 /**
3060  * Generate cos(a)
3061  */
3062 LLVMValueRef
3063 lp_build_cos(struct lp_build_context *bld,
3064              LLVMValueRef a)
3065 {
3066    return lp_build_sin_or_cos(bld, a, TRUE);
3067 }
3068
3069
3070 /**
3071  * Generate pow(x, y)
3072  */
3073 LLVMValueRef
3074 lp_build_pow(struct lp_build_context *bld,
3075              LLVMValueRef x,
3076              LLVMValueRef y)
3077 {
3078    /* TODO: optimize the constant case */
3079    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3080        LLVMIsConstant(x) && LLVMIsConstant(y)) {
3081       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3082                    __FUNCTION__);
3083    }
3084
3085    return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
3086 }
3087
3088
3089 /**
3090  * Generate exp(x)
3091  */
3092 LLVMValueRef
3093 lp_build_exp(struct lp_build_context *bld,
3094              LLVMValueRef x)
3095 {
3096    /* log2(e) = 1/log(2) */
3097    LLVMValueRef log2e = lp_build_const_vec(bld->gallivm, bld->type,
3098                                            1.4426950408889634);
3099
3100    assert(lp_check_value(bld->type, x));
3101
3102    return lp_build_exp2(bld, lp_build_mul(bld, log2e, x));
3103 }
3104
3105
3106 /**
3107  * Generate log(x)
3108  * Behavior is undefined with infs, 0s and nans
3109  */
3110 LLVMValueRef
3111 lp_build_log(struct lp_build_context *bld,
3112              LLVMValueRef x)
3113 {
3114    /* log(2) */
3115    LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
3116                                           0.69314718055994529);
3117
3118    assert(lp_check_value(bld->type, x));
3119
3120    return lp_build_mul(bld, log2, lp_build_log2(bld, x));
3121 }
3122
3123 /**
3124  * Generate log(x) that handles edge cases (infs, 0s and nans)
3125  */
3126 LLVMValueRef
3127 lp_build_log_safe(struct lp_build_context *bld,
3128                   LLVMValueRef x)
3129 {
3130    /* log(2) */
3131    LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
3132                                           0.69314718055994529);
3133
3134    assert(lp_check_value(bld->type, x));
3135
3136    return lp_build_mul(bld, log2, lp_build_log2_safe(bld, x));
3137 }
3138
3139
3140 /**
3141  * Generate polynomial.
3142  * Ex:  coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
3143  */
3144 LLVMValueRef
3145 lp_build_polynomial(struct lp_build_context *bld,
3146                     LLVMValueRef x,
3147                     const double *coeffs,
3148                     unsigned num_coeffs)
3149 {
3150    const struct lp_type type = bld->type;
3151    LLVMValueRef even = NULL, odd = NULL;
3152    LLVMValueRef x2;
3153    unsigned i;
3154
3155    assert(lp_check_value(bld->type, x));
3156
3157    /* TODO: optimize the constant case */
3158    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3159        LLVMIsConstant(x)) {
3160       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3161                    __FUNCTION__);
3162    }
3163
3164    /*
3165     * Calculate odd and even terms seperately to decrease data dependency
3166     * Ex:
3167     *     c[0] + x^2 * c[2] + x^4 * c[4] ...
3168     *     + x * (c[1] + x^2 * c[3] + x^4 * c[5]) ...
3169     */
3170    x2 = lp_build_mul(bld, x, x);
3171
3172    for (i = num_coeffs; i--; ) {
3173       LLVMValueRef coeff;
3174
3175       coeff = lp_build_const_vec(bld->gallivm, type, coeffs[i]);
3176
3177       if (i % 2 == 0) {
3178          if (even)
3179             even = lp_build_mad(bld, x2, even, coeff);
3180          else
3181             even = coeff;
3182       } else {
3183          if (odd)
3184             odd = lp_build_mad(bld, x2, odd, coeff);
3185          else
3186             odd = coeff;
3187       }
3188    }
3189
3190    if (odd)
3191       return lp_build_mad(bld, odd, x, even);
3192    else if (even)
3193       return even;
3194    else
3195       return bld->undef;
3196 }
3197
3198
3199 /**
3200  * Minimax polynomial fit of 2**x, in range [0, 1[
3201  */
3202 const double lp_build_exp2_polynomial[] = {
3203 #if EXP_POLY_DEGREE == 5
3204    1.000000000000000000000, /*XXX: was 0.999999925063526176901, recompute others */
3205    0.693153073200168932794,
3206    0.240153617044375388211,
3207    0.0558263180532956664775,
3208    0.00898934009049466391101,
3209    0.00187757667519147912699
3210 #elif EXP_POLY_DEGREE == 4
3211    1.00000259337069434683,
3212    0.693003834469974940458,
3213    0.24144275689150793076,
3214    0.0520114606103070150235,
3215    0.0135341679161270268764
3216 #elif EXP_POLY_DEGREE == 3
3217    0.999925218562710312959,
3218    0.695833540494823811697,
3219    0.226067155427249155588,
3220    0.0780245226406372992967
3221 #elif EXP_POLY_DEGREE == 2
3222    1.00172476321474503578,
3223    0.657636275736077639316,
3224    0.33718943461968720704
3225 #else
3226 #error
3227 #endif
3228 };
3229
3230
3231 LLVMValueRef
3232 lp_build_exp2(struct lp_build_context *bld,
3233               LLVMValueRef x)
3234 {
3235    LLVMBuilderRef builder = bld->gallivm->builder;
3236    const struct lp_type type = bld->type;
3237    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3238    LLVMValueRef ipart = NULL;
3239    LLVMValueRef fpart = NULL;
3240    LLVMValueRef expipart = NULL;
3241    LLVMValueRef expfpart = NULL;
3242    LLVMValueRef res = NULL;
3243
3244    assert(lp_check_value(bld->type, x));
3245
3246    /* TODO: optimize the constant case */
3247    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3248        LLVMIsConstant(x)) {
3249       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3250                    __FUNCTION__);
3251    }
3252
3253    assert(type.floating && type.width == 32);
3254
3255    /* We want to preserve NaN and make sure than for exp2 if x > 128,
3256     * the result is INF  and if it's smaller than -126.9 the result is 0 */
3257    x = lp_build_min_ext(bld, lp_build_const_vec(bld->gallivm, type,  128.0), x,
3258                         GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
3259    x = lp_build_max_ext(bld, lp_build_const_vec(bld->gallivm, type, -126.99999),
3260                         x, GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
3261
3262    /* ipart = floor(x) */
3263    /* fpart = x - ipart */
3264    lp_build_ifloor_fract(bld, x, &ipart, &fpart);
3265
3266    /* expipart = (float) (1 << ipart) */
3267    expipart = LLVMBuildAdd(builder, ipart,
3268                            lp_build_const_int_vec(bld->gallivm, type, 127), "");
3269    expipart = LLVMBuildShl(builder, expipart,
3270                            lp_build_const_int_vec(bld->gallivm, type, 23), "");
3271    expipart = LLVMBuildBitCast(builder, expipart, vec_type, "");
3272
3273    expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
3274                                   ARRAY_SIZE(lp_build_exp2_polynomial));
3275
3276    res = LLVMBuildFMul(builder, expipart, expfpart, "");
3277
3278    return res;
3279 }
3280
3281
3282
3283 /**
3284  * Extract the exponent of a IEEE-754 floating point value.
3285  *
3286  * Optionally apply an integer bias.
3287  *
3288  * Result is an integer value with
3289  *
3290  *   ifloor(log2(x)) + bias
3291  */
3292 LLVMValueRef
3293 lp_build_extract_exponent(struct lp_build_context *bld,
3294                           LLVMValueRef x,
3295                           int bias)
3296 {
3297    LLVMBuilderRef builder = bld->gallivm->builder;
3298    const struct lp_type type = bld->type;
3299    unsigned mantissa = lp_mantissa(type);
3300    LLVMValueRef res;
3301
3302    assert(type.floating);
3303
3304    assert(lp_check_value(bld->type, x));
3305
3306    x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3307
3308    res = LLVMBuildLShr(builder, x,
3309                        lp_build_const_int_vec(bld->gallivm, type, mantissa), "");
3310    res = LLVMBuildAnd(builder, res,
3311                       lp_build_const_int_vec(bld->gallivm, type, 255), "");
3312    res = LLVMBuildSub(builder, res,
3313                       lp_build_const_int_vec(bld->gallivm, type, 127 - bias), "");
3314
3315    return res;
3316 }
3317
3318
3319 /**
3320  * Extract the mantissa of the a floating.
3321  *
3322  * Result is a floating point value with
3323  *
3324  *   x / floor(log2(x))
3325  */
3326 LLVMValueRef
3327 lp_build_extract_mantissa(struct lp_build_context *bld,
3328                           LLVMValueRef x)
3329 {
3330    LLVMBuilderRef builder = bld->gallivm->builder;
3331    const struct lp_type type = bld->type;
3332    unsigned mantissa = lp_mantissa(type);
3333    LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type,
3334                                                   (1ULL << mantissa) - 1);
3335    LLVMValueRef one = LLVMConstBitCast(bld->one, bld->int_vec_type);
3336    LLVMValueRef res;
3337
3338    assert(lp_check_value(bld->type, x));
3339
3340    assert(type.floating);
3341
3342    x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3343
3344    /* res = x / 2**ipart */
3345    res = LLVMBuildAnd(builder, x, mantmask, "");
3346    res = LLVMBuildOr(builder, res, one, "");
3347    res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
3348
3349    return res;
3350 }
3351
3352
3353
3354 /**
3355  * Minimax polynomial fit of log2((1.0 + sqrt(x))/(1.0 - sqrt(x)))/sqrt(x) ,for x in range of [0, 1/9[
3356  * These coefficients can be generate with
3357  * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
3358  */
3359 const double lp_build_log2_polynomial[] = {
3360 #if LOG_POLY_DEGREE == 5
3361    2.88539008148777786488L,
3362    0.961796878841293367824L,
3363    0.577058946784739859012L,
3364    0.412914355135828735411L,
3365    0.308591899232910175289L,
3366    0.352376952300281371868L,
3367 #elif LOG_POLY_DEGREE == 4
3368    2.88539009343309178325L,
3369    0.961791550404184197881L,
3370    0.577440339438736392009L,
3371    0.403343858251329912514L,
3372    0.406718052498846252698L,
3373 #elif LOG_POLY_DEGREE == 3
3374    2.88538959748872753838L,
3375    0.961932915889597772928L,
3376    0.571118517972136195241L,
3377    0.493997535084709500285L,
3378 #else
3379 #error
3380 #endif
3381 };
3382
3383 /**
3384  * See http://www.devmaster.net/forums/showthread.php?p=43580
3385  * http://en.wikipedia.org/wiki/Logarithm#Calculation
3386  * http://www.nezumi.demon.co.uk/consult/logx.htm
3387  *
3388  * If handle_edge_cases is true the function will perform computations
3389  * to match the required D3D10+ behavior for each of the edge cases.
3390  * That means that if input is:
3391  * - less than zero (to and including -inf) then NaN will be returned
3392  * - equal to zero (-denorm, -0, +0 or +denorm), then -inf will be returned
3393  * - +infinity, then +infinity will be returned
3394  * - NaN, then NaN will be returned
3395  *
3396  * Those checks are fairly expensive so if you don't need them make sure
3397  * handle_edge_cases is false.
3398  */
3399 void
3400 lp_build_log2_approx(struct lp_build_context *bld,
3401                      LLVMValueRef x,
3402                      LLVMValueRef *p_exp,
3403                      LLVMValueRef *p_floor_log2,
3404                      LLVMValueRef *p_log2,
3405                      boolean handle_edge_cases)
3406 {
3407    LLVMBuilderRef builder = bld->gallivm->builder;
3408    const struct lp_type type = bld->type;
3409    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3410    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
3411
3412    LLVMValueRef expmask = lp_build_const_int_vec(bld->gallivm, type, 0x7f800000);
3413    LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 0x007fffff);
3414    LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
3415
3416    LLVMValueRef i = NULL;
3417    LLVMValueRef y = NULL;
3418    LLVMValueRef z = NULL;
3419    LLVMValueRef exp = NULL;
3420    LLVMValueRef mant = NULL;
3421    LLVMValueRef logexp = NULL;
3422    LLVMValueRef p_z = NULL;
3423    LLVMValueRef res = NULL;
3424
3425    assert(lp_check_value(bld->type, x));
3426
3427    if(p_exp || p_floor_log2 || p_log2) {
3428       /* TODO: optimize the constant case */
3429       if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3430           LLVMIsConstant(x)) {
3431          debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3432                       __FUNCTION__);
3433       }
3434
3435       assert(type.floating && type.width == 32);
3436
3437       /*
3438        * We don't explicitly handle denormalized numbers. They will yield a
3439        * result in the neighbourhood of -127, which appears to be adequate
3440        * enough.
3441        */
3442
3443       i = LLVMBuildBitCast(builder, x, int_vec_type, "");
3444
3445       /* exp = (float) exponent(x) */
3446       exp = LLVMBuildAnd(builder, i, expmask, "");
3447    }
3448
3449    if(p_floor_log2 || p_log2) {
3450       logexp = LLVMBuildLShr(builder, exp, lp_build_const_int_vec(bld->gallivm, type, 23), "");
3451       logexp = LLVMBuildSub(builder, logexp, lp_build_const_int_vec(bld->gallivm, type, 127), "");
3452       logexp = LLVMBuildSIToFP(builder, logexp, vec_type, "");
3453    }
3454
3455    if (p_log2) {
3456       /* mant = 1 + (float) mantissa(x) */
3457       mant = LLVMBuildAnd(builder, i, mantmask, "");
3458       mant = LLVMBuildOr(builder, mant, one, "");
3459       mant = LLVMBuildBitCast(builder, mant, vec_type, "");
3460
3461       /* y = (mant - 1) / (mant + 1) */
3462       y = lp_build_div(bld,
3463          lp_build_sub(bld, mant, bld->one),
3464          lp_build_add(bld, mant, bld->one)
3465       );
3466
3467       /* z = y^2 */
3468       z = lp_build_mul(bld, y, y);
3469
3470       /* compute P(z) */
3471       p_z = lp_build_polynomial(bld, z, lp_build_log2_polynomial,
3472                                 ARRAY_SIZE(lp_build_log2_polynomial));
3473
3474       /* y * P(z) + logexp */
3475       res = lp_build_mad(bld, y, p_z, logexp);
3476
3477       if (type.floating && handle_edge_cases) {
3478          LLVMValueRef negmask, infmask,  zmask;
3479          negmask = lp_build_cmp(bld, PIPE_FUNC_LESS, x,
3480                                 lp_build_const_vec(bld->gallivm, type,  0.0f));
3481          zmask = lp_build_cmp(bld, PIPE_FUNC_EQUAL, x,
3482                               lp_build_const_vec(bld->gallivm, type,  0.0f));
3483          infmask = lp_build_cmp(bld, PIPE_FUNC_GEQUAL, x,
3484                                 lp_build_const_vec(bld->gallivm, type,  INFINITY));
3485
3486          /* If x is qual to inf make sure we return inf */
3487          res = lp_build_select(bld, infmask,
3488                                lp_build_const_vec(bld->gallivm, type,  INFINITY),
3489                                res);
3490          /* If x is qual to 0, return -inf */
3491          res = lp_build_select(bld, zmask,
3492                                lp_build_const_vec(bld->gallivm, type,  -INFINITY),
3493                                res);
3494          /* If x is nan or less than 0, return nan */
3495          res = lp_build_select(bld, negmask,
3496                                lp_build_const_vec(bld->gallivm, type,  NAN),
3497                                res);
3498       }
3499    }
3500
3501    if (p_exp) {
3502       exp = LLVMBuildBitCast(builder, exp, vec_type, "");
3503       *p_exp = exp;
3504    }
3505
3506    if (p_floor_log2)
3507       *p_floor_log2 = logexp;
3508
3509    if (p_log2)
3510       *p_log2 = res;
3511 }
3512
3513
3514 /*
3515  * log2 implementation which doesn't have special code to
3516  * handle edge cases (-inf, 0, inf, NaN). It's faster but
3517  * the results for those cases are undefined.
3518  */
3519 LLVMValueRef
3520 lp_build_log2(struct lp_build_context *bld,
3521               LLVMValueRef x)
3522 {
3523    LLVMValueRef res;
3524    lp_build_log2_approx(bld, x, NULL, NULL, &res, FALSE);
3525    return res;
3526 }
3527
3528 /*
3529  * Version of log2 which handles all edge cases.
3530  * Look at documentation of lp_build_log2_approx for
3531  * description of the behavior for each of the edge cases.
3532  */
3533 LLVMValueRef
3534 lp_build_log2_safe(struct lp_build_context *bld,
3535                    LLVMValueRef x)
3536 {
3537    LLVMValueRef res;
3538    lp_build_log2_approx(bld, x, NULL, NULL, &res, TRUE);
3539    return res;
3540 }
3541
3542
3543 /**
3544  * Faster (and less accurate) log2.
3545  *
3546  *    log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x))
3547  *
3548  * Piece-wise linear approximation, with exact results when x is a
3549  * power of two.
3550  *
3551  * See http://www.flipcode.com/archives/Fast_log_Function.shtml
3552  */
3553 LLVMValueRef
3554 lp_build_fast_log2(struct lp_build_context *bld,
3555                    LLVMValueRef x)
3556 {
3557    LLVMBuilderRef builder = bld->gallivm->builder;
3558    LLVMValueRef ipart;
3559    LLVMValueRef fpart;
3560
3561    assert(lp_check_value(bld->type, x));
3562
3563    assert(bld->type.floating);
3564
3565    /* ipart = floor(log2(x)) - 1 */
3566    ipart = lp_build_extract_exponent(bld, x, -1);
3567    ipart = LLVMBuildSIToFP(builder, ipart, bld->vec_type, "");
3568
3569    /* fpart = x / 2**ipart */
3570    fpart = lp_build_extract_mantissa(bld, x);
3571
3572    /* ipart + fpart */
3573    return LLVMBuildFAdd(builder, ipart, fpart, "");
3574 }
3575
3576
3577 /**
3578  * Fast implementation of iround(log2(x)).
3579  *
3580  * Not an approximation -- it should give accurate results all the time.
3581  */
3582 LLVMValueRef
3583 lp_build_ilog2(struct lp_build_context *bld,
3584                LLVMValueRef x)
3585 {
3586    LLVMBuilderRef builder = bld->gallivm->builder;
3587    LLVMValueRef sqrt2 = lp_build_const_vec(bld->gallivm, bld->type, M_SQRT2);
3588    LLVMValueRef ipart;
3589
3590    assert(bld->type.floating);
3591
3592    assert(lp_check_value(bld->type, x));
3593
3594    /* x * 2^(0.5)   i.e., add 0.5 to the log2(x) */
3595    x = LLVMBuildFMul(builder, x, sqrt2, "");
3596
3597    /* ipart = floor(log2(x) + 0.5)  */
3598    ipart = lp_build_extract_exponent(bld, x, 0);
3599
3600    return ipart;
3601 }
3602
3603 LLVMValueRef
3604 lp_build_mod(struct lp_build_context *bld,
3605              LLVMValueRef x,
3606              LLVMValueRef y)
3607 {
3608    LLVMBuilderRef builder = bld->gallivm->builder;
3609    LLVMValueRef res;
3610    const struct lp_type type = bld->type;
3611
3612    assert(lp_check_value(type, x));
3613    assert(lp_check_value(type, y));
3614
3615    if (type.floating)
3616       res = LLVMBuildFRem(builder, x, y, "");
3617    else if (type.sign)
3618       res = LLVMBuildSRem(builder, x, y, "");
3619    else
3620       res = LLVMBuildURem(builder, x, y, "");
3621    return res;
3622 }
3623
3624
3625 /*
3626  * For floating inputs it creates and returns a mask
3627  * which is all 1's for channels which are NaN.
3628  * Channels inside x which are not NaN will be 0.
3629  */
3630 LLVMValueRef
3631 lp_build_isnan(struct lp_build_context *bld,
3632                LLVMValueRef x)
3633 {
3634    LLVMValueRef mask;
3635    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3636
3637    assert(bld->type.floating);
3638    assert(lp_check_value(bld->type, x));
3639
3640    mask = LLVMBuildFCmp(bld->gallivm->builder, LLVMRealOEQ, x, x,
3641                         "isnotnan");
3642    mask = LLVMBuildNot(bld->gallivm->builder, mask, "");
3643    mask = LLVMBuildSExt(bld->gallivm->builder, mask, int_vec_type, "isnan");
3644    return mask;
3645 }
3646
3647 /* Returns all 1's for floating point numbers that are
3648  * finite numbers and returns all zeros for -inf,
3649  * inf and nan's */
3650 LLVMValueRef
3651 lp_build_isfinite(struct lp_build_context *bld,
3652                   LLVMValueRef x)
3653 {
3654    LLVMBuilderRef builder = bld->gallivm->builder;
3655    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3656    struct lp_type int_type = lp_int_type(bld->type);
3657    LLVMValueRef intx = LLVMBuildBitCast(builder, x, int_vec_type, "");
3658    LLVMValueRef infornan32 = lp_build_const_int_vec(bld->gallivm, bld->type,
3659                                                     0x7f800000);
3660
3661    if (!bld->type.floating) {
3662       return lp_build_const_int_vec(bld->gallivm, bld->type, 0);
3663    }
3664    assert(bld->type.floating);
3665    assert(lp_check_value(bld->type, x));
3666    assert(bld->type.width == 32);
3667
3668    intx = LLVMBuildAnd(builder, intx, infornan32, "");
3669    return lp_build_compare(bld->gallivm, int_type, PIPE_FUNC_NOTEQUAL,
3670                            intx, infornan32);
3671 }
3672
3673 /*
3674  * Returns true if the number is nan or inf and false otherwise.
3675  * The input has to be a floating point vector.
3676  */
3677 LLVMValueRef
3678 lp_build_is_inf_or_nan(struct gallivm_state *gallivm,
3679                        const struct lp_type type,
3680                        LLVMValueRef x)
3681 {
3682    LLVMBuilderRef builder = gallivm->builder;
3683    struct lp_type int_type = lp_int_type(type);
3684    LLVMValueRef const0 = lp_build_const_int_vec(gallivm, int_type,
3685                                                 0x7f800000);
3686    LLVMValueRef ret;
3687
3688    assert(type.floating);
3689
3690    ret = LLVMBuildBitCast(builder, x, lp_build_vec_type(gallivm, int_type), "");
3691    ret = LLVMBuildAnd(builder, ret, const0, "");
3692    ret = lp_build_compare(gallivm, int_type, PIPE_FUNC_EQUAL,
3693                           ret, const0);
3694
3695    return ret;
3696 }
3697
3698
3699 LLVMValueRef
3700 lp_build_fpstate_get(struct gallivm_state *gallivm)
3701 {
3702    if (util_cpu_caps.has_sse) {
3703       LLVMBuilderRef builder = gallivm->builder;
3704       LLVMValueRef mxcsr_ptr = lp_build_alloca(
3705          gallivm,
3706          LLVMInt32TypeInContext(gallivm->context),
3707          "mxcsr_ptr");
3708       LLVMValueRef mxcsr_ptr8 = LLVMBuildPointerCast(builder, mxcsr_ptr,
3709           LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3710       lp_build_intrinsic(builder,
3711                          "llvm.x86.sse.stmxcsr",
3712                          LLVMVoidTypeInContext(gallivm->context),
3713                          &mxcsr_ptr8, 1, 0);
3714       return mxcsr_ptr;
3715    }
3716    return 0;
3717 }
3718
3719 void
3720 lp_build_fpstate_set_denorms_zero(struct gallivm_state *gallivm,
3721                                   boolean zero)
3722 {
3723    if (util_cpu_caps.has_sse) {
3724       /* turn on DAZ (64) | FTZ (32768) = 32832 if available */
3725       int daz_ftz = _MM_FLUSH_ZERO_MASK;
3726
3727       LLVMBuilderRef builder = gallivm->builder;
3728       LLVMValueRef mxcsr_ptr = lp_build_fpstate_get(gallivm);
3729       LLVMValueRef mxcsr =
3730          LLVMBuildLoad(builder, mxcsr_ptr, "mxcsr");
3731
3732       if (util_cpu_caps.has_daz) {
3733          /* Enable denormals are zero mode */
3734          daz_ftz |= _MM_DENORMALS_ZERO_MASK;
3735       }
3736       if (zero) {
3737          mxcsr = LLVMBuildOr(builder, mxcsr,
3738                              LLVMConstInt(LLVMTypeOf(mxcsr), daz_ftz, 0), "");
3739       } else {
3740          mxcsr = LLVMBuildAnd(builder, mxcsr,
3741                               LLVMConstInt(LLVMTypeOf(mxcsr), ~daz_ftz, 0), "");
3742       }
3743
3744       LLVMBuildStore(builder, mxcsr, mxcsr_ptr);
3745       lp_build_fpstate_set(gallivm, mxcsr_ptr);
3746    }
3747 }
3748
3749 void
3750 lp_build_fpstate_set(struct gallivm_state *gallivm,
3751                      LLVMValueRef mxcsr_ptr)
3752 {
3753    if (util_cpu_caps.has_sse) {
3754       LLVMBuilderRef builder = gallivm->builder;
3755       mxcsr_ptr = LLVMBuildPointerCast(builder, mxcsr_ptr,
3756                      LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3757       lp_build_intrinsic(builder,
3758                          "llvm.x86.sse.ldmxcsr",
3759                          LLVMVoidTypeInContext(gallivm->context),
3760                          &mxcsr_ptr, 1, 0);
3761    }
3762 }