src/gallium/auxiliary/gallivm/lp_bld_arit.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2009-2010 VMware, Inc.
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the
  15  * next paragraph) shall be included in all copies or substantial portions
  16  * of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  21  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
  22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25  *
  26  **************************************************************************/
  27
  28
  29 /**
  30  * @file
  31  * Helper
  32  *
  33  * LLVM IR doesn't support all basic arithmetic operations we care about (most
  34  * notably min/max and saturated operations), and it is often necessary to
  35  * resort machine-specific intrinsics directly. The functions here hide all
  36  * these implementation details from the other modules.
  37  *
  38  * We also do simple expressions simplification here. Reasons are:
  39  * - it is very easy given we have all necessary information readily available
  40  * - LLVM optimization passes fail to simplify several vector expressions
  41  * - We often know value constraints which the optimization passes have no way
  42  *   of knowing, such as when source arguments are known to be in [0, 1] range.
  43  *
  44  * @author Jose Fonseca <jfonseca@vmware.com>
  45  */
  46
  47
  48 #include <float.h>
  49
  50 #include "util/u_memory.h"
  51 #include "util/u_debug.h"
  52 #include "util/u_math.h"
  53 #include "util/u_string.h"
  54 #include "util/u_cpu_detect.h"
  55
  56 #include "lp_bld_type.h"
  57 #include "lp_bld_const.h"
  58 #include "lp_bld_init.h"
  59 #include "lp_bld_intr.h"
  60 #include "lp_bld_logic.h"
  61 #include "lp_bld_pack.h"
  62 #include "lp_bld_debug.h"
  63 #include "lp_bld_bitarit.h"
  64 #include "lp_bld_arit.h"
  65 #include "lp_bld_flow.h"
  66
  67 #if defined(PIPE_ARCH_SSE)
  68 #include <xmmintrin.h>
  69 #endif
  70
  71 #ifndef _MM_DENORMALS_ZERO_MASK
  72 #define _MM_DENORMALS_ZERO_MASK 0x0040
  73 #endif
  74
  75 #ifndef _MM_FLUSH_ZERO_MASK
  76 #define _MM_FLUSH_ZERO_MASK 0x8000
  77 #endif
  78
  79 #define EXP_POLY_DEGREE 5
  80
  81 #define LOG_POLY_DEGREE 4
  82
  83
  84 /**
  85  * Generate min(a, b)
  86  * No checks for special case values of a or b = 1 or 0 are done.
  87  * NaN's are handled according to the behavior specified by the
  88  * nan_behavior argument.
  89  */
  90 static LLVMValueRef
  91 lp_build_min_simple(struct lp_build_context *bld,
  92                     LLVMValueRef a,
  93                     LLVMValueRef b,
  94                     enum gallivm_nan_behavior nan_behavior)
  95 {
  96    const struct lp_type type = bld->type;
  97    const char *intrinsic = NULL;
  98    unsigned intr_size = 0;
  99    LLVMValueRef cond;
 100
 101    assert(lp_check_value(type, a));
 102    assert(lp_check_value(type, b));
 103
 104    /* TODO: optimize the constant case */
 105
 106    if (type.floating && util_cpu_caps.has_sse) {
 107       if (type.width == 32) {
 108          if (type.length == 1) {
 109             intrinsic = "llvm.x86.sse.min.ss";
 110             intr_size = 128;
 111          }
 112          else if (type.length <= 4 || !util_cpu_caps.has_avx) {
 113             intrinsic = "llvm.x86.sse.min.ps";
 114             intr_size = 128;
 115          }
 116          else {
 117             intrinsic = "llvm.x86.avx.min.ps.256";
 118             intr_size = 256;
 119          }
 120       }
 121       if (type.width == 64 && util_cpu_caps.has_sse2) {
 122          if (type.length == 1) {
 123             intrinsic = "llvm.x86.sse2.min.sd";
 124             intr_size = 128;
 125          }
 126          else if (type.length == 2 || !util_cpu_caps.has_avx) {
 127             intrinsic = "llvm.x86.sse2.min.pd";
 128             intr_size = 128;
 129          }
 130          else {
 131             intrinsic = "llvm.x86.avx.min.pd.256";
 132             intr_size = 256;
 133          }
 134       }
 135    }
 136    else if (type.floating && util_cpu_caps.has_altivec) {
 137       if (nan_behavior == GALLIVM_NAN_RETURN_NAN ||
 138           nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
 139          debug_printf("%s: altivec doesn't support nan return nan behavior\n",
 140                       __FUNCTION__);
 141       }
 142       if (type.width == 32 && type.length == 4) {
 143          intrinsic = "llvm.ppc.altivec.vminfp";
 144          intr_size = 128;
 145       }
 146    } else if (util_cpu_caps.has_sse2 && type.length >= 2) {
 147       intr_size = 128;
 148       if ((type.width == 8 || type.width == 16) &&
 149           (type.width * type.length <= 64) &&
 150           (gallivm_debug & GALLIVM_DEBUG_PERF)) {
 151          debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
 152                       __FUNCTION__);
 153       }
 154       if (type.width == 8 && !type.sign) {
 155          intrinsic = "llvm.x86.sse2.pminu.b";
 156       }
 157       else if (type.width == 16 && type.sign) {
 158          intrinsic = "llvm.x86.sse2.pmins.w";
 159       }
 160       if (util_cpu_caps.has_sse4_1) {
 161          if (type.width == 8 && type.sign) {
 162             intrinsic = "llvm.x86.sse41.pminsb";
 163          }
 164          if (type.width == 16 && !type.sign) {
 165             intrinsic = "llvm.x86.sse41.pminuw";
 166          }
 167          if (type.width == 32 && !type.sign) {
 168             intrinsic = "llvm.x86.sse41.pminud";
 169          }
 170          if (type.width == 32 && type.sign) {
 171             intrinsic = "llvm.x86.sse41.pminsd";
 172          }
 173       }
 174    } else if (util_cpu_caps.has_altivec) {
 175       intr_size = 128;
 176       if (type.width == 8) {
 177          if (!type.sign) {
 178             intrinsic = "llvm.ppc.altivec.vminub";
 179          } else {
 180             intrinsic = "llvm.ppc.altivec.vminsb";
 181          }
 182       } else if (type.width == 16) {
 183          if (!type.sign) {
 184             intrinsic = "llvm.ppc.altivec.vminuh";
 185          } else {
 186             intrinsic = "llvm.ppc.altivec.vminsh";
 187          }
 188       } else if (type.width == 32) {
 189          if (!type.sign) {
 190             intrinsic = "llvm.ppc.altivec.vminuw";
 191          } else {
 192             intrinsic = "llvm.ppc.altivec.vminsw";
 193          }
 194       }
 195    }
 196
 197    if(intrinsic) {
 198       /* We need to handle nan's for floating point numbers. If one of the
 199        * inputs is nan the other should be returned (required by both D3D10+
 200        * and OpenCL).
 201        * The sse intrinsics return the second operator in case of nan by
 202        * default so we need to special code to handle those.
 203        */
 204       if (util_cpu_caps.has_sse && type.floating &&
 205           nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
 206           nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN &&
 207           nan_behavior != GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
 208          LLVMValueRef isnan, min;
 209          min = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
 210                                                    type,
 211                                                    intr_size, a, b);
 212          if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
 213             isnan = lp_build_isnan(bld, b);
 214             return lp_build_select(bld, isnan, a, min);
 215          } else {
 216             assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
 217             isnan = lp_build_isnan(bld, a);
 218             return lp_build_select(bld, isnan, a, min);
 219          }
 220       } else {
 221          return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
 222                                                     type,
 223                                                     intr_size, a, b);
 224       }
 225    }
 226
 227    if (type.floating) {
 228       switch (nan_behavior) {
 229       case GALLIVM_NAN_RETURN_NAN: {
 230          LLVMValueRef isnan = lp_build_isnan(bld, b);
 231          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 232          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
 233          return lp_build_select(bld, cond, a, b);
 234       }
 235          break;
 236       case GALLIVM_NAN_RETURN_OTHER: {
 237          LLVMValueRef isnan = lp_build_isnan(bld, a);
 238          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 239          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
 240          return lp_build_select(bld, cond, a, b);
 241       }
 242          break;
 243       case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
 244          cond = lp_build_cmp_ordered(bld, PIPE_FUNC_LESS, a, b);
 245          return lp_build_select(bld, cond, a, b);
 246       case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
 247          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, b, a);
 248          return lp_build_select(bld, cond, b, a);
 249       case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
 250          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 251          return lp_build_select(bld, cond, a, b);
 252          break;
 253       default:
 254          assert(0);
 255          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 256          return lp_build_select(bld, cond, a, b);
 257       }
 258    } else {
 259       cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 260       return lp_build_select(bld, cond, a, b);
 261    }
 262 }
 263
 264
 265 /**
 266  * Generate max(a, b)
 267  * No checks for special case values of a or b = 1 or 0 are done.
 268  * NaN's are handled according to the behavior specified by the
 269  * nan_behavior argument.
 270  */
 271 static LLVMValueRef
 272 lp_build_max_simple(struct lp_build_context *bld,
 273                     LLVMValueRef a,
 274                     LLVMValueRef b,
 275                     enum gallivm_nan_behavior nan_behavior)
 276 {
 277    const struct lp_type type = bld->type;
 278    const char *intrinsic = NULL;
 279    unsigned intr_size = 0;
 280    LLVMValueRef cond;
 281
 282    assert(lp_check_value(type, a));
 283    assert(lp_check_value(type, b));
 284
 285    /* TODO: optimize the constant case */
 286
 287    if (type.floating && util_cpu_caps.has_sse) {
 288       if (type.width == 32) {
 289          if (type.length == 1) {
 290             intrinsic = "llvm.x86.sse.max.ss";
 291             intr_size = 128;
 292          }
 293          else if (type.length <= 4 || !util_cpu_caps.has_avx) {
 294             intrinsic = "llvm.x86.sse.max.ps";
 295             intr_size = 128;
 296          }
 297          else {
 298             intrinsic = "llvm.x86.avx.max.ps.256";
 299             intr_size = 256;
 300          }
 301       }
 302       if (type.width == 64 && util_cpu_caps.has_sse2) {
 303          if (type.length == 1) {
 304             intrinsic = "llvm.x86.sse2.max.sd";
 305             intr_size = 128;
 306          }
 307          else if (type.length == 2 || !util_cpu_caps.has_avx) {
 308             intrinsic = "llvm.x86.sse2.max.pd";
 309             intr_size = 128;
 310          }
 311          else {
 312             intrinsic = "llvm.x86.avx.max.pd.256";
 313             intr_size = 256;
 314          }
 315       }
 316    }
 317    else if (type.floating && util_cpu_caps.has_altivec) {
 318       if (nan_behavior == GALLIVM_NAN_RETURN_NAN ||
 319           nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
 320          debug_printf("%s: altivec doesn't support nan return nan behavior\n",
 321                       __FUNCTION__);
 322       }
 323       if (type.width == 32 || type.length == 4) {
 324          intrinsic = "llvm.ppc.altivec.vmaxfp";
 325          intr_size = 128;
 326       }
 327    } else if (util_cpu_caps.has_sse2 && type.length >= 2) {
 328       intr_size = 128;
 329       if ((type.width == 8 || type.width == 16) &&
 330           (type.width * type.length <= 64) &&
 331           (gallivm_debug & GALLIVM_DEBUG_PERF)) {
 332          debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
 333                       __FUNCTION__);
 334          }
 335       if (type.width == 8 && !type.sign) {
 336          intrinsic = "llvm.x86.sse2.pmaxu.b";
 337          intr_size = 128;
 338       }
 339       else if (type.width == 16 && type.sign) {
 340          intrinsic = "llvm.x86.sse2.pmaxs.w";
 341       }
 342       if (util_cpu_caps.has_sse4_1) {
 343          if (type.width == 8 && type.sign) {
 344             intrinsic = "llvm.x86.sse41.pmaxsb";
 345          }
 346          if (type.width == 16 && !type.sign) {
 347             intrinsic = "llvm.x86.sse41.pmaxuw";
 348          }
 349          if (type.width == 32 && !type.sign) {
 350             intrinsic = "llvm.x86.sse41.pmaxud";
 351         }
 352          if (type.width == 32 && type.sign) {
 353             intrinsic = "llvm.x86.sse41.pmaxsd";
 354          }
 355       }
 356    } else if (util_cpu_caps.has_altivec) {
 357      intr_size = 128;
 358      if (type.width == 8) {
 359        if (!type.sign) {
 360          intrinsic = "llvm.ppc.altivec.vmaxub";
 361        } else {
 362          intrinsic = "llvm.ppc.altivec.vmaxsb";
 363        }
 364      } else if (type.width == 16) {
 365        if (!type.sign) {
 366          intrinsic = "llvm.ppc.altivec.vmaxuh";
 367        } else {
 368          intrinsic = "llvm.ppc.altivec.vmaxsh";
 369        }
 370      } else if (type.width == 32) {
 371        if (!type.sign) {
 372          intrinsic = "llvm.ppc.altivec.vmaxuw";
 373        } else {
 374          intrinsic = "llvm.ppc.altivec.vmaxsw";
 375        }
 376      }
 377    }
 378
 379    if(intrinsic) {
 380       if (util_cpu_caps.has_sse && type.floating &&
 381           nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
 382           nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN &&
 383           nan_behavior != GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
 384          LLVMValueRef isnan, max;
 385          max = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
 386                                                    type,
 387                                                    intr_size, a, b);
 388          if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
 389             isnan = lp_build_isnan(bld, b);
 390             return lp_build_select(bld, isnan, a, max);
 391          } else {
 392             assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
 393             isnan = lp_build_isnan(bld, a);
 394             return lp_build_select(bld, isnan, a, max);
 395          }
 396       } else {
 397          return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
 398                                                     type,
 399                                                     intr_size, a, b);
 400       }
 401    }
 402
 403    if (type.floating) {
 404       switch (nan_behavior) {
 405       case GALLIVM_NAN_RETURN_NAN: {
 406          LLVMValueRef isnan = lp_build_isnan(bld, b);
 407          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 408          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
 409          return lp_build_select(bld, cond, a, b);
 410       }
 411          break;
 412       case GALLIVM_NAN_RETURN_OTHER: {
 413          LLVMValueRef isnan = lp_build_isnan(bld, a);
 414          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 415          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
 416          return lp_build_select(bld, cond, a, b);
 417       }
 418          break;
 419       case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
 420          cond = lp_build_cmp_ordered(bld, PIPE_FUNC_GREATER, a, b);
 421          return lp_build_select(bld, cond, a, b);
 422       case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
 423          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, b, a);
 424          return lp_build_select(bld, cond, b, a);
 425       case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
 426          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 427          return lp_build_select(bld, cond, a, b);
 428          break;
 429       default:
 430          assert(0);
 431          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 432          return lp_build_select(bld, cond, a, b);
 433       }
 434    } else {
 435       cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 436       return lp_build_select(bld, cond, a, b);
 437    }
 438 }
 439
 440
 441 /**
 442  * Generate 1 - a, or ~a depending on bld->type.
 443  */
 444 LLVMValueRef
 445 lp_build_comp(struct lp_build_context *bld,
 446               LLVMValueRef a)
 447 {
 448    LLVMBuilderRef builder = bld->gallivm->builder;
 449    const struct lp_type type = bld->type;
 450
 451    assert(lp_check_value(type, a));
 452
 453    if(a == bld->one)
 454       return bld->zero;
 455    if(a == bld->zero)
 456       return bld->one;
 457
 458    if(type.norm && !type.floating && !type.fixed && !type.sign) {
 459       if(LLVMIsConstant(a))
 460          return LLVMConstNot(a);
 461       else
 462          return LLVMBuildNot(builder, a, "");
 463    }
 464
 465    if(LLVMIsConstant(a))
 466       if (type.floating)
 467           return LLVMConstFSub(bld->one, a);
 468       else
 469           return LLVMConstSub(bld->one, a);
 470    else
 471       if (type.floating)
 472          return LLVMBuildFSub(builder, bld->one, a, "");
 473       else
 474          return LLVMBuildSub(builder, bld->one, a, "");
 475 }
 476
 477
 478 /**
 479  * Generate a + b
 480  */
 481 LLVMValueRef
 482 lp_build_add(struct lp_build_context *bld,
 483              LLVMValueRef a,
 484              LLVMValueRef b)
 485 {
 486    LLVMBuilderRef builder = bld->gallivm->builder;
 487    const struct lp_type type = bld->type;
 488    LLVMValueRef res;
 489
 490    assert(lp_check_value(type, a));
 491    assert(lp_check_value(type, b));
 492
 493    if(a == bld->zero)
 494       return b;
 495    if(b == bld->zero)
 496       return a;
 497    if(a == bld->undef || b == bld->undef)
 498       return bld->undef;
 499
 500    if(bld->type.norm) {
 501       const char *intrinsic = NULL;
 502
 503       if(a == bld->one || b == bld->one)
 504         return bld->one;
 505
 506       if (type.width * type.length == 128 &&
 507           !type.floating && !type.fixed) {
 508          if(util_cpu_caps.has_sse2) {
 509            if(type.width == 8)
 510              intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
 511            if(type.width == 16)
 512              intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
 513          } else if (util_cpu_caps.has_altivec) {
 514            if(type.width == 8)
 515               intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs";
 516            if(type.width == 16)
 517               intrinsic = type.sign ? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs";
 518          }
 519       }
 520
 521       if(intrinsic)
 522          return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
 523    }
 524
 525    if(type.norm && !type.floating && !type.fixed) {
 526       if (type.sign) {
 527          uint64_t sign = (uint64_t)1 << (type.width - 1);
 528          LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1);
 529          LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign);
 530          /* a_clamp_max is the maximum a for positive b,
 531             a_clamp_min is the minimum a for negative b. */
 532          LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildSub(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 533          LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildSub(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 534          a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_max, a_clamp_min);
 535       } else {
 536          a = lp_build_min_simple(bld, a, lp_build_comp(bld, b), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 537       }
 538    }
 539
 540    if(LLVMIsConstant(a) && LLVMIsConstant(b))
 541       if (type.floating)
 542          res = LLVMConstFAdd(a, b);
 543       else
 544          res = LLVMConstAdd(a, b);
 545    else
 546       if (type.floating)
 547          res = LLVMBuildFAdd(builder, a, b, "");
 548       else
 549          res = LLVMBuildAdd(builder, a, b, "");
 550
 551    /* clamp to ceiling of 1.0 */
 552    if(bld->type.norm && (bld->type.floating || bld->type.fixed))
 553       res = lp_build_min_simple(bld, res, bld->one, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 554
 555    /* XXX clamp to floor of -1 or 0??? */
 556
 557    return res;
 558 }
 559
 560
 561 /** Return the scalar sum of the elements of a.
 562  * Should avoid this operation whenever possible.
 563  */
 564 LLVMValueRef
 565 lp_build_horizontal_add(struct lp_build_context *bld,
 566                         LLVMValueRef a)
 567 {
 568    LLVMBuilderRef builder = bld->gallivm->builder;
 569    const struct lp_type type = bld->type;
 570    LLVMValueRef index, res;
 571    unsigned i, length;
 572    LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 2];
 573    LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 2];
 574    LLVMValueRef vecres, elem2;
 575
 576    assert(lp_check_value(type, a));
 577
 578    if (type.length == 1) {
 579       return a;
 580    }
 581
 582    assert(!bld->type.norm);
 583
 584    /*
 585     * for byte vectors can do much better with psadbw.
 586     * Using repeated shuffle/adds here. Note with multiple vectors
 587     * this can be done more efficiently as outlined in the intel
 588     * optimization manual.
 589     * Note: could cause data rearrangement if used with smaller element
 590     * sizes.
 591     */
 592
 593    vecres = a;
 594    length = type.length / 2;
 595    while (length > 1) {
 596       LLVMValueRef vec1, vec2;
 597       for (i = 0; i < length; i++) {
 598          shuffles1[i] = lp_build_const_int32(bld->gallivm, i);
 599          shuffles2[i] = lp_build_const_int32(bld->gallivm, i + length);
 600       }
 601       vec1 = LLVMBuildShuffleVector(builder, vecres, vecres,
 602                                     LLVMConstVector(shuffles1, length), "");
 603       vec2 = LLVMBuildShuffleVector(builder, vecres, vecres,
 604                                     LLVMConstVector(shuffles2, length), "");
 605       if (type.floating) {
 606          vecres = LLVMBuildFAdd(builder, vec1, vec2, "");
 607       }
 608       else {
 609          vecres = LLVMBuildAdd(builder, vec1, vec2, "");
 610       }
 611       length = length >> 1;
 612    }
 613
 614    /* always have vector of size 2 here */
 615    assert(length == 1);
 616
 617    index = lp_build_const_int32(bld->gallivm, 0);
 618    res = LLVMBuildExtractElement(builder, vecres, index, "");
 619    index = lp_build_const_int32(bld->gallivm, 1);
 620    elem2 = LLVMBuildExtractElement(builder, vecres, index, "");
 621
 622    if (type.floating)
 623       res = LLVMBuildFAdd(builder, res, elem2, "");
 624     else
 625       res = LLVMBuildAdd(builder, res, elem2, "");
 626
 627    return res;
 628 }
 629
 630 /**
 631  * Return the horizontal sums of 4 float vectors as a float4 vector.
 632  * This uses the technique as outlined in Intel Optimization Manual.
 633  */
 634 static LLVMValueRef
 635 lp_build_horizontal_add4x4f(struct lp_build_context *bld,
 636                             LLVMValueRef src[4])
 637 {
 638    struct gallivm_state *gallivm = bld->gallivm;
 639    LLVMBuilderRef builder = gallivm->builder;
 640    LLVMValueRef shuffles[4];
 641    LLVMValueRef tmp[4];
 642    LLVMValueRef sumtmp[2], shuftmp[2];
 643
 644    /* lower half of regs */
 645    shuffles[0] = lp_build_const_int32(gallivm, 0);
 646    shuffles[1] = lp_build_const_int32(gallivm, 1);
 647    shuffles[2] = lp_build_const_int32(gallivm, 4);
 648    shuffles[3] = lp_build_const_int32(gallivm, 5);
 649    tmp[0] = LLVMBuildShuffleVector(builder, src[0], src[1],
 650                                    LLVMConstVector(shuffles, 4), "");
 651    tmp[2] = LLVMBuildShuffleVector(builder, src[2], src[3],
 652                                    LLVMConstVector(shuffles, 4), "");
 653
 654    /* upper half of regs */
 655    shuffles[0] = lp_build_const_int32(gallivm, 2);
 656    shuffles[1] = lp_build_const_int32(gallivm, 3);
 657    shuffles[2] = lp_build_const_int32(gallivm, 6);
 658    shuffles[3] = lp_build_const_int32(gallivm, 7);
 659    tmp[1] = LLVMBuildShuffleVector(builder, src[0], src[1],
 660                                    LLVMConstVector(shuffles, 4), "");
 661    tmp[3] = LLVMBuildShuffleVector(builder, src[2], src[3],
 662                                    LLVMConstVector(shuffles, 4), "");
 663
 664    sumtmp[0] = LLVMBuildFAdd(builder, tmp[0], tmp[1], "");
 665    sumtmp[1] = LLVMBuildFAdd(builder, tmp[2], tmp[3], "");
 666
 667    shuffles[0] = lp_build_const_int32(gallivm, 0);
 668    shuffles[1] = lp_build_const_int32(gallivm, 2);
 669    shuffles[2] = lp_build_const_int32(gallivm, 4);
 670    shuffles[3] = lp_build_const_int32(gallivm, 6);
 671    shuftmp[0] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
 672                                        LLVMConstVector(shuffles, 4), "");
 673
 674    shuffles[0] = lp_build_const_int32(gallivm, 1);
 675    shuffles[1] = lp_build_const_int32(gallivm, 3);
 676    shuffles[2] = lp_build_const_int32(gallivm, 5);
 677    shuffles[3] = lp_build_const_int32(gallivm, 7);
 678    shuftmp[1] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
 679                                        LLVMConstVector(shuffles, 4), "");
 680
 681    return LLVMBuildFAdd(builder, shuftmp[0], shuftmp[1], "");
 682 }
 683
 684
 685 /*
 686  * partially horizontally add 2-4 float vectors with length nx4,
 687  * i.e. only four adjacent values in each vector will be added,
 688  * assuming values are really grouped in 4 which also determines
 689  * output order.
 690  *
 691  * Return a vector of the same length as the initial vectors,
 692  * with the excess elements (if any) being undefined.
 693  * The element order is independent of number of input vectors.
 694  * For 3 vectors x0x1x2x3x4x5x6x7, y0y1y2y3y4y5y6y7, z0z1z2z3z4z5z6z7
 695  * the output order thus will be
 696  * sumx0-x3,sumy0-y3,sumz0-z3,undef,sumx4-x7,sumy4-y7,sumz4z7,undef
 697  */
 698 LLVMValueRef
 699 lp_build_hadd_partial4(struct lp_build_context *bld,
 700                        LLVMValueRef vectors[],
 701                        unsigned num_vecs)
 702 {
 703    struct gallivm_state *gallivm = bld->gallivm;
 704    LLVMBuilderRef builder = gallivm->builder;
 705    LLVMValueRef ret_vec;
 706    LLVMValueRef tmp[4];
 707    const char *intrinsic = NULL;
 708
 709    assert(num_vecs >= 2 && num_vecs <= 4);
 710    assert(bld->type.floating);
 711
 712    /* only use this with at least 2 vectors, as it is sort of expensive
 713     * (depending on cpu) and we always need two horizontal adds anyway,
 714     * so a shuffle/add approach might be better.
 715     */
 716
 717    tmp[0] = vectors[0];
 718    tmp[1] = vectors[1];
 719
 720    tmp[2] = num_vecs > 2 ? vectors[2] : vectors[0];
 721    tmp[3] = num_vecs > 3 ? vectors[3] : vectors[0];
 722
 723    if (util_cpu_caps.has_sse3 && bld->type.width == 32 &&
 724        bld->type.length == 4) {
 725       intrinsic = "llvm.x86.sse3.hadd.ps";
 726    }
 727    else if (util_cpu_caps.has_avx && bld->type.width == 32 &&
 728             bld->type.length == 8) {
 729       intrinsic = "llvm.x86.avx.hadd.ps.256";
 730    }
 731    if (intrinsic) {
 732       tmp[0] = lp_build_intrinsic_binary(builder, intrinsic,
 733                                        lp_build_vec_type(gallivm, bld->type),
 734                                        tmp[0], tmp[1]);
 735       if (num_vecs > 2) {
 736          tmp[1] = lp_build_intrinsic_binary(builder, intrinsic,
 737                                           lp_build_vec_type(gallivm, bld->type),
 738                                           tmp[2], tmp[3]);
 739       }
 740       else {
 741          tmp[1] = tmp[0];
 742       }
 743       return lp_build_intrinsic_binary(builder, intrinsic,
 744                                        lp_build_vec_type(gallivm, bld->type),
 745                                        tmp[0], tmp[1]);
 746    }
 747
 748    if (bld->type.length == 4) {
 749       ret_vec = lp_build_horizontal_add4x4f(bld, tmp);
 750    }
 751    else {
 752       LLVMValueRef partres[LP_MAX_VECTOR_LENGTH/4];
 753       unsigned j;
 754       unsigned num_iter = bld->type.length / 4;
 755       struct lp_type parttype = bld->type;
 756       parttype.length = 4;
 757       for (j = 0; j < num_iter; j++) {
 758          LLVMValueRef partsrc[4];
 759          unsigned i;
 760          for (i = 0; i < 4; i++) {
 761             partsrc[i] = lp_build_extract_range(gallivm, tmp[i], j*4, 4);
 762          }
 763          partres[j] = lp_build_horizontal_add4x4f(bld, partsrc);
 764       }
 765       ret_vec = lp_build_concat(gallivm, partres, parttype, num_iter);
 766    }
 767    return ret_vec;
 768 }
 769
 770 /**
 771  * Generate a - b
 772  */
 773 LLVMValueRef
 774 lp_build_sub(struct lp_build_context *bld,
 775              LLVMValueRef a,
 776              LLVMValueRef b)
 777 {
 778    LLVMBuilderRef builder = bld->gallivm->builder;
 779    const struct lp_type type = bld->type;
 780    LLVMValueRef res;
 781
 782    assert(lp_check_value(type, a));
 783    assert(lp_check_value(type, b));
 784
 785    if(b == bld->zero)
 786       return a;
 787    if(a == bld->undef || b == bld->undef)
 788       return bld->undef;
 789    if(a == b)
 790       return bld->zero;
 791
 792    if(bld->type.norm) {
 793       const char *intrinsic = NULL;
 794
 795       if(b == bld->one)
 796         return bld->zero;
 797
 798       if (type.width * type.length == 128 &&
 799           !type.floating && !type.fixed) {
 800          if (util_cpu_caps.has_sse2) {
 801            if(type.width == 8)
 802               intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
 803            if(type.width == 16)
 804               intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
 805          } else if (util_cpu_caps.has_altivec) {
 806            if(type.width == 8)
 807               intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs";
 808            if(type.width == 16)
 809               intrinsic = type.sign ? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs";
 810          }
 811       }
 812
 813       if(intrinsic)
 814          return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
 815    }
 816
 817    if(type.norm && !type.floating && !type.fixed) {
 818       if (type.sign) {
 819          uint64_t sign = (uint64_t)1 << (type.width - 1);
 820          LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1);
 821          LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign);
 822          /* a_clamp_max is the maximum a for negative b,
 823             a_clamp_min is the minimum a for positive b. */
 824          LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildAdd(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 825          LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildAdd(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 826          a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_min, a_clamp_max);
 827       } else {
 828          a = lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 829       }
 830    }
 831
 832    if(LLVMIsConstant(a) && LLVMIsConstant(b))
 833       if (type.floating)
 834          res = LLVMConstFSub(a, b);
 835       else
 836          res = LLVMConstSub(a, b);
 837    else
 838       if (type.floating)
 839          res = LLVMBuildFSub(builder, a, b, "");
 840       else
 841          res = LLVMBuildSub(builder, a, b, "");
 842
 843    if(bld->type.norm && (bld->type.floating || bld->type.fixed))
 844       res = lp_build_max_simple(bld, res, bld->zero, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 845
 846    return res;
 847 }
 848
 849
 850
 851 /**
 852  * Normalized multiplication.
 853  *
 854  * There are several approaches for (using 8-bit normalized multiplication as
 855  * an example):
 856  *
 857  * - alpha plus one
 858  *
 859  *     makes the following approximation to the division (Sree)
 860  *
 861  *       a*b/255 ~= (a*(b + 1)) >> 256
 862  *
 863  *     which is the fastest method that satisfies the following OpenGL criteria of
 864  *
 865  *       0*0 = 0 and 255*255 = 255
 866  *
 867  * - geometric series
 868  *
 869  *     takes the geometric series approximation to the division
 870  *
 871  *       t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
 872  *
 873  *     in this case just the first two terms to fit in 16bit arithmetic
 874  *
 875  *       t/255 ~= (t + (t >> 8)) >> 8
 876  *
 877  *     note that just by itself it doesn't satisfies the OpenGL criteria, as
 878  *     255*255 = 254, so the special case b = 255 must be accounted or roundoff
 879  *     must be used.
 880  *
 881  * - geometric series plus rounding
 882  *
 883  *     when using a geometric series division instead of truncating the result
 884  *     use roundoff in the approximation (Jim Blinn)
 885  *
 886  *       t/255 ~= (t + (t >> 8) + 0x80) >> 8
 887  *
 888  *     achieving the exact results.
 889  *
 890  *
 891  *
 892  * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
 893  *     ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
 894  * @sa Michael Herf, The "double blend trick", May 2000,
 895  *     http://www.stereopsis.com/doubleblend.html
 896  */
 897 static LLVMValueRef
 898 lp_build_mul_norm(struct gallivm_state *gallivm,
 899                   struct lp_type wide_type,
 900                   LLVMValueRef a, LLVMValueRef b)
 901 {
 902    LLVMBuilderRef builder = gallivm->builder;
 903    struct lp_build_context bld;
 904    unsigned n;
 905    LLVMValueRef half;
 906    LLVMValueRef ab;
 907
 908    assert(!wide_type.floating);
 909    assert(lp_check_value(wide_type, a));
 910    assert(lp_check_value(wide_type, b));
 911
 912    lp_build_context_init(&bld, gallivm, wide_type);
 913
 914    n = wide_type.width / 2;
 915    if (wide_type.sign) {
 916       --n;
 917    }
 918
 919    /*
 920     * TODO: for 16bits normalized SSE2 vectors we could consider using PMULHUW
 921     * http://ssp.impulsetrain.com/2011/07/03/multiplying-normalized-16-bit-numbers-with-sse2/
 922     */
 923
 924    /*
 925     * a*b / (2**n - 1) ~= (a*b + (a*b >> n) + half) >> n
 926     */
 927
 928    ab = LLVMBuildMul(builder, a, b, "");
 929    ab = LLVMBuildAdd(builder, ab, lp_build_shr_imm(&bld, ab, n), "");
 930
 931    /*
 932     * half = sgn(ab) * 0.5 * (2 ** n) = sgn(ab) * (1 << (n - 1))
 933     */
 934
 935    half = lp_build_const_int_vec(gallivm, wide_type, 1LL << (n - 1));
 936    if (wide_type.sign) {
 937       LLVMValueRef minus_half = LLVMBuildNeg(builder, half, "");
 938       LLVMValueRef sign = lp_build_shr_imm(&bld, ab, wide_type.width - 1);
 939       half = lp_build_select(&bld, sign, minus_half, half);
 940    }
 941    ab = LLVMBuildAdd(builder, ab, half, "");
 942
 943    /* Final division */
 944    ab = lp_build_shr_imm(&bld, ab, n);
 945
 946    return ab;
 947 }
 948
 949 /**
 950  * Generate a * b
 951  */
 952 LLVMValueRef
 953 lp_build_mul(struct lp_build_context *bld,
 954              LLVMValueRef a,
 955              LLVMValueRef b)
 956 {
 957    LLVMBuilderRef builder = bld->gallivm->builder;
 958    const struct lp_type type = bld->type;
 959    LLVMValueRef shift;
 960    LLVMValueRef res;
 961
 962    assert(lp_check_value(type, a));
 963    assert(lp_check_value(type, b));
 964
 965    if(a == bld->zero)
 966       return bld->zero;
 967    if(a == bld->one)
 968       return b;
 969    if(b == bld->zero)
 970       return bld->zero;
 971    if(b == bld->one)
 972       return a;
 973    if(a == bld->undef || b == bld->undef)
 974       return bld->undef;
 975
 976    if (!type.floating && !type.fixed && type.norm) {
 977       struct lp_type wide_type = lp_wider_type(type);
 978       LLVMValueRef al, ah, bl, bh, abl, abh, ab;
 979
 980       lp_build_unpack2(bld->gallivm, type, wide_type, a, &al, &ah);
 981       lp_build_unpack2(bld->gallivm, type, wide_type, b, &bl, &bh);
 982
 983       /* PMULLW, PSRLW, PADDW */
 984       abl = lp_build_mul_norm(bld->gallivm, wide_type, al, bl);
 985       abh = lp_build_mul_norm(bld->gallivm, wide_type, ah, bh);
 986
 987       ab = lp_build_pack2(bld->gallivm, wide_type, type, abl, abh);
 988
 989       return ab;
 990    }
 991
 992    if(type.fixed)
 993       shift = lp_build_const_int_vec(bld->gallivm, type, type.width/2);
 994    else
 995       shift = NULL;
 996
 997    if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
 998       if (type.floating)
 999          res = LLVMConstFMul(a, b);
1000       else
1001          res = LLVMConstMul(a, b);
1002       if(shift) {
1003          if(type.sign)
1004             res = LLVMConstAShr(res, shift);
1005          else
1006             res = LLVMConstLShr(res, shift);
1007       }
1008    }
1009    else {
1010       if (type.floating)
1011          res = LLVMBuildFMul(builder, a, b, "");
1012       else
1013          res = LLVMBuildMul(builder, a, b, "");
1014       if(shift) {
1015          if(type.sign)
1016             res = LLVMBuildAShr(builder, res, shift, "");
1017          else
1018             res = LLVMBuildLShr(builder, res, shift, "");
1019       }
1020    }
1021
1022    return res;
1023 }
1024
1025
1026 /**
1027  * Small vector x scale multiplication optimization.
1028  */
1029 LLVMValueRef
1030 lp_build_mul_imm(struct lp_build_context *bld,
1031                  LLVMValueRef a,
1032                  int b)
1033 {
1034    LLVMBuilderRef builder = bld->gallivm->builder;
1035    LLVMValueRef factor;
1036
1037    assert(lp_check_value(bld->type, a));
1038
1039    if(b == 0)
1040       return bld->zero;
1041
1042    if(b == 1)
1043       return a;
1044
1045    if(b == -1)
1046       return lp_build_negate(bld, a);
1047
1048    if(b == 2 && bld->type.floating)
1049       return lp_build_add(bld, a, a);
1050
1051    if(util_is_power_of_two(b)) {
1052       unsigned shift = ffs(b) - 1;
1053
1054       if(bld->type.floating) {
1055 #if 0
1056          /*
1057           * Power of two multiplication by directly manipulating the exponent.
1058           *
1059           * XXX: This might not be always faster, it will introduce a small error
1060           * for multiplication by zero, and it will produce wrong results
1061           * for Inf and NaN.
1062           */
1063          unsigned mantissa = lp_mantissa(bld->type);
1064          factor = lp_build_const_int_vec(bld->gallivm, bld->type, (unsigned long long)shift << mantissa);
1065          a = LLVMBuildBitCast(builder, a, lp_build_int_vec_type(bld->type), "");
1066          a = LLVMBuildAdd(builder, a, factor, "");
1067          a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, bld->type), "");
1068          return a;
1069 #endif
1070       }
1071       else {
1072          factor = lp_build_const_vec(bld->gallivm, bld->type, shift);
1073          return LLVMBuildShl(builder, a, factor, "");
1074       }
1075    }
1076
1077    factor = lp_build_const_vec(bld->gallivm, bld->type, (double)b);
1078    return lp_build_mul(bld, a, factor);
1079 }
1080
1081
1082 /**
1083  * Generate a / b
1084  */
1085 LLVMValueRef
1086 lp_build_div(struct lp_build_context *bld,
1087              LLVMValueRef a,
1088              LLVMValueRef b)
1089 {
1090    LLVMBuilderRef builder = bld->gallivm->builder;
1091    const struct lp_type type = bld->type;
1092
1093    assert(lp_check_value(type, a));
1094    assert(lp_check_value(type, b));
1095
1096    if(a == bld->zero)
1097       return bld->zero;
1098    if(a == bld->one && type.floating)
1099       return lp_build_rcp(bld, b);
1100    if(b == bld->zero)
1101       return bld->undef;
1102    if(b == bld->one)
1103       return a;
1104    if(a == bld->undef || b == bld->undef)
1105       return bld->undef;
1106
1107    if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
1108       if (type.floating)
1109          return LLVMConstFDiv(a, b);
1110       else if (type.sign)
1111          return LLVMConstSDiv(a, b);
1112       else
1113          return LLVMConstUDiv(a, b);
1114    }
1115
1116    if(((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
1117        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) &&
1118       type.floating)
1119       return lp_build_mul(bld, a, lp_build_rcp(bld, b));
1120
1121    if (type.floating)
1122       return LLVMBuildFDiv(builder, a, b, "");
1123    else if (type.sign)
1124       return LLVMBuildSDiv(builder, a, b, "");
1125    else
1126       return LLVMBuildUDiv(builder, a, b, "");
1127 }
1128
1129
1130 /**
1131  * Linear interpolation helper.
1132  *
1133  * @param normalized whether we are interpolating normalized values,
1134  *        encoded in normalized integers, twice as wide.
1135  *
1136  * @sa http://www.stereopsis.com/doubleblend.html
1137  */
1138 static inline LLVMValueRef
1139 lp_build_lerp_simple(struct lp_build_context *bld,
1140                      LLVMValueRef x,
1141                      LLVMValueRef v0,
1142                      LLVMValueRef v1,
1143                      unsigned flags)
1144 {
1145    unsigned half_width = bld->type.width/2;
1146    LLVMBuilderRef builder = bld->gallivm->builder;
1147    LLVMValueRef delta;
1148    LLVMValueRef res;
1149
1150    assert(lp_check_value(bld->type, x));
1151    assert(lp_check_value(bld->type, v0));
1152    assert(lp_check_value(bld->type, v1));
1153
1154    delta = lp_build_sub(bld, v1, v0);
1155
1156    if (flags & LP_BLD_LERP_WIDE_NORMALIZED) {
1157       if (!bld->type.sign) {
1158          if (!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS)) {
1159             /*
1160              * Scale x from [0, 2**n - 1] to [0, 2**n] by adding the
1161              * most-significant-bit to the lowest-significant-bit, so that
1162              * later we can just divide by 2**n instead of 2**n - 1.
1163              */
1164
1165             x = lp_build_add(bld, x, lp_build_shr_imm(bld, x, half_width - 1));
1166          }
1167
1168          /* (x * delta) >> n */
1169          res = lp_build_mul(bld, x, delta);
1170          res = lp_build_shr_imm(bld, res, half_width);
1171       } else {
1172          /*
1173           * The rescaling trick above doesn't work for signed numbers, so
1174           * use the 2**n - 1 divison approximation in lp_build_mul_norm
1175           * instead.
1176           */
1177          assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1178          res = lp_build_mul_norm(bld->gallivm, bld->type, x, delta);
1179       }
1180    } else {
1181       assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1182       res = lp_build_mul(bld, x, delta);
1183    }
1184
1185    res = lp_build_add(bld, v0, res);
1186
1187    if (((flags & LP_BLD_LERP_WIDE_NORMALIZED) && !bld->type.sign) ||
1188        bld->type.fixed) {
1189       /* We need to mask out the high order bits when lerping 8bit normalized colors stored on 16bits */
1190       /* XXX: This step is necessary for lerping 8bit colors stored on 16bits,
1191        * but it will be wrong for true fixed point use cases. Basically we need
1192        * a more powerful lp_type, capable of further distinguishing the values
1193        * interpretation from the value storage. */
1194       res = LLVMBuildAnd(builder, res, lp_build_const_int_vec(bld->gallivm, bld->type, (1 << half_width) - 1), "");
1195    }
1196
1197    return res;
1198 }
1199
1200
1201 /**
1202  * Linear interpolation.
1203  */
1204 LLVMValueRef
1205 lp_build_lerp(struct lp_build_context *bld,
1206               LLVMValueRef x,
1207               LLVMValueRef v0,
1208               LLVMValueRef v1,
1209               unsigned flags)
1210 {
1211    const struct lp_type type = bld->type;
1212    LLVMValueRef res;
1213
1214    assert(lp_check_value(type, x));
1215    assert(lp_check_value(type, v0));
1216    assert(lp_check_value(type, v1));
1217
1218    assert(!(flags & LP_BLD_LERP_WIDE_NORMALIZED));
1219
1220    if (type.norm) {
1221       struct lp_type wide_type;
1222       struct lp_build_context wide_bld;
1223       LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh;
1224
1225       assert(type.length >= 2);
1226
1227       /*
1228        * Create a wider integer type, enough to hold the
1229        * intermediate result of the multiplication.
1230        */
1231       memset(&wide_type, 0, sizeof wide_type);
1232       wide_type.sign   = type.sign;
1233       wide_type.width  = type.width*2;
1234       wide_type.length = type.length/2;
1235
1236       lp_build_context_init(&wide_bld, bld->gallivm, wide_type);
1237
1238       lp_build_unpack2(bld->gallivm, type, wide_type, x,  &xl,  &xh);
1239       lp_build_unpack2(bld->gallivm, type, wide_type, v0, &v0l, &v0h);
1240       lp_build_unpack2(bld->gallivm, type, wide_type, v1, &v1l, &v1h);
1241
1242       /*
1243        * Lerp both halves.
1244        */
1245
1246       flags |= LP_BLD_LERP_WIDE_NORMALIZED;
1247
1248       resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l, flags);
1249       resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h, flags);
1250
1251       res = lp_build_pack2(bld->gallivm, wide_type, type, resl, resh);
1252    } else {
1253       res = lp_build_lerp_simple(bld, x, v0, v1, flags);
1254    }
1255
1256    return res;
1257 }
1258
1259
1260 /**
1261  * Bilinear interpolation.
1262  *
1263  * Values indices are in v_{yx}.
1264  */
1265 LLVMValueRef
1266 lp_build_lerp_2d(struct lp_build_context *bld,
1267                  LLVMValueRef x,
1268                  LLVMValueRef y,
1269                  LLVMValueRef v00,
1270                  LLVMValueRef v01,
1271                  LLVMValueRef v10,
1272                  LLVMValueRef v11,
1273                  unsigned flags)
1274 {
1275    LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01, flags);
1276    LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11, flags);
1277    return lp_build_lerp(bld, y, v0, v1, flags);
1278 }
1279
1280
1281 LLVMValueRef
1282 lp_build_lerp_3d(struct lp_build_context *bld,
1283                  LLVMValueRef x,
1284                  LLVMValueRef y,
1285                  LLVMValueRef z,
1286                  LLVMValueRef v000,
1287                  LLVMValueRef v001,
1288                  LLVMValueRef v010,
1289                  LLVMValueRef v011,
1290                  LLVMValueRef v100,
1291                  LLVMValueRef v101,
1292                  LLVMValueRef v110,
1293                  LLVMValueRef v111,
1294                  unsigned flags)
1295 {
1296    LLVMValueRef v0 = lp_build_lerp_2d(bld, x, y, v000, v001, v010, v011, flags);
1297    LLVMValueRef v1 = lp_build_lerp_2d(bld, x, y, v100, v101, v110, v111, flags);
1298    return lp_build_lerp(bld, z, v0, v1, flags);
1299 }
1300
1301
1302 /**
1303  * Generate min(a, b)
1304  * Do checks for special cases but not for nans.
1305  */
1306 LLVMValueRef
1307 lp_build_min(struct lp_build_context *bld,
1308              LLVMValueRef a,
1309              LLVMValueRef b)
1310 {
1311    assert(lp_check_value(bld->type, a));
1312    assert(lp_check_value(bld->type, b));
1313
1314    if(a == bld->undef || b == bld->undef)
1315       return bld->undef;
1316
1317    if(a == b)
1318       return a;
1319
1320    if (bld->type.norm) {
1321       if (!bld->type.sign) {
1322          if (a == bld->zero || b == bld->zero) {
1323             return bld->zero;
1324          }
1325       }
1326       if(a == bld->one)
1327          return b;
1328       if(b == bld->one)
1329          return a;
1330    }
1331
1332    return lp_build_min_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1333 }
1334
1335
1336 /**
1337  * Generate min(a, b)
1338  * NaN's are handled according to the behavior specified by the
1339  * nan_behavior argument.
1340  */
1341 LLVMValueRef
1342 lp_build_min_ext(struct lp_build_context *bld,
1343                  LLVMValueRef a,
1344                  LLVMValueRef b,
1345                  enum gallivm_nan_behavior nan_behavior)
1346 {
1347    assert(lp_check_value(bld->type, a));
1348    assert(lp_check_value(bld->type, b));
1349
1350    if(a == bld->undef || b == bld->undef)
1351       return bld->undef;
1352
1353    if(a == b)
1354       return a;
1355
1356    if (bld->type.norm) {
1357       if (!bld->type.sign) {
1358          if (a == bld->zero || b == bld->zero) {
1359             return bld->zero;
1360          }
1361       }
1362       if(a == bld->one)
1363          return b;
1364       if(b == bld->one)
1365          return a;
1366    }
1367
1368    return lp_build_min_simple(bld, a, b, nan_behavior);
1369 }
1370
1371 /**
1372  * Generate max(a, b)
1373  * Do checks for special cases, but NaN behavior is undefined.
1374  */
1375 LLVMValueRef
1376 lp_build_max(struct lp_build_context *bld,
1377              LLVMValueRef a,
1378              LLVMValueRef b)
1379 {
1380    assert(lp_check_value(bld->type, a));
1381    assert(lp_check_value(bld->type, b));
1382
1383    if(a == bld->undef || b == bld->undef)
1384       return bld->undef;
1385
1386    if(a == b)
1387       return a;
1388
1389    if(bld->type.norm) {
1390       if(a == bld->one || b == bld->one)
1391          return bld->one;
1392       if (!bld->type.sign) {
1393          if (a == bld->zero) {
1394             return b;
1395          }
1396          if (b == bld->zero) {
1397             return a;
1398          }
1399       }
1400    }
1401
1402    return lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1403 }
1404
1405
1406 /**
1407  * Generate max(a, b)
1408  * Checks for special cases.
1409  * NaN's are handled according to the behavior specified by the
1410  * nan_behavior argument.
1411  */
1412 LLVMValueRef
1413 lp_build_max_ext(struct lp_build_context *bld,
1414                   LLVMValueRef a,
1415                   LLVMValueRef b,
1416                   enum gallivm_nan_behavior nan_behavior)
1417 {
1418    assert(lp_check_value(bld->type, a));
1419    assert(lp_check_value(bld->type, b));
1420
1421    if(a == bld->undef || b == bld->undef)
1422       return bld->undef;
1423
1424    if(a == b)
1425       return a;
1426
1427    if(bld->type.norm) {
1428       if(a == bld->one || b == bld->one)
1429          return bld->one;
1430       if (!bld->type.sign) {
1431          if (a == bld->zero) {
1432             return b;
1433          }
1434          if (b == bld->zero) {
1435             return a;
1436          }
1437       }
1438    }
1439
1440    return lp_build_max_simple(bld, a, b, nan_behavior);
1441 }
1442
1443 /**
1444  * Generate clamp(a, min, max)
1445  * NaN behavior (for any of a, min, max) is undefined.
1446  * Do checks for special cases.
1447  */
1448 LLVMValueRef
1449 lp_build_clamp(struct lp_build_context *bld,
1450                LLVMValueRef a,
1451                LLVMValueRef min,
1452                LLVMValueRef max)
1453 {
1454    assert(lp_check_value(bld->type, a));
1455    assert(lp_check_value(bld->type, min));
1456    assert(lp_check_value(bld->type, max));
1457
1458    a = lp_build_min(bld, a, max);
1459    a = lp_build_max(bld, a, min);
1460    return a;
1461 }
1462
1463
1464 /**
1465  * Generate clamp(a, 0, 1)
1466  * A NaN will get converted to zero.
1467  */
1468 LLVMValueRef
1469 lp_build_clamp_zero_one_nanzero(struct lp_build_context *bld,
1470                                 LLVMValueRef a)
1471 {
1472    a = lp_build_max_ext(bld, a, bld->zero, GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
1473    a = lp_build_min(bld, a, bld->one);
1474    return a;
1475 }
1476
1477
1478 /**
1479  * Generate abs(a)
1480  */
1481 LLVMValueRef
1482 lp_build_abs(struct lp_build_context *bld,
1483              LLVMValueRef a)
1484 {
1485    LLVMBuilderRef builder = bld->gallivm->builder;
1486    const struct lp_type type = bld->type;
1487    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1488
1489    assert(lp_check_value(type, a));
1490
1491    if(!type.sign)
1492       return a;
1493
1494    if(type.floating) {
1495       /* Mask out the sign bit */
1496       LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1497       unsigned long long absMask = ~(1ULL << (type.width - 1));
1498       LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, ((unsigned long long) absMask));
1499       a = LLVMBuildBitCast(builder, a, int_vec_type, "");
1500       a = LLVMBuildAnd(builder, a, mask, "");
1501       a = LLVMBuildBitCast(builder, a, vec_type, "");
1502       return a;
1503    }
1504
1505    if(type.width*type.length == 128 && util_cpu_caps.has_ssse3) {
1506       switch(type.width) {
1507       case 8:
1508          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
1509       case 16:
1510          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
1511       case 32:
1512          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
1513       }
1514    }
1515    else if (type.width*type.length == 256 && util_cpu_caps.has_ssse3 &&
1516             (gallivm_debug & GALLIVM_DEBUG_PERF) &&
1517             (type.width == 8 || type.width == 16 || type.width == 32)) {
1518       debug_printf("%s: inefficient code, should split vectors manually\n",
1519                    __FUNCTION__);
1520    }
1521
1522    return lp_build_max(bld, a, LLVMBuildNeg(builder, a, ""));
1523 }
1524
1525
1526 LLVMValueRef
1527 lp_build_negate(struct lp_build_context *bld,
1528                 LLVMValueRef a)
1529 {
1530    LLVMBuilderRef builder = bld->gallivm->builder;
1531
1532    assert(lp_check_value(bld->type, a));
1533
1534    if (bld->type.floating)
1535       a = LLVMBuildFNeg(builder, a, "");
1536    else
1537       a = LLVMBuildNeg(builder, a, "");
1538
1539    return a;
1540 }
1541
1542
1543 /** Return -1, 0 or +1 depending on the sign of a */
1544 LLVMValueRef
1545 lp_build_sgn(struct lp_build_context *bld,
1546              LLVMValueRef a)
1547 {
1548    LLVMBuilderRef builder = bld->gallivm->builder;
1549    const struct lp_type type = bld->type;
1550    LLVMValueRef cond;
1551    LLVMValueRef res;
1552
1553    assert(lp_check_value(type, a));
1554
1555    /* Handle non-zero case */
1556    if(!type.sign) {
1557       /* if not zero then sign must be positive */
1558       res = bld->one;
1559    }
1560    else if(type.floating) {
1561       LLVMTypeRef vec_type;
1562       LLVMTypeRef int_type;
1563       LLVMValueRef mask;
1564       LLVMValueRef sign;
1565       LLVMValueRef one;
1566       unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
1567
1568       int_type = lp_build_int_vec_type(bld->gallivm, type);
1569       vec_type = lp_build_vec_type(bld->gallivm, type);
1570       mask = lp_build_const_int_vec(bld->gallivm, type, maskBit);
1571
1572       /* Take the sign bit and add it to 1 constant */
1573       sign = LLVMBuildBitCast(builder, a, int_type, "");
1574       sign = LLVMBuildAnd(builder, sign, mask, "");
1575       one = LLVMConstBitCast(bld->one, int_type);
1576       res = LLVMBuildOr(builder, sign, one, "");
1577       res = LLVMBuildBitCast(builder, res, vec_type, "");
1578    }
1579    else
1580    {
1581       /* signed int/norm/fixed point */
1582       /* could use psign with sse3 and appropriate vectors here */
1583       LLVMValueRef minus_one = lp_build_const_vec(bld->gallivm, type, -1.0);
1584       cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
1585       res = lp_build_select(bld, cond, bld->one, minus_one);
1586    }
1587
1588    /* Handle zero */
1589    cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
1590    res = lp_build_select(bld, cond, bld->zero, res);
1591
1592    return res;
1593 }
1594
1595
1596 /**
1597  * Set the sign of float vector 'a' according to 'sign'.
1598  * If sign==0, return abs(a).
1599  * If sign==1, return -abs(a);
1600  * Other values for sign produce undefined results.
1601  */
1602 LLVMValueRef
1603 lp_build_set_sign(struct lp_build_context *bld,
1604                   LLVMValueRef a, LLVMValueRef sign)
1605 {
1606    LLVMBuilderRef builder = bld->gallivm->builder;
1607    const struct lp_type type = bld->type;
1608    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1609    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1610    LLVMValueRef shift = lp_build_const_int_vec(bld->gallivm, type, type.width - 1);
1611    LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1612                              ~((unsigned long long) 1 << (type.width - 1)));
1613    LLVMValueRef val, res;
1614
1615    assert(type.floating);
1616    assert(lp_check_value(type, a));
1617
1618    /* val = reinterpret_cast<int>(a) */
1619    val = LLVMBuildBitCast(builder, a, int_vec_type, "");
1620    /* val = val & mask */
1621    val = LLVMBuildAnd(builder, val, mask, "");
1622    /* sign = sign << shift */
1623    sign = LLVMBuildShl(builder, sign, shift, "");
1624    /* res = val | sign */
1625    res = LLVMBuildOr(builder, val, sign, "");
1626    /* res = reinterpret_cast<float>(res) */
1627    res = LLVMBuildBitCast(builder, res, vec_type, "");
1628
1629    return res;
1630 }
1631
1632
1633 /**
1634  * Convert vector of (or scalar) int to vector of (or scalar) float.
1635  */
1636 LLVMValueRef
1637 lp_build_int_to_float(struct lp_build_context *bld,
1638                       LLVMValueRef a)
1639 {
1640    LLVMBuilderRef builder = bld->gallivm->builder;
1641    const struct lp_type type = bld->type;
1642    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1643
1644    assert(type.floating);
1645
1646    return LLVMBuildSIToFP(builder, a, vec_type, "");
1647 }
1648
1649 static boolean
1650 arch_rounding_available(const struct lp_type type)
1651 {
1652    if ((util_cpu_caps.has_sse4_1 &&
1653        (type.length == 1 || type.width*type.length == 128)) ||
1654        (util_cpu_caps.has_avx && type.width*type.length == 256))
1655       return TRUE;
1656    else if ((util_cpu_caps.has_altivec &&
1657             (type.width == 32 && type.length == 4)))
1658       return TRUE;
1659
1660    return FALSE;
1661 }
1662
1663 enum lp_build_round_mode
1664 {
1665    LP_BUILD_ROUND_NEAREST = 0,
1666    LP_BUILD_ROUND_FLOOR = 1,
1667    LP_BUILD_ROUND_CEIL = 2,
1668    LP_BUILD_ROUND_TRUNCATE = 3
1669 };
1670
1671 /**
1672  * Helper for SSE4.1's ROUNDxx instructions.
1673  *
1674  * NOTE: In the SSE4.1's nearest mode, if two values are equally close, the
1675  * result is the even value.  That is, rounding 2.5 will be 2.0, and not 3.0.
1676  */
1677 static inline LLVMValueRef
1678 lp_build_round_sse41(struct lp_build_context *bld,
1679                      LLVMValueRef a,
1680                      enum lp_build_round_mode mode)
1681 {
1682    LLVMBuilderRef builder = bld->gallivm->builder;
1683    const struct lp_type type = bld->type;
1684    LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1685    const char *intrinsic;
1686    LLVMValueRef res;
1687
1688    assert(type.floating);
1689
1690    assert(lp_check_value(type, a));
1691    assert(util_cpu_caps.has_sse4_1);
1692
1693    if (type.length == 1) {
1694       LLVMTypeRef vec_type;
1695       LLVMValueRef undef;
1696       LLVMValueRef args[3];
1697       LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
1698
1699       switch(type.width) {
1700       case 32:
1701          intrinsic = "llvm.x86.sse41.round.ss";
1702          break;
1703       case 64:
1704          intrinsic = "llvm.x86.sse41.round.sd";
1705          break;
1706       default:
1707          assert(0);
1708          return bld->undef;
1709       }
1710
1711       vec_type = LLVMVectorType(bld->elem_type, 4);
1712
1713       undef = LLVMGetUndef(vec_type);
1714
1715       args[0] = undef;
1716       args[1] = LLVMBuildInsertElement(builder, undef, a, index0, "");
1717       args[2] = LLVMConstInt(i32t, mode, 0);
1718
1719       res = lp_build_intrinsic(builder, intrinsic,
1720                                vec_type, args, Elements(args), 0);
1721
1722       res = LLVMBuildExtractElement(builder, res, index0, "");
1723    }
1724    else {
1725       if (type.width * type.length == 128) {
1726          switch(type.width) {
1727          case 32:
1728             intrinsic = "llvm.x86.sse41.round.ps";
1729             break;
1730          case 64:
1731             intrinsic = "llvm.x86.sse41.round.pd";
1732             break;
1733          default:
1734             assert(0);
1735             return bld->undef;
1736          }
1737       }
1738       else {
1739          assert(type.width * type.length == 256);
1740          assert(util_cpu_caps.has_avx);
1741
1742          switch(type.width) {
1743          case 32:
1744             intrinsic = "llvm.x86.avx.round.ps.256";
1745             break;
1746          case 64:
1747             intrinsic = "llvm.x86.avx.round.pd.256";
1748             break;
1749          default:
1750             assert(0);
1751             return bld->undef;
1752          }
1753       }
1754
1755       res = lp_build_intrinsic_binary(builder, intrinsic,
1756                                       bld->vec_type, a,
1757                                       LLVMConstInt(i32t, mode, 0));
1758    }
1759
1760    return res;
1761 }
1762
1763
1764 static inline LLVMValueRef
1765 lp_build_iround_nearest_sse2(struct lp_build_context *bld,
1766                              LLVMValueRef a)
1767 {
1768    LLVMBuilderRef builder = bld->gallivm->builder;
1769    const struct lp_type type = bld->type;
1770    LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1771    LLVMTypeRef ret_type = lp_build_int_vec_type(bld->gallivm, type);
1772    const char *intrinsic;
1773    LLVMValueRef res;
1774
1775    assert(type.floating);
1776    /* using the double precision conversions is a bit more complicated */
1777    assert(type.width == 32);
1778
1779    assert(lp_check_value(type, a));
1780    assert(util_cpu_caps.has_sse2);
1781
1782    /* This is relying on MXCSR rounding mode, which should always be nearest. */
1783    if (type.length == 1) {
1784       LLVMTypeRef vec_type;
1785       LLVMValueRef undef;
1786       LLVMValueRef arg;
1787       LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
1788
1789       vec_type = LLVMVectorType(bld->elem_type, 4);
1790
1791       intrinsic = "llvm.x86.sse.cvtss2si";
1792
1793       undef = LLVMGetUndef(vec_type);
1794
1795       arg = LLVMBuildInsertElement(builder, undef, a, index0, "");
1796
1797       res = lp_build_intrinsic_unary(builder, intrinsic,
1798                                      ret_type, arg);
1799    }
1800    else {
1801       if (type.width* type.length == 128) {
1802          intrinsic = "llvm.x86.sse2.cvtps2dq";
1803       }
1804       else {
1805          assert(type.width*type.length == 256);
1806          assert(util_cpu_caps.has_avx);
1807
1808          intrinsic = "llvm.x86.avx.cvt.ps2dq.256";
1809       }
1810       res = lp_build_intrinsic_unary(builder, intrinsic,
1811                                      ret_type, a);
1812    }
1813
1814    return res;
1815 }
1816
1817
1818 /*
1819  */
1820 static inline LLVMValueRef
1821 lp_build_round_altivec(struct lp_build_context *bld,
1822                        LLVMValueRef a,
1823                        enum lp_build_round_mode mode)
1824 {
1825    LLVMBuilderRef builder = bld->gallivm->builder;
1826    const struct lp_type type = bld->type;
1827    const char *intrinsic = NULL;
1828
1829    assert(type.floating);
1830
1831    assert(lp_check_value(type, a));
1832    assert(util_cpu_caps.has_altivec);
1833
1834    (void)type;
1835
1836    switch (mode) {
1837    case LP_BUILD_ROUND_NEAREST:
1838       intrinsic = "llvm.ppc.altivec.vrfin";
1839       break;
1840    case LP_BUILD_ROUND_FLOOR:
1841       intrinsic = "llvm.ppc.altivec.vrfim";
1842       break;
1843    case LP_BUILD_ROUND_CEIL:
1844       intrinsic = "llvm.ppc.altivec.vrfip";
1845       break;
1846    case LP_BUILD_ROUND_TRUNCATE:
1847       intrinsic = "llvm.ppc.altivec.vrfiz";
1848       break;
1849    }
1850
1851    return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
1852 }
1853
1854 static inline LLVMValueRef
1855 lp_build_round_arch(struct lp_build_context *bld,
1856                     LLVMValueRef a,
1857                     enum lp_build_round_mode mode)
1858 {
1859    if (util_cpu_caps.has_sse4_1)
1860      return lp_build_round_sse41(bld, a, mode);
1861    else /* (util_cpu_caps.has_altivec) */
1862      return lp_build_round_altivec(bld, a, mode);
1863 }
1864
1865 /**
1866  * Return the integer part of a float (vector) value (== round toward zero).
1867  * The returned value is a float (vector).
1868  * Ex: trunc(-1.5) = -1.0
1869  */
1870 LLVMValueRef
1871 lp_build_trunc(struct lp_build_context *bld,
1872                LLVMValueRef a)
1873 {
1874    LLVMBuilderRef builder = bld->gallivm->builder;
1875    const struct lp_type type = bld->type;
1876
1877    assert(type.floating);
1878    assert(lp_check_value(type, a));
1879
1880    if (arch_rounding_available(type)) {
1881       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_TRUNCATE);
1882    }
1883    else {
1884       const struct lp_type type = bld->type;
1885       struct lp_type inttype;
1886       struct lp_build_context intbld;
1887       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
1888       LLVMValueRef trunc, res, anosign, mask;
1889       LLVMTypeRef int_vec_type = bld->int_vec_type;
1890       LLVMTypeRef vec_type = bld->vec_type;
1891
1892       assert(type.width == 32); /* might want to handle doubles at some point */
1893
1894       inttype = type;
1895       inttype.floating = 0;
1896       lp_build_context_init(&intbld, bld->gallivm, inttype);
1897
1898       /* round by truncation */
1899       trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
1900       res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
1901
1902       /* mask out sign bit */
1903       anosign = lp_build_abs(bld, a);
1904       /*
1905        * mask out all values if anosign > 2^24
1906        * This should work both for large ints (all rounding is no-op for them
1907        * because such floats are always exact) as well as special cases like
1908        * NaNs, Infs (taking advantage of the fact they use max exponent).
1909        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
1910        */
1911       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
1912       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
1913       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
1914       return lp_build_select(bld, mask, a, res);
1915    }
1916 }
1917
1918
1919 /**
1920  * Return float (vector) rounded to nearest integer (vector).  The returned
1921  * value is a float (vector).
1922  * Ex: round(0.9) = 1.0
1923  * Ex: round(-1.5) = -2.0
1924  */
1925 LLVMValueRef
1926 lp_build_round(struct lp_build_context *bld,
1927                LLVMValueRef a)
1928 {
1929    LLVMBuilderRef builder = bld->gallivm->builder;
1930    const struct lp_type type = bld->type;
1931
1932    assert(type.floating);
1933    assert(lp_check_value(type, a));
1934
1935    if (arch_rounding_available(type)) {
1936       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
1937    }
1938    else {
1939       const struct lp_type type = bld->type;
1940       struct lp_type inttype;
1941       struct lp_build_context intbld;
1942       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
1943       LLVMValueRef res, anosign, mask;
1944       LLVMTypeRef int_vec_type = bld->int_vec_type;
1945       LLVMTypeRef vec_type = bld->vec_type;
1946
1947       assert(type.width == 32); /* might want to handle doubles at some point */
1948
1949       inttype = type;
1950       inttype.floating = 0;
1951       lp_build_context_init(&intbld, bld->gallivm, inttype);
1952
1953       res = lp_build_iround(bld, a);
1954       res = LLVMBuildSIToFP(builder, res, vec_type, "");
1955
1956       /* mask out sign bit */
1957       anosign = lp_build_abs(bld, a);
1958       /*
1959        * mask out all values if anosign > 2^24
1960        * This should work both for large ints (all rounding is no-op for them
1961        * because such floats are always exact) as well as special cases like
1962        * NaNs, Infs (taking advantage of the fact they use max exponent).
1963        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
1964        */
1965       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
1966       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
1967       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
1968       return lp_build_select(bld, mask, a, res);
1969    }
1970 }
1971
1972
1973 /**
1974  * Return floor of float (vector), result is a float (vector)
1975  * Ex: floor(1.1) = 1.0
1976  * Ex: floor(-1.1) = -2.0
1977  */
1978 LLVMValueRef
1979 lp_build_floor(struct lp_build_context *bld,
1980                LLVMValueRef a)
1981 {
1982    LLVMBuilderRef builder = bld->gallivm->builder;
1983    const struct lp_type type = bld->type;
1984
1985    assert(type.floating);
1986    assert(lp_check_value(type, a));
1987
1988    if (arch_rounding_available(type)) {
1989       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
1990    }
1991    else {
1992       const struct lp_type type = bld->type;
1993       struct lp_type inttype;
1994       struct lp_build_context intbld;
1995       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
1996       LLVMValueRef trunc, res, anosign, mask;
1997       LLVMTypeRef int_vec_type = bld->int_vec_type;
1998       LLVMTypeRef vec_type = bld->vec_type;
1999
2000       if (type.width != 32) {
2001          char intrinsic[32];
2002          util_snprintf(intrinsic, sizeof intrinsic, "llvm.floor.v%uf%u", type.length, type.width);
2003          return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2004       }
2005
2006       assert(type.width == 32); /* might want to handle doubles at some point */
2007
2008       inttype = type;
2009       inttype.floating = 0;
2010       lp_build_context_init(&intbld, bld->gallivm, inttype);
2011
2012       /* round by truncation */
2013       trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2014       res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
2015
2016       if (type.sign) {
2017          LLVMValueRef tmp;
2018
2019          /*
2020           * fix values if rounding is wrong (for non-special cases)
2021           * - this is the case if trunc > a
2022           */
2023          mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, res, a);
2024          /* tmp = trunc > a ? 1.0 : 0.0 */
2025          tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
2026          tmp = lp_build_and(&intbld, mask, tmp);
2027          tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
2028          res = lp_build_sub(bld, res, tmp);
2029       }
2030
2031       /* mask out sign bit */
2032       anosign = lp_build_abs(bld, a);
2033       /*
2034        * mask out all values if anosign > 2^24
2035        * This should work both for large ints (all rounding is no-op for them
2036        * because such floats are always exact) as well as special cases like
2037        * NaNs, Infs (taking advantage of the fact they use max exponent).
2038        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2039        */
2040       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2041       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2042       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2043       return lp_build_select(bld, mask, a, res);
2044    }
2045 }
2046
2047
2048 /**
2049  * Return ceiling of float (vector), returning float (vector).
2050  * Ex: ceil( 1.1) = 2.0
2051  * Ex: ceil(-1.1) = -1.0
2052  */
2053 LLVMValueRef
2054 lp_build_ceil(struct lp_build_context *bld,
2055               LLVMValueRef a)
2056 {
2057    LLVMBuilderRef builder = bld->gallivm->builder;
2058    const struct lp_type type = bld->type;
2059
2060    assert(type.floating);
2061    assert(lp_check_value(type, a));
2062
2063    if (arch_rounding_available(type)) {
2064       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2065    }
2066    else {
2067       const struct lp_type type = bld->type;
2068       struct lp_type inttype;
2069       struct lp_build_context intbld;
2070       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2071       LLVMValueRef trunc, res, anosign, mask, tmp;
2072       LLVMTypeRef int_vec_type = bld->int_vec_type;
2073       LLVMTypeRef vec_type = bld->vec_type;
2074
2075       if (type.width != 32) {
2076          char intrinsic[32];
2077          util_snprintf(intrinsic, sizeof intrinsic, "llvm.ceil.v%uf%u", type.length, type.width);
2078          return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2079       }
2080
2081       assert(type.width == 32); /* might want to handle doubles at some point */
2082
2083       inttype = type;
2084       inttype.floating = 0;
2085       lp_build_context_init(&intbld, bld->gallivm, inttype);
2086
2087       /* round by truncation */
2088       trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2089       trunc = LLVMBuildSIToFP(builder, trunc, vec_type, "ceil.trunc");
2090
2091       /*
2092        * fix values if rounding is wrong (for non-special cases)
2093        * - this is the case if trunc < a
2094        */
2095       mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2096       /* tmp = trunc < a ? 1.0 : 0.0 */
2097       tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
2098       tmp = lp_build_and(&intbld, mask, tmp);
2099       tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
2100       res = lp_build_add(bld, trunc, tmp);
2101
2102       /* mask out sign bit */
2103       anosign = lp_build_abs(bld, a);
2104       /*
2105        * mask out all values if anosign > 2^24
2106        * This should work both for large ints (all rounding is no-op for them
2107        * because such floats are always exact) as well as special cases like
2108        * NaNs, Infs (taking advantage of the fact they use max exponent).
2109        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2110        */
2111       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2112       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2113       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2114       return lp_build_select(bld, mask, a, res);
2115    }
2116 }
2117
2118
2119 /**
2120  * Return fractional part of 'a' computed as a - floor(a)
2121  * Typically used in texture coord arithmetic.
2122  */
2123 LLVMValueRef
2124 lp_build_fract(struct lp_build_context *bld,
2125                LLVMValueRef a)
2126 {
2127    assert(bld->type.floating);
2128    return lp_build_sub(bld, a, lp_build_floor(bld, a));
2129 }
2130
2131
2132 /**
2133  * Prevent returning a fractional part of 1.0 for very small negative values of
2134  * 'a' by clamping against 0.99999(9).
2135  */
2136 static inline LLVMValueRef
2137 clamp_fract(struct lp_build_context *bld, LLVMValueRef fract)
2138 {
2139    LLVMValueRef max;
2140
2141    /* this is the largest number smaller than 1.0 representable as float */
2142    max = lp_build_const_vec(bld->gallivm, bld->type,
2143                             1.0 - 1.0/(1LL << (lp_mantissa(bld->type) + 1)));
2144    return lp_build_min(bld, fract, max);
2145 }
2146
2147
2148 /**
2149  * Same as lp_build_fract, but guarantees that the result is always smaller
2150  * than one.
2151  */
2152 LLVMValueRef
2153 lp_build_fract_safe(struct lp_build_context *bld,
2154                     LLVMValueRef a)
2155 {
2156    return clamp_fract(bld, lp_build_fract(bld, a));
2157 }
2158
2159
2160 /**
2161  * Return the integer part of a float (vector) value (== round toward zero).
2162  * The returned value is an integer (vector).
2163  * Ex: itrunc(-1.5) = -1
2164  */
2165 LLVMValueRef
2166 lp_build_itrunc(struct lp_build_context *bld,
2167                 LLVMValueRef a)
2168 {
2169    LLVMBuilderRef builder = bld->gallivm->builder;
2170    const struct lp_type type = bld->type;
2171    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
2172
2173    assert(type.floating);
2174    assert(lp_check_value(type, a));
2175
2176    return LLVMBuildFPToSI(builder, a, int_vec_type, "");
2177 }
2178
2179
2180 /**
2181  * Return float (vector) rounded to nearest integer (vector).  The returned
2182  * value is an integer (vector).
2183  * Ex: iround(0.9) = 1
2184  * Ex: iround(-1.5) = -2
2185  */
2186 LLVMValueRef
2187 lp_build_iround(struct lp_build_context *bld,
2188                 LLVMValueRef a)
2189 {
2190    LLVMBuilderRef builder = bld->gallivm->builder;
2191    const struct lp_type type = bld->type;
2192    LLVMTypeRef int_vec_type = bld->int_vec_type;
2193    LLVMValueRef res;
2194
2195    assert(type.floating);
2196
2197    assert(lp_check_value(type, a));
2198
2199    if ((util_cpu_caps.has_sse2 &&
2200        ((type.width == 32) && (type.length == 1 || type.length == 4))) ||
2201        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
2202       return lp_build_iround_nearest_sse2(bld, a);
2203    }
2204    if (arch_rounding_available(type)) {
2205       res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
2206    }
2207    else {
2208       LLVMValueRef half;
2209
2210       half = lp_build_const_vec(bld->gallivm, type, 0.5);
2211
2212       if (type.sign) {
2213          LLVMTypeRef vec_type = bld->vec_type;
2214          LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
2215                                     (unsigned long long)1 << (type.width - 1));
2216          LLVMValueRef sign;
2217
2218          /* get sign bit */
2219          sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
2220          sign = LLVMBuildAnd(builder, sign, mask, "");
2221
2222          /* sign * 0.5 */
2223          half = LLVMBuildBitCast(builder, half, int_vec_type, "");
2224          half = LLVMBuildOr(builder, sign, half, "");
2225          half = LLVMBuildBitCast(builder, half, vec_type, "");
2226       }
2227
2228       res = LLVMBuildFAdd(builder, a, half, "");
2229    }
2230
2231    res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
2232
2233    return res;
2234 }
2235
2236
2237 /**
2238  * Return floor of float (vector), result is an int (vector)
2239  * Ex: ifloor(1.1) = 1.0
2240  * Ex: ifloor(-1.1) = -2.0
2241  */
2242 LLVMValueRef
2243 lp_build_ifloor(struct lp_build_context *bld,
2244                 LLVMValueRef a)
2245 {
2246    LLVMBuilderRef builder = bld->gallivm->builder;
2247    const struct lp_type type = bld->type;
2248    LLVMTypeRef int_vec_type = bld->int_vec_type;
2249    LLVMValueRef res;
2250
2251    assert(type.floating);
2252    assert(lp_check_value(type, a));
2253
2254    res = a;
2255    if (type.sign) {
2256       if (arch_rounding_available(type)) {
2257          res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
2258       }
2259       else {
2260          struct lp_type inttype;
2261          struct lp_build_context intbld;
2262          LLVMValueRef trunc, itrunc, mask;
2263
2264          assert(type.floating);
2265          assert(lp_check_value(type, a));
2266
2267          inttype = type;
2268          inttype.floating = 0;
2269          lp_build_context_init(&intbld, bld->gallivm, inttype);
2270
2271          /* round by truncation */
2272          itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2273          trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "ifloor.trunc");
2274
2275          /*
2276           * fix values if rounding is wrong (for non-special cases)
2277           * - this is the case if trunc > a
2278           * The results of doing this with NaNs, very large values etc.
2279           * are undefined but this seems to be the case anyway.
2280           */
2281          mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, trunc, a);
2282          /* cheapie minus one with mask since the mask is minus one / zero */
2283          return lp_build_add(&intbld, itrunc, mask);
2284       }
2285    }
2286
2287    /* round to nearest (toward zero) */
2288    res = LLVMBuildFPToSI(builder, res, int_vec_type, "ifloor.res");
2289
2290    return res;
2291 }
2292
2293
2294 /**
2295  * Return ceiling of float (vector), returning int (vector).
2296  * Ex: iceil( 1.1) = 2
2297  * Ex: iceil(-1.1) = -1
2298  */
2299 LLVMValueRef
2300 lp_build_iceil(struct lp_build_context *bld,
2301                LLVMValueRef a)
2302 {
2303    LLVMBuilderRef builder = bld->gallivm->builder;
2304    const struct lp_type type = bld->type;
2305    LLVMTypeRef int_vec_type = bld->int_vec_type;
2306    LLVMValueRef res;
2307
2308    assert(type.floating);
2309    assert(lp_check_value(type, a));
2310
2311    if (arch_rounding_available(type)) {
2312       res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2313    }
2314    else {
2315       struct lp_type inttype;
2316       struct lp_build_context intbld;
2317       LLVMValueRef trunc, itrunc, mask;
2318
2319       assert(type.floating);
2320       assert(lp_check_value(type, a));
2321
2322       inttype = type;
2323       inttype.floating = 0;
2324       lp_build_context_init(&intbld, bld->gallivm, inttype);
2325
2326       /* round by truncation */
2327       itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2328       trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "iceil.trunc");
2329
2330       /*
2331        * fix values if rounding is wrong (for non-special cases)
2332        * - this is the case if trunc < a
2333        * The results of doing this with NaNs, very large values etc.
2334        * are undefined but this seems to be the case anyway.
2335        */
2336       mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2337       /* cheapie plus one with mask since the mask is minus one / zero */
2338       return lp_build_sub(&intbld, itrunc, mask);
2339    }
2340
2341    /* round to nearest (toward zero) */
2342    res = LLVMBuildFPToSI(builder, res, int_vec_type, "iceil.res");
2343
2344    return res;
2345 }
2346
2347
2348 /**
2349  * Combined ifloor() & fract().
2350  *
2351  * Preferred to calling the functions separately, as it will ensure that the
2352  * strategy (floor() vs ifloor()) that results in less redundant work is used.
2353  */
2354 void
2355 lp_build_ifloor_fract(struct lp_build_context *bld,
2356                       LLVMValueRef a,
2357                       LLVMValueRef *out_ipart,
2358                       LLVMValueRef *out_fpart)
2359 {
2360    LLVMBuilderRef builder = bld->gallivm->builder;
2361    const struct lp_type type = bld->type;
2362    LLVMValueRef ipart;
2363
2364    assert(type.floating);
2365    assert(lp_check_value(type, a));
2366
2367    if (arch_rounding_available(type)) {
2368       /*
2369        * floor() is easier.
2370        */
2371
2372       ipart = lp_build_floor(bld, a);
2373       *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2374       *out_ipart = LLVMBuildFPToSI(builder, ipart, bld->int_vec_type, "ipart");
2375    }
2376    else {
2377       /*
2378        * ifloor() is easier.
2379        */
2380
2381       *out_ipart = lp_build_ifloor(bld, a);
2382       ipart = LLVMBuildSIToFP(builder, *out_ipart, bld->vec_type, "ipart");
2383       *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2384    }
2385 }
2386
2387
2388 /**
2389  * Same as lp_build_ifloor_fract, but guarantees that the fractional part is
2390  * always smaller than one.
2391  */
2392 void
2393 lp_build_ifloor_fract_safe(struct lp_build_context *bld,
2394                            LLVMValueRef a,
2395                            LLVMValueRef *out_ipart,
2396                            LLVMValueRef *out_fpart)
2397 {
2398    lp_build_ifloor_fract(bld, a, out_ipart, out_fpart);
2399    *out_fpart = clamp_fract(bld, *out_fpart);
2400 }
2401
2402
2403 LLVMValueRef
2404 lp_build_sqrt(struct lp_build_context *bld,
2405               LLVMValueRef a)
2406 {
2407    LLVMBuilderRef builder = bld->gallivm->builder;
2408    const struct lp_type type = bld->type;
2409    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
2410    char intrinsic[32];
2411
2412    assert(lp_check_value(type, a));
2413
2414    /* TODO: optimize the constant case */
2415
2416    assert(type.floating);
2417    if (type.length == 1) {
2418       util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.f%u", type.width);
2419    }
2420    else {
2421       util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.v%uf%u", type.length, type.width);
2422    }
2423
2424    return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2425 }
2426
2427
2428 /**
2429  * Do one Newton-Raphson step to improve reciprocate precision:
2430  *
2431  *   x_{i+1} = x_i * (2 - a * x_i)
2432  *
2433  * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
2434  * +/-Inf, giving NaN instead.  Certain applications rely on this behavior,
2435  * such as Google Earth, which does RCP(RSQRT(0.0) when drawing the Earth's
2436  * halo. It would be necessary to clamp the argument to prevent this.
2437  *
2438  * See also:
2439  * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
2440  * - http://softwarecommunity.intel.com/articles/eng/1818.htm
2441  */
2442 static inline LLVMValueRef
2443 lp_build_rcp_refine(struct lp_build_context *bld,
2444                     LLVMValueRef a,
2445                     LLVMValueRef rcp_a)
2446 {
2447    LLVMBuilderRef builder = bld->gallivm->builder;
2448    LLVMValueRef two = lp_build_const_vec(bld->gallivm, bld->type, 2.0);
2449    LLVMValueRef res;
2450
2451    res = LLVMBuildFMul(builder, a, rcp_a, "");
2452    res = LLVMBuildFSub(builder, two, res, "");
2453    res = LLVMBuildFMul(builder, rcp_a, res, "");
2454
2455    return res;
2456 }
2457
2458
2459 LLVMValueRef
2460 lp_build_rcp(struct lp_build_context *bld,
2461              LLVMValueRef a)
2462 {
2463    LLVMBuilderRef builder = bld->gallivm->builder;
2464    const struct lp_type type = bld->type;
2465
2466    assert(lp_check_value(type, a));
2467
2468    if(a == bld->zero)
2469       return bld->undef;
2470    if(a == bld->one)
2471       return bld->one;
2472    if(a == bld->undef)
2473       return bld->undef;
2474
2475    assert(type.floating);
2476
2477    if(LLVMIsConstant(a))
2478       return LLVMConstFDiv(bld->one, a);
2479
2480    /*
2481     * We don't use RCPPS because:
2482     * - it only has 10bits of precision
2483     * - it doesn't even get the reciprocate of 1.0 exactly
2484     * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
2485     * - for recent processors the benefit over DIVPS is marginal, a case
2486     *   dependent
2487     *
2488     * We could still use it on certain processors if benchmarks show that the
2489     * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
2490     * particular uses that require less workarounds.
2491     */
2492
2493    if (FALSE && ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2494          (util_cpu_caps.has_avx && type.width == 32 && type.length == 8))){
2495       const unsigned num_iterations = 0;
2496       LLVMValueRef res;
2497       unsigned i;
2498       const char *intrinsic = NULL;
2499
2500       if (type.length == 4) {
2501          intrinsic = "llvm.x86.sse.rcp.ps";
2502       }
2503       else {
2504          intrinsic = "llvm.x86.avx.rcp.ps.256";
2505       }
2506
2507       res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2508
2509       for (i = 0; i < num_iterations; ++i) {
2510          res = lp_build_rcp_refine(bld, a, res);
2511       }
2512
2513       return res;
2514    }
2515
2516    return LLVMBuildFDiv(builder, bld->one, a, "");
2517 }
2518
2519
2520 /**
2521  * Do one Newton-Raphson step to improve rsqrt precision:
2522  *
2523  *   x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
2524  *
2525  * See also Intel 64 and IA-32 Architectures Optimization Manual.
2526  */
2527 static inline LLVMValueRef
2528 lp_build_rsqrt_refine(struct lp_build_context *bld,
2529                       LLVMValueRef a,
2530                       LLVMValueRef rsqrt_a)
2531 {
2532    LLVMBuilderRef builder = bld->gallivm->builder;
2533    LLVMValueRef half = lp_build_const_vec(bld->gallivm, bld->type, 0.5);
2534    LLVMValueRef three = lp_build_const_vec(bld->gallivm, bld->type, 3.0);
2535    LLVMValueRef res;
2536
2537    res = LLVMBuildFMul(builder, rsqrt_a, rsqrt_a, "");
2538    res = LLVMBuildFMul(builder, a, res, "");
2539    res = LLVMBuildFSub(builder, three, res, "");
2540    res = LLVMBuildFMul(builder, rsqrt_a, res, "");
2541    res = LLVMBuildFMul(builder, half, res, "");
2542
2543    return res;
2544 }
2545
2546
2547 /**
2548  * Generate 1/sqrt(a).
2549  * Result is undefined for values < 0, infinity for +0.
2550  */
2551 LLVMValueRef
2552 lp_build_rsqrt(struct lp_build_context *bld,
2553                LLVMValueRef a)
2554 {
2555    const struct lp_type type = bld->type;
2556
2557    assert(lp_check_value(type, a));
2558
2559    assert(type.floating);
2560
2561    /*
2562     * This should be faster but all denormals will end up as infinity.
2563     */
2564    if (0 && lp_build_fast_rsqrt_available(type)) {
2565       const unsigned num_iterations = 1;
2566       LLVMValueRef res;
2567       unsigned i;
2568
2569       /* rsqrt(1.0) != 1.0 here */
2570       res = lp_build_fast_rsqrt(bld, a);
2571
2572       if (num_iterations) {
2573          /*
2574           * Newton-Raphson will result in NaN instead of infinity for zero,
2575           * and NaN instead of zero for infinity.
2576           * Also, need to ensure rsqrt(1.0) == 1.0.
2577           * All numbers smaller than FLT_MIN will result in +infinity
2578           * (rsqrtps treats all denormals as zero).
2579           */
2580          LLVMValueRef cmp;
2581          LLVMValueRef flt_min = lp_build_const_vec(bld->gallivm, type, FLT_MIN);
2582          LLVMValueRef inf = lp_build_const_vec(bld->gallivm, type, INFINITY);
2583
2584          for (i = 0; i < num_iterations; ++i) {
2585             res = lp_build_rsqrt_refine(bld, a, res);
2586          }
2587          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_LESS, a, flt_min);
2588          res = lp_build_select(bld, cmp, inf, res);
2589          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, inf);
2590          res = lp_build_select(bld, cmp, bld->zero, res);
2591          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, bld->one);
2592          res = lp_build_select(bld, cmp, bld->one, res);
2593       }
2594
2595       return res;
2596    }
2597
2598    return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2599 }
2600
2601 /**
2602  * If there's a fast (inaccurate) rsqrt instruction available
2603  * (caller may want to avoid to call rsqrt_fast if it's not available,
2604  * i.e. for calculating x^0.5 it may do rsqrt_fast(x) * x but if
2605  * unavailable it would result in sqrt/div/mul so obviously
2606  * much better to just call sqrt, skipping both div and mul).
2607  */
2608 boolean
2609 lp_build_fast_rsqrt_available(struct lp_type type)
2610 {
2611    assert(type.floating);
2612
2613    if ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2614        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
2615       return true;
2616    }
2617    return false;
2618 }
2619
2620
2621 /**
2622  * Generate 1/sqrt(a).
2623  * Result is undefined for values < 0, infinity for +0.
2624  * Precision is limited, only ~10 bits guaranteed
2625  * (rsqrt 1.0 may not be 1.0, denorms may be flushed to 0).
2626  */
2627 LLVMValueRef
2628 lp_build_fast_rsqrt(struct lp_build_context *bld,
2629                     LLVMValueRef a)
2630 {
2631    LLVMBuilderRef builder = bld->gallivm->builder;
2632    const struct lp_type type = bld->type;
2633
2634    assert(lp_check_value(type, a));
2635
2636    if (lp_build_fast_rsqrt_available(type)) {
2637       const char *intrinsic = NULL;
2638
2639       if (type.length == 4) {
2640          intrinsic = "llvm.x86.sse.rsqrt.ps";
2641       }
2642       else {
2643          intrinsic = "llvm.x86.avx.rsqrt.ps.256";
2644       }
2645       return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2646    }
2647    else {
2648       debug_printf("%s: emulating fast rsqrt with rcp/sqrt\n", __FUNCTION__);
2649    }
2650    return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2651 }
2652
2653
2654 /**
2655  * Generate sin(a) or cos(a) using polynomial approximation.
2656  * TODO: it might be worth recognizing sin and cos using same source
2657  * (i.e. d3d10 sincos opcode). Obviously doing both at the same time
2658  * would be way cheaper than calculating (nearly) everything twice...
2659  * Not sure it's common enough to be worth bothering however, scs
2660  * opcode could also benefit from calculating both though.
2661  */
2662 static LLVMValueRef
2663 lp_build_sin_or_cos(struct lp_build_context *bld,
2664                     LLVMValueRef a,
2665                     boolean cos)
2666 {
2667    struct gallivm_state *gallivm = bld->gallivm;
2668    LLVMBuilderRef b = gallivm->builder;
2669    struct lp_type int_type = lp_int_type(bld->type);
2670
2671    /*
2672     *  take the absolute value,
2673     *  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
2674     */
2675
2676    LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
2677    LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
2678
2679    LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
2680    LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
2681
2682    /*
2683     * scale by 4/Pi
2684     * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
2685     */
2686
2687    LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
2688    LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
2689
2690    /*
2691     * store the integer part of y in mm0
2692     * emm2 = _mm_cvttps_epi32(y);
2693     */
2694
2695    LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
2696
2697    /*
2698     * j=(j+1) & (~1) (see the cephes sources)
2699     * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
2700     */
2701
2702    LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
2703    LLVMValueRef emm2_add =  LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
2704    /*
2705     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
2706     */
2707    LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
2708    LLVMValueRef emm2_and =  LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
2709
2710    /*
2711     * y = _mm_cvtepi32_ps(emm2);
2712     */
2713    LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
2714
2715    LLVMValueRef const_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
2716    LLVMValueRef const_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
2717    LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
2718    LLVMValueRef sign_mask = lp_build_const_int_vec(gallivm, bld->type, 0x80000000);
2719
2720    /*
2721     * Argument used for poly selection and sign bit determination
2722     * is different for sin vs. cos.
2723     */
2724    LLVMValueRef emm2_2 = cos ? LLVMBuildSub(b, emm2_and, const_2, "emm2_2") :
2725                                emm2_and;
2726
2727    LLVMValueRef sign_bit = cos ? LLVMBuildShl(b, LLVMBuildAnd(b, const_4,
2728                                                               LLVMBuildNot(b, emm2_2, ""), ""),
2729                                               const_29, "sign_bit") :
2730                                  LLVMBuildAnd(b, LLVMBuildXor(b, a_v4si,
2731                                                               LLVMBuildShl(b, emm2_add,
2732                                                                            const_29, ""), ""),
2733                                               sign_mask, "sign_bit");
2734
2735    /*
2736     * get the polynom selection mask
2737     * there is one polynom for 0 <= x <= Pi/4
2738     * and another one for Pi/4<x<=Pi/2
2739     * Both branches will be computed.
2740     *
2741     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
2742     * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
2743     */
2744
2745    LLVMValueRef emm2_3 =  LLVMBuildAnd(b, emm2_2, const_2, "emm2_3");
2746    LLVMValueRef poly_mask = lp_build_compare(gallivm,
2747                                              int_type, PIPE_FUNC_EQUAL,
2748                                              emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
2749
2750    /*
2751     * _PS_CONST(minus_cephes_DP1, -0.78515625);
2752     * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
2753     * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
2754     */
2755    LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
2756    LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
2757    LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
2758
2759    /*
2760     * The magic pass: "Extended precision modular arithmetic"
2761     * x = ((x - y * DP1) - y * DP2) - y * DP3;
2762     * xmm1 = _mm_mul_ps(y, xmm1);
2763     * xmm2 = _mm_mul_ps(y, xmm2);
2764     * xmm3 = _mm_mul_ps(y, xmm3);
2765     */
2766    LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1");
2767    LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2");
2768    LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3");
2769
2770    /*
2771     * x = _mm_add_ps(x, xmm1);
2772     * x = _mm_add_ps(x, xmm2);
2773     * x = _mm_add_ps(x, xmm3);
2774     */
2775
2776    LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1");
2777    LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2");
2778    LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3");
2779
2780    /*
2781     * Evaluate the first polynom  (0 <= x <= Pi/4)
2782     *
2783     * z = _mm_mul_ps(x,x);
2784     */
2785    LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
2786
2787    /*
2788     * _PS_CONST(coscof_p0,  2.443315711809948E-005);
2789     * _PS_CONST(coscof_p1, -1.388731625493765E-003);
2790     * _PS_CONST(coscof_p2,  4.166664568298827E-002);
2791     */
2792    LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
2793    LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
2794    LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
2795
2796    /*
2797     * y = *(v4sf*)_ps_coscof_p0;
2798     * y = _mm_mul_ps(y, z);
2799     */
2800    LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3");
2801    LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4");
2802    LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5");
2803    LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6");
2804    LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
2805    LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
2806
2807
2808    /*
2809     * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
2810     * y = _mm_sub_ps(y, tmp);
2811     * y = _mm_add_ps(y, *(v4sf*)_ps_1);
2812     */
2813    LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
2814    LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
2815    LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
2816    LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
2817    LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
2818
2819    /*
2820     * _PS_CONST(sincof_p0, -1.9515295891E-4);
2821     * _PS_CONST(sincof_p1,  8.3321608736E-3);
2822     * _PS_CONST(sincof_p2, -1.6666654611E-1);
2823     */
2824    LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
2825    LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
2826    LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
2827
2828    /*
2829     * Evaluate the second polynom  (Pi/4 <= x <= 0)
2830     *
2831     * y2 = *(v4sf*)_ps_sincof_p0;
2832     * y2 = _mm_mul_ps(y2, z);
2833     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
2834     * y2 = _mm_mul_ps(y2, z);
2835     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
2836     * y2 = _mm_mul_ps(y2, z);
2837     * y2 = _mm_mul_ps(y2, x);
2838     * y2 = _mm_add_ps(y2, x);
2839     */
2840
2841    LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3");
2842    LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4");
2843    LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5");
2844    LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6");
2845    LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
2846    LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8");
2847    LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9");
2848
2849    /*
2850     * select the correct result from the two polynoms
2851     * xmm3 = poly_mask;
2852     * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
2853     * y = _mm_andnot_ps(xmm3, y);
2854     * y = _mm_or_ps(y,y2);
2855     */
2856    LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
2857    LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
2858    LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
2859    LLVMValueRef poly_mask_inv = LLVMBuildNot(b, poly_mask, "poly_mask_inv");
2860    LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
2861    LLVMValueRef y_combine = LLVMBuildOr(b, y_and, y2_and, "y_combine");
2862
2863    /*
2864     * update the sign
2865     * y = _mm_xor_ps(y, sign_bit);
2866     */
2867    LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sign");
2868    LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
2869
2870    LLVMValueRef isfinite = lp_build_isfinite(bld, a);
2871
2872    /* clamp output to be within [-1, 1] */
2873    y_result = lp_build_clamp(bld, y_result,
2874                              lp_build_const_vec(bld->gallivm, bld->type,  -1.f),
2875                              lp_build_const_vec(bld->gallivm, bld->type,  1.f));
2876    /* If a is -inf, inf or NaN then return NaN */
2877    y_result = lp_build_select(bld, isfinite, y_result,
2878                               lp_build_const_vec(bld->gallivm, bld->type,  NAN));
2879    return y_result;
2880 }
2881
2882
2883 /**
2884  * Generate sin(a)
2885  */
2886 LLVMValueRef
2887 lp_build_sin(struct lp_build_context *bld,
2888              LLVMValueRef a)
2889 {
2890    return lp_build_sin_or_cos(bld, a, FALSE);
2891 }
2892
2893
2894 /**
2895  * Generate cos(a)
2896  */
2897 LLVMValueRef
2898 lp_build_cos(struct lp_build_context *bld,
2899              LLVMValueRef a)
2900 {
2901    return lp_build_sin_or_cos(bld, a, TRUE);
2902 }
2903
2904
2905 /**
2906  * Generate pow(x, y)
2907  */
2908 LLVMValueRef
2909 lp_build_pow(struct lp_build_context *bld,
2910              LLVMValueRef x,
2911              LLVMValueRef y)
2912 {
2913    /* TODO: optimize the constant case */
2914    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2915        LLVMIsConstant(x) && LLVMIsConstant(y)) {
2916       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2917                    __FUNCTION__);
2918    }
2919
2920    return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
2921 }
2922
2923
2924 /**
2925  * Generate exp(x)
2926  */
2927 LLVMValueRef
2928 lp_build_exp(struct lp_build_context *bld,
2929              LLVMValueRef x)
2930 {
2931    /* log2(e) = 1/log(2) */
2932    LLVMValueRef log2e = lp_build_const_vec(bld->gallivm, bld->type,
2933                                            1.4426950408889634);
2934
2935    assert(lp_check_value(bld->type, x));
2936
2937    return lp_build_exp2(bld, lp_build_mul(bld, log2e, x));
2938 }
2939
2940
2941 /**
2942  * Generate log(x)
2943  * Behavior is undefined with infs, 0s and nans
2944  */
2945 LLVMValueRef
2946 lp_build_log(struct lp_build_context *bld,
2947              LLVMValueRef x)
2948 {
2949    /* log(2) */
2950    LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
2951                                           0.69314718055994529);
2952
2953    assert(lp_check_value(bld->type, x));
2954
2955    return lp_build_mul(bld, log2, lp_build_log2(bld, x));
2956 }
2957
2958 /**
2959  * Generate log(x) that handles edge cases (infs, 0s and nans)
2960  */
2961 LLVMValueRef
2962 lp_build_log_safe(struct lp_build_context *bld,
2963                   LLVMValueRef x)
2964 {
2965    /* log(2) */
2966    LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
2967                                           0.69314718055994529);
2968
2969    assert(lp_check_value(bld->type, x));
2970
2971    return lp_build_mul(bld, log2, lp_build_log2_safe(bld, x));
2972 }
2973
2974
2975 /**
2976  * Generate polynomial.
2977  * Ex:  coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
2978  */
2979 LLVMValueRef
2980 lp_build_polynomial(struct lp_build_context *bld,
2981                     LLVMValueRef x,
2982                     const double *coeffs,
2983                     unsigned num_coeffs)
2984 {
2985    const struct lp_type type = bld->type;
2986    LLVMValueRef even = NULL, odd = NULL;
2987    LLVMValueRef x2;
2988    unsigned i;
2989
2990    assert(lp_check_value(bld->type, x));
2991
2992    /* TODO: optimize the constant case */
2993    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2994        LLVMIsConstant(x)) {
2995       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2996                    __FUNCTION__);
2997    }
2998
2999    /*
3000     * Calculate odd and even terms seperately to decrease data dependency
3001     * Ex:
3002     *     c[0] + x^2 * c[2] + x^4 * c[4] ...
3003     *     + x * (c[1] + x^2 * c[3] + x^4 * c[5]) ...
3004     */
3005    x2 = lp_build_mul(bld, x, x);
3006
3007    for (i = num_coeffs; i--; ) {
3008       LLVMValueRef coeff;
3009
3010       coeff = lp_build_const_vec(bld->gallivm, type, coeffs[i]);
3011
3012       if (i % 2 == 0) {
3013          if (even)
3014             even = lp_build_add(bld, coeff, lp_build_mul(bld, x2, even));
3015          else
3016             even = coeff;
3017       } else {
3018          if (odd)
3019             odd = lp_build_add(bld, coeff, lp_build_mul(bld, x2, odd));
3020          else
3021             odd = coeff;
3022       }
3023    }
3024
3025    if (odd)
3026       return lp_build_add(bld, lp_build_mul(bld, odd, x), even);
3027    else if (even)
3028       return even;
3029    else
3030       return bld->undef;
3031 }
3032
3033
3034 /**
3035  * Minimax polynomial fit of 2**x, in range [0, 1[
3036  */
3037 const double lp_build_exp2_polynomial[] = {
3038 #if EXP_POLY_DEGREE == 5
3039    1.000000000000000000000, /*XXX: was 0.999999925063526176901, recompute others */
3040    0.693153073200168932794,
3041    0.240153617044375388211,
3042    0.0558263180532956664775,
3043    0.00898934009049466391101,
3044    0.00187757667519147912699
3045 #elif EXP_POLY_DEGREE == 4
3046    1.00000259337069434683,
3047    0.693003834469974940458,
3048    0.24144275689150793076,
3049    0.0520114606103070150235,
3050    0.0135341679161270268764
3051 #elif EXP_POLY_DEGREE == 3
3052    0.999925218562710312959,
3053    0.695833540494823811697,
3054    0.226067155427249155588,
3055    0.0780245226406372992967
3056 #elif EXP_POLY_DEGREE == 2
3057    1.00172476321474503578,
3058    0.657636275736077639316,
3059    0.33718943461968720704
3060 #else
3061 #error
3062 #endif
3063 };
3064
3065
3066 LLVMValueRef
3067 lp_build_exp2(struct lp_build_context *bld,
3068               LLVMValueRef x)
3069 {
3070    LLVMBuilderRef builder = bld->gallivm->builder;
3071    const struct lp_type type = bld->type;
3072    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3073    LLVMValueRef ipart = NULL;
3074    LLVMValueRef fpart = NULL;
3075    LLVMValueRef expipart = NULL;
3076    LLVMValueRef expfpart = NULL;
3077    LLVMValueRef res = NULL;
3078
3079    assert(lp_check_value(bld->type, x));
3080
3081    /* TODO: optimize the constant case */
3082    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3083        LLVMIsConstant(x)) {
3084       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3085                    __FUNCTION__);
3086    }
3087
3088    assert(type.floating && type.width == 32);
3089
3090    /* We want to preserve NaN and make sure than for exp2 if x > 128,
3091     * the result is INF  and if it's smaller than -126.9 the result is 0 */
3092    x = lp_build_min_ext(bld, lp_build_const_vec(bld->gallivm, type,  128.0), x,
3093                         GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
3094    x = lp_build_max_ext(bld, lp_build_const_vec(bld->gallivm, type, -126.99999),
3095                         x, GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
3096
3097    /* ipart = floor(x) */
3098    /* fpart = x - ipart */
3099    lp_build_ifloor_fract(bld, x, &ipart, &fpart);
3100
3101    /* expipart = (float) (1 << ipart) */
3102    expipart = LLVMBuildAdd(builder, ipart,
3103                            lp_build_const_int_vec(bld->gallivm, type, 127), "");
3104    expipart = LLVMBuildShl(builder, expipart,
3105                            lp_build_const_int_vec(bld->gallivm, type, 23), "");
3106    expipart = LLVMBuildBitCast(builder, expipart, vec_type, "");
3107
3108    expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
3109                                   Elements(lp_build_exp2_polynomial));
3110
3111    res = LLVMBuildFMul(builder, expipart, expfpart, "");
3112
3113    return res;
3114 }
3115
3116
3117
3118 /**
3119  * Extract the exponent of a IEEE-754 floating point value.
3120  *
3121  * Optionally apply an integer bias.
3122  *
3123  * Result is an integer value with
3124  *
3125  *   ifloor(log2(x)) + bias
3126  */
3127 LLVMValueRef
3128 lp_build_extract_exponent(struct lp_build_context *bld,
3129                           LLVMValueRef x,
3130                           int bias)
3131 {
3132    LLVMBuilderRef builder = bld->gallivm->builder;
3133    const struct lp_type type = bld->type;
3134    unsigned mantissa = lp_mantissa(type);
3135    LLVMValueRef res;
3136
3137    assert(type.floating);
3138
3139    assert(lp_check_value(bld->type, x));
3140
3141    x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3142
3143    res = LLVMBuildLShr(builder, x,
3144                        lp_build_const_int_vec(bld->gallivm, type, mantissa), "");
3145    res = LLVMBuildAnd(builder, res,
3146                       lp_build_const_int_vec(bld->gallivm, type, 255), "");
3147    res = LLVMBuildSub(builder, res,
3148                       lp_build_const_int_vec(bld->gallivm, type, 127 - bias), "");
3149
3150    return res;
3151 }
3152
3153
3154 /**
3155  * Extract the mantissa of the a floating.
3156  *
3157  * Result is a floating point value with
3158  *
3159  *   x / floor(log2(x))
3160  */
3161 LLVMValueRef
3162 lp_build_extract_mantissa(struct lp_build_context *bld,
3163                           LLVMValueRef x)
3164 {
3165    LLVMBuilderRef builder = bld->gallivm->builder;
3166    const struct lp_type type = bld->type;
3167    unsigned mantissa = lp_mantissa(type);
3168    LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type,
3169                                                   (1ULL << mantissa) - 1);
3170    LLVMValueRef one = LLVMConstBitCast(bld->one, bld->int_vec_type);
3171    LLVMValueRef res;
3172
3173    assert(lp_check_value(bld->type, x));
3174
3175    assert(type.floating);
3176
3177    x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3178
3179    /* res = x / 2**ipart */
3180    res = LLVMBuildAnd(builder, x, mantmask, "");
3181    res = LLVMBuildOr(builder, res, one, "");
3182    res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
3183
3184    return res;
3185 }
3186
3187
3188
3189 /**
3190  * Minimax polynomial fit of log2((1.0 + sqrt(x))/(1.0 - sqrt(x)))/sqrt(x) ,for x in range of [0, 1/9[
3191  * These coefficients can be generate with
3192  * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
3193  */
3194 const double lp_build_log2_polynomial[] = {
3195 #if LOG_POLY_DEGREE == 5
3196    2.88539008148777786488L,
3197    0.961796878841293367824L,
3198    0.577058946784739859012L,
3199    0.412914355135828735411L,
3200    0.308591899232910175289L,
3201    0.352376952300281371868L,
3202 #elif LOG_POLY_DEGREE == 4
3203    2.88539009343309178325L,
3204    0.961791550404184197881L,
3205    0.577440339438736392009L,
3206    0.403343858251329912514L,
3207    0.406718052498846252698L,
3208 #elif LOG_POLY_DEGREE == 3
3209    2.88538959748872753838L,
3210    0.961932915889597772928L,
3211    0.571118517972136195241L,
3212    0.493997535084709500285L,
3213 #else
3214 #error
3215 #endif
3216 };
3217
3218 /**
3219  * See http://www.devmaster.net/forums/showthread.php?p=43580
3220  * http://en.wikipedia.org/wiki/Logarithm#Calculation
3221  * http://www.nezumi.demon.co.uk/consult/logx.htm
3222  *
3223  * If handle_edge_cases is true the function will perform computations
3224  * to match the required D3D10+ behavior for each of the edge cases.
3225  * That means that if input is:
3226  * - less than zero (to and including -inf) then NaN will be returned
3227  * - equal to zero (-denorm, -0, +0 or +denorm), then -inf will be returned
3228  * - +infinity, then +infinity will be returned
3229  * - NaN, then NaN will be returned
3230  *
3231  * Those checks are fairly expensive so if you don't need them make sure
3232  * handle_edge_cases is false.
3233  */
3234 void
3235 lp_build_log2_approx(struct lp_build_context *bld,
3236                      LLVMValueRef x,
3237                      LLVMValueRef *p_exp,
3238                      LLVMValueRef *p_floor_log2,
3239                      LLVMValueRef *p_log2,
3240                      boolean handle_edge_cases)
3241 {
3242    LLVMBuilderRef builder = bld->gallivm->builder;
3243    const struct lp_type type = bld->type;
3244    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3245    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
3246
3247    LLVMValueRef expmask = lp_build_const_int_vec(bld->gallivm, type, 0x7f800000);
3248    LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 0x007fffff);
3249    LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
3250
3251    LLVMValueRef i = NULL;
3252    LLVMValueRef y = NULL;
3253    LLVMValueRef z = NULL;
3254    LLVMValueRef exp = NULL;
3255    LLVMValueRef mant = NULL;
3256    LLVMValueRef logexp = NULL;
3257    LLVMValueRef logmant = NULL;
3258    LLVMValueRef res = NULL;
3259
3260    assert(lp_check_value(bld->type, x));
3261
3262    if(p_exp || p_floor_log2 || p_log2) {
3263       /* TODO: optimize the constant case */
3264       if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3265           LLVMIsConstant(x)) {
3266          debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3267                       __FUNCTION__);
3268       }
3269
3270       assert(type.floating && type.width == 32);
3271
3272       /*
3273        * We don't explicitly handle denormalized numbers. They will yield a
3274        * result in the neighbourhood of -127, which appears to be adequate
3275        * enough.
3276        */
3277
3278       i = LLVMBuildBitCast(builder, x, int_vec_type, "");
3279
3280       /* exp = (float) exponent(x) */
3281       exp = LLVMBuildAnd(builder, i, expmask, "");
3282    }
3283
3284    if(p_floor_log2 || p_log2) {
3285       logexp = LLVMBuildLShr(builder, exp, lp_build_const_int_vec(bld->gallivm, type, 23), "");
3286       logexp = LLVMBuildSub(builder, logexp, lp_build_const_int_vec(bld->gallivm, type, 127), "");
3287       logexp = LLVMBuildSIToFP(builder, logexp, vec_type, "");
3288    }
3289
3290    if(p_log2) {
3291       /* mant = 1 + (float) mantissa(x) */
3292       mant = LLVMBuildAnd(builder, i, mantmask, "");
3293       mant = LLVMBuildOr(builder, mant, one, "");
3294       mant = LLVMBuildBitCast(builder, mant, vec_type, "");
3295
3296       /* y = (mant - 1) / (mant + 1) */
3297       y = lp_build_div(bld,
3298          lp_build_sub(bld, mant, bld->one),
3299          lp_build_add(bld, mant, bld->one)
3300       );
3301
3302       /* z = y^2 */
3303       z = lp_build_mul(bld, y, y);
3304
3305       /* compute P(z) */
3306       logmant = lp_build_polynomial(bld, z, lp_build_log2_polynomial,
3307                                     Elements(lp_build_log2_polynomial));
3308
3309       /* logmant = y * P(z) */
3310       logmant = lp_build_mul(bld, y, logmant);
3311
3312       res = lp_build_add(bld, logmant, logexp);
3313
3314       if (type.floating && handle_edge_cases) {
3315          LLVMValueRef negmask, infmask,  zmask;
3316          negmask = lp_build_cmp(bld, PIPE_FUNC_LESS, x,
3317                                 lp_build_const_vec(bld->gallivm, type,  0.0f));
3318          zmask = lp_build_cmp(bld, PIPE_FUNC_EQUAL, x,
3319                               lp_build_const_vec(bld->gallivm, type,  0.0f));
3320          infmask = lp_build_cmp(bld, PIPE_FUNC_GEQUAL, x,
3321                                 lp_build_const_vec(bld->gallivm, type,  INFINITY));
3322
3323          /* If x is qual to inf make sure we return inf */
3324          res = lp_build_select(bld, infmask,
3325                                lp_build_const_vec(bld->gallivm, type,  INFINITY),
3326                                res);
3327          /* If x is qual to 0, return -inf */
3328          res = lp_build_select(bld, zmask,
3329                                lp_build_const_vec(bld->gallivm, type,  -INFINITY),
3330                                res);
3331          /* If x is nan or less than 0, return nan */
3332          res = lp_build_select(bld, negmask,
3333                                lp_build_const_vec(bld->gallivm, type,  NAN),
3334                                res);
3335       }
3336    }
3337
3338    if(p_exp) {
3339       exp = LLVMBuildBitCast(builder, exp, vec_type, "");
3340       *p_exp = exp;
3341    }
3342
3343    if(p_floor_log2)
3344       *p_floor_log2 = logexp;
3345
3346    if(p_log2)
3347       *p_log2 = res;
3348 }
3349
3350
3351 /*
3352  * log2 implementation which doesn't have special code to
3353  * handle edge cases (-inf, 0, inf, NaN). It's faster but
3354  * the results for those cases are undefined.
3355  */
3356 LLVMValueRef
3357 lp_build_log2(struct lp_build_context *bld,
3358               LLVMValueRef x)
3359 {
3360    LLVMValueRef res;
3361    lp_build_log2_approx(bld, x, NULL, NULL, &res, FALSE);
3362    return res;
3363 }
3364
3365 /*
3366  * Version of log2 which handles all edge cases.
3367  * Look at documentation of lp_build_log2_approx for
3368  * description of the behavior for each of the edge cases.
3369  */
3370 LLVMValueRef
3371 lp_build_log2_safe(struct lp_build_context *bld,
3372                    LLVMValueRef x)
3373 {
3374    LLVMValueRef res;
3375    lp_build_log2_approx(bld, x, NULL, NULL, &res, TRUE);
3376    return res;
3377 }
3378
3379
3380 /**
3381  * Faster (and less accurate) log2.
3382  *
3383  *    log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x))
3384  *
3385  * Piece-wise linear approximation, with exact results when x is a
3386  * power of two.
3387  *
3388  * See http://www.flipcode.com/archives/Fast_log_Function.shtml
3389  */
3390 LLVMValueRef
3391 lp_build_fast_log2(struct lp_build_context *bld,
3392                    LLVMValueRef x)
3393 {
3394    LLVMBuilderRef builder = bld->gallivm->builder;
3395    LLVMValueRef ipart;
3396    LLVMValueRef fpart;
3397
3398    assert(lp_check_value(bld->type, x));
3399
3400    assert(bld->type.floating);
3401
3402    /* ipart = floor(log2(x)) - 1 */
3403    ipart = lp_build_extract_exponent(bld, x, -1);
3404    ipart = LLVMBuildSIToFP(builder, ipart, bld->vec_type, "");
3405
3406    /* fpart = x / 2**ipart */
3407    fpart = lp_build_extract_mantissa(bld, x);
3408
3409    /* ipart + fpart */
3410    return LLVMBuildFAdd(builder, ipart, fpart, "");
3411 }
3412
3413
3414 /**
3415  * Fast implementation of iround(log2(x)).
3416  *
3417  * Not an approximation -- it should give accurate results all the time.
3418  */
3419 LLVMValueRef
3420 lp_build_ilog2(struct lp_build_context *bld,
3421                LLVMValueRef x)
3422 {
3423    LLVMBuilderRef builder = bld->gallivm->builder;
3424    LLVMValueRef sqrt2 = lp_build_const_vec(bld->gallivm, bld->type, M_SQRT2);
3425    LLVMValueRef ipart;
3426
3427    assert(bld->type.floating);
3428
3429    assert(lp_check_value(bld->type, x));
3430
3431    /* x * 2^(0.5)   i.e., add 0.5 to the log2(x) */
3432    x = LLVMBuildFMul(builder, x, sqrt2, "");
3433
3434    /* ipart = floor(log2(x) + 0.5)  */
3435    ipart = lp_build_extract_exponent(bld, x, 0);
3436
3437    return ipart;
3438 }
3439
3440 LLVMValueRef
3441 lp_build_mod(struct lp_build_context *bld,
3442              LLVMValueRef x,
3443              LLVMValueRef y)
3444 {
3445    LLVMBuilderRef builder = bld->gallivm->builder;
3446    LLVMValueRef res;
3447    const struct lp_type type = bld->type;
3448
3449    assert(lp_check_value(type, x));
3450    assert(lp_check_value(type, y));
3451
3452    if (type.floating)
3453       res = LLVMBuildFRem(builder, x, y, "");
3454    else if (type.sign)
3455       res = LLVMBuildSRem(builder, x, y, "");
3456    else
3457       res = LLVMBuildURem(builder, x, y, "");
3458    return res;
3459 }
3460
3461
3462 /*
3463  * For floating inputs it creates and returns a mask
3464  * which is all 1's for channels which are NaN.
3465  * Channels inside x which are not NaN will be 0.
3466  */
3467 LLVMValueRef
3468 lp_build_isnan(struct lp_build_context *bld,
3469                LLVMValueRef x)
3470 {
3471    LLVMValueRef mask;
3472    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3473
3474    assert(bld->type.floating);
3475    assert(lp_check_value(bld->type, x));
3476
3477    mask = LLVMBuildFCmp(bld->gallivm->builder, LLVMRealOEQ, x, x,
3478                         "isnotnan");
3479    mask = LLVMBuildNot(bld->gallivm->builder, mask, "");
3480    mask = LLVMBuildSExt(bld->gallivm->builder, mask, int_vec_type, "isnan");
3481    return mask;
3482 }
3483
3484 /* Returns all 1's for floating point numbers that are
3485  * finite numbers and returns all zeros for -inf,
3486  * inf and nan's */
3487 LLVMValueRef
3488 lp_build_isfinite(struct lp_build_context *bld,
3489                   LLVMValueRef x)
3490 {
3491    LLVMBuilderRef builder = bld->gallivm->builder;
3492    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3493    struct lp_type int_type = lp_int_type(bld->type);
3494    LLVMValueRef intx = LLVMBuildBitCast(builder, x, int_vec_type, "");
3495    LLVMValueRef infornan32 = lp_build_const_int_vec(bld->gallivm, bld->type,
3496                                                     0x7f800000);
3497
3498    if (!bld->type.floating) {
3499       return lp_build_const_int_vec(bld->gallivm, bld->type, 0);
3500    }
3501    assert(bld->type.floating);
3502    assert(lp_check_value(bld->type, x));
3503    assert(bld->type.width == 32);
3504
3505    intx = LLVMBuildAnd(builder, intx, infornan32, "");
3506    return lp_build_compare(bld->gallivm, int_type, PIPE_FUNC_NOTEQUAL,
3507                            intx, infornan32);
3508 }
3509
3510 /*
3511  * Returns true if the number is nan or inf and false otherwise.
3512  * The input has to be a floating point vector.
3513  */
3514 LLVMValueRef
3515 lp_build_is_inf_or_nan(struct gallivm_state *gallivm,
3516                        const struct lp_type type,
3517                        LLVMValueRef x)
3518 {
3519    LLVMBuilderRef builder = gallivm->builder;
3520    struct lp_type int_type = lp_int_type(type);
3521    LLVMValueRef const0 = lp_build_const_int_vec(gallivm, int_type,
3522                                                 0x7f800000);
3523    LLVMValueRef ret;
3524
3525    assert(type.floating);
3526
3527    ret = LLVMBuildBitCast(builder, x, lp_build_vec_type(gallivm, int_type), "");
3528    ret = LLVMBuildAnd(builder, ret, const0, "");
3529    ret = lp_build_compare(gallivm, int_type, PIPE_FUNC_EQUAL,
3530                           ret, const0);
3531
3532    return ret;
3533 }
3534
3535
3536 LLVMValueRef
3537 lp_build_fpstate_get(struct gallivm_state *gallivm)
3538 {
3539    if (util_cpu_caps.has_sse) {
3540       LLVMBuilderRef builder = gallivm->builder;
3541       LLVMValueRef mxcsr_ptr = lp_build_alloca(
3542          gallivm,
3543          LLVMInt32TypeInContext(gallivm->context),
3544          "mxcsr_ptr");
3545       LLVMValueRef mxcsr_ptr8 = LLVMBuildPointerCast(builder, mxcsr_ptr,
3546           LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3547       lp_build_intrinsic(builder,
3548                          "llvm.x86.sse.stmxcsr",
3549                          LLVMVoidTypeInContext(gallivm->context),
3550                          &mxcsr_ptr8, 1, 0);
3551       return mxcsr_ptr;
3552    }
3553    return 0;
3554 }
3555
3556 void
3557 lp_build_fpstate_set_denorms_zero(struct gallivm_state *gallivm,
3558                                   boolean zero)
3559 {
3560    if (util_cpu_caps.has_sse) {
3561       /* turn on DAZ (64) | FTZ (32768) = 32832 if available */
3562       int daz_ftz = _MM_FLUSH_ZERO_MASK;
3563
3564       LLVMBuilderRef builder = gallivm->builder;
3565       LLVMValueRef mxcsr_ptr = lp_build_fpstate_get(gallivm);
3566       LLVMValueRef mxcsr =
3567          LLVMBuildLoad(builder, mxcsr_ptr, "mxcsr");
3568
3569       if (util_cpu_caps.has_daz) {
3570          /* Enable denormals are zero mode */
3571          daz_ftz |= _MM_DENORMALS_ZERO_MASK;
3572       }
3573       if (zero) {
3574          mxcsr = LLVMBuildOr(builder, mxcsr,
3575                              LLVMConstInt(LLVMTypeOf(mxcsr), daz_ftz, 0), "");
3576       } else {
3577          mxcsr = LLVMBuildAnd(builder, mxcsr,
3578                               LLVMConstInt(LLVMTypeOf(mxcsr), ~daz_ftz, 0), "");
3579       }
3580
3581       LLVMBuildStore(builder, mxcsr, mxcsr_ptr);
3582       lp_build_fpstate_set(gallivm, mxcsr_ptr);
3583    }
3584 }
3585
3586 void
3587 lp_build_fpstate_set(struct gallivm_state *gallivm,
3588                      LLVMValueRef mxcsr_ptr)
3589 {
3590    if (util_cpu_caps.has_sse) {
3591       LLVMBuilderRef builder = gallivm->builder;
3592       mxcsr_ptr = LLVMBuildPointerCast(builder, mxcsr_ptr,
3593                      LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3594       lp_build_intrinsic(builder,
3595                          "llvm.x86.sse.ldmxcsr",
3596                          LLVMVoidTypeInContext(gallivm->context),
3597                          &mxcsr_ptr, 1, 0);
3598    }
3599 }