src/compiler/nir/nir_lower_double_ops.c

   1 /*
   2  * Copyright © 2015 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  *
  23  */
  24
  25 #include "nir.h"
  26 #include "nir_builder.h"
  27 #include "c99_math.h"
  28
  29 #include <float.h>
  30
  31 /*
  32  * Lowers some unsupported double operations, using only:
  33  *
  34  * - pack/unpackDouble2x32
  35  * - conversion to/from single-precision
  36  * - double add, mul, and fma
  37  * - conditional select
  38  * - 32-bit integer and floating point arithmetic
  39  */
  40
  41 /* Creates a double with the exponent bits set to a given integer value */
  42 static nir_ssa_def *
  43 set_exponent(nir_builder *b, nir_ssa_def *src, nir_ssa_def *exp)
  44 {
  45    /* Split into bits 0-31 and 32-63 */
  46    nir_ssa_def *lo = nir_unpack_64_2x32_split_x(b, src);
  47    nir_ssa_def *hi = nir_unpack_64_2x32_split_y(b, src);
  48
  49    /* The exponent is bits 52-62, or 20-30 of the high word, so set the exponent
  50     * to 1023
  51     */
  52    nir_ssa_def *new_hi = nir_bfi(b, nir_imm_int(b, 0x7ff00000), exp, hi);
  53    /* recombine */
  54    return nir_pack_64_2x32_split(b, lo, new_hi);
  55 }
  56
  57 static nir_ssa_def *
  58 get_exponent(nir_builder *b, nir_ssa_def *src)
  59 {
  60    /* get bits 32-63 */
  61    nir_ssa_def *hi = nir_unpack_64_2x32_split_y(b, src);
  62
  63    /* extract bits 20-30 of the high word */
  64    return nir_ubitfield_extract(b, hi, nir_imm_int(b, 20), nir_imm_int(b, 11));
  65 }
  66
  67 /* Return infinity with the sign of the given source which is +/-0 */
  68
  69 static nir_ssa_def *
  70 get_signed_inf(nir_builder *b, nir_ssa_def *zero)
  71 {
  72    nir_ssa_def *zero_hi = nir_unpack_64_2x32_split_y(b, zero);
  73
  74    /* The bit pattern for infinity is 0x7ff0000000000000, where the sign bit
  75     * is the highest bit. Only the sign bit can be non-zero in the passed in
  76     * source. So we essentially need to OR the infinity and the zero, except
  77     * the low 32 bits are always 0 so we can construct the correct high 32
  78     * bits and then pack it together with zero low 32 bits.
  79     */
  80    nir_ssa_def *inf_hi = nir_ior(b, nir_imm_int(b, 0x7ff00000), zero_hi);
  81    return nir_pack_64_2x32_split(b, nir_imm_int(b, 0), inf_hi);
  82 }
  83
  84 /*
  85  * Generates the correctly-signed infinity if the source was zero, and flushes
  86  * the result to 0 if the source was infinity or the calculated exponent was
  87  * too small to be representable.
  88  */
  89
  90 static nir_ssa_def *
  91 fix_inv_result(nir_builder *b, nir_ssa_def *res, nir_ssa_def *src,
  92                nir_ssa_def *exp)
  93 {
  94    /* If the exponent is too small or the original input was infinity/NaN,
  95     * force the result to 0 (flush denorms) to avoid the work of handling
  96     * denorms properly. Note that this doesn't preserve positive/negative
  97     * zeros, but GLSL doesn't require it.
  98     */
  99    res = nir_bcsel(b, nir_ior(b, nir_ige(b, nir_imm_int(b, 0), exp),
 100                               nir_feq(b, nir_fabs(b, src),
 101                                       nir_imm_double(b, INFINITY))),
 102                    nir_imm_double(b, 0.0f), res);
 103
 104    /* If the original input was 0, generate the correctly-signed infinity */
 105    res = nir_bcsel(b, nir_fne(b, src, nir_imm_double(b, 0.0f)),
 106                    res, get_signed_inf(b, src));
 107
 108    return res;
 109
 110 }
 111
 112 static nir_ssa_def *
 113 lower_rcp(nir_builder *b, nir_ssa_def *src)
 114 {
 115    /* normalize the input to avoid range issues */
 116    nir_ssa_def *src_norm = set_exponent(b, src, nir_imm_int(b, 1023));
 117
 118    /* cast to float, do an rcp, and then cast back to get an approximate
 119     * result
 120     */
 121    nir_ssa_def *ra = nir_f2f64(b, nir_frcp(b, nir_f2f32(b, src_norm)));
 122
 123    /* Fixup the exponent of the result - note that we check if this is too
 124     * small below.
 125     */
 126    nir_ssa_def *new_exp = nir_isub(b, get_exponent(b, ra),
 127                                    nir_isub(b, get_exponent(b, src),
 128                                             nir_imm_int(b, 1023)));
 129
 130    ra = set_exponent(b, ra, new_exp);
 131
 132    /* Do a few Newton-Raphson steps to improve precision.
 133     *
 134     * Each step doubles the precision, and we started off with around 24 bits,
 135     * so we only need to do 2 steps to get to full precision. The step is:
 136     *
 137     * x_new = x * (2 - x*src)
 138     *
 139     * But we can re-arrange this to improve precision by using another fused
 140     * multiply-add:
 141     *
 142     * x_new = x + x * (1 - x*src)
 143     *
 144     * See https://en.wikipedia.org/wiki/Division_algorithm for more details.
 145     */
 146
 147    ra = nir_ffma(b, nir_fneg(b, ra), nir_ffma(b, ra, src, nir_imm_double(b, -1)), ra);
 148    ra = nir_ffma(b, nir_fneg(b, ra), nir_ffma(b, ra, src, nir_imm_double(b, -1)), ra);
 149
 150    return fix_inv_result(b, ra, src, new_exp);
 151 }
 152
 153 static nir_ssa_def *
 154 lower_sqrt_rsq(nir_builder *b, nir_ssa_def *src, bool sqrt)
 155 {
 156    /* We want to compute:
 157     *
 158     * 1/sqrt(m * 2^e)
 159     *
 160     * When the exponent is even, this is equivalent to:
 161     *
 162     * 1/sqrt(m) * 2^(-e/2)
 163     *
 164     * and then the exponent is odd, this is equal to:
 165     *
 166     * 1/sqrt(m * 2) * 2^(-(e - 1)/2)
 167     *
 168     * where the m * 2 is absorbed into the exponent. So we want the exponent
 169     * inside the square root to be 1 if e is odd and 0 if e is even, and we
 170     * want to subtract off e/2 from the final exponent, rounded to negative
 171     * infinity. We can do the former by first computing the unbiased exponent,
 172     * and then AND'ing it with 1 to get 0 or 1, and we can do the latter by
 173     * shifting right by 1.
 174     */
 175
 176    nir_ssa_def *unbiased_exp = nir_isub(b, get_exponent(b, src),
 177                                         nir_imm_int(b, 1023));
 178    nir_ssa_def *even = nir_iand(b, unbiased_exp, nir_imm_int(b, 1));
 179    nir_ssa_def *half = nir_ishr(b, unbiased_exp, nir_imm_int(b, 1));
 180
 181    nir_ssa_def *src_norm = set_exponent(b, src,
 182                                         nir_iadd(b, nir_imm_int(b, 1023),
 183                                                  even));
 184
 185    nir_ssa_def *ra = nir_f2f64(b, nir_frsq(b, nir_f2f32(b, src_norm)));
 186    nir_ssa_def *new_exp = nir_isub(b, get_exponent(b, ra), half);
 187    ra = set_exponent(b, ra, new_exp);
 188
 189    /*
 190     * The following implements an iterative algorithm that's very similar
 191     * between sqrt and rsqrt. We start with an iteration of Goldschmit's
 192     * algorithm, which looks like:
 193     *
 194     * a = the source
 195     * y_0 = initial (single-precision) rsqrt estimate
 196     *
 197     * h_0 = .5 * y_0
 198     * g_0 = a * y_0
 199     * r_0 = .5 - h_0 * g_0
 200     * g_1 = g_0 * r_0 + g_0
 201     * h_1 = h_0 * r_0 + h_0
 202     *
 203     * Now g_1 ~= sqrt(a), and h_1 ~= 1/(2 * sqrt(a)). We could continue
 204     * applying another round of Goldschmit, but since we would never refer
 205     * back to a (the original source), we would add too much rounding error.
 206     * So instead, we do one last round of Newton-Raphson, which has better
 207     * rounding characteristics, to get the final rounding correct. This is
 208     * split into two cases:
 209     *
 210     * 1. sqrt
 211     *
 212     * Normally, doing a round of Newton-Raphson for sqrt involves taking a
 213     * reciprocal of the original estimate, which is slow since it isn't
 214     * supported in HW. But we can take advantage of the fact that we already
 215     * computed a good estimate of 1/(2 * g_1) by rearranging it like so:
 216     *
 217     * g_2 = .5 * (g_1 + a / g_1)
 218     *     = g_1 + .5 * (a / g_1 - g_1)
 219     *     = g_1 + (.5 / g_1) * (a - g_1^2)
 220     *     = g_1 + h_1 * (a - g_1^2)
 221     *
 222     * The second term represents the error, and by splitting it out we can get
 223     * better precision by computing it as part of a fused multiply-add. Since
 224     * both Newton-Raphson and Goldschmit approximately double the precision of
 225     * the result, these two steps should be enough.
 226     *
 227     * 2. rsqrt
 228     *
 229     * First off, note that the first round of the Goldschmit algorithm is
 230     * really just a Newton-Raphson step in disguise:
 231     *
 232     * h_1 = h_0 * (.5 - h_0 * g_0) + h_0
 233     *     = h_0 * (1.5 - h_0 * g_0)
 234     *     = h_0 * (1.5 - .5 * a * y_0^2)
 235     *     = (.5 * y_0) * (1.5 - .5 * a * y_0^2)
 236     *
 237     * which is the standard formula multiplied by .5. Unlike in the sqrt case,
 238     * we don't need the inverse to do a Newton-Raphson step; we just need h_1,
 239     * so we can skip the calculation of g_1. Instead, we simply do another
 240     * Newton-Raphson step:
 241     *
 242     * y_1 = 2 * h_1
 243     * r_1 = .5 - h_1 * y_1 * a
 244     * y_2 = y_1 * r_1 + y_1
 245     *
 246     * Where the difference from Goldschmit is that we calculate y_1 * a
 247     * instead of using g_1. Doing it this way should be as fast as computing
 248     * y_1 up front instead of h_1, and it lets us share the code for the
 249     * initial Goldschmit step with the sqrt case.
 250     *
 251     * Putting it together, the computations are:
 252     *
 253     * h_0 = .5 * y_0
 254     * g_0 = a * y_0
 255     * r_0 = .5 - h_0 * g_0
 256     * h_1 = h_0 * r_0 + h_0
 257     * if sqrt:
 258     *    g_1 = g_0 * r_0 + g_0
 259     *    r_1 = a - g_1 * g_1
 260     *    g_2 = h_1 * r_1 + g_1
 261     * else:
 262     *    y_1 = 2 * h_1
 263     *    r_1 = .5 - y_1 * (h_1 * a)
 264     *    y_2 = y_1 * r_1 + y_1
 265     *
 266     * For more on the ideas behind this, see "Software Division and Square
 267     * Root Using Goldschmit's Algorithms" by Markstein and the Wikipedia page
 268     * on square roots
 269     * (https://en.wikipedia.org/wiki/Methods_of_computing_square_roots).
 270     */
 271
 272    nir_ssa_def *one_half = nir_imm_double(b, 0.5);
 273    nir_ssa_def *h_0 = nir_fmul(b, one_half, ra);
 274    nir_ssa_def *g_0 = nir_fmul(b, src, ra);
 275    nir_ssa_def *r_0 = nir_ffma(b, nir_fneg(b, h_0), g_0, one_half);
 276    nir_ssa_def *h_1 = nir_ffma(b, h_0, r_0, h_0);
 277    nir_ssa_def *res;
 278    if (sqrt) {
 279       nir_ssa_def *g_1 = nir_ffma(b, g_0, r_0, g_0);
 280       nir_ssa_def *r_1 = nir_ffma(b, nir_fneg(b, g_1), g_1, src);
 281       res = nir_ffma(b, h_1, r_1, g_1);
 282    } else {
 283       nir_ssa_def *y_1 = nir_fmul(b, nir_imm_double(b, 2.0), h_1);
 284       nir_ssa_def *r_1 = nir_ffma(b, nir_fneg(b, y_1), nir_fmul(b, h_1, src),
 285                                   one_half);
 286       res = nir_ffma(b, y_1, r_1, y_1);
 287    }
 288
 289    if (sqrt) {
 290       /* Here, the special cases we need to handle are
 291        * 0 -> 0 and
 292        * +inf -> +inf
 293        */
 294       const bool preserve_denorms =
 295          b->shader->info.float_controls_execution_mode &
 296          FLOAT_CONTROLS_DENORM_PRESERVE_FP64;
 297       nir_ssa_def *src_flushed = src;
 298       if (!preserve_denorms) {
 299          src_flushed = nir_bcsel(b,
 300                                  nir_flt(b, nir_fabs(b, src),
 301                                          nir_imm_double(b, DBL_MIN)),
 302                                  nir_imm_double(b, 0.0),
 303                                  src);
 304       }
 305       res = nir_bcsel(b, nir_ior(b, nir_feq(b, src_flushed, nir_imm_double(b, 0.0)),
 306                                  nir_feq(b, src, nir_imm_double(b, INFINITY))),
 307                                  src_flushed, res);
 308    } else {
 309       res = fix_inv_result(b, res, src, new_exp);
 310    }
 311
 312    return res;
 313 }
 314
 315 static nir_ssa_def *
 316 lower_trunc(nir_builder *b, nir_ssa_def *src)
 317 {
 318    nir_ssa_def *unbiased_exp = nir_isub(b, get_exponent(b, src),
 319                                         nir_imm_int(b, 1023));
 320
 321    nir_ssa_def *frac_bits = nir_isub(b, nir_imm_int(b, 52), unbiased_exp);
 322
 323    /*
 324     * Decide the operation to apply depending on the unbiased exponent:
 325     *
 326     * if (unbiased_exp < 0)
 327     *    return 0
 328     * else if (unbiased_exp > 52)
 329     *    return src
 330     * else
 331     *    return src & (~0 << frac_bits)
 332     *
 333     * Notice that the else branch is a 64-bit integer operation that we need
 334     * to implement in terms of 32-bit integer arithmetics (at least until we
 335     * support 64-bit integer arithmetics).
 336     */
 337
 338    /* Compute "~0 << frac_bits" in terms of hi/lo 32-bit integer math */
 339    nir_ssa_def *mask_lo =
 340       nir_bcsel(b,
 341                 nir_ige(b, frac_bits, nir_imm_int(b, 32)),
 342                 nir_imm_int(b, 0),
 343                 nir_ishl(b, nir_imm_int(b, ~0), frac_bits));
 344
 345    nir_ssa_def *mask_hi =
 346       nir_bcsel(b,
 347                 nir_ilt(b, frac_bits, nir_imm_int(b, 33)),
 348                 nir_imm_int(b, ~0),
 349                 nir_ishl(b,
 350                          nir_imm_int(b, ~0),
 351                          nir_isub(b, frac_bits, nir_imm_int(b, 32))));
 352
 353    nir_ssa_def *src_lo = nir_unpack_64_2x32_split_x(b, src);
 354    nir_ssa_def *src_hi = nir_unpack_64_2x32_split_y(b, src);
 355
 356    return
 357       nir_bcsel(b,
 358                 nir_ilt(b, unbiased_exp, nir_imm_int(b, 0)),
 359                 nir_imm_double(b, 0.0),
 360                 nir_bcsel(b, nir_ige(b, unbiased_exp, nir_imm_int(b, 53)),
 361                           src,
 362                           nir_pack_64_2x32_split(b,
 363                                                  nir_iand(b, mask_lo, src_lo),
 364                                                  nir_iand(b, mask_hi, src_hi))));
 365 }
 366
 367 static nir_ssa_def *
 368 lower_floor(nir_builder *b, nir_ssa_def *src)
 369 {
 370    /*
 371     * For x >= 0, floor(x) = trunc(x)
 372     * For x < 0,
 373     *    - if x is integer, floor(x) = x
 374     *    - otherwise, floor(x) = trunc(x) - 1
 375     */
 376    nir_ssa_def *tr = nir_ftrunc(b, src);
 377    nir_ssa_def *positive = nir_fge(b, src, nir_imm_double(b, 0.0));
 378    return nir_bcsel(b,
 379                     nir_ior(b, positive, nir_feq(b, src, tr)),
 380                     tr,
 381                     nir_fsub(b, tr, nir_imm_double(b, 1.0)));
 382 }
 383
 384 static nir_ssa_def *
 385 lower_ceil(nir_builder *b, nir_ssa_def *src)
 386 {
 387    /* if x < 0,                    ceil(x) = trunc(x)
 388     * else if (x - trunc(x) == 0), ceil(x) = x
 389     * else,                        ceil(x) = trunc(x) + 1
 390     */
 391    nir_ssa_def *tr = nir_ftrunc(b, src);
 392    nir_ssa_def *negative = nir_flt(b, src, nir_imm_double(b, 0.0));
 393    return nir_bcsel(b,
 394                     nir_ior(b, negative, nir_feq(b, src, tr)),
 395                     tr,
 396                     nir_fadd(b, tr, nir_imm_double(b, 1.0)));
 397 }
 398
 399 static nir_ssa_def *
 400 lower_fract(nir_builder *b, nir_ssa_def *src)
 401 {
 402    return nir_fsub(b, src, nir_ffloor(b, src));
 403 }
 404
 405 static nir_ssa_def *
 406 lower_round_even(nir_builder *b, nir_ssa_def *src)
 407 {
 408    /* Add and subtract 2**52 to round off any fractional bits. */
 409    nir_ssa_def *two52 = nir_imm_double(b, (double)(1ull << 52));
 410    nir_ssa_def *sign = nir_iand(b, nir_unpack_64_2x32_split_y(b, src),
 411                                 nir_imm_int(b, 1ull << 31));
 412
 413    b->exact = true;
 414    nir_ssa_def *res = nir_fsub(b, nir_fadd(b, nir_fabs(b, src), two52), two52);
 415    b->exact = false;
 416
 417    return nir_bcsel(b, nir_flt(b, nir_fabs(b, src), two52),
 418                     nir_pack_64_2x32_split(b, nir_unpack_64_2x32_split_x(b, res),
 419                                            nir_ior(b, nir_unpack_64_2x32_split_y(b, res), sign)), src);
 420 }
 421
 422 static nir_ssa_def *
 423 lower_mod(nir_builder *b, nir_ssa_def *src0, nir_ssa_def *src1)
 424 {
 425    /* mod(x,y) = x - y * floor(x/y)
 426     *
 427     * If the division is lowered, it could add some rounding errors that make
 428     * floor() to return the quotient minus one when x = N * y. If this is the
 429     * case, we return zero because mod(x, y) output value is [0, y).
 430     */
 431    nir_ssa_def *floor = nir_ffloor(b, nir_fdiv(b, src0, src1));
 432    nir_ssa_def *mod = nir_fsub(b, src0, nir_fmul(b, src1, floor));
 433
 434    return nir_bcsel(b,
 435                     nir_fne(b, mod, src1),
 436                     mod,
 437                     nir_imm_double(b, 0.0));
 438 }
 439
 440 static nir_ssa_def *
 441 lower_doubles_instr_to_soft(nir_builder *b, nir_alu_instr *instr,
 442                             const nir_shader *softfp64,
 443                             nir_lower_doubles_options options)
 444 {
 445    if (!(options & nir_lower_fp64_full_software))
 446       return NULL;
 447
 448    assert(instr->dest.dest.is_ssa);
 449
 450    const char *name;
 451    const struct glsl_type *return_type = glsl_uint64_t_type();
 452
 453    switch (instr->op) {
 454    case nir_op_f2i64:
 455       if (instr->src[0].src.ssa->bit_size == 64)
 456          name = "__fp64_to_int64";
 457       else
 458          name = "__fp32_to_int64";
 459       return_type = glsl_int64_t_type();
 460       break;
 461    case nir_op_f2u64:
 462       if (instr->src[0].src.ssa->bit_size == 64)
 463          name = "__fp64_to_uint64";
 464       else
 465          name = "__fp32_to_uint64";
 466       break;
 467    case nir_op_f2f64:
 468       name = "__fp32_to_fp64";
 469       break;
 470    case nir_op_f2f32:
 471       name = "__fp64_to_fp32";
 472       return_type = glsl_float_type();
 473       break;
 474    case nir_op_f2i32:
 475       name = "__fp64_to_int";
 476       return_type = glsl_int_type();
 477       break;
 478    case nir_op_f2u32:
 479       name = "__fp64_to_uint";
 480       return_type = glsl_uint_type();
 481       break;
 482    case nir_op_f2b1:
 483    case nir_op_f2b32:
 484       name = "__fp64_to_bool";
 485       return_type = glsl_bool_type();
 486       break;
 487    case nir_op_b2f64:
 488       name = "__bool_to_fp64";
 489       break;
 490    case nir_op_i2f32:
 491       if (instr->src[0].src.ssa->bit_size != 64)
 492          return false;
 493       name = "__int64_to_fp32";
 494       return_type = glsl_float_type();
 495       break;
 496    case nir_op_u2f32:
 497       if (instr->src[0].src.ssa->bit_size != 64)
 498          return false;
 499       name = "__uint64_to_fp32";
 500       return_type = glsl_float_type();
 501       break;
 502    case nir_op_i2f64:
 503       if (instr->src[0].src.ssa->bit_size == 64)
 504          name = "__int64_to_fp64";
 505       else
 506          name = "__int_to_fp64";
 507       break;
 508    case nir_op_u2f64:
 509       if (instr->src[0].src.ssa->bit_size == 64)
 510          name = "__uint64_to_fp64";
 511       else
 512          name = "__uint_to_fp64";
 513       break;
 514    case nir_op_fabs:
 515       name = "__fabs64";
 516       break;
 517    case nir_op_fneg:
 518       name = "__fneg64";
 519       break;
 520    case nir_op_fround_even:
 521       name = "__fround64";
 522       break;
 523    case nir_op_ftrunc:
 524       name = "__ftrunc64";
 525       break;
 526    case nir_op_ffloor:
 527       name = "__ffloor64";
 528       break;
 529    case nir_op_ffract:
 530       name = "__ffract64";
 531       break;
 532    case nir_op_fsign:
 533       name = "__fsign64";
 534       break;
 535    case nir_op_feq:
 536       name = "__feq64";
 537       return_type = glsl_bool_type();
 538       break;
 539    case nir_op_fne:
 540       name = "__fne64";
 541       return_type = glsl_bool_type();
 542       break;
 543    case nir_op_flt:
 544       name = "__flt64";
 545       return_type = glsl_bool_type();
 546       break;
 547    case nir_op_fge:
 548       name = "__fge64";
 549       return_type = glsl_bool_type();
 550       break;
 551    case nir_op_fmin:
 552       name = "__fmin64";
 553       break;
 554    case nir_op_fmax:
 555       name = "__fmax64";
 556       break;
 557    case nir_op_fadd:
 558       name = "__fadd64";
 559       break;
 560    case nir_op_fmul:
 561       name = "__fmul64";
 562       break;
 563    case nir_op_ffma:
 564       name = "__ffma64";
 565       break;
 566    case nir_op_fsat:
 567       name = "__fsat64";
 568       break;
 569    default:
 570       return false;
 571    }
 572
 573    nir_function *func = NULL;
 574    nir_foreach_function(function, softfp64) {
 575       if (strcmp(function->name, name) == 0) {
 576          func = function;
 577          break;
 578       }
 579    }
 580    if (!func || !func->impl) {
 581       fprintf(stderr, "Cannot find function \"%s\"\n", name);
 582       assert(func);
 583    }
 584
 585    nir_ssa_def *params[4] = { NULL, };
 586
 587    nir_variable *ret_tmp =
 588       nir_local_variable_create(b->impl, return_type, "return_tmp");
 589    nir_deref_instr *ret_deref = nir_build_deref_var(b, ret_tmp);
 590    params[0] = &ret_deref->dest.ssa;
 591
 592    assert(nir_op_infos[instr->op].num_inputs + 1 == func->num_params);
 593    for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
 594       assert(i + 1 < ARRAY_SIZE(params));
 595       params[i + 1] = nir_mov_alu(b, instr->src[i], 1);
 596    }
 597
 598    nir_inline_function_impl(b, func->impl, params);
 599
 600    return nir_load_deref(b, ret_deref);
 601 }
 602
 603 nir_lower_doubles_options
 604 nir_lower_doubles_op_to_options_mask(nir_op opcode)
 605 {
 606    switch (opcode) {
 607    case nir_op_frcp:          return nir_lower_drcp;
 608    case nir_op_fsqrt:         return nir_lower_dsqrt;
 609    case nir_op_frsq:          return nir_lower_drsq;
 610    case nir_op_ftrunc:        return nir_lower_dtrunc;
 611    case nir_op_ffloor:        return nir_lower_dfloor;
 612    case nir_op_fceil:         return nir_lower_dceil;
 613    case nir_op_ffract:        return nir_lower_dfract;
 614    case nir_op_fround_even:   return nir_lower_dround_even;
 615    case nir_op_fmod:          return nir_lower_dmod;
 616    case nir_op_fsub:          return nir_lower_dsub;
 617    case nir_op_fdiv:          return nir_lower_ddiv;
 618    default:                   return 0;
 619    }
 620 }
 621
 622 struct lower_doubles_data {
 623    const nir_shader *softfp64;
 624    nir_lower_doubles_options options;
 625 };
 626
 627 static bool
 628 should_lower_double_instr(const nir_instr *instr, const void *_data)
 629 {
 630    const struct lower_doubles_data *data = _data;
 631    const nir_lower_doubles_options options = data->options;
 632
 633    if (instr->type != nir_instr_type_alu)
 634       return false;
 635
 636    const nir_alu_instr *alu = nir_instr_as_alu(instr);
 637
 638    assert(alu->dest.dest.is_ssa);
 639    bool is_64 = alu->dest.dest.ssa.bit_size == 64;
 640
 641    unsigned num_srcs = nir_op_infos[alu->op].num_inputs;
 642    for (unsigned i = 0; i < num_srcs; i++) {
 643       is_64 |= (nir_src_bit_size(alu->src[i].src) == 64);
 644    }
 645
 646    if (!is_64)
 647       return false;
 648
 649    if (options & nir_lower_fp64_full_software)
 650       return true;
 651
 652    return options & nir_lower_doubles_op_to_options_mask(alu->op);
 653 }
 654
 655 static nir_ssa_def *
 656 lower_doubles_instr(nir_builder *b, nir_instr *instr, void *_data)
 657 {
 658    const struct lower_doubles_data *data = _data;
 659    const nir_lower_doubles_options options = data->options;
 660    nir_alu_instr *alu = nir_instr_as_alu(instr);
 661
 662    nir_ssa_def *soft_def =
 663       lower_doubles_instr_to_soft(b, alu, data->softfp64, options);
 664    if (soft_def)
 665       return soft_def;
 666
 667    if (!(options & nir_lower_doubles_op_to_options_mask(alu->op)))
 668       return NULL;
 669
 670    nir_ssa_def *src = nir_mov_alu(b, alu->src[0],
 671                                   alu->dest.dest.ssa.num_components);
 672
 673    switch (alu->op) {
 674    case nir_op_frcp:
 675       return lower_rcp(b, src);
 676    case nir_op_fsqrt:
 677       return lower_sqrt_rsq(b, src, true);
 678    case nir_op_frsq:
 679       return lower_sqrt_rsq(b, src, false);
 680    case nir_op_ftrunc:
 681       return lower_trunc(b, src);
 682    case nir_op_ffloor:
 683       return lower_floor(b, src);
 684    case nir_op_fceil:
 685       return lower_ceil(b, src);
 686    case nir_op_ffract:
 687       return lower_fract(b, src);
 688    case nir_op_fround_even:
 689       return lower_round_even(b, src);
 690
 691    case nir_op_fdiv:
 692    case nir_op_fsub:
 693    case nir_op_fmod: {
 694       nir_ssa_def *src1 = nir_mov_alu(b, alu->src[1],
 695                                       alu->dest.dest.ssa.num_components);
 696       switch (alu->op) {
 697       case nir_op_fdiv:
 698          return nir_fmul(b, src, nir_frcp(b, src1));
 699       case nir_op_fsub:
 700          return nir_fadd(b, src, nir_fneg(b, src1));
 701       case nir_op_fmod:
 702          return lower_mod(b, src, src1);
 703       default:
 704          unreachable("unhandled opcode");
 705       }
 706    }
 707    default:
 708       unreachable("unhandled opcode");
 709    }
 710 }
 711
 712 static bool
 713 nir_lower_doubles_impl(nir_function_impl *impl,
 714                        const nir_shader *softfp64,
 715                        nir_lower_doubles_options options)
 716 {
 717    struct lower_doubles_data data = {
 718       .softfp64 = softfp64,
 719       .options = options,
 720    };
 721
 722    bool progress =
 723       nir_function_impl_lower_instructions(impl,
 724                                            should_lower_double_instr,
 725                                            lower_doubles_instr,
 726                                            &data);
 727
 728    if (progress && (options & nir_lower_fp64_full_software)) {
 729       /* SSA and register indices are completely messed up now */
 730       nir_index_ssa_defs(impl);
 731       nir_index_local_regs(impl);
 732
 733       nir_metadata_preserve(impl, nir_metadata_none);
 734
 735       /* And we have deref casts we need to clean up thanks to function
 736        * inlining.
 737        */
 738       nir_opt_deref_impl(impl);
 739    }
 740
 741    return progress;
 742 }
 743
 744 bool
 745 nir_lower_doubles(nir_shader *shader,
 746                   const nir_shader *softfp64,
 747                   nir_lower_doubles_options options)
 748 {
 749    bool progress = false;
 750
 751    nir_foreach_function(function, shader) {
 752       if (function->impl) {
 753          progress |= nir_lower_doubles_impl(function->impl, softfp64, options);
 754       }
 755    }
 756
 757    return progress;
 758 }