src/compiler/glsl/lower_instructions.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  21  * DEALINGS IN THE SOFTWARE.
  22  */
  23
  24 /**
  25  * \file lower_instructions.cpp
  26  *
  27  * Many GPUs lack native instructions for certain expression operations, and
  28  * must replace them with some other expression tree.  This pass lowers some
  29  * of the most common cases, allowing the lowering code to be implemented once
  30  * rather than in each driver backend.
  31  *
  32  * Currently supported transformations:
  33  * - SUB_TO_ADD_NEG
  34  * - DIV_TO_MUL_RCP
  35  * - INT_DIV_TO_MUL_RCP
  36  * - EXP_TO_EXP2
  37  * - POW_TO_EXP2
  38  * - LOG_TO_LOG2
  39  * - MOD_TO_FLOOR
  40  * - LDEXP_TO_ARITH
  41  * - DFREXP_TO_ARITH
  42  * - CARRY_TO_ARITH
  43  * - BORROW_TO_ARITH
  44  * - SAT_TO_CLAMP
  45  * - DOPS_TO_DFRAC
  46  *
  47  * SUB_TO_ADD_NEG:
  48  * ---------------
  49  * Breaks an ir_binop_sub expression down to add(op0, neg(op1))
  50  *
  51  * This simplifies expression reassociation, and for many backends
  52  * there is no subtract operation separate from adding the negation.
  53  * For backends with native subtract operations, they will probably
  54  * want to recognize add(op0, neg(op1)) or the other way around to
  55  * produce a subtract anyway.
  56  *
  57  * FDIV_TO_MUL_RCP, DDIV_TO_MUL_RCP, and INT_DIV_TO_MUL_RCP:
  58  * ---------------------------------------------------------
  59  * Breaks an ir_binop_div expression down to op0 * (rcp(op1)).
  60  *
  61  * Many GPUs don't have a divide instruction (945 and 965 included),
  62  * but they do have an RCP instruction to compute an approximate
  63  * reciprocal.  By breaking the operation down, constant reciprocals
  64  * can get constant folded.
  65  *
  66  * FDIV_TO_MUL_RCP only lowers single-precision floating point division;
  67  * DDIV_TO_MUL_RCP only lowers double-precision floating point division.
  68  * DIV_TO_MUL_RCP is a convenience macro that sets both flags.
  69  * INT_DIV_TO_MUL_RCP handles the integer case, converting to and from floating
  70  * point so that RCP is possible.
  71  *
  72  * EXP_TO_EXP2 and LOG_TO_LOG2:
  73  * ----------------------------
  74  * Many GPUs don't have a base e log or exponent instruction, but they
  75  * do have base 2 versions, so this pass converts exp and log to exp2
  76  * and log2 operations.
  77  *
  78  * POW_TO_EXP2:
  79  * -----------
  80  * Many older GPUs don't have an x**y instruction.  For these GPUs, convert
  81  * x**y to 2**(y * log2(x)).
  82  *
  83  * MOD_TO_FLOOR:
  84  * -------------
  85  * Breaks an ir_binop_mod expression down to (op0 - op1 * floor(op0 / op1))
  86  *
  87  * Many GPUs don't have a MOD instruction (945 and 965 included), and
  88  * if we have to break it down like this anyway, it gives an
  89  * opportunity to do things like constant fold the (1.0 / op1) easily.
  90  *
  91  * Note: before we used to implement this as op1 * fract(op / op1) but this
  92  * implementation had significant precision errors.
  93  *
  94  * LDEXP_TO_ARITH:
  95  * -------------
  96  * Converts ir_binop_ldexp to arithmetic and bit operations for float sources.
  97  *
  98  * DFREXP_DLDEXP_TO_ARITH:
  99  * ---------------
 100  * Converts ir_binop_ldexp, ir_unop_frexp_sig, and ir_unop_frexp_exp to
 101  * arithmetic and bit ops for double arguments.
 102  *
 103  * CARRY_TO_ARITH:
 104  * ---------------
 105  * Converts ir_carry into (x + y) < x.
 106  *
 107  * BORROW_TO_ARITH:
 108  * ----------------
 109  * Converts ir_borrow into (x < y).
 110  *
 111  * SAT_TO_CLAMP:
 112  * -------------
 113  * Converts ir_unop_saturate into min(max(x, 0.0), 1.0)
 114  *
 115  * DOPS_TO_DFRAC:
 116  * --------------
 117  * Converts double trunc, ceil, floor, round to fract
 118  */
 119
 120 #include "c99_math.h"
 121 #include "program/prog_instruction.h" /* for swizzle */
 122 #include "compiler/glsl_types.h"
 123 #include "ir.h"
 124 #include "ir_builder.h"
 125 #include "ir_optimization.h"
 126
 127 using namespace ir_builder;
 128
 129 namespace {
 130
 131 class lower_instructions_visitor : public ir_hierarchical_visitor {
 132 public:
 133    lower_instructions_visitor(unsigned lower)
 134       : progress(false), lower(lower) { }
 135
 136    ir_visitor_status visit_leave(ir_expression *);
 137
 138    bool progress;
 139
 140 private:
 141    unsigned lower; /** Bitfield of which operations to lower */
 142
 143    void sub_to_add_neg(ir_expression *);
 144    void div_to_mul_rcp(ir_expression *);
 145    void int_div_to_mul_rcp(ir_expression *);
 146    void mod_to_floor(ir_expression *);
 147    void exp_to_exp2(ir_expression *);
 148    void pow_to_exp2(ir_expression *);
 149    void log_to_log2(ir_expression *);
 150    void ldexp_to_arith(ir_expression *);
 151    void dldexp_to_arith(ir_expression *);
 152    void dfrexp_sig_to_arith(ir_expression *);
 153    void dfrexp_exp_to_arith(ir_expression *);
 154    void carry_to_arith(ir_expression *);
 155    void borrow_to_arith(ir_expression *);
 156    void sat_to_clamp(ir_expression *);
 157    void double_dot_to_fma(ir_expression *);
 158    void double_lrp(ir_expression *);
 159    void dceil_to_dfrac(ir_expression *);
 160    void dfloor_to_dfrac(ir_expression *);
 161    void dround_even_to_dfrac(ir_expression *);
 162    void dtrunc_to_dfrac(ir_expression *);
 163    void dsign_to_csel(ir_expression *);
 164    void bit_count_to_math(ir_expression *);
 165    void extract_to_shifts(ir_expression *);
 166    void insert_to_shifts(ir_expression *);
 167    void reverse_to_shifts(ir_expression *ir);
 168    void find_lsb_to_float_cast(ir_expression *ir);
 169    void find_msb_to_float_cast(ir_expression *ir);
 170    void imul_high_to_mul(ir_expression *ir);
 171    void sqrt_to_abs_sqrt(ir_expression *ir);
 172
 173    ir_expression *_carry(operand a, operand b);
 174 };
 175
 176 } /* anonymous namespace */
 177
 178 /**
 179  * Determine if a particular type of lowering should occur
 180  */
 181 #define lowering(x) (this->lower & x)
 182
 183 bool
 184 lower_instructions(exec_list *instructions, unsigned what_to_lower)
 185 {
 186    lower_instructions_visitor v(what_to_lower);
 187
 188    visit_list_elements(&v, instructions);
 189    return v.progress;
 190 }
 191
 192 void
 193 lower_instructions_visitor::sub_to_add_neg(ir_expression *ir)
 194 {
 195    ir->operation = ir_binop_add;
 196    ir->operands[1] = new(ir) ir_expression(ir_unop_neg, ir->operands[1]->type,
 197                                            ir->operands[1], NULL);
 198    this->progress = true;
 199 }
 200
 201 void
 202 lower_instructions_visitor::div_to_mul_rcp(ir_expression *ir)
 203 {
 204    assert(ir->operands[1]->type->is_float() || ir->operands[1]->type->is_double());
 205
 206    /* New expression for the 1.0 / op1 */
 207    ir_rvalue *expr;
 208    expr = new(ir) ir_expression(ir_unop_rcp,
 209                                 ir->operands[1]->type,
 210                                 ir->operands[1]);
 211
 212    /* op0 / op1 -> op0 * (1.0 / op1) */
 213    ir->operation = ir_binop_mul;
 214    ir->operands[1] = expr;
 215
 216    this->progress = true;
 217 }
 218
 219 void
 220 lower_instructions_visitor::int_div_to_mul_rcp(ir_expression *ir)
 221 {
 222    assert(ir->operands[1]->type->is_integer());
 223
 224    /* Be careful with integer division -- we need to do it as a
 225     * float and re-truncate, since rcp(n > 1) of an integer would
 226     * just be 0.
 227     */
 228    ir_rvalue *op0, *op1;
 229    const struct glsl_type *vec_type;
 230
 231    vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
 232                                       ir->operands[1]->type->vector_elements,
 233                                       ir->operands[1]->type->matrix_columns);
 234
 235    if (ir->operands[1]->type->base_type == GLSL_TYPE_INT)
 236       op1 = new(ir) ir_expression(ir_unop_i2f, vec_type, ir->operands[1], NULL);
 237    else
 238       op1 = new(ir) ir_expression(ir_unop_u2f, vec_type, ir->operands[1], NULL);
 239
 240    op1 = new(ir) ir_expression(ir_unop_rcp, op1->type, op1, NULL);
 241
 242    vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
 243                                       ir->operands[0]->type->vector_elements,
 244                                       ir->operands[0]->type->matrix_columns);
 245
 246    if (ir->operands[0]->type->base_type == GLSL_TYPE_INT)
 247       op0 = new(ir) ir_expression(ir_unop_i2f, vec_type, ir->operands[0], NULL);
 248    else
 249       op0 = new(ir) ir_expression(ir_unop_u2f, vec_type, ir->operands[0], NULL);
 250
 251    vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
 252                                       ir->type->vector_elements,
 253                                       ir->type->matrix_columns);
 254
 255    op0 = new(ir) ir_expression(ir_binop_mul, vec_type, op0, op1);
 256
 257    if (ir->operands[1]->type->base_type == GLSL_TYPE_INT) {
 258       ir->operation = ir_unop_f2i;
 259       ir->operands[0] = op0;
 260    } else {
 261       ir->operation = ir_unop_i2u;
 262       ir->operands[0] = new(ir) ir_expression(ir_unop_f2i, op0);
 263    }
 264    ir->operands[1] = NULL;
 265
 266    this->progress = true;
 267 }
 268
 269 void
 270 lower_instructions_visitor::exp_to_exp2(ir_expression *ir)
 271 {
 272    ir_constant *log2_e = new(ir) ir_constant(float(M_LOG2E));
 273
 274    ir->operation = ir_unop_exp2;
 275    ir->operands[0] = new(ir) ir_expression(ir_binop_mul, ir->operands[0]->type,
 276                                            ir->operands[0], log2_e);
 277    this->progress = true;
 278 }
 279
 280 void
 281 lower_instructions_visitor::pow_to_exp2(ir_expression *ir)
 282 {
 283    ir_expression *const log2_x =
 284       new(ir) ir_expression(ir_unop_log2, ir->operands[0]->type,
 285                             ir->operands[0]);
 286
 287    ir->operation = ir_unop_exp2;
 288    ir->operands[0] = new(ir) ir_expression(ir_binop_mul, ir->operands[1]->type,
 289                                            ir->operands[1], log2_x);
 290    ir->operands[1] = NULL;
 291    this->progress = true;
 292 }
 293
 294 void
 295 lower_instructions_visitor::log_to_log2(ir_expression *ir)
 296 {
 297    ir->operation = ir_binop_mul;
 298    ir->operands[0] = new(ir) ir_expression(ir_unop_log2, ir->operands[0]->type,
 299                                            ir->operands[0], NULL);
 300    ir->operands[1] = new(ir) ir_constant(float(1.0 / M_LOG2E));
 301    this->progress = true;
 302 }
 303
 304 void
 305 lower_instructions_visitor::mod_to_floor(ir_expression *ir)
 306 {
 307    ir_variable *x = new(ir) ir_variable(ir->operands[0]->type, "mod_x",
 308                                          ir_var_temporary);
 309    ir_variable *y = new(ir) ir_variable(ir->operands[1]->type, "mod_y",
 310                                          ir_var_temporary);
 311    this->base_ir->insert_before(x);
 312    this->base_ir->insert_before(y);
 313
 314    ir_assignment *const assign_x =
 315       new(ir) ir_assignment(new(ir) ir_dereference_variable(x),
 316                             ir->operands[0], NULL);
 317    ir_assignment *const assign_y =
 318       new(ir) ir_assignment(new(ir) ir_dereference_variable(y),
 319                             ir->operands[1], NULL);
 320
 321    this->base_ir->insert_before(assign_x);
 322    this->base_ir->insert_before(assign_y);
 323
 324    ir_expression *const div_expr =
 325       new(ir) ir_expression(ir_binop_div, x->type,
 326                             new(ir) ir_dereference_variable(x),
 327                             new(ir) ir_dereference_variable(y));
 328
 329    /* Don't generate new IR that would need to be lowered in an additional
 330     * pass.
 331     */
 332    if ((lowering(FDIV_TO_MUL_RCP) && ir->type->is_float()) ||
 333        (lowering(DDIV_TO_MUL_RCP) && ir->type->is_double()))
 334       div_to_mul_rcp(div_expr);
 335
 336    ir_expression *const floor_expr =
 337       new(ir) ir_expression(ir_unop_floor, x->type, div_expr);
 338
 339    if (lowering(DOPS_TO_DFRAC) && ir->type->is_double())
 340       dfloor_to_dfrac(floor_expr);
 341
 342    ir_expression *const mul_expr =
 343       new(ir) ir_expression(ir_binop_mul,
 344                             new(ir) ir_dereference_variable(y),
 345                             floor_expr);
 346
 347    ir->operation = ir_binop_sub;
 348    ir->operands[0] = new(ir) ir_dereference_variable(x);
 349    ir->operands[1] = mul_expr;
 350    this->progress = true;
 351 }
 352
 353 void
 354 lower_instructions_visitor::ldexp_to_arith(ir_expression *ir)
 355 {
 356    /* Translates
 357     *    ir_binop_ldexp x exp
 358     * into
 359     *
 360     *    extracted_biased_exp = rshift(bitcast_f2i(abs(x)), exp_shift);
 361     *    resulting_biased_exp = extracted_biased_exp + exp;
 362     *
 363     *    if (resulting_biased_exp < 1 || x == 0.0f) {
 364     *       return copysign(0.0, x);
 365     *    }
 366     *
 367     *    return bitcast_u2f((bitcast_f2u(x) & sign_mantissa_mask) |
 368     *                       lshift(i2u(resulting_biased_exp), exp_shift));
 369     *
 370     * which we can't actually implement as such, since the GLSL IR doesn't
 371     * have vectorized if-statements. We actually implement it without branches
 372     * using conditional-select:
 373     *
 374     *    extracted_biased_exp = rshift(bitcast_f2i(abs(x)), exp_shift);
 375     *    resulting_biased_exp = extracted_biased_exp + exp;
 376     *
 377     *    is_not_zero_or_underflow = logic_and(nequal(x, 0.0f),
 378     *                                         gequal(resulting_biased_exp, 1);
 379     *    x = csel(is_not_zero_or_underflow, x, copysign(0.0f, x));
 380     *    resulting_biased_exp = csel(is_not_zero_or_underflow,
 381     *                                resulting_biased_exp, 0);
 382     *
 383     *    return bitcast_u2f((bitcast_f2u(x) & sign_mantissa_mask) |
 384     *                       lshift(i2u(resulting_biased_exp), exp_shift));
 385     */
 386
 387    const unsigned vec_elem = ir->type->vector_elements;
 388
 389    /* Types */
 390    const glsl_type *ivec = glsl_type::get_instance(GLSL_TYPE_INT, vec_elem, 1);
 391    const glsl_type *bvec = glsl_type::get_instance(GLSL_TYPE_BOOL, vec_elem, 1);
 392
 393    /* Constants */
 394    ir_constant *zeroi = ir_constant::zero(ir, ivec);
 395
 396    ir_constant *sign_mask = new(ir) ir_constant(0x80000000u, vec_elem);
 397
 398    ir_constant *exp_shift = new(ir) ir_constant(23, vec_elem);
 399
 400    /* Temporary variables */
 401    ir_variable *x = new(ir) ir_variable(ir->type, "x", ir_var_temporary);
 402    ir_variable *exp = new(ir) ir_variable(ivec, "exp", ir_var_temporary);
 403
 404    ir_variable *zero_sign_x = new(ir) ir_variable(ir->type, "zero_sign_x",
 405                                                   ir_var_temporary);
 406
 407    ir_variable *extracted_biased_exp =
 408       new(ir) ir_variable(ivec, "extracted_biased_exp", ir_var_temporary);
 409    ir_variable *resulting_biased_exp =
 410       new(ir) ir_variable(ivec, "resulting_biased_exp", ir_var_temporary);
 411
 412    ir_variable *is_not_zero_or_underflow =
 413       new(ir) ir_variable(bvec, "is_not_zero_or_underflow", ir_var_temporary);
 414
 415    ir_instruction &i = *base_ir;
 416
 417    /* Copy <x> and <exp> arguments. */
 418    i.insert_before(x);
 419    i.insert_before(assign(x, ir->operands[0]));
 420    i.insert_before(exp);
 421    i.insert_before(assign(exp, ir->operands[1]));
 422
 423    /* Extract the biased exponent from <x>. */
 424    i.insert_before(extracted_biased_exp);
 425    i.insert_before(assign(extracted_biased_exp,
 426                           rshift(bitcast_f2i(abs(x)), exp_shift)));
 427
 428    i.insert_before(resulting_biased_exp);
 429    i.insert_before(assign(resulting_biased_exp,
 430                           add(extracted_biased_exp, exp)));
 431
 432    /* Test if result is ±0.0, subnormal, or underflow by checking if the
 433     * resulting biased exponent would be less than 0x1. If so, the result is
 434     * 0.0 with the sign of x. (Actually, invert the conditions so that
 435     * immediate values are the second arguments, which is better for i965)
 436     */
 437    i.insert_before(zero_sign_x);
 438    i.insert_before(assign(zero_sign_x,
 439                           bitcast_u2f(bit_and(bitcast_f2u(x), sign_mask))));
 440
 441    i.insert_before(is_not_zero_or_underflow);
 442    i.insert_before(assign(is_not_zero_or_underflow,
 443                           logic_and(nequal(x, new(ir) ir_constant(0.0f, vec_elem)),
 444                                     gequal(resulting_biased_exp,
 445                                            new(ir) ir_constant(0x1, vec_elem)))));
 446    i.insert_before(assign(x, csel(is_not_zero_or_underflow,
 447                                   x, zero_sign_x)));
 448    i.insert_before(assign(resulting_biased_exp,
 449                           csel(is_not_zero_or_underflow,
 450                                resulting_biased_exp, zeroi)));
 451
 452    /* We could test for overflows by checking if the resulting biased exponent
 453     * would be greater than 0xFE. Turns out we don't need to because the GLSL
 454     * spec says:
 455     *
 456     *    "If this product is too large to be represented in the
 457     *     floating-point type, the result is undefined."
 458     */
 459
 460    ir_constant *exp_shift_clone = exp_shift->clone(ir, NULL);
 461
 462    /* Don't generate new IR that would need to be lowered in an additional
 463     * pass.
 464     */
 465    if (!lowering(INSERT_TO_SHIFTS)) {
 466       ir_constant *exp_width = new(ir) ir_constant(8, vec_elem);
 467       ir->operation = ir_unop_bitcast_i2f;
 468       ir->operands[0] = bitfield_insert(bitcast_f2i(x), resulting_biased_exp,
 469                                         exp_shift_clone, exp_width);
 470       ir->operands[1] = NULL;
 471    } else {
 472       ir_constant *sign_mantissa_mask = new(ir) ir_constant(0x807fffffu, vec_elem);
 473       ir->operation = ir_unop_bitcast_u2f;
 474       ir->operands[0] = bit_or(bit_and(bitcast_f2u(x), sign_mantissa_mask),
 475                                lshift(i2u(resulting_biased_exp), exp_shift_clone));
 476    }
 477
 478    this->progress = true;
 479 }
 480
 481 void
 482 lower_instructions_visitor::dldexp_to_arith(ir_expression *ir)
 483 {
 484    /* See ldexp_to_arith for structure. Uses frexp_exp to extract the exponent
 485     * from the significand.
 486     */
 487
 488    const unsigned vec_elem = ir->type->vector_elements;
 489
 490    /* Types */
 491    const glsl_type *ivec = glsl_type::get_instance(GLSL_TYPE_INT, vec_elem, 1);
 492    const glsl_type *bvec = glsl_type::get_instance(GLSL_TYPE_BOOL, vec_elem, 1);
 493
 494    /* Constants */
 495    ir_constant *zeroi = ir_constant::zero(ir, ivec);
 496
 497    ir_constant *sign_mask = new(ir) ir_constant(0x80000000u);
 498
 499    ir_constant *exp_shift = new(ir) ir_constant(20u);
 500    ir_constant *exp_width = new(ir) ir_constant(11u);
 501    ir_constant *exp_bias = new(ir) ir_constant(1022, vec_elem);
 502
 503    /* Temporary variables */
 504    ir_variable *x = new(ir) ir_variable(ir->type, "x", ir_var_temporary);
 505    ir_variable *exp = new(ir) ir_variable(ivec, "exp", ir_var_temporary);
 506
 507    ir_variable *zero_sign_x = new(ir) ir_variable(ir->type, "zero_sign_x",
 508                                                   ir_var_temporary);
 509
 510    ir_variable *extracted_biased_exp =
 511       new(ir) ir_variable(ivec, "extracted_biased_exp", ir_var_temporary);
 512    ir_variable *resulting_biased_exp =
 513       new(ir) ir_variable(ivec, "resulting_biased_exp", ir_var_temporary);
 514
 515    ir_variable *is_not_zero_or_underflow =
 516       new(ir) ir_variable(bvec, "is_not_zero_or_underflow", ir_var_temporary);
 517
 518    ir_instruction &i = *base_ir;
 519
 520    /* Copy <x> and <exp> arguments. */
 521    i.insert_before(x);
 522    i.insert_before(assign(x, ir->operands[0]));
 523    i.insert_before(exp);
 524    i.insert_before(assign(exp, ir->operands[1]));
 525
 526    ir_expression *frexp_exp = expr(ir_unop_frexp_exp, x);
 527    if (lowering(DFREXP_DLDEXP_TO_ARITH))
 528       dfrexp_exp_to_arith(frexp_exp);
 529
 530    /* Extract the biased exponent from <x>. */
 531    i.insert_before(extracted_biased_exp);
 532    i.insert_before(assign(extracted_biased_exp, add(frexp_exp, exp_bias)));
 533
 534    i.insert_before(resulting_biased_exp);
 535    i.insert_before(assign(resulting_biased_exp,
 536                           add(extracted_biased_exp, exp)));
 537
 538    /* Test if result is ±0.0, subnormal, or underflow by checking if the
 539     * resulting biased exponent would be less than 0x1. If so, the result is
 540     * 0.0 with the sign of x. (Actually, invert the conditions so that
 541     * immediate values are the second arguments, which is better for i965)
 542     * TODO: Implement in a vector fashion.
 543     */
 544    i.insert_before(zero_sign_x);
 545    for (unsigned elem = 0; elem < vec_elem; elem++) {
 546       ir_variable *unpacked =
 547          new(ir) ir_variable(glsl_type::uvec2_type, "unpacked", ir_var_temporary);
 548       i.insert_before(unpacked);
 549       i.insert_before(
 550             assign(unpacked,
 551                    expr(ir_unop_unpack_double_2x32, swizzle(x, elem, 1))));
 552       i.insert_before(assign(unpacked, bit_and(swizzle_y(unpacked), sign_mask->clone(ir, NULL)),
 553                              WRITEMASK_Y));
 554       i.insert_before(assign(unpacked, ir_constant::zero(ir, glsl_type::uint_type), WRITEMASK_X));
 555       i.insert_before(assign(zero_sign_x,
 556                              expr(ir_unop_pack_double_2x32, unpacked),
 557                              1 << elem));
 558    }
 559    i.insert_before(is_not_zero_or_underflow);
 560    i.insert_before(assign(is_not_zero_or_underflow,
 561                           gequal(resulting_biased_exp,
 562                                   new(ir) ir_constant(0x1, vec_elem))));
 563    i.insert_before(assign(x, csel(is_not_zero_or_underflow,
 564                                   x, zero_sign_x)));
 565    i.insert_before(assign(resulting_biased_exp,
 566                           csel(is_not_zero_or_underflow,
 567                                resulting_biased_exp, zeroi)));
 568
 569    /* We could test for overflows by checking if the resulting biased exponent
 570     * would be greater than 0xFE. Turns out we don't need to because the GLSL
 571     * spec says:
 572     *
 573     *    "If this product is too large to be represented in the
 574     *     floating-point type, the result is undefined."
 575     */
 576
 577    ir_rvalue *results[4] = {NULL};
 578    for (unsigned elem = 0; elem < vec_elem; elem++) {
 579       ir_variable *unpacked =
 580          new(ir) ir_variable(glsl_type::uvec2_type, "unpacked", ir_var_temporary);
 581       i.insert_before(unpacked);
 582       i.insert_before(
 583             assign(unpacked,
 584                    expr(ir_unop_unpack_double_2x32, swizzle(x, elem, 1))));
 585
 586       ir_expression *bfi = bitfield_insert(
 587             swizzle_y(unpacked),
 588             i2u(swizzle(resulting_biased_exp, elem, 1)),
 589             exp_shift->clone(ir, NULL),
 590             exp_width->clone(ir, NULL));
 591
 592       i.insert_before(assign(unpacked, bfi, WRITEMASK_Y));
 593
 594       results[elem] = expr(ir_unop_pack_double_2x32, unpacked);
 595    }
 596
 597    ir->operation = ir_quadop_vector;
 598    ir->operands[0] = results[0];
 599    ir->operands[1] = results[1];
 600    ir->operands[2] = results[2];
 601    ir->operands[3] = results[3];
 602
 603    /* Don't generate new IR that would need to be lowered in an additional
 604     * pass.
 605     */
 606
 607    this->progress = true;
 608 }
 609
 610 void
 611 lower_instructions_visitor::dfrexp_sig_to_arith(ir_expression *ir)
 612 {
 613    const unsigned vec_elem = ir->type->vector_elements;
 614    const glsl_type *bvec = glsl_type::get_instance(GLSL_TYPE_BOOL, vec_elem, 1);
 615
 616    /* Double-precision floating-point values are stored as
 617     *   1 sign bit;
 618     *   11 exponent bits;
 619     *   52 mantissa bits.
 620     *
 621     * We're just extracting the significand here, so we only need to modify
 622     * the upper 32-bit uint. Unfortunately we must extract each double
 623     * independently as there is no vector version of unpackDouble.
 624     */
 625
 626    ir_instruction &i = *base_ir;
 627
 628    ir_variable *is_not_zero =
 629       new(ir) ir_variable(bvec, "is_not_zero", ir_var_temporary);
 630    ir_rvalue *results[4] = {NULL};
 631
 632    ir_constant *dzero = new(ir) ir_constant(0.0, vec_elem);
 633    i.insert_before(is_not_zero);
 634    i.insert_before(
 635          assign(is_not_zero,
 636                 nequal(abs(ir->operands[0]->clone(ir, NULL)), dzero)));
 637
 638    /* TODO: Remake this as more vector-friendly when int64 support is
 639     * available.
 640     */
 641    for (unsigned elem = 0; elem < vec_elem; elem++) {
 642       ir_constant *zero = new(ir) ir_constant(0u, 1);
 643       ir_constant *sign_mantissa_mask = new(ir) ir_constant(0x800fffffu, 1);
 644
 645       /* Exponent of double floating-point values in the range [0.5, 1.0). */
 646       ir_constant *exponent_value = new(ir) ir_constant(0x3fe00000u, 1);
 647
 648       ir_variable *bits =
 649          new(ir) ir_variable(glsl_type::uint_type, "bits", ir_var_temporary);
 650       ir_variable *unpacked =
 651          new(ir) ir_variable(glsl_type::uvec2_type, "unpacked", ir_var_temporary);
 652
 653       ir_rvalue *x = swizzle(ir->operands[0]->clone(ir, NULL), elem, 1);
 654
 655       i.insert_before(bits);
 656       i.insert_before(unpacked);
 657       i.insert_before(assign(unpacked, expr(ir_unop_unpack_double_2x32, x)));
 658
 659       /* Manipulate the high uint to remove the exponent and replace it with
 660        * either the default exponent or zero.
 661        */
 662       i.insert_before(assign(bits, swizzle_y(unpacked)));
 663       i.insert_before(assign(bits, bit_and(bits, sign_mantissa_mask)));
 664       i.insert_before(assign(bits, bit_or(bits,
 665                                           csel(swizzle(is_not_zero, elem, 1),
 666                                                exponent_value,
 667                                                zero))));
 668       i.insert_before(assign(unpacked, bits, WRITEMASK_Y));
 669       results[elem] = expr(ir_unop_pack_double_2x32, unpacked);
 670    }
 671
 672    /* Put the dvec back together */
 673    ir->operation = ir_quadop_vector;
 674    ir->operands[0] = results[0];
 675    ir->operands[1] = results[1];
 676    ir->operands[2] = results[2];
 677    ir->operands[3] = results[3];
 678
 679    this->progress = true;
 680 }
 681
 682 void
 683 lower_instructions_visitor::dfrexp_exp_to_arith(ir_expression *ir)
 684 {
 685    const unsigned vec_elem = ir->type->vector_elements;
 686    const glsl_type *bvec = glsl_type::get_instance(GLSL_TYPE_BOOL, vec_elem, 1);
 687    const glsl_type *uvec = glsl_type::get_instance(GLSL_TYPE_UINT, vec_elem, 1);
 688
 689    /* Double-precision floating-point values are stored as
 690     *   1 sign bit;
 691     *   11 exponent bits;
 692     *   52 mantissa bits.
 693     *
 694     * We're just extracting the exponent here, so we only care about the upper
 695     * 32-bit uint.
 696     */
 697
 698    ir_instruction &i = *base_ir;
 699
 700    ir_variable *is_not_zero =
 701       new(ir) ir_variable(bvec, "is_not_zero", ir_var_temporary);
 702    ir_variable *high_words =
 703       new(ir) ir_variable(uvec, "high_words", ir_var_temporary);
 704    ir_constant *dzero = new(ir) ir_constant(0.0, vec_elem);
 705    ir_constant *izero = new(ir) ir_constant(0, vec_elem);
 706
 707    ir_rvalue *absval = abs(ir->operands[0]);
 708
 709    i.insert_before(is_not_zero);
 710    i.insert_before(high_words);
 711    i.insert_before(assign(is_not_zero, nequal(absval->clone(ir, NULL), dzero)));
 712
 713    /* Extract all of the upper uints. */
 714    for (unsigned elem = 0; elem < vec_elem; elem++) {
 715       ir_rvalue *x = swizzle(absval->clone(ir, NULL), elem, 1);
 716
 717       i.insert_before(assign(high_words,
 718                              swizzle_y(expr(ir_unop_unpack_double_2x32, x)),
 719                              1 << elem));
 720
 721    }
 722    ir_constant *exponent_shift = new(ir) ir_constant(20, vec_elem);
 723    ir_constant *exponent_bias = new(ir) ir_constant(-1022, vec_elem);
 724
 725    /* For non-zero inputs, shift the exponent down and apply bias. */
 726    ir->operation = ir_triop_csel;
 727    ir->operands[0] = new(ir) ir_dereference_variable(is_not_zero);
 728    ir->operands[1] = add(exponent_bias, u2i(rshift(high_words, exponent_shift)));
 729    ir->operands[2] = izero;
 730
 731    this->progress = true;
 732 }
 733
 734 void
 735 lower_instructions_visitor::carry_to_arith(ir_expression *ir)
 736 {
 737    /* Translates
 738     *   ir_binop_carry x y
 739     * into
 740     *   sum = ir_binop_add x y
 741     *   bcarry = ir_binop_less sum x
 742     *   carry = ir_unop_b2i bcarry
 743     */
 744
 745    ir_rvalue *x_clone = ir->operands[0]->clone(ir, NULL);
 746    ir->operation = ir_unop_i2u;
 747    ir->operands[0] = b2i(less(add(ir->operands[0], ir->operands[1]), x_clone));
 748    ir->operands[1] = NULL;
 749
 750    this->progress = true;
 751 }
 752
 753 void
 754 lower_instructions_visitor::borrow_to_arith(ir_expression *ir)
 755 {
 756    /* Translates
 757     *   ir_binop_borrow x y
 758     * into
 759     *   bcarry = ir_binop_less x y
 760     *   carry = ir_unop_b2i bcarry
 761     */
 762
 763    ir->operation = ir_unop_i2u;
 764    ir->operands[0] = b2i(less(ir->operands[0], ir->operands[1]));
 765    ir->operands[1] = NULL;
 766
 767    this->progress = true;
 768 }
 769
 770 void
 771 lower_instructions_visitor::sat_to_clamp(ir_expression *ir)
 772 {
 773    /* Translates
 774     *   ir_unop_saturate x
 775     * into
 776     *   ir_binop_min (ir_binop_max(x, 0.0), 1.0)
 777     */
 778
 779    ir->operation = ir_binop_min;
 780    ir->operands[0] = new(ir) ir_expression(ir_binop_max, ir->operands[0]->type,
 781                                            ir->operands[0],
 782                                            new(ir) ir_constant(0.0f));
 783    ir->operands[1] = new(ir) ir_constant(1.0f);
 784
 785    this->progress = true;
 786 }
 787
 788 void
 789 lower_instructions_visitor::double_dot_to_fma(ir_expression *ir)
 790 {
 791    ir_variable *temp = new(ir) ir_variable(ir->operands[0]->type->get_base_type(), "dot_res",
 792                                            ir_var_temporary);
 793    this->base_ir->insert_before(temp);
 794
 795    int nc = ir->operands[0]->type->components();
 796    for (int i = nc - 1; i >= 1; i--) {
 797       ir_assignment *assig;
 798       if (i == (nc - 1)) {
 799          assig = assign(temp, mul(swizzle(ir->operands[0]->clone(ir, NULL), i, 1),
 800                                   swizzle(ir->operands[1]->clone(ir, NULL), i, 1)));
 801       } else {
 802          assig = assign(temp, fma(swizzle(ir->operands[0]->clone(ir, NULL), i, 1),
 803                                   swizzle(ir->operands[1]->clone(ir, NULL), i, 1),
 804                                   temp));
 805       }
 806       this->base_ir->insert_before(assig);
 807    }
 808
 809    ir->operation = ir_triop_fma;
 810    ir->operands[0] = swizzle(ir->operands[0], 0, 1);
 811    ir->operands[1] = swizzle(ir->operands[1], 0, 1);
 812    ir->operands[2] = new(ir) ir_dereference_variable(temp);
 813
 814    this->progress = true;
 815
 816 }
 817
 818 void
 819 lower_instructions_visitor::double_lrp(ir_expression *ir)
 820 {
 821    int swizval;
 822    ir_rvalue *op0 = ir->operands[0], *op2 = ir->operands[2];
 823    ir_constant *one = new(ir) ir_constant(1.0, op2->type->vector_elements);
 824
 825    switch (op2->type->vector_elements) {
 826    case 1:
 827       swizval = SWIZZLE_XXXX;
 828       break;
 829    default:
 830       assert(op0->type->vector_elements == op2->type->vector_elements);
 831       swizval = SWIZZLE_XYZW;
 832       break;
 833    }
 834
 835    ir->operation = ir_triop_fma;
 836    ir->operands[0] = swizzle(op2, swizval, op0->type->vector_elements);
 837    ir->operands[2] = mul(sub(one, op2->clone(ir, NULL)), op0);
 838
 839    this->progress = true;
 840 }
 841
 842 void
 843 lower_instructions_visitor::dceil_to_dfrac(ir_expression *ir)
 844 {
 845    /*
 846     * frtemp = frac(x);
 847     * temp = sub(x, frtemp);
 848     * result = temp + ((frtemp != 0.0) ? 1.0 : 0.0);
 849     */
 850    ir_instruction &i = *base_ir;
 851    ir_constant *zero = new(ir) ir_constant(0.0, ir->operands[0]->type->vector_elements);
 852    ir_constant *one = new(ir) ir_constant(1.0, ir->operands[0]->type->vector_elements);
 853    ir_variable *frtemp = new(ir) ir_variable(ir->operands[0]->type, "frtemp",
 854                                              ir_var_temporary);
 855
 856    i.insert_before(frtemp);
 857    i.insert_before(assign(frtemp, fract(ir->operands[0])));
 858
 859    ir->operation = ir_binop_add;
 860    ir->operands[0] = sub(ir->operands[0]->clone(ir, NULL), frtemp);
 861    ir->operands[1] = csel(nequal(frtemp, zero), one, zero->clone(ir, NULL));
 862
 863    this->progress = true;
 864 }
 865
 866 void
 867 lower_instructions_visitor::dfloor_to_dfrac(ir_expression *ir)
 868 {
 869    /*
 870     * frtemp = frac(x);
 871     * result = sub(x, frtemp);
 872     */
 873    ir->operation = ir_binop_sub;
 874    ir->operands[1] = fract(ir->operands[0]->clone(ir, NULL));
 875
 876    this->progress = true;
 877 }
 878 void
 879 lower_instructions_visitor::dround_even_to_dfrac(ir_expression *ir)
 880 {
 881    /*
 882     * insane but works
 883     * temp = x + 0.5;
 884     * frtemp = frac(temp);
 885     * t2 = sub(temp, frtemp);
 886     * if (frac(x) == 0.5)
 887     *     result = frac(t2 * 0.5) == 0 ? t2 : t2 - 1;
 888     *  else
 889     *     result = t2;
 890
 891     */
 892    ir_instruction &i = *base_ir;
 893    ir_variable *frtemp = new(ir) ir_variable(ir->operands[0]->type, "frtemp",
 894                                              ir_var_temporary);
 895    ir_variable *temp = new(ir) ir_variable(ir->operands[0]->type, "temp",
 896                                            ir_var_temporary);
 897    ir_variable *t2 = new(ir) ir_variable(ir->operands[0]->type, "t2",
 898                                            ir_var_temporary);
 899    ir_constant *p5 = new(ir) ir_constant(0.5, ir->operands[0]->type->vector_elements);
 900    ir_constant *one = new(ir) ir_constant(1.0, ir->operands[0]->type->vector_elements);
 901    ir_constant *zero = new(ir) ir_constant(0.0, ir->operands[0]->type->vector_elements);
 902
 903    i.insert_before(temp);
 904    i.insert_before(assign(temp, add(ir->operands[0], p5)));
 905
 906    i.insert_before(frtemp);
 907    i.insert_before(assign(frtemp, fract(temp)));
 908
 909    i.insert_before(t2);
 910    i.insert_before(assign(t2, sub(temp, frtemp)));
 911
 912    ir->operation = ir_triop_csel;
 913    ir->operands[0] = equal(fract(ir->operands[0]->clone(ir, NULL)),
 914                            p5->clone(ir, NULL));
 915    ir->operands[1] = csel(equal(fract(mul(t2, p5->clone(ir, NULL))),
 916                                 zero),
 917                           t2,
 918                           sub(t2, one));
 919    ir->operands[2] = new(ir) ir_dereference_variable(t2);
 920
 921    this->progress = true;
 922 }
 923
 924 void
 925 lower_instructions_visitor::dtrunc_to_dfrac(ir_expression *ir)
 926 {
 927    /*
 928     * frtemp = frac(x);
 929     * temp = sub(x, frtemp);
 930     * result = x >= 0 ? temp : temp + (frtemp == 0.0) ? 0 : 1;
 931     */
 932    ir_rvalue *arg = ir->operands[0];
 933    ir_instruction &i = *base_ir;
 934
 935    ir_constant *zero = new(ir) ir_constant(0.0, arg->type->vector_elements);
 936    ir_constant *one = new(ir) ir_constant(1.0, arg->type->vector_elements);
 937    ir_variable *frtemp = new(ir) ir_variable(arg->type, "frtemp",
 938                                              ir_var_temporary);
 939    ir_variable *temp = new(ir) ir_variable(ir->operands[0]->type, "temp",
 940                                            ir_var_temporary);
 941
 942    i.insert_before(frtemp);
 943    i.insert_before(assign(frtemp, fract(arg)));
 944    i.insert_before(temp);
 945    i.insert_before(assign(temp, sub(arg->clone(ir, NULL), frtemp)));
 946
 947    ir->operation = ir_triop_csel;
 948    ir->operands[0] = gequal(arg->clone(ir, NULL), zero);
 949    ir->operands[1] = new (ir) ir_dereference_variable(temp);
 950    ir->operands[2] = add(temp,
 951                          csel(equal(frtemp, zero->clone(ir, NULL)),
 952                               zero->clone(ir, NULL),
 953                               one));
 954
 955    this->progress = true;
 956 }
 957
 958 void
 959 lower_instructions_visitor::dsign_to_csel(ir_expression *ir)
 960 {
 961    /*
 962     * temp = x > 0.0 ? 1.0 : 0.0;
 963     * result = x < 0.0 ? -1.0 : temp;
 964     */
 965    ir_rvalue *arg = ir->operands[0];
 966    ir_constant *zero = new(ir) ir_constant(0.0, arg->type->vector_elements);
 967    ir_constant *one = new(ir) ir_constant(1.0, arg->type->vector_elements);
 968    ir_constant *neg_one = new(ir) ir_constant(-1.0, arg->type->vector_elements);
 969
 970    ir->operation = ir_triop_csel;
 971    ir->operands[0] = less(arg->clone(ir, NULL),
 972                           zero->clone(ir, NULL));
 973    ir->operands[1] = neg_one;
 974    ir->operands[2] = csel(greater(arg, zero),
 975                           one,
 976                           zero->clone(ir, NULL));
 977
 978    this->progress = true;
 979 }
 980
 981 void
 982 lower_instructions_visitor::bit_count_to_math(ir_expression *ir)
 983 {
 984    /* For more details, see:
 985     *
 986     * http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetPaallel
 987     */
 988    const unsigned elements = ir->operands[0]->type->vector_elements;
 989    ir_variable *temp = new(ir) ir_variable(glsl_type::uvec(elements), "temp",
 990                                            ir_var_temporary);
 991    ir_constant *c55555555 = new(ir) ir_constant(0x55555555u);
 992    ir_constant *c33333333 = new(ir) ir_constant(0x33333333u);
 993    ir_constant *c0F0F0F0F = new(ir) ir_constant(0x0F0F0F0Fu);
 994    ir_constant *c01010101 = new(ir) ir_constant(0x01010101u);
 995    ir_constant *c1 = new(ir) ir_constant(1u);
 996    ir_constant *c2 = new(ir) ir_constant(2u);
 997    ir_constant *c4 = new(ir) ir_constant(4u);
 998    ir_constant *c24 = new(ir) ir_constant(24u);
 999
1000    base_ir->insert_before(temp);
1001
1002    if (ir->operands[0]->type->base_type == GLSL_TYPE_UINT) {
1003       base_ir->insert_before(assign(temp, ir->operands[0]));
1004    } else {
1005       assert(ir->operands[0]->type->base_type == GLSL_TYPE_INT);
1006       base_ir->insert_before(assign(temp, i2u(ir->operands[0])));
1007    }
1008
1009    /* temp = temp - ((temp >> 1) & 0x55555555u); */
1010    base_ir->insert_before(assign(temp, sub(temp, bit_and(rshift(temp, c1),
1011                                                          c55555555))));
1012
1013    /* temp = (temp & 0x33333333u) + ((temp >> 2) & 0x33333333u); */
1014    base_ir->insert_before(assign(temp, add(bit_and(temp, c33333333),
1015                                            bit_and(rshift(temp, c2),
1016                                                    c33333333->clone(ir, NULL)))));
1017
1018    /* int(((temp + (temp >> 4) & 0xF0F0F0Fu) * 0x1010101u) >> 24); */
1019    ir->operation = ir_unop_u2i;
1020    ir->operands[0] = rshift(mul(bit_and(add(temp, rshift(temp, c4)), c0F0F0F0F),
1021                                 c01010101),
1022                             c24);
1023
1024    this->progress = true;
1025 }
1026
1027 void
1028 lower_instructions_visitor::extract_to_shifts(ir_expression *ir)
1029 {
1030    ir_variable *bits =
1031       new(ir) ir_variable(ir->operands[0]->type, "bits", ir_var_temporary);
1032
1033    base_ir->insert_before(bits);
1034    base_ir->insert_before(assign(bits, ir->operands[2]));
1035
1036    if (ir->operands[0]->type->base_type == GLSL_TYPE_UINT) {
1037       ir_constant *c1 =
1038          new(ir) ir_constant(1u, ir->operands[0]->type->vector_elements);
1039       ir_constant *c32 =
1040          new(ir) ir_constant(32u, ir->operands[0]->type->vector_elements);
1041       ir_constant *cFFFFFFFF =
1042          new(ir) ir_constant(0xFFFFFFFFu, ir->operands[0]->type->vector_elements);
1043
1044       /* At least some hardware treats (x << y) as (x << (y%32)).  This means
1045        * we'd get a mask of 0 when bits is 32.  Special case it.
1046        *
1047        * mask = bits == 32 ? 0xffffffff : (1u << bits) - 1u;
1048        */
1049       ir_expression *mask = csel(equal(bits, c32),
1050                                  cFFFFFFFF,
1051                                  sub(lshift(c1, bits), c1->clone(ir, NULL)));
1052
1053       /* Section 8.8 (Integer Functions) of the GLSL 4.50 spec says:
1054        *
1055        *    If bits is zero, the result will be zero.
1056        *
1057        * Since (1 << 0) - 1 == 0, we don't need to bother with the conditional
1058        * select as in the signed integer case.
1059        *
1060        * (value >> offset) & mask;
1061        */
1062       ir->operation = ir_binop_bit_and;
1063       ir->operands[0] = rshift(ir->operands[0], ir->operands[1]);
1064       ir->operands[1] = mask;
1065       ir->operands[2] = NULL;
1066    } else {
1067       ir_constant *c0 =
1068          new(ir) ir_constant(int(0), ir->operands[0]->type->vector_elements);
1069       ir_constant *c32 =
1070          new(ir) ir_constant(int(32), ir->operands[0]->type->vector_elements);
1071       ir_variable *temp =
1072          new(ir) ir_variable(ir->operands[0]->type, "temp", ir_var_temporary);
1073
1074       /* temp = 32 - bits; */
1075       base_ir->insert_before(temp);
1076       base_ir->insert_before(assign(temp, sub(c32, bits)));
1077
1078       /* expr = value << (temp - offset)) >> temp; */
1079       ir_expression *expr =
1080          rshift(lshift(ir->operands[0], sub(temp, ir->operands[1])), temp);
1081
1082       /* Section 8.8 (Integer Functions) of the GLSL 4.50 spec says:
1083        *
1084        *    If bits is zero, the result will be zero.
1085        *
1086        * Due to the (x << (y%32)) behavior mentioned before, the (value <<
1087        * (32-0)) doesn't "erase" all of the data as we would like, so finish
1088        * up with:
1089        *
1090        * (bits == 0) ? 0 : e;
1091        */
1092       ir->operation = ir_triop_csel;
1093       ir->operands[0] = equal(c0, bits);
1094       ir->operands[1] = c0->clone(ir, NULL);
1095       ir->operands[2] = expr;
1096    }
1097
1098    this->progress = true;
1099 }
1100
1101 void
1102 lower_instructions_visitor::insert_to_shifts(ir_expression *ir)
1103 {
1104    ir_constant *c1;
1105    ir_constant *c32;
1106    ir_constant *cFFFFFFFF;
1107    ir_variable *offset =
1108       new(ir) ir_variable(ir->operands[0]->type, "offset", ir_var_temporary);
1109    ir_variable *bits =
1110       new(ir) ir_variable(ir->operands[0]->type, "bits", ir_var_temporary);
1111    ir_variable *mask =
1112       new(ir) ir_variable(ir->operands[0]->type, "mask", ir_var_temporary);
1113
1114    if (ir->operands[0]->type->base_type == GLSL_TYPE_INT) {
1115       c1 = new(ir) ir_constant(int(1), ir->operands[0]->type->vector_elements);
1116       c32 = new(ir) ir_constant(int(32), ir->operands[0]->type->vector_elements);
1117       cFFFFFFFF = new(ir) ir_constant(int(0xFFFFFFFF), ir->operands[0]->type->vector_elements);
1118    } else {
1119       assert(ir->operands[0]->type->base_type == GLSL_TYPE_UINT);
1120
1121       c1 = new(ir) ir_constant(1u, ir->operands[0]->type->vector_elements);
1122       c32 = new(ir) ir_constant(32u, ir->operands[0]->type->vector_elements);
1123       cFFFFFFFF = new(ir) ir_constant(0xFFFFFFFFu, ir->operands[0]->type->vector_elements);
1124    }
1125
1126    base_ir->insert_before(offset);
1127    base_ir->insert_before(assign(offset, ir->operands[2]));
1128
1129    base_ir->insert_before(bits);
1130    base_ir->insert_before(assign(bits, ir->operands[3]));
1131
1132    /* At least some hardware treats (x << y) as (x << (y%32)).  This means
1133     * we'd get a mask of 0 when bits is 32.  Special case it.
1134     *
1135     * mask = (bits == 32 ? 0xffffffff : (1u << bits) - 1u) << offset;
1136     *
1137     * Section 8.8 (Integer Functions) of the GLSL 4.50 spec says:
1138     *
1139     *    The result will be undefined if offset or bits is negative, or if the
1140     *    sum of offset and bits is greater than the number of bits used to
1141     *    store the operand.
1142     *
1143     * Since it's undefined, there are a couple other ways this could be
1144     * implemented.  The other way that was considered was to put the csel
1145     * around the whole thing:
1146     *
1147     *    final_result = bits == 32 ? insert : ... ;
1148     */
1149    base_ir->insert_before(mask);
1150
1151    base_ir->insert_before(assign(mask, csel(equal(bits, c32),
1152                                             cFFFFFFFF,
1153                                             lshift(sub(lshift(c1, bits),
1154                                                        c1->clone(ir, NULL)),
1155                                                    offset))));
1156
1157    /* (base & ~mask) | ((insert << offset) & mask) */
1158    ir->operation = ir_binop_bit_or;
1159    ir->operands[0] = bit_and(ir->operands[0], bit_not(mask));
1160    ir->operands[1] = bit_and(lshift(ir->operands[1], offset), mask);
1161    ir->operands[2] = NULL;
1162    ir->operands[3] = NULL;
1163
1164    this->progress = true;
1165 }
1166
1167 void
1168 lower_instructions_visitor::reverse_to_shifts(ir_expression *ir)
1169 {
1170    /* For more details, see:
1171     *
1172     * http://graphics.stanford.edu/~seander/bithacks.html#ReverseParallel
1173     */
1174    ir_constant *c1 =
1175       new(ir) ir_constant(1u, ir->operands[0]->type->vector_elements);
1176    ir_constant *c2 =
1177       new(ir) ir_constant(2u, ir->operands[0]->type->vector_elements);
1178    ir_constant *c4 =
1179       new(ir) ir_constant(4u, ir->operands[0]->type->vector_elements);
1180    ir_constant *c8 =
1181       new(ir) ir_constant(8u, ir->operands[0]->type->vector_elements);
1182    ir_constant *c16 =
1183       new(ir) ir_constant(16u, ir->operands[0]->type->vector_elements);
1184    ir_constant *c33333333 =
1185       new(ir) ir_constant(0x33333333u, ir->operands[0]->type->vector_elements);
1186    ir_constant *c55555555 =
1187       new(ir) ir_constant(0x55555555u, ir->operands[0]->type->vector_elements);
1188    ir_constant *c0F0F0F0F =
1189       new(ir) ir_constant(0x0F0F0F0Fu, ir->operands[0]->type->vector_elements);
1190    ir_constant *c00FF00FF =
1191       new(ir) ir_constant(0x00FF00FFu, ir->operands[0]->type->vector_elements);
1192    ir_variable *temp =
1193       new(ir) ir_variable(glsl_type::uvec(ir->operands[0]->type->vector_elements),
1194                           "temp", ir_var_temporary);
1195    ir_instruction &i = *base_ir;
1196
1197    i.insert_before(temp);
1198
1199    if (ir->operands[0]->type->base_type == GLSL_TYPE_UINT) {
1200       i.insert_before(assign(temp, ir->operands[0]));
1201    } else {
1202       assert(ir->operands[0]->type->base_type == GLSL_TYPE_INT);
1203       i.insert_before(assign(temp, i2u(ir->operands[0])));
1204    }
1205
1206    /* Swap odd and even bits.
1207     *
1208     * temp = ((temp >> 1) & 0x55555555u) | ((temp & 0x55555555u) << 1);
1209     */
1210    i.insert_before(assign(temp, bit_or(bit_and(rshift(temp, c1), c55555555),
1211                                        lshift(bit_and(temp, c55555555->clone(ir, NULL)),
1212                                               c1->clone(ir, NULL)))));
1213    /* Swap consecutive pairs.
1214     *
1215     * temp = ((temp >> 2) & 0x33333333u) | ((temp & 0x33333333u) << 2);
1216     */
1217    i.insert_before(assign(temp, bit_or(bit_and(rshift(temp, c2), c33333333),
1218                                        lshift(bit_and(temp, c33333333->clone(ir, NULL)),
1219                                               c2->clone(ir, NULL)))));
1220
1221    /* Swap nibbles.
1222     *
1223     * temp = ((temp >> 4) & 0x0F0F0F0Fu) | ((temp & 0x0F0F0F0Fu) << 4);
1224     */
1225    i.insert_before(assign(temp, bit_or(bit_and(rshift(temp, c4), c0F0F0F0F),
1226                                        lshift(bit_and(temp, c0F0F0F0F->clone(ir, NULL)),
1227                                               c4->clone(ir, NULL)))));
1228
1229    /* The last step is, basically, bswap.  Swap the bytes, then swap the
1230     * words.  When this code is run through GCC on x86, it does generate a
1231     * bswap instruction.
1232     *
1233     * temp = ((temp >> 8) & 0x00FF00FFu) | ((temp & 0x00FF00FFu) << 8);
1234     * temp = ( temp >> 16              ) | ( temp                << 16);
1235     */
1236    i.insert_before(assign(temp, bit_or(bit_and(rshift(temp, c8), c00FF00FF),
1237                                        lshift(bit_and(temp, c00FF00FF->clone(ir, NULL)),
1238                                               c8->clone(ir, NULL)))));
1239
1240    if (ir->operands[0]->type->base_type == GLSL_TYPE_UINT) {
1241       ir->operation = ir_binop_bit_or;
1242       ir->operands[0] = rshift(temp, c16);
1243       ir->operands[1] = lshift(temp, c16->clone(ir, NULL));
1244    } else {
1245       ir->operation = ir_unop_u2i;
1246       ir->operands[0] = bit_or(rshift(temp, c16),
1247                                lshift(temp, c16->clone(ir, NULL)));
1248    }
1249
1250    this->progress = true;
1251 }
1252
1253 void
1254 lower_instructions_visitor::find_lsb_to_float_cast(ir_expression *ir)
1255 {
1256    /* For more details, see:
1257     *
1258     * http://graphics.stanford.edu/~seander/bithacks.html#ZerosOnRightFloatCast
1259     */
1260    const unsigned elements = ir->operands[0]->type->vector_elements;
1261    ir_constant *c0 = new(ir) ir_constant(unsigned(0), elements);
1262    ir_constant *cminus1 = new(ir) ir_constant(int(-1), elements);
1263    ir_constant *c23 = new(ir) ir_constant(int(23), elements);
1264    ir_constant *c7F = new(ir) ir_constant(int(0x7F), elements);
1265    ir_variable *temp =
1266       new(ir) ir_variable(glsl_type::ivec(elements), "temp", ir_var_temporary);
1267    ir_variable *lsb_only =
1268       new(ir) ir_variable(glsl_type::uvec(elements), "lsb_only", ir_var_temporary);
1269    ir_variable *as_float =
1270       new(ir) ir_variable(glsl_type::vec(elements), "as_float", ir_var_temporary);
1271    ir_variable *lsb =
1272       new(ir) ir_variable(glsl_type::ivec(elements), "lsb", ir_var_temporary);
1273
1274    ir_instruction &i = *base_ir;
1275
1276    i.insert_before(temp);
1277
1278    if (ir->operands[0]->type->base_type == GLSL_TYPE_INT) {
1279       i.insert_before(assign(temp, ir->operands[0]));
1280    } else {
1281       assert(ir->operands[0]->type->base_type == GLSL_TYPE_UINT);
1282       i.insert_before(assign(temp, u2i(ir->operands[0])));
1283    }
1284
1285    /* The int-to-float conversion is lossless because (value & -value) is
1286     * either a power of two or zero.  We don't use the result in the zero
1287     * case.  The uint() cast is necessary so that 0x80000000 does not
1288     * generate a negative value.
1289     *
1290     * uint lsb_only = uint(value & -value);
1291     * float as_float = float(lsb_only);
1292     */
1293    i.insert_before(lsb_only);
1294    i.insert_before(assign(lsb_only, i2u(bit_and(temp, neg(temp)))));
1295
1296    i.insert_before(as_float);
1297    i.insert_before(assign(as_float, u2f(lsb_only)));
1298
1299    /* This is basically an open-coded frexp.  Implementations that have a
1300     * native frexp instruction would be better served by that.  This is
1301     * optimized versus a full-featured open-coded implementation in two ways:
1302     *
1303     * - We don't care about a correct result from subnormal numbers (including
1304     *   0.0), so the raw exponent can always be safely unbiased.
1305     *
1306     * - The value cannot be negative, so it does not need to be masked off to
1307     *   extract the exponent.
1308     *
1309     * int lsb = (floatBitsToInt(as_float) >> 23) - 0x7f;
1310     */
1311    i.insert_before(lsb);
1312    i.insert_before(assign(lsb, sub(rshift(bitcast_f2i(as_float), c23), c7F)));
1313
1314    /* Use lsb_only in the comparison instead of temp so that the & (far above)
1315     * can possibly generate the result without an explicit comparison.
1316     *
1317     * (lsb_only == 0) ? -1 : lsb;
1318     *
1319     * Since our input values are all integers, the unbiased exponent must not
1320     * be negative.  It will only be negative (-0x7f, in fact) if lsb_only is
1321     * 0.  Instead of using (lsb_only == 0), we could use (lsb >= 0).  Which is
1322     * better is likely GPU dependent.  Either way, the difference should be
1323     * small.
1324     */
1325    ir->operation = ir_triop_csel;
1326    ir->operands[0] = equal(lsb_only, c0);
1327    ir->operands[1] = cminus1;
1328    ir->operands[2] = new(ir) ir_dereference_variable(lsb);
1329
1330    this->progress = true;
1331 }
1332
1333 void
1334 lower_instructions_visitor::find_msb_to_float_cast(ir_expression *ir)
1335 {
1336    /* For more details, see:
1337     *
1338     * http://graphics.stanford.edu/~seander/bithacks.html#ZerosOnRightFloatCast
1339     */
1340    const unsigned elements = ir->operands[0]->type->vector_elements;
1341    ir_constant *c0 = new(ir) ir_constant(int(0), elements);
1342    ir_constant *cminus1 = new(ir) ir_constant(int(-1), elements);
1343    ir_constant *c23 = new(ir) ir_constant(int(23), elements);
1344    ir_constant *c7F = new(ir) ir_constant(int(0x7F), elements);
1345    ir_constant *c000000FF = new(ir) ir_constant(0x000000FFu, elements);
1346    ir_constant *cFFFFFF00 = new(ir) ir_constant(0xFFFFFF00u, elements);
1347    ir_variable *temp =
1348       new(ir) ir_variable(glsl_type::uvec(elements), "temp", ir_var_temporary);
1349    ir_variable *as_float =
1350       new(ir) ir_variable(glsl_type::vec(elements), "as_float", ir_var_temporary);
1351    ir_variable *msb =
1352       new(ir) ir_variable(glsl_type::ivec(elements), "msb", ir_var_temporary);
1353
1354    ir_instruction &i = *base_ir;
1355
1356    i.insert_before(temp);
1357
1358    if (ir->operands[0]->type->base_type == GLSL_TYPE_UINT) {
1359       i.insert_before(assign(temp, ir->operands[0]));
1360    } else {
1361       assert(ir->operands[0]->type->base_type == GLSL_TYPE_INT);
1362
1363       /* findMSB(uint(abs(some_int))) almost always does the right thing.
1364        * There are two problem values:
1365        *
1366        * * 0x80000000.  Since abs(0x80000000) == 0x80000000, findMSB returns
1367        *   31.  However, findMSB(int(0x80000000)) == 30.
1368        *
1369        * * 0xffffffff.  Since abs(0xffffffff) == 1, findMSB returns
1370        *   31.  Section 8.8 (Integer Functions) of the GLSL 4.50 spec says:
1371        *
1372        *    For a value of zero or negative one, -1 will be returned.
1373        *
1374        * For all negative number cases, including 0x80000000 and 0xffffffff,
1375        * the correct value is obtained from findMSB if instead of negating the
1376        * (already negative) value the logical-not is used.  A conditonal
1377        * logical-not can be achieved in two instructions.
1378        */
1379       ir_variable *as_int =
1380          new(ir) ir_variable(glsl_type::ivec(elements), "as_int", ir_var_temporary);
1381       ir_constant *c31 = new(ir) ir_constant(int(31), elements);
1382
1383       i.insert_before(as_int);
1384       i.insert_before(assign(as_int, ir->operands[0]));
1385       i.insert_before(assign(temp, i2u(expr(ir_binop_bit_xor,
1386                                             as_int,
1387                                             rshift(as_int, c31)))));
1388    }
1389
1390    /* The int-to-float conversion is lossless because bits are conditionally
1391     * masked off the bottom of temp to ensure the value has at most 24 bits of
1392     * data or is zero.  We don't use the result in the zero case.  The uint()
1393     * cast is necessary so that 0x80000000 does not generate a negative value.
1394     *
1395     * float as_float = float(temp > 255 ? temp & ~255 : temp);
1396     */
1397    i.insert_before(as_float);
1398    i.insert_before(assign(as_float, u2f(csel(greater(temp, c000000FF),
1399                                              bit_and(temp, cFFFFFF00),
1400                                              temp))));
1401
1402    /* This is basically an open-coded frexp.  Implementations that have a
1403     * native frexp instruction would be better served by that.  This is
1404     * optimized versus a full-featured open-coded implementation in two ways:
1405     *
1406     * - We don't care about a correct result from subnormal numbers (including
1407     *   0.0), so the raw exponent can always be safely unbiased.
1408     *
1409     * - The value cannot be negative, so it does not need to be masked off to
1410     *   extract the exponent.
1411     *
1412     * int msb = (floatBitsToInt(as_float) >> 23) - 0x7f;
1413     */
1414    i.insert_before(msb);
1415    i.insert_before(assign(msb, sub(rshift(bitcast_f2i(as_float), c23), c7F)));
1416
1417    /* Use msb in the comparison instead of temp so that the subtract can
1418     * possibly generate the result without an explicit comparison.
1419     *
1420     * (msb < 0) ? -1 : msb;
1421     *
1422     * Since our input values are all integers, the unbiased exponent must not
1423     * be negative.  It will only be negative (-0x7f, in fact) if temp is 0.
1424     */
1425    ir->operation = ir_triop_csel;
1426    ir->operands[0] = less(msb, c0);
1427    ir->operands[1] = cminus1;
1428    ir->operands[2] = new(ir) ir_dereference_variable(msb);
1429
1430    this->progress = true;
1431 }
1432
1433 ir_expression *
1434 lower_instructions_visitor::_carry(operand a, operand b)
1435 {
1436    if (lowering(CARRY_TO_ARITH))
1437       return i2u(b2i(less(add(a, b),
1438                           a.val->clone(ralloc_parent(a.val), NULL))));
1439    else
1440       return carry(a, b);
1441 }
1442
1443 void
1444 lower_instructions_visitor::imul_high_to_mul(ir_expression *ir)
1445 {
1446    /*   ABCD
1447     * * EFGH
1448     * ======
1449     * (GH * CD) + (GH * AB) << 16 + (EF * CD) << 16 + (EF * AB) << 32
1450     *
1451     * In GLSL, (a * b) becomes
1452     *
1453     * uint m1 = (a & 0x0000ffffu) * (b & 0x0000ffffu);
1454     * uint m2 = (a & 0x0000ffffu) * (b >> 16);
1455     * uint m3 = (a >> 16)         * (b & 0x0000ffffu);
1456     * uint m4 = (a >> 16)         * (b >> 16);
1457     *
1458     * uint c1;
1459     * uint c2;
1460     * uint lo_result;
1461     * uint hi_result;
1462     *
1463     * lo_result = uaddCarry(m1, m2 << 16, c1);
1464     * hi_result = m4 + c1;
1465     * lo_result = uaddCarry(lo_result, m3 << 16, c2);
1466     * hi_result = hi_result + c2;
1467     * hi_result = hi_result + (m2 >> 16) + (m3 >> 16);
1468     */
1469    const unsigned elements = ir->operands[0]->type->vector_elements;
1470    ir_variable *src1 =
1471       new(ir) ir_variable(glsl_type::uvec(elements), "src1", ir_var_temporary);
1472    ir_variable *src1h =
1473       new(ir) ir_variable(glsl_type::uvec(elements), "src1h", ir_var_temporary);
1474    ir_variable *src1l =
1475       new(ir) ir_variable(glsl_type::uvec(elements), "src1l", ir_var_temporary);
1476    ir_variable *src2 =
1477       new(ir) ir_variable(glsl_type::uvec(elements), "src2", ir_var_temporary);
1478    ir_variable *src2h =
1479       new(ir) ir_variable(glsl_type::uvec(elements), "src2h", ir_var_temporary);
1480    ir_variable *src2l =
1481       new(ir) ir_variable(glsl_type::uvec(elements), "src2l", ir_var_temporary);
1482    ir_variable *t1 =
1483       new(ir) ir_variable(glsl_type::uvec(elements), "t1", ir_var_temporary);
1484    ir_variable *t2 =
1485       new(ir) ir_variable(glsl_type::uvec(elements), "t2", ir_var_temporary);
1486    ir_variable *lo =
1487       new(ir) ir_variable(glsl_type::uvec(elements), "lo", ir_var_temporary);
1488    ir_variable *hi =
1489       new(ir) ir_variable(glsl_type::uvec(elements), "hi", ir_var_temporary);
1490    ir_variable *different_signs = NULL;
1491    ir_constant *c0000FFFF = new(ir) ir_constant(0x0000FFFFu, elements);
1492    ir_constant *c16 = new(ir) ir_constant(16u, elements);
1493
1494    ir_instruction &i = *base_ir;
1495
1496    i.insert_before(src1);
1497    i.insert_before(src2);
1498    i.insert_before(src1h);
1499    i.insert_before(src2h);
1500    i.insert_before(src1l);
1501    i.insert_before(src2l);
1502
1503    if (ir->operands[0]->type->base_type == GLSL_TYPE_UINT) {
1504       i.insert_before(assign(src1, ir->operands[0]));
1505       i.insert_before(assign(src2, ir->operands[1]));
1506    } else {
1507       assert(ir->operands[0]->type->base_type == GLSL_TYPE_INT);
1508
1509       ir_variable *itmp1 =
1510          new(ir) ir_variable(glsl_type::ivec(elements), "itmp1", ir_var_temporary);
1511       ir_variable *itmp2 =
1512          new(ir) ir_variable(glsl_type::ivec(elements), "itmp2", ir_var_temporary);
1513       ir_constant *c0 = new(ir) ir_constant(int(0), elements);
1514
1515       i.insert_before(itmp1);
1516       i.insert_before(itmp2);
1517       i.insert_before(assign(itmp1, ir->operands[0]));
1518       i.insert_before(assign(itmp2, ir->operands[1]));
1519
1520       different_signs =
1521          new(ir) ir_variable(glsl_type::bvec(elements), "different_signs",
1522                              ir_var_temporary);
1523
1524       i.insert_before(different_signs);
1525       i.insert_before(assign(different_signs, expr(ir_binop_logic_xor,
1526                                                    less(itmp1, c0),
1527                                                    less(itmp2, c0->clone(ir, NULL)))));
1528
1529       i.insert_before(assign(src1, i2u(abs(itmp1))));
1530       i.insert_before(assign(src2, i2u(abs(itmp2))));
1531    }
1532
1533    i.insert_before(assign(src1l, bit_and(src1, c0000FFFF)));
1534    i.insert_before(assign(src2l, bit_and(src2, c0000FFFF->clone(ir, NULL))));
1535    i.insert_before(assign(src1h, rshift(src1, c16)));
1536    i.insert_before(assign(src2h, rshift(src2, c16->clone(ir, NULL))));
1537
1538    i.insert_before(lo);
1539    i.insert_before(hi);
1540    i.insert_before(t1);
1541    i.insert_before(t2);
1542
1543    i.insert_before(assign(lo, mul(src1l, src2l)));
1544    i.insert_before(assign(t1, mul(src1l, src2h)));
1545    i.insert_before(assign(t2, mul(src1h, src2l)));
1546    i.insert_before(assign(hi, mul(src1h, src2h)));
1547
1548    i.insert_before(assign(hi, add(hi, _carry(lo, lshift(t1, c16->clone(ir, NULL))))));
1549    i.insert_before(assign(lo,            add(lo, lshift(t1, c16->clone(ir, NULL)))));
1550
1551    i.insert_before(assign(hi, add(hi, _carry(lo, lshift(t2, c16->clone(ir, NULL))))));
1552    i.insert_before(assign(lo,            add(lo, lshift(t2, c16->clone(ir, NULL)))));
1553
1554    if (different_signs == NULL) {
1555       assert(ir->operands[0]->type->base_type == GLSL_TYPE_UINT);
1556
1557       ir->operation = ir_binop_add;
1558       ir->operands[0] = add(hi, rshift(t1, c16->clone(ir, NULL)));
1559       ir->operands[1] = rshift(t2, c16->clone(ir, NULL));
1560    } else {
1561       assert(ir->operands[0]->type->base_type == GLSL_TYPE_INT);
1562
1563       i.insert_before(assign(hi, add(add(hi, rshift(t1, c16->clone(ir, NULL))),
1564                                      rshift(t2, c16->clone(ir, NULL)))));
1565
1566       /* For channels where different_signs is set we have to perform a 64-bit
1567        * negation.  This is *not* the same as just negating the high 32-bits.
1568        * Consider -3 * 2.  The high 32-bits is 0, but the desired result is
1569        * -1, not -0!  Recall -x == ~x + 1.
1570        */
1571       ir_variable *neg_hi =
1572          new(ir) ir_variable(glsl_type::ivec(elements), "neg_hi", ir_var_temporary);
1573       ir_constant *c1 = new(ir) ir_constant(1u, elements);
1574
1575       i.insert_before(neg_hi);
1576       i.insert_before(assign(neg_hi, add(bit_not(u2i(hi)),
1577                                          u2i(_carry(bit_not(lo), c1)))));
1578
1579       ir->operation = ir_triop_csel;
1580       ir->operands[0] = new(ir) ir_dereference_variable(different_signs);
1581       ir->operands[1] = new(ir) ir_dereference_variable(neg_hi);
1582       ir->operands[2] = u2i(hi);
1583    }
1584 }
1585
1586 void
1587 lower_instructions_visitor::sqrt_to_abs_sqrt(ir_expression *ir)
1588 {
1589    ir->operands[0] = new(ir) ir_expression(ir_unop_abs, ir->operands[0]);
1590    this->progress = true;
1591 }
1592
1593 ir_visitor_status
1594 lower_instructions_visitor::visit_leave(ir_expression *ir)
1595 {
1596    switch (ir->operation) {
1597    case ir_binop_dot:
1598       if (ir->operands[0]->type->is_double())
1599          double_dot_to_fma(ir);
1600       break;
1601    case ir_triop_lrp:
1602       if (ir->operands[0]->type->is_double())
1603          double_lrp(ir);
1604       break;
1605    case ir_binop_sub:
1606       if (lowering(SUB_TO_ADD_NEG))
1607          sub_to_add_neg(ir);
1608       break;
1609
1610    case ir_binop_div:
1611       if (ir->operands[1]->type->is_integer() && lowering(INT_DIV_TO_MUL_RCP))
1612          int_div_to_mul_rcp(ir);
1613       else if ((ir->operands[1]->type->is_float() && lowering(FDIV_TO_MUL_RCP)) ||
1614                (ir->operands[1]->type->is_double() && lowering(DDIV_TO_MUL_RCP)))
1615          div_to_mul_rcp(ir);
1616       break;
1617
1618    case ir_unop_exp:
1619       if (lowering(EXP_TO_EXP2))
1620          exp_to_exp2(ir);
1621       break;
1622
1623    case ir_unop_log:
1624       if (lowering(LOG_TO_LOG2))
1625          log_to_log2(ir);
1626       break;
1627
1628    case ir_binop_mod:
1629       if (lowering(MOD_TO_FLOOR) && (ir->type->is_float() || ir->type->is_double()))
1630          mod_to_floor(ir);
1631       break;
1632
1633    case ir_binop_pow:
1634       if (lowering(POW_TO_EXP2))
1635          pow_to_exp2(ir);
1636       break;
1637
1638    case ir_binop_ldexp:
1639       if (lowering(LDEXP_TO_ARITH) && ir->type->is_float())
1640          ldexp_to_arith(ir);
1641       if (lowering(DFREXP_DLDEXP_TO_ARITH) && ir->type->is_double())
1642          dldexp_to_arith(ir);
1643       break;
1644
1645    case ir_unop_frexp_exp:
1646       if (lowering(DFREXP_DLDEXP_TO_ARITH) && ir->operands[0]->type->is_double())
1647          dfrexp_exp_to_arith(ir);
1648       break;
1649
1650    case ir_unop_frexp_sig:
1651       if (lowering(DFREXP_DLDEXP_TO_ARITH) && ir->operands[0]->type->is_double())
1652          dfrexp_sig_to_arith(ir);
1653       break;
1654
1655    case ir_binop_carry:
1656       if (lowering(CARRY_TO_ARITH))
1657          carry_to_arith(ir);
1658       break;
1659
1660    case ir_binop_borrow:
1661       if (lowering(BORROW_TO_ARITH))
1662          borrow_to_arith(ir);
1663       break;
1664
1665    case ir_unop_saturate:
1666       if (lowering(SAT_TO_CLAMP))
1667          sat_to_clamp(ir);
1668       break;
1669
1670    case ir_unop_trunc:
1671       if (lowering(DOPS_TO_DFRAC) && ir->type->is_double())
1672          dtrunc_to_dfrac(ir);
1673       break;
1674
1675    case ir_unop_ceil:
1676       if (lowering(DOPS_TO_DFRAC) && ir->type->is_double())
1677          dceil_to_dfrac(ir);
1678       break;
1679
1680    case ir_unop_floor:
1681       if (lowering(DOPS_TO_DFRAC) && ir->type->is_double())
1682          dfloor_to_dfrac(ir);
1683       break;
1684
1685    case ir_unop_round_even:
1686       if (lowering(DOPS_TO_DFRAC) && ir->type->is_double())
1687          dround_even_to_dfrac(ir);
1688       break;
1689
1690    case ir_unop_sign:
1691       if (lowering(DOPS_TO_DFRAC) && ir->type->is_double())
1692          dsign_to_csel(ir);
1693       break;
1694
1695    case ir_unop_bit_count:
1696       if (lowering(BIT_COUNT_TO_MATH))
1697          bit_count_to_math(ir);
1698       break;
1699
1700    case ir_triop_bitfield_extract:
1701       if (lowering(EXTRACT_TO_SHIFTS))
1702          extract_to_shifts(ir);
1703       break;
1704
1705    case ir_quadop_bitfield_insert:
1706       if (lowering(INSERT_TO_SHIFTS))
1707          insert_to_shifts(ir);
1708       break;
1709
1710    case ir_unop_bitfield_reverse:
1711       if (lowering(REVERSE_TO_SHIFTS))
1712          reverse_to_shifts(ir);
1713       break;
1714
1715    case ir_unop_find_lsb:
1716       if (lowering(FIND_LSB_TO_FLOAT_CAST))
1717          find_lsb_to_float_cast(ir);
1718       break;
1719
1720    case ir_unop_find_msb:
1721       if (lowering(FIND_MSB_TO_FLOAT_CAST))
1722          find_msb_to_float_cast(ir);
1723       break;
1724
1725    case ir_binop_imul_high:
1726       if (lowering(IMUL_HIGH_TO_MUL))
1727          imul_high_to_mul(ir);
1728       break;
1729
1730    case ir_unop_rsq:
1731    case ir_unop_sqrt:
1732       if (lowering(SQRT_TO_ABS_SQRT))
1733          sqrt_to_abs_sqrt(ir);
1734       break;
1735
1736    default:
1737       return visit_continue;
1738    }
1739
1740    return visit_continue;
1741 }