src/compiler/glsl/lower_instructions.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  21  * DEALINGS IN THE SOFTWARE.
  22  */
  23
  24 /**
  25  * \file lower_instructions.cpp
  26  *
  27  * Many GPUs lack native instructions for certain expression operations, and
  28  * must replace them with some other expression tree.  This pass lowers some
  29  * of the most common cases, allowing the lowering code to be implemented once
  30  * rather than in each driver backend.
  31  *
  32  * Currently supported transformations:
  33  * - SUB_TO_ADD_NEG
  34  * - DIV_TO_MUL_RCP
  35  * - INT_DIV_TO_MUL_RCP
  36  * - EXP_TO_EXP2
  37  * - POW_TO_EXP2
  38  * - LOG_TO_LOG2
  39  * - MOD_TO_FLOOR
  40  * - LDEXP_TO_ARITH
  41  * - DFREXP_TO_ARITH
  42  * - CARRY_TO_ARITH
  43  * - BORROW_TO_ARITH
  44  * - SAT_TO_CLAMP
  45  * - DOPS_TO_DFRAC
  46  *
  47  * SUB_TO_ADD_NEG:
  48  * ---------------
  49  * Breaks an ir_binop_sub expression down to add(op0, neg(op1))
  50  *
  51  * This simplifies expression reassociation, and for many backends
  52  * there is no subtract operation separate from adding the negation.
  53  * For backends with native subtract operations, they will probably
  54  * want to recognize add(op0, neg(op1)) or the other way around to
  55  * produce a subtract anyway.
  56  *
  57  * DIV_TO_MUL_RCP and INT_DIV_TO_MUL_RCP:
  58  * --------------------------------------
  59  * Breaks an ir_binop_div expression down to op0 * (rcp(op1)).
  60  *
  61  * Many GPUs don't have a divide instruction (945 and 965 included),
  62  * but they do have an RCP instruction to compute an approximate
  63  * reciprocal.  By breaking the operation down, constant reciprocals
  64  * can get constant folded.
  65  *
  66  * DIV_TO_MUL_RCP only lowers floating point division; INT_DIV_TO_MUL_RCP
  67  * handles the integer case, converting to and from floating point so that
  68  * RCP is possible.
  69  *
  70  * EXP_TO_EXP2 and LOG_TO_LOG2:
  71  * ----------------------------
  72  * Many GPUs don't have a base e log or exponent instruction, but they
  73  * do have base 2 versions, so this pass converts exp and log to exp2
  74  * and log2 operations.
  75  *
  76  * POW_TO_EXP2:
  77  * -----------
  78  * Many older GPUs don't have an x**y instruction.  For these GPUs, convert
  79  * x**y to 2**(y * log2(x)).
  80  *
  81  * MOD_TO_FLOOR:
  82  * -------------
  83  * Breaks an ir_binop_mod expression down to (op0 - op1 * floor(op0 / op1))
  84  *
  85  * Many GPUs don't have a MOD instruction (945 and 965 included), and
  86  * if we have to break it down like this anyway, it gives an
  87  * opportunity to do things like constant fold the (1.0 / op1) easily.
  88  *
  89  * Note: before we used to implement this as op1 * fract(op / op1) but this
  90  * implementation had significant precision errors.
  91  *
  92  * LDEXP_TO_ARITH:
  93  * -------------
  94  * Converts ir_binop_ldexp to arithmetic and bit operations for float sources.
  95  *
  96  * DFREXP_DLDEXP_TO_ARITH:
  97  * ---------------
  98  * Converts ir_binop_ldexp, ir_unop_frexp_sig, and ir_unop_frexp_exp to
  99  * arithmetic and bit ops for double arguments.
 100  *
 101  * CARRY_TO_ARITH:
 102  * ---------------
 103  * Converts ir_carry into (x + y) < x.
 104  *
 105  * BORROW_TO_ARITH:
 106  * ----------------
 107  * Converts ir_borrow into (x < y).
 108  *
 109  * SAT_TO_CLAMP:
 110  * -------------
 111  * Converts ir_unop_saturate into min(max(x, 0.0), 1.0)
 112  *
 113  * DOPS_TO_DFRAC:
 114  * --------------
 115  * Converts double trunc, ceil, floor, round to fract
 116  */
 117
 118 #include "c99_math.h"
 119 #include "program/prog_instruction.h" /* for swizzle */
 120 #include "compiler/glsl_types.h"
 121 #include "ir.h"
 122 #include "ir_builder.h"
 123 #include "ir_optimization.h"
 124
 125 using namespace ir_builder;
 126
 127 namespace {
 128
 129 class lower_instructions_visitor : public ir_hierarchical_visitor {
 130 public:
 131    lower_instructions_visitor(unsigned lower)
 132       : progress(false), lower(lower) { }
 133
 134    ir_visitor_status visit_leave(ir_expression *);
 135
 136    bool progress;
 137
 138 private:
 139    unsigned lower; /** Bitfield of which operations to lower */
 140
 141    void sub_to_add_neg(ir_expression *);
 142    void div_to_mul_rcp(ir_expression *);
 143    void int_div_to_mul_rcp(ir_expression *);
 144    void mod_to_floor(ir_expression *);
 145    void exp_to_exp2(ir_expression *);
 146    void pow_to_exp2(ir_expression *);
 147    void log_to_log2(ir_expression *);
 148    void ldexp_to_arith(ir_expression *);
 149    void dldexp_to_arith(ir_expression *);
 150    void dfrexp_sig_to_arith(ir_expression *);
 151    void dfrexp_exp_to_arith(ir_expression *);
 152    void carry_to_arith(ir_expression *);
 153    void borrow_to_arith(ir_expression *);
 154    void sat_to_clamp(ir_expression *);
 155    void double_dot_to_fma(ir_expression *);
 156    void double_lrp(ir_expression *);
 157    void dceil_to_dfrac(ir_expression *);
 158    void dfloor_to_dfrac(ir_expression *);
 159    void dround_even_to_dfrac(ir_expression *);
 160    void dtrunc_to_dfrac(ir_expression *);
 161    void dsign_to_csel(ir_expression *);
 162    void bit_count_to_math(ir_expression *);
 163    void extract_to_shifts(ir_expression *);
 164    void insert_to_shifts(ir_expression *);
 165 };
 166
 167 } /* anonymous namespace */
 168
 169 /**
 170  * Determine if a particular type of lowering should occur
 171  */
 172 #define lowering(x) (this->lower & x)
 173
 174 bool
 175 lower_instructions(exec_list *instructions, unsigned what_to_lower)
 176 {
 177    lower_instructions_visitor v(what_to_lower);
 178
 179    visit_list_elements(&v, instructions);
 180    return v.progress;
 181 }
 182
 183 void
 184 lower_instructions_visitor::sub_to_add_neg(ir_expression *ir)
 185 {
 186    ir->operation = ir_binop_add;
 187    ir->operands[1] = new(ir) ir_expression(ir_unop_neg, ir->operands[1]->type,
 188                                            ir->operands[1], NULL);
 189    this->progress = true;
 190 }
 191
 192 void
 193 lower_instructions_visitor::div_to_mul_rcp(ir_expression *ir)
 194 {
 195    assert(ir->operands[1]->type->is_float() || ir->operands[1]->type->is_double());
 196
 197    /* New expression for the 1.0 / op1 */
 198    ir_rvalue *expr;
 199    expr = new(ir) ir_expression(ir_unop_rcp,
 200                                 ir->operands[1]->type,
 201                                 ir->operands[1]);
 202
 203    /* op0 / op1 -> op0 * (1.0 / op1) */
 204    ir->operation = ir_binop_mul;
 205    ir->operands[1] = expr;
 206
 207    this->progress = true;
 208 }
 209
 210 void
 211 lower_instructions_visitor::int_div_to_mul_rcp(ir_expression *ir)
 212 {
 213    assert(ir->operands[1]->type->is_integer());
 214
 215    /* Be careful with integer division -- we need to do it as a
 216     * float and re-truncate, since rcp(n > 1) of an integer would
 217     * just be 0.
 218     */
 219    ir_rvalue *op0, *op1;
 220    const struct glsl_type *vec_type;
 221
 222    vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
 223                                       ir->operands[1]->type->vector_elements,
 224                                       ir->operands[1]->type->matrix_columns);
 225
 226    if (ir->operands[1]->type->base_type == GLSL_TYPE_INT)
 227       op1 = new(ir) ir_expression(ir_unop_i2f, vec_type, ir->operands[1], NULL);
 228    else
 229       op1 = new(ir) ir_expression(ir_unop_u2f, vec_type, ir->operands[1], NULL);
 230
 231    op1 = new(ir) ir_expression(ir_unop_rcp, op1->type, op1, NULL);
 232
 233    vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
 234                                       ir->operands[0]->type->vector_elements,
 235                                       ir->operands[0]->type->matrix_columns);
 236
 237    if (ir->operands[0]->type->base_type == GLSL_TYPE_INT)
 238       op0 = new(ir) ir_expression(ir_unop_i2f, vec_type, ir->operands[0], NULL);
 239    else
 240       op0 = new(ir) ir_expression(ir_unop_u2f, vec_type, ir->operands[0], NULL);
 241
 242    vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
 243                                       ir->type->vector_elements,
 244                                       ir->type->matrix_columns);
 245
 246    op0 = new(ir) ir_expression(ir_binop_mul, vec_type, op0, op1);
 247
 248    if (ir->operands[1]->type->base_type == GLSL_TYPE_INT) {
 249       ir->operation = ir_unop_f2i;
 250       ir->operands[0] = op0;
 251    } else {
 252       ir->operation = ir_unop_i2u;
 253       ir->operands[0] = new(ir) ir_expression(ir_unop_f2i, op0);
 254    }
 255    ir->operands[1] = NULL;
 256
 257    this->progress = true;
 258 }
 259
 260 void
 261 lower_instructions_visitor::exp_to_exp2(ir_expression *ir)
 262 {
 263    ir_constant *log2_e = new(ir) ir_constant(float(M_LOG2E));
 264
 265    ir->operation = ir_unop_exp2;
 266    ir->operands[0] = new(ir) ir_expression(ir_binop_mul, ir->operands[0]->type,
 267                                            ir->operands[0], log2_e);
 268    this->progress = true;
 269 }
 270
 271 void
 272 lower_instructions_visitor::pow_to_exp2(ir_expression *ir)
 273 {
 274    ir_expression *const log2_x =
 275       new(ir) ir_expression(ir_unop_log2, ir->operands[0]->type,
 276                             ir->operands[0]);
 277
 278    ir->operation = ir_unop_exp2;
 279    ir->operands[0] = new(ir) ir_expression(ir_binop_mul, ir->operands[1]->type,
 280                                            ir->operands[1], log2_x);
 281    ir->operands[1] = NULL;
 282    this->progress = true;
 283 }
 284
 285 void
 286 lower_instructions_visitor::log_to_log2(ir_expression *ir)
 287 {
 288    ir->operation = ir_binop_mul;
 289    ir->operands[0] = new(ir) ir_expression(ir_unop_log2, ir->operands[0]->type,
 290                                            ir->operands[0], NULL);
 291    ir->operands[1] = new(ir) ir_constant(float(1.0 / M_LOG2E));
 292    this->progress = true;
 293 }
 294
 295 void
 296 lower_instructions_visitor::mod_to_floor(ir_expression *ir)
 297 {
 298    ir_variable *x = new(ir) ir_variable(ir->operands[0]->type, "mod_x",
 299                                          ir_var_temporary);
 300    ir_variable *y = new(ir) ir_variable(ir->operands[1]->type, "mod_y",
 301                                          ir_var_temporary);
 302    this->base_ir->insert_before(x);
 303    this->base_ir->insert_before(y);
 304
 305    ir_assignment *const assign_x =
 306       new(ir) ir_assignment(new(ir) ir_dereference_variable(x),
 307                             ir->operands[0], NULL);
 308    ir_assignment *const assign_y =
 309       new(ir) ir_assignment(new(ir) ir_dereference_variable(y),
 310                             ir->operands[1], NULL);
 311
 312    this->base_ir->insert_before(assign_x);
 313    this->base_ir->insert_before(assign_y);
 314
 315    ir_expression *const div_expr =
 316       new(ir) ir_expression(ir_binop_div, x->type,
 317                             new(ir) ir_dereference_variable(x),
 318                             new(ir) ir_dereference_variable(y));
 319
 320    /* Don't generate new IR that would need to be lowered in an additional
 321     * pass.
 322     */
 323    if (lowering(DIV_TO_MUL_RCP) && (ir->type->is_float() || ir->type->is_double()))
 324       div_to_mul_rcp(div_expr);
 325
 326    ir_expression *const floor_expr =
 327       new(ir) ir_expression(ir_unop_floor, x->type, div_expr);
 328
 329    if (lowering(DOPS_TO_DFRAC) && ir->type->is_double())
 330       dfloor_to_dfrac(floor_expr);
 331
 332    ir_expression *const mul_expr =
 333       new(ir) ir_expression(ir_binop_mul,
 334                             new(ir) ir_dereference_variable(y),
 335                             floor_expr);
 336
 337    ir->operation = ir_binop_sub;
 338    ir->operands[0] = new(ir) ir_dereference_variable(x);
 339    ir->operands[1] = mul_expr;
 340    this->progress = true;
 341 }
 342
 343 void
 344 lower_instructions_visitor::ldexp_to_arith(ir_expression *ir)
 345 {
 346    /* Translates
 347     *    ir_binop_ldexp x exp
 348     * into
 349     *
 350     *    extracted_biased_exp = rshift(bitcast_f2i(abs(x)), exp_shift);
 351     *    resulting_biased_exp = extracted_biased_exp + exp;
 352     *
 353     *    if (resulting_biased_exp < 1 || x == 0.0f) {
 354     *       return copysign(0.0, x);
 355     *    }
 356     *
 357     *    return bitcast_u2f((bitcast_f2u(x) & sign_mantissa_mask) |
 358     *                       lshift(i2u(resulting_biased_exp), exp_shift));
 359     *
 360     * which we can't actually implement as such, since the GLSL IR doesn't
 361     * have vectorized if-statements. We actually implement it without branches
 362     * using conditional-select:
 363     *
 364     *    extracted_biased_exp = rshift(bitcast_f2i(abs(x)), exp_shift);
 365     *    resulting_biased_exp = extracted_biased_exp + exp;
 366     *
 367     *    is_not_zero_or_underflow = logic_and(nequal(x, 0.0f),
 368     *                                         gequal(resulting_biased_exp, 1);
 369     *    x = csel(is_not_zero_or_underflow, x, copysign(0.0f, x));
 370     *    resulting_biased_exp = csel(is_not_zero_or_underflow,
 371     *                                resulting_biased_exp, 0);
 372     *
 373     *    return bitcast_u2f((bitcast_f2u(x) & sign_mantissa_mask) |
 374     *                       lshift(i2u(resulting_biased_exp), exp_shift));
 375     */
 376
 377    const unsigned vec_elem = ir->type->vector_elements;
 378
 379    /* Types */
 380    const glsl_type *ivec = glsl_type::get_instance(GLSL_TYPE_INT, vec_elem, 1);
 381    const glsl_type *bvec = glsl_type::get_instance(GLSL_TYPE_BOOL, vec_elem, 1);
 382
 383    /* Constants */
 384    ir_constant *zeroi = ir_constant::zero(ir, ivec);
 385
 386    ir_constant *sign_mask = new(ir) ir_constant(0x80000000u, vec_elem);
 387
 388    ir_constant *exp_shift = new(ir) ir_constant(23, vec_elem);
 389    ir_constant *exp_width = new(ir) ir_constant(8, vec_elem);
 390
 391    /* Temporary variables */
 392    ir_variable *x = new(ir) ir_variable(ir->type, "x", ir_var_temporary);
 393    ir_variable *exp = new(ir) ir_variable(ivec, "exp", ir_var_temporary);
 394
 395    ir_variable *zero_sign_x = new(ir) ir_variable(ir->type, "zero_sign_x",
 396                                                   ir_var_temporary);
 397
 398    ir_variable *extracted_biased_exp =
 399       new(ir) ir_variable(ivec, "extracted_biased_exp", ir_var_temporary);
 400    ir_variable *resulting_biased_exp =
 401       new(ir) ir_variable(ivec, "resulting_biased_exp", ir_var_temporary);
 402
 403    ir_variable *is_not_zero_or_underflow =
 404       new(ir) ir_variable(bvec, "is_not_zero_or_underflow", ir_var_temporary);
 405
 406    ir_instruction &i = *base_ir;
 407
 408    /* Copy <x> and <exp> arguments. */
 409    i.insert_before(x);
 410    i.insert_before(assign(x, ir->operands[0]));
 411    i.insert_before(exp);
 412    i.insert_before(assign(exp, ir->operands[1]));
 413
 414    /* Extract the biased exponent from <x>. */
 415    i.insert_before(extracted_biased_exp);
 416    i.insert_before(assign(extracted_biased_exp,
 417                           rshift(bitcast_f2i(abs(x)), exp_shift)));
 418
 419    i.insert_before(resulting_biased_exp);
 420    i.insert_before(assign(resulting_biased_exp,
 421                           add(extracted_biased_exp, exp)));
 422
 423    /* Test if result is ±0.0, subnormal, or underflow by checking if the
 424     * resulting biased exponent would be less than 0x1. If so, the result is
 425     * 0.0 with the sign of x. (Actually, invert the conditions so that
 426     * immediate values are the second arguments, which is better for i965)
 427     */
 428    i.insert_before(zero_sign_x);
 429    i.insert_before(assign(zero_sign_x,
 430                           bitcast_u2f(bit_and(bitcast_f2u(x), sign_mask))));
 431
 432    i.insert_before(is_not_zero_or_underflow);
 433    i.insert_before(assign(is_not_zero_or_underflow,
 434                           logic_and(nequal(x, new(ir) ir_constant(0.0f, vec_elem)),
 435                                     gequal(resulting_biased_exp,
 436                                            new(ir) ir_constant(0x1, vec_elem)))));
 437    i.insert_before(assign(x, csel(is_not_zero_or_underflow,
 438                                   x, zero_sign_x)));
 439    i.insert_before(assign(resulting_biased_exp,
 440                           csel(is_not_zero_or_underflow,
 441                                resulting_biased_exp, zeroi)));
 442
 443    /* We could test for overflows by checking if the resulting biased exponent
 444     * would be greater than 0xFE. Turns out we don't need to because the GLSL
 445     * spec says:
 446     *
 447     *    "If this product is too large to be represented in the
 448     *     floating-point type, the result is undefined."
 449     */
 450
 451    ir_constant *exp_shift_clone = exp_shift->clone(ir, NULL);
 452    ir->operation = ir_unop_bitcast_i2f;
 453    ir->operands[0] = bitfield_insert(bitcast_f2i(x), resulting_biased_exp,
 454                                      exp_shift_clone, exp_width);
 455    ir->operands[1] = NULL;
 456
 457    this->progress = true;
 458 }
 459
 460 void
 461 lower_instructions_visitor::dldexp_to_arith(ir_expression *ir)
 462 {
 463    /* See ldexp_to_arith for structure. Uses frexp_exp to extract the exponent
 464     * from the significand.
 465     */
 466
 467    const unsigned vec_elem = ir->type->vector_elements;
 468
 469    /* Types */
 470    const glsl_type *ivec = glsl_type::get_instance(GLSL_TYPE_INT, vec_elem, 1);
 471    const glsl_type *bvec = glsl_type::get_instance(GLSL_TYPE_BOOL, vec_elem, 1);
 472
 473    /* Constants */
 474    ir_constant *zeroi = ir_constant::zero(ir, ivec);
 475
 476    ir_constant *sign_mask = new(ir) ir_constant(0x80000000u);
 477
 478    ir_constant *exp_shift = new(ir) ir_constant(20u);
 479    ir_constant *exp_width = new(ir) ir_constant(11u);
 480    ir_constant *exp_bias = new(ir) ir_constant(1022, vec_elem);
 481
 482    /* Temporary variables */
 483    ir_variable *x = new(ir) ir_variable(ir->type, "x", ir_var_temporary);
 484    ir_variable *exp = new(ir) ir_variable(ivec, "exp", ir_var_temporary);
 485
 486    ir_variable *zero_sign_x = new(ir) ir_variable(ir->type, "zero_sign_x",
 487                                                   ir_var_temporary);
 488
 489    ir_variable *extracted_biased_exp =
 490       new(ir) ir_variable(ivec, "extracted_biased_exp", ir_var_temporary);
 491    ir_variable *resulting_biased_exp =
 492       new(ir) ir_variable(ivec, "resulting_biased_exp", ir_var_temporary);
 493
 494    ir_variable *is_not_zero_or_underflow =
 495       new(ir) ir_variable(bvec, "is_not_zero_or_underflow", ir_var_temporary);
 496
 497    ir_instruction &i = *base_ir;
 498
 499    /* Copy <x> and <exp> arguments. */
 500    i.insert_before(x);
 501    i.insert_before(assign(x, ir->operands[0]));
 502    i.insert_before(exp);
 503    i.insert_before(assign(exp, ir->operands[1]));
 504
 505    ir_expression *frexp_exp = expr(ir_unop_frexp_exp, x);
 506    if (lowering(DFREXP_DLDEXP_TO_ARITH))
 507       dfrexp_exp_to_arith(frexp_exp);
 508
 509    /* Extract the biased exponent from <x>. */
 510    i.insert_before(extracted_biased_exp);
 511    i.insert_before(assign(extracted_biased_exp, add(frexp_exp, exp_bias)));
 512
 513    i.insert_before(resulting_biased_exp);
 514    i.insert_before(assign(resulting_biased_exp,
 515                           add(extracted_biased_exp, exp)));
 516
 517    /* Test if result is ±0.0, subnormal, or underflow by checking if the
 518     * resulting biased exponent would be less than 0x1. If so, the result is
 519     * 0.0 with the sign of x. (Actually, invert the conditions so that
 520     * immediate values are the second arguments, which is better for i965)
 521     * TODO: Implement in a vector fashion.
 522     */
 523    i.insert_before(zero_sign_x);
 524    for (unsigned elem = 0; elem < vec_elem; elem++) {
 525       ir_variable *unpacked =
 526          new(ir) ir_variable(glsl_type::uvec2_type, "unpacked", ir_var_temporary);
 527       i.insert_before(unpacked);
 528       i.insert_before(
 529             assign(unpacked,
 530                    expr(ir_unop_unpack_double_2x32, swizzle(x, elem, 1))));
 531       i.insert_before(assign(unpacked, bit_and(swizzle_y(unpacked), sign_mask->clone(ir, NULL)),
 532                              WRITEMASK_Y));
 533       i.insert_before(assign(unpacked, ir_constant::zero(ir, glsl_type::uint_type), WRITEMASK_X));
 534       i.insert_before(assign(zero_sign_x,
 535                              expr(ir_unop_pack_double_2x32, unpacked),
 536                              1 << elem));
 537    }
 538    i.insert_before(is_not_zero_or_underflow);
 539    i.insert_before(assign(is_not_zero_or_underflow,
 540                           gequal(resulting_biased_exp,
 541                                   new(ir) ir_constant(0x1, vec_elem))));
 542    i.insert_before(assign(x, csel(is_not_zero_or_underflow,
 543                                   x, zero_sign_x)));
 544    i.insert_before(assign(resulting_biased_exp,
 545                           csel(is_not_zero_or_underflow,
 546                                resulting_biased_exp, zeroi)));
 547
 548    /* We could test for overflows by checking if the resulting biased exponent
 549     * would be greater than 0xFE. Turns out we don't need to because the GLSL
 550     * spec says:
 551     *
 552     *    "If this product is too large to be represented in the
 553     *     floating-point type, the result is undefined."
 554     */
 555
 556    ir_rvalue *results[4] = {NULL};
 557    for (unsigned elem = 0; elem < vec_elem; elem++) {
 558       ir_variable *unpacked =
 559          new(ir) ir_variable(glsl_type::uvec2_type, "unpacked", ir_var_temporary);
 560       i.insert_before(unpacked);
 561       i.insert_before(
 562             assign(unpacked,
 563                    expr(ir_unop_unpack_double_2x32, swizzle(x, elem, 1))));
 564
 565       ir_expression *bfi = bitfield_insert(
 566             swizzle_y(unpacked),
 567             i2u(swizzle(resulting_biased_exp, elem, 1)),
 568             exp_shift->clone(ir, NULL),
 569             exp_width->clone(ir, NULL));
 570
 571       i.insert_before(assign(unpacked, bfi, WRITEMASK_Y));
 572
 573       results[elem] = expr(ir_unop_pack_double_2x32, unpacked);
 574    }
 575
 576    ir->operation = ir_quadop_vector;
 577    ir->operands[0] = results[0];
 578    ir->operands[1] = results[1];
 579    ir->operands[2] = results[2];
 580    ir->operands[3] = results[3];
 581
 582    /* Don't generate new IR that would need to be lowered in an additional
 583     * pass.
 584     */
 585
 586    this->progress = true;
 587 }
 588
 589 void
 590 lower_instructions_visitor::dfrexp_sig_to_arith(ir_expression *ir)
 591 {
 592    const unsigned vec_elem = ir->type->vector_elements;
 593    const glsl_type *bvec = glsl_type::get_instance(GLSL_TYPE_BOOL, vec_elem, 1);
 594
 595    /* Double-precision floating-point values are stored as
 596     *   1 sign bit;
 597     *   11 exponent bits;
 598     *   52 mantissa bits.
 599     *
 600     * We're just extracting the significand here, so we only need to modify
 601     * the upper 32-bit uint. Unfortunately we must extract each double
 602     * independently as there is no vector version of unpackDouble.
 603     */
 604
 605    ir_instruction &i = *base_ir;
 606
 607    ir_variable *is_not_zero =
 608       new(ir) ir_variable(bvec, "is_not_zero", ir_var_temporary);
 609    ir_rvalue *results[4] = {NULL};
 610
 611    ir_constant *dzero = new(ir) ir_constant(0.0, vec_elem);
 612    i.insert_before(is_not_zero);
 613    i.insert_before(
 614          assign(is_not_zero,
 615                 nequal(abs(ir->operands[0]->clone(ir, NULL)), dzero)));
 616
 617    /* TODO: Remake this as more vector-friendly when int64 support is
 618     * available.
 619     */
 620    for (unsigned elem = 0; elem < vec_elem; elem++) {
 621       ir_constant *zero = new(ir) ir_constant(0u, 1);
 622       ir_constant *sign_mantissa_mask = new(ir) ir_constant(0x800fffffu, 1);
 623
 624       /* Exponent of double floating-point values in the range [0.5, 1.0). */
 625       ir_constant *exponent_value = new(ir) ir_constant(0x3fe00000u, 1);
 626
 627       ir_variable *bits =
 628          new(ir) ir_variable(glsl_type::uint_type, "bits", ir_var_temporary);
 629       ir_variable *unpacked =
 630          new(ir) ir_variable(glsl_type::uvec2_type, "unpacked", ir_var_temporary);
 631
 632       ir_rvalue *x = swizzle(ir->operands[0]->clone(ir, NULL), elem, 1);
 633
 634       i.insert_before(bits);
 635       i.insert_before(unpacked);
 636       i.insert_before(assign(unpacked, expr(ir_unop_unpack_double_2x32, x)));
 637
 638       /* Manipulate the high uint to remove the exponent and replace it with
 639        * either the default exponent or zero.
 640        */
 641       i.insert_before(assign(bits, swizzle_y(unpacked)));
 642       i.insert_before(assign(bits, bit_and(bits, sign_mantissa_mask)));
 643       i.insert_before(assign(bits, bit_or(bits,
 644                                           csel(swizzle(is_not_zero, elem, 1),
 645                                                exponent_value,
 646                                                zero))));
 647       i.insert_before(assign(unpacked, bits, WRITEMASK_Y));
 648       results[elem] = expr(ir_unop_pack_double_2x32, unpacked);
 649    }
 650
 651    /* Put the dvec back together */
 652    ir->operation = ir_quadop_vector;
 653    ir->operands[0] = results[0];
 654    ir->operands[1] = results[1];
 655    ir->operands[2] = results[2];
 656    ir->operands[3] = results[3];
 657
 658    this->progress = true;
 659 }
 660
 661 void
 662 lower_instructions_visitor::dfrexp_exp_to_arith(ir_expression *ir)
 663 {
 664    const unsigned vec_elem = ir->type->vector_elements;
 665    const glsl_type *bvec = glsl_type::get_instance(GLSL_TYPE_BOOL, vec_elem, 1);
 666    const glsl_type *uvec = glsl_type::get_instance(GLSL_TYPE_UINT, vec_elem, 1);
 667
 668    /* Double-precision floating-point values are stored as
 669     *   1 sign bit;
 670     *   11 exponent bits;
 671     *   52 mantissa bits.
 672     *
 673     * We're just extracting the exponent here, so we only care about the upper
 674     * 32-bit uint.
 675     */
 676
 677    ir_instruction &i = *base_ir;
 678
 679    ir_variable *is_not_zero =
 680       new(ir) ir_variable(bvec, "is_not_zero", ir_var_temporary);
 681    ir_variable *high_words =
 682       new(ir) ir_variable(uvec, "high_words", ir_var_temporary);
 683    ir_constant *dzero = new(ir) ir_constant(0.0, vec_elem);
 684    ir_constant *izero = new(ir) ir_constant(0, vec_elem);
 685
 686    ir_rvalue *absval = abs(ir->operands[0]);
 687
 688    i.insert_before(is_not_zero);
 689    i.insert_before(high_words);
 690    i.insert_before(assign(is_not_zero, nequal(absval->clone(ir, NULL), dzero)));
 691
 692    /* Extract all of the upper uints. */
 693    for (unsigned elem = 0; elem < vec_elem; elem++) {
 694       ir_rvalue *x = swizzle(absval->clone(ir, NULL), elem, 1);
 695
 696       i.insert_before(assign(high_words,
 697                              swizzle_y(expr(ir_unop_unpack_double_2x32, x)),
 698                              1 << elem));
 699
 700    }
 701    ir_constant *exponent_shift = new(ir) ir_constant(20, vec_elem);
 702    ir_constant *exponent_bias = new(ir) ir_constant(-1022, vec_elem);
 703
 704    /* For non-zero inputs, shift the exponent down and apply bias. */
 705    ir->operation = ir_triop_csel;
 706    ir->operands[0] = new(ir) ir_dereference_variable(is_not_zero);
 707    ir->operands[1] = add(exponent_bias, u2i(rshift(high_words, exponent_shift)));
 708    ir->operands[2] = izero;
 709
 710    this->progress = true;
 711 }
 712
 713 void
 714 lower_instructions_visitor::carry_to_arith(ir_expression *ir)
 715 {
 716    /* Translates
 717     *   ir_binop_carry x y
 718     * into
 719     *   sum = ir_binop_add x y
 720     *   bcarry = ir_binop_less sum x
 721     *   carry = ir_unop_b2i bcarry
 722     */
 723
 724    ir_rvalue *x_clone = ir->operands[0]->clone(ir, NULL);
 725    ir->operation = ir_unop_i2u;
 726    ir->operands[0] = b2i(less(add(ir->operands[0], ir->operands[1]), x_clone));
 727    ir->operands[1] = NULL;
 728
 729    this->progress = true;
 730 }
 731
 732 void
 733 lower_instructions_visitor::borrow_to_arith(ir_expression *ir)
 734 {
 735    /* Translates
 736     *   ir_binop_borrow x y
 737     * into
 738     *   bcarry = ir_binop_less x y
 739     *   carry = ir_unop_b2i bcarry
 740     */
 741
 742    ir->operation = ir_unop_i2u;
 743    ir->operands[0] = b2i(less(ir->operands[0], ir->operands[1]));
 744    ir->operands[1] = NULL;
 745
 746    this->progress = true;
 747 }
 748
 749 void
 750 lower_instructions_visitor::sat_to_clamp(ir_expression *ir)
 751 {
 752    /* Translates
 753     *   ir_unop_saturate x
 754     * into
 755     *   ir_binop_min (ir_binop_max(x, 0.0), 1.0)
 756     */
 757
 758    ir->operation = ir_binop_min;
 759    ir->operands[0] = new(ir) ir_expression(ir_binop_max, ir->operands[0]->type,
 760                                            ir->operands[0],
 761                                            new(ir) ir_constant(0.0f));
 762    ir->operands[1] = new(ir) ir_constant(1.0f);
 763
 764    this->progress = true;
 765 }
 766
 767 void
 768 lower_instructions_visitor::double_dot_to_fma(ir_expression *ir)
 769 {
 770    ir_variable *temp = new(ir) ir_variable(ir->operands[0]->type->get_base_type(), "dot_res",
 771                                            ir_var_temporary);
 772    this->base_ir->insert_before(temp);
 773
 774    int nc = ir->operands[0]->type->components();
 775    for (int i = nc - 1; i >= 1; i--) {
 776       ir_assignment *assig;
 777       if (i == (nc - 1)) {
 778          assig = assign(temp, mul(swizzle(ir->operands[0]->clone(ir, NULL), i, 1),
 779                                   swizzle(ir->operands[1]->clone(ir, NULL), i, 1)));
 780       } else {
 781          assig = assign(temp, fma(swizzle(ir->operands[0]->clone(ir, NULL), i, 1),
 782                                   swizzle(ir->operands[1]->clone(ir, NULL), i, 1),
 783                                   temp));
 784       }
 785       this->base_ir->insert_before(assig);
 786    }
 787
 788    ir->operation = ir_triop_fma;
 789    ir->operands[0] = swizzle(ir->operands[0], 0, 1);
 790    ir->operands[1] = swizzle(ir->operands[1], 0, 1);
 791    ir->operands[2] = new(ir) ir_dereference_variable(temp);
 792
 793    this->progress = true;
 794
 795 }
 796
 797 void
 798 lower_instructions_visitor::double_lrp(ir_expression *ir)
 799 {
 800    int swizval;
 801    ir_rvalue *op0 = ir->operands[0], *op2 = ir->operands[2];
 802    ir_constant *one = new(ir) ir_constant(1.0, op2->type->vector_elements);
 803
 804    switch (op2->type->vector_elements) {
 805    case 1:
 806       swizval = SWIZZLE_XXXX;
 807       break;
 808    default:
 809       assert(op0->type->vector_elements == op2->type->vector_elements);
 810       swizval = SWIZZLE_XYZW;
 811       break;
 812    }
 813
 814    ir->operation = ir_triop_fma;
 815    ir->operands[0] = swizzle(op2, swizval, op0->type->vector_elements);
 816    ir->operands[2] = mul(sub(one, op2->clone(ir, NULL)), op0);
 817
 818    this->progress = true;
 819 }
 820
 821 void
 822 lower_instructions_visitor::dceil_to_dfrac(ir_expression *ir)
 823 {
 824    /*
 825     * frtemp = frac(x);
 826     * temp = sub(x, frtemp);
 827     * result = temp + ((frtemp != 0.0) ? 1.0 : 0.0);
 828     */
 829    ir_instruction &i = *base_ir;
 830    ir_constant *zero = new(ir) ir_constant(0.0, ir->operands[0]->type->vector_elements);
 831    ir_constant *one = new(ir) ir_constant(1.0, ir->operands[0]->type->vector_elements);
 832    ir_variable *frtemp = new(ir) ir_variable(ir->operands[0]->type, "frtemp",
 833                                              ir_var_temporary);
 834
 835    i.insert_before(frtemp);
 836    i.insert_before(assign(frtemp, fract(ir->operands[0])));
 837
 838    ir->operation = ir_binop_add;
 839    ir->operands[0] = sub(ir->operands[0]->clone(ir, NULL), frtemp);
 840    ir->operands[1] = csel(nequal(frtemp, zero), one, zero->clone(ir, NULL));
 841
 842    this->progress = true;
 843 }
 844
 845 void
 846 lower_instructions_visitor::dfloor_to_dfrac(ir_expression *ir)
 847 {
 848    /*
 849     * frtemp = frac(x);
 850     * result = sub(x, frtemp);
 851     */
 852    ir->operation = ir_binop_sub;
 853    ir->operands[1] = fract(ir->operands[0]->clone(ir, NULL));
 854
 855    this->progress = true;
 856 }
 857 void
 858 lower_instructions_visitor::dround_even_to_dfrac(ir_expression *ir)
 859 {
 860    /*
 861     * insane but works
 862     * temp = x + 0.5;
 863     * frtemp = frac(temp);
 864     * t2 = sub(temp, frtemp);
 865     * if (frac(x) == 0.5)
 866     *     result = frac(t2 * 0.5) == 0 ? t2 : t2 - 1;
 867     *  else
 868     *     result = t2;
 869
 870     */
 871    ir_instruction &i = *base_ir;
 872    ir_variable *frtemp = new(ir) ir_variable(ir->operands[0]->type, "frtemp",
 873                                              ir_var_temporary);
 874    ir_variable *temp = new(ir) ir_variable(ir->operands[0]->type, "temp",
 875                                            ir_var_temporary);
 876    ir_variable *t2 = new(ir) ir_variable(ir->operands[0]->type, "t2",
 877                                            ir_var_temporary);
 878    ir_constant *p5 = new(ir) ir_constant(0.5, ir->operands[0]->type->vector_elements);
 879    ir_constant *one = new(ir) ir_constant(1.0, ir->operands[0]->type->vector_elements);
 880    ir_constant *zero = new(ir) ir_constant(0.0, ir->operands[0]->type->vector_elements);
 881
 882    i.insert_before(temp);
 883    i.insert_before(assign(temp, add(ir->operands[0], p5)));
 884
 885    i.insert_before(frtemp);
 886    i.insert_before(assign(frtemp, fract(temp)));
 887
 888    i.insert_before(t2);
 889    i.insert_before(assign(t2, sub(temp, frtemp)));
 890
 891    ir->operation = ir_triop_csel;
 892    ir->operands[0] = equal(fract(ir->operands[0]->clone(ir, NULL)),
 893                            p5->clone(ir, NULL));
 894    ir->operands[1] = csel(equal(fract(mul(t2, p5->clone(ir, NULL))),
 895                                 zero),
 896                           t2,
 897                           sub(t2, one));
 898    ir->operands[2] = new(ir) ir_dereference_variable(t2);
 899
 900    this->progress = true;
 901 }
 902
 903 void
 904 lower_instructions_visitor::dtrunc_to_dfrac(ir_expression *ir)
 905 {
 906    /*
 907     * frtemp = frac(x);
 908     * temp = sub(x, frtemp);
 909     * result = x >= 0 ? temp : temp + (frtemp == 0.0) ? 0 : 1;
 910     */
 911    ir_rvalue *arg = ir->operands[0];
 912    ir_instruction &i = *base_ir;
 913
 914    ir_constant *zero = new(ir) ir_constant(0.0, arg->type->vector_elements);
 915    ir_constant *one = new(ir) ir_constant(1.0, arg->type->vector_elements);
 916    ir_variable *frtemp = new(ir) ir_variable(arg->type, "frtemp",
 917                                              ir_var_temporary);
 918    ir_variable *temp = new(ir) ir_variable(ir->operands[0]->type, "temp",
 919                                            ir_var_temporary);
 920
 921    i.insert_before(frtemp);
 922    i.insert_before(assign(frtemp, fract(arg)));
 923    i.insert_before(temp);
 924    i.insert_before(assign(temp, sub(arg->clone(ir, NULL), frtemp)));
 925
 926    ir->operation = ir_triop_csel;
 927    ir->operands[0] = gequal(arg->clone(ir, NULL), zero);
 928    ir->operands[1] = new (ir) ir_dereference_variable(temp);
 929    ir->operands[2] = add(temp,
 930                          csel(equal(frtemp, zero->clone(ir, NULL)),
 931                               zero->clone(ir, NULL),
 932                               one));
 933
 934    this->progress = true;
 935 }
 936
 937 void
 938 lower_instructions_visitor::dsign_to_csel(ir_expression *ir)
 939 {
 940    /*
 941     * temp = x > 0.0 ? 1.0 : 0.0;
 942     * result = x < 0.0 ? -1.0 : temp;
 943     */
 944    ir_rvalue *arg = ir->operands[0];
 945    ir_constant *zero = new(ir) ir_constant(0.0, arg->type->vector_elements);
 946    ir_constant *one = new(ir) ir_constant(1.0, arg->type->vector_elements);
 947    ir_constant *neg_one = new(ir) ir_constant(-1.0, arg->type->vector_elements);
 948
 949    ir->operation = ir_triop_csel;
 950    ir->operands[0] = less(arg->clone(ir, NULL),
 951                           zero->clone(ir, NULL));
 952    ir->operands[1] = neg_one;
 953    ir->operands[2] = csel(greater(arg, zero),
 954                           one,
 955                           zero->clone(ir, NULL));
 956
 957    this->progress = true;
 958 }
 959
 960 void
 961 lower_instructions_visitor::bit_count_to_math(ir_expression *ir)
 962 {
 963    /* For more details, see:
 964     *
 965     * http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetPaallel
 966     */
 967    const unsigned elements = ir->operands[0]->type->vector_elements;
 968    ir_variable *temp = new(ir) ir_variable(glsl_type::uvec(elements), "temp",
 969                                            ir_var_temporary);
 970    ir_constant *c55555555 = new(ir) ir_constant(0x55555555u);
 971    ir_constant *c33333333 = new(ir) ir_constant(0x33333333u);
 972    ir_constant *c0F0F0F0F = new(ir) ir_constant(0x0F0F0F0Fu);
 973    ir_constant *c01010101 = new(ir) ir_constant(0x01010101u);
 974    ir_constant *c1 = new(ir) ir_constant(1u);
 975    ir_constant *c2 = new(ir) ir_constant(2u);
 976    ir_constant *c4 = new(ir) ir_constant(4u);
 977    ir_constant *c24 = new(ir) ir_constant(24u);
 978
 979    base_ir->insert_before(temp);
 980
 981    if (ir->operands[0]->type->base_type == GLSL_TYPE_UINT) {
 982       base_ir->insert_before(assign(temp, ir->operands[0]));
 983    } else {
 984       assert(ir->operands[0]->type->base_type == GLSL_TYPE_INT);
 985       base_ir->insert_before(assign(temp, i2u(ir->operands[0])));
 986    }
 987
 988    /* temp = temp - ((temp >> 1) & 0x55555555u); */
 989    base_ir->insert_before(assign(temp, sub(temp, bit_and(rshift(temp, c1),
 990                                                          c55555555))));
 991
 992    /* temp = (temp & 0x33333333u) + ((temp >> 2) & 0x33333333u); */
 993    base_ir->insert_before(assign(temp, add(bit_and(temp, c33333333),
 994                                            bit_and(rshift(temp, c2),
 995                                                    c33333333->clone(ir, NULL)))));
 996
 997    /* int(((temp + (temp >> 4) & 0xF0F0F0Fu) * 0x1010101u) >> 24); */
 998    ir->operation = ir_unop_u2i;
 999    ir->operands[0] = rshift(mul(bit_and(add(temp, rshift(temp, c4)), c0F0F0F0F),
1000                                 c01010101),
1001                             c24);
1002
1003    this->progress = true;
1004 }
1005
1006 void
1007 lower_instructions_visitor::extract_to_shifts(ir_expression *ir)
1008 {
1009    ir_variable *bits =
1010       new(ir) ir_variable(ir->operands[0]->type, "bits", ir_var_temporary);
1011
1012    base_ir->insert_before(bits);
1013    base_ir->insert_before(assign(bits, ir->operands[2]));
1014
1015    if (ir->operands[0]->type->base_type == GLSL_TYPE_UINT) {
1016       ir_constant *c1 =
1017          new(ir) ir_constant(1u, ir->operands[0]->type->vector_elements);
1018       ir_constant *c32 =
1019          new(ir) ir_constant(32u, ir->operands[0]->type->vector_elements);
1020       ir_constant *cFFFFFFFF =
1021          new(ir) ir_constant(0xFFFFFFFFu, ir->operands[0]->type->vector_elements);
1022
1023       /* At least some hardware treats (x << y) as (x << (y%32)).  This means
1024        * we'd get a mask of 0 when bits is 32.  Special case it.
1025        *
1026        * mask = bits == 32 ? 0xffffffff : (1u << bits) - 1u;
1027        */
1028       ir_expression *mask = csel(equal(bits, c32),
1029                                  cFFFFFFFF,
1030                                  sub(lshift(c1, bits), c1->clone(ir, NULL)));
1031
1032       /* Section 8.8 (Integer Functions) of the GLSL 4.50 spec says:
1033        *
1034        *    If bits is zero, the result will be zero.
1035        *
1036        * Since (1 << 0) - 1 == 0, we don't need to bother with the conditional
1037        * select as in the signed integer case.
1038        *
1039        * (value >> offset) & mask;
1040        */
1041       ir->operation = ir_binop_bit_and;
1042       ir->operands[0] = rshift(ir->operands[0], ir->operands[1]);
1043       ir->operands[1] = mask;
1044       ir->operands[2] = NULL;
1045    } else {
1046       ir_constant *c0 =
1047          new(ir) ir_constant(int(0), ir->operands[0]->type->vector_elements);
1048       ir_constant *c32 =
1049          new(ir) ir_constant(int(32), ir->operands[0]->type->vector_elements);
1050       ir_variable *temp =
1051          new(ir) ir_variable(ir->operands[0]->type, "temp", ir_var_temporary);
1052
1053       /* temp = 32 - bits; */
1054       base_ir->insert_before(temp);
1055       base_ir->insert_before(assign(temp, sub(c32, bits)));
1056
1057       /* expr = value << (temp - offset)) >> temp; */
1058       ir_expression *expr =
1059          rshift(lshift(ir->operands[0], sub(temp, ir->operands[1])), temp);
1060
1061       /* Section 8.8 (Integer Functions) of the GLSL 4.50 spec says:
1062        *
1063        *    If bits is zero, the result will be zero.
1064        *
1065        * Due to the (x << (y%32)) behavior mentioned before, the (value <<
1066        * (32-0)) doesn't "erase" all of the data as we would like, so finish
1067        * up with:
1068        *
1069        * (bits == 0) ? 0 : e;
1070        */
1071       ir->operation = ir_triop_csel;
1072       ir->operands[0] = equal(c0, bits);
1073       ir->operands[1] = c0->clone(ir, NULL);
1074       ir->operands[2] = expr;
1075    }
1076
1077    this->progress = true;
1078 }
1079
1080 void
1081 lower_instructions_visitor::insert_to_shifts(ir_expression *ir)
1082 {
1083    ir_constant *c1;
1084    ir_constant *c32;
1085    ir_constant *cFFFFFFFF;
1086    ir_variable *offset =
1087       new(ir) ir_variable(ir->operands[0]->type, "offset", ir_var_temporary);
1088    ir_variable *bits =
1089       new(ir) ir_variable(ir->operands[0]->type, "bits", ir_var_temporary);
1090    ir_variable *mask =
1091       new(ir) ir_variable(ir->operands[0]->type, "mask", ir_var_temporary);
1092
1093    if (ir->operands[0]->type->base_type == GLSL_TYPE_INT) {
1094       c1 = new(ir) ir_constant(int(1), ir->operands[0]->type->vector_elements);
1095       c32 = new(ir) ir_constant(int(32), ir->operands[0]->type->vector_elements);
1096       cFFFFFFFF = new(ir) ir_constant(int(0xFFFFFFFF), ir->operands[0]->type->vector_elements);
1097    } else {
1098       assert(ir->operands[0]->type->base_type == GLSL_TYPE_UINT);
1099
1100       c1 = new(ir) ir_constant(1u, ir->operands[0]->type->vector_elements);
1101       c32 = new(ir) ir_constant(32u, ir->operands[0]->type->vector_elements);
1102       cFFFFFFFF = new(ir) ir_constant(0xFFFFFFFFu, ir->operands[0]->type->vector_elements);
1103    }
1104
1105    base_ir->insert_before(offset);
1106    base_ir->insert_before(assign(offset, ir->operands[2]));
1107
1108    base_ir->insert_before(bits);
1109    base_ir->insert_before(assign(bits, ir->operands[3]));
1110
1111    /* At least some hardware treats (x << y) as (x << (y%32)).  This means
1112     * we'd get a mask of 0 when bits is 32.  Special case it.
1113     *
1114     * mask = (bits == 32 ? 0xffffffff : (1u << bits) - 1u) << offset;
1115     *
1116     * Section 8.8 (Integer Functions) of the GLSL 4.50 spec says:
1117     *
1118     *    The result will be undefined if offset or bits is negative, or if the
1119     *    sum of offset and bits is greater than the number of bits used to
1120     *    store the operand.
1121     *
1122     * Since it's undefined, there are a couple other ways this could be
1123     * implemented.  The other way that was considered was to put the csel
1124     * around the whole thing:
1125     *
1126     *    final_result = bits == 32 ? insert : ... ;
1127     */
1128    base_ir->insert_before(mask);
1129
1130    base_ir->insert_before(assign(mask, csel(equal(bits, c32),
1131                                             cFFFFFFFF,
1132                                             lshift(sub(lshift(c1, bits),
1133                                                        c1->clone(ir, NULL)),
1134                                                    offset))));
1135
1136    /* (base & ~mask) | ((insert << offset) & mask) */
1137    ir->operation = ir_binop_bit_or;
1138    ir->operands[0] = bit_and(ir->operands[0], bit_not(mask));
1139    ir->operands[1] = bit_and(lshift(ir->operands[1], offset), mask);
1140    ir->operands[2] = NULL;
1141    ir->operands[3] = NULL;
1142
1143    this->progress = true;
1144 }
1145
1146 ir_visitor_status
1147 lower_instructions_visitor::visit_leave(ir_expression *ir)
1148 {
1149    switch (ir->operation) {
1150    case ir_binop_dot:
1151       if (ir->operands[0]->type->is_double())
1152          double_dot_to_fma(ir);
1153       break;
1154    case ir_triop_lrp:
1155       if (ir->operands[0]->type->is_double())
1156          double_lrp(ir);
1157       break;
1158    case ir_binop_sub:
1159       if (lowering(SUB_TO_ADD_NEG))
1160          sub_to_add_neg(ir);
1161       break;
1162
1163    case ir_binop_div:
1164       if (ir->operands[1]->type->is_integer() && lowering(INT_DIV_TO_MUL_RCP))
1165          int_div_to_mul_rcp(ir);
1166       else if ((ir->operands[1]->type->is_float() ||
1167                 ir->operands[1]->type->is_double()) && lowering(DIV_TO_MUL_RCP))
1168          div_to_mul_rcp(ir);
1169       break;
1170
1171    case ir_unop_exp:
1172       if (lowering(EXP_TO_EXP2))
1173          exp_to_exp2(ir);
1174       break;
1175
1176    case ir_unop_log:
1177       if (lowering(LOG_TO_LOG2))
1178          log_to_log2(ir);
1179       break;
1180
1181    case ir_binop_mod:
1182       if (lowering(MOD_TO_FLOOR) && (ir->type->is_float() || ir->type->is_double()))
1183          mod_to_floor(ir);
1184       break;
1185
1186    case ir_binop_pow:
1187       if (lowering(POW_TO_EXP2))
1188          pow_to_exp2(ir);
1189       break;
1190
1191    case ir_binop_ldexp:
1192       if (lowering(LDEXP_TO_ARITH) && ir->type->is_float())
1193          ldexp_to_arith(ir);
1194       if (lowering(DFREXP_DLDEXP_TO_ARITH) && ir->type->is_double())
1195          dldexp_to_arith(ir);
1196       break;
1197
1198    case ir_unop_frexp_exp:
1199       if (lowering(DFREXP_DLDEXP_TO_ARITH) && ir->operands[0]->type->is_double())
1200          dfrexp_exp_to_arith(ir);
1201       break;
1202
1203    case ir_unop_frexp_sig:
1204       if (lowering(DFREXP_DLDEXP_TO_ARITH) && ir->operands[0]->type->is_double())
1205          dfrexp_sig_to_arith(ir);
1206       break;
1207
1208    case ir_binop_carry:
1209       if (lowering(CARRY_TO_ARITH))
1210          carry_to_arith(ir);
1211       break;
1212
1213    case ir_binop_borrow:
1214       if (lowering(BORROW_TO_ARITH))
1215          borrow_to_arith(ir);
1216       break;
1217
1218    case ir_unop_saturate:
1219       if (lowering(SAT_TO_CLAMP))
1220          sat_to_clamp(ir);
1221       break;
1222
1223    case ir_unop_trunc:
1224       if (lowering(DOPS_TO_DFRAC) && ir->type->is_double())
1225          dtrunc_to_dfrac(ir);
1226       break;
1227
1228    case ir_unop_ceil:
1229       if (lowering(DOPS_TO_DFRAC) && ir->type->is_double())
1230          dceil_to_dfrac(ir);
1231       break;
1232
1233    case ir_unop_floor:
1234       if (lowering(DOPS_TO_DFRAC) && ir->type->is_double())
1235          dfloor_to_dfrac(ir);
1236       break;
1237
1238    case ir_unop_round_even:
1239       if (lowering(DOPS_TO_DFRAC) && ir->type->is_double())
1240          dround_even_to_dfrac(ir);
1241       break;
1242
1243    case ir_unop_sign:
1244       if (lowering(DOPS_TO_DFRAC) && ir->type->is_double())
1245          dsign_to_csel(ir);
1246       break;
1247
1248    case ir_unop_bit_count:
1249       if (lowering(BIT_COUNT_TO_MATH))
1250          bit_count_to_math(ir);
1251       break;
1252
1253    case ir_triop_bitfield_extract:
1254       if (lowering(EXTRACT_TO_SHIFTS))
1255          extract_to_shifts(ir);
1256       break;
1257
1258    case ir_quadop_bitfield_insert:
1259       if (lowering(INSERT_TO_SHIFTS))
1260          insert_to_shifts(ir);
1261       break;
1262
1263
1264    default:
1265       return visit_continue;
1266    }
1267
1268    return visit_continue;
1269 }