src/compiler/glsl/lower_instructions.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  21  * DEALINGS IN THE SOFTWARE.
  22  */
  23
  24 /**
  25  * \file lower_instructions.cpp
  26  *
  27  * Many GPUs lack native instructions for certain expression operations, and
  28  * must replace them with some other expression tree.  This pass lowers some
  29  * of the most common cases, allowing the lowering code to be implemented once
  30  * rather than in each driver backend.
  31  *
  32  * Currently supported transformations:
  33  * - SUB_TO_ADD_NEG
  34  * - DIV_TO_MUL_RCP
  35  * - INT_DIV_TO_MUL_RCP
  36  * - EXP_TO_EXP2
  37  * - POW_TO_EXP2
  38  * - LOG_TO_LOG2
  39  * - MOD_TO_FLOOR
  40  * - LDEXP_TO_ARITH
  41  * - DFREXP_TO_ARITH
  42  * - CARRY_TO_ARITH
  43  * - BORROW_TO_ARITH
  44  * - SAT_TO_CLAMP
  45  * - DOPS_TO_DFRAC
  46  *
  47  * SUB_TO_ADD_NEG:
  48  * ---------------
  49  * Breaks an ir_binop_sub expression down to add(op0, neg(op1))
  50  *
  51  * This simplifies expression reassociation, and for many backends
  52  * there is no subtract operation separate from adding the negation.
  53  * For backends with native subtract operations, they will probably
  54  * want to recognize add(op0, neg(op1)) or the other way around to
  55  * produce a subtract anyway.
  56  *
  57  * DIV_TO_MUL_RCP and INT_DIV_TO_MUL_RCP:
  58  * --------------------------------------
  59  * Breaks an ir_binop_div expression down to op0 * (rcp(op1)).
  60  *
  61  * Many GPUs don't have a divide instruction (945 and 965 included),
  62  * but they do have an RCP instruction to compute an approximate
  63  * reciprocal.  By breaking the operation down, constant reciprocals
  64  * can get constant folded.
  65  *
  66  * DIV_TO_MUL_RCP only lowers floating point division; INT_DIV_TO_MUL_RCP
  67  * handles the integer case, converting to and from floating point so that
  68  * RCP is possible.
  69  *
  70  * EXP_TO_EXP2 and LOG_TO_LOG2:
  71  * ----------------------------
  72  * Many GPUs don't have a base e log or exponent instruction, but they
  73  * do have base 2 versions, so this pass converts exp and log to exp2
  74  * and log2 operations.
  75  *
  76  * POW_TO_EXP2:
  77  * -----------
  78  * Many older GPUs don't have an x**y instruction.  For these GPUs, convert
  79  * x**y to 2**(y * log2(x)).
  80  *
  81  * MOD_TO_FLOOR:
  82  * -------------
  83  * Breaks an ir_binop_mod expression down to (op0 - op1 * floor(op0 / op1))
  84  *
  85  * Many GPUs don't have a MOD instruction (945 and 965 included), and
  86  * if we have to break it down like this anyway, it gives an
  87  * opportunity to do things like constant fold the (1.0 / op1) easily.
  88  *
  89  * Note: before we used to implement this as op1 * fract(op / op1) but this
  90  * implementation had significant precision errors.
  91  *
  92  * LDEXP_TO_ARITH:
  93  * -------------
  94  * Converts ir_binop_ldexp to arithmetic and bit operations for float sources.
  95  *
  96  * DFREXP_DLDEXP_TO_ARITH:
  97  * ---------------
  98  * Converts ir_binop_ldexp, ir_unop_frexp_sig, and ir_unop_frexp_exp to
  99  * arithmetic and bit ops for double arguments.
 100  *
 101  * CARRY_TO_ARITH:
 102  * ---------------
 103  * Converts ir_carry into (x + y) < x.
 104  *
 105  * BORROW_TO_ARITH:
 106  * ----------------
 107  * Converts ir_borrow into (x < y).
 108  *
 109  * SAT_TO_CLAMP:
 110  * -------------
 111  * Converts ir_unop_saturate into min(max(x, 0.0), 1.0)
 112  *
 113  * DOPS_TO_DFRAC:
 114  * --------------
 115  * Converts double trunc, ceil, floor, round to fract
 116  */
 117
 118 #include "c99_math.h"
 119 #include "program/prog_instruction.h" /* for swizzle */
 120 #include "compiler/glsl_types.h"
 121 #include "ir.h"
 122 #include "ir_builder.h"
 123 #include "ir_optimization.h"
 124
 125 using namespace ir_builder;
 126
 127 namespace {
 128
 129 class lower_instructions_visitor : public ir_hierarchical_visitor {
 130 public:
 131    lower_instructions_visitor(unsigned lower)
 132       : progress(false), lower(lower) { }
 133
 134    ir_visitor_status visit_leave(ir_expression *);
 135
 136    bool progress;
 137
 138 private:
 139    unsigned lower; /** Bitfield of which operations to lower */
 140
 141    void sub_to_add_neg(ir_expression *);
 142    void div_to_mul_rcp(ir_expression *);
 143    void int_div_to_mul_rcp(ir_expression *);
 144    void mod_to_floor(ir_expression *);
 145    void exp_to_exp2(ir_expression *);
 146    void pow_to_exp2(ir_expression *);
 147    void log_to_log2(ir_expression *);
 148    void ldexp_to_arith(ir_expression *);
 149    void dldexp_to_arith(ir_expression *);
 150    void dfrexp_sig_to_arith(ir_expression *);
 151    void dfrexp_exp_to_arith(ir_expression *);
 152    void carry_to_arith(ir_expression *);
 153    void borrow_to_arith(ir_expression *);
 154    void sat_to_clamp(ir_expression *);
 155    void double_dot_to_fma(ir_expression *);
 156    void double_lrp(ir_expression *);
 157    void dceil_to_dfrac(ir_expression *);
 158    void dfloor_to_dfrac(ir_expression *);
 159    void dround_even_to_dfrac(ir_expression *);
 160    void dtrunc_to_dfrac(ir_expression *);
 161    void dsign_to_csel(ir_expression *);
 162    void bit_count_to_math(ir_expression *);
 163    void extract_to_shifts(ir_expression *);
 164    void insert_to_shifts(ir_expression *);
 165    void reverse_to_shifts(ir_expression *ir);
 166    void find_lsb_to_float_cast(ir_expression *ir);
 167 };
 168
 169 } /* anonymous namespace */
 170
 171 /**
 172  * Determine if a particular type of lowering should occur
 173  */
 174 #define lowering(x) (this->lower & x)
 175
 176 bool
 177 lower_instructions(exec_list *instructions, unsigned what_to_lower)
 178 {
 179    lower_instructions_visitor v(what_to_lower);
 180
 181    visit_list_elements(&v, instructions);
 182    return v.progress;
 183 }
 184
 185 void
 186 lower_instructions_visitor::sub_to_add_neg(ir_expression *ir)
 187 {
 188    ir->operation = ir_binop_add;
 189    ir->operands[1] = new(ir) ir_expression(ir_unop_neg, ir->operands[1]->type,
 190                                            ir->operands[1], NULL);
 191    this->progress = true;
 192 }
 193
 194 void
 195 lower_instructions_visitor::div_to_mul_rcp(ir_expression *ir)
 196 {
 197    assert(ir->operands[1]->type->is_float() || ir->operands[1]->type->is_double());
 198
 199    /* New expression for the 1.0 / op1 */
 200    ir_rvalue *expr;
 201    expr = new(ir) ir_expression(ir_unop_rcp,
 202                                 ir->operands[1]->type,
 203                                 ir->operands[1]);
 204
 205    /* op0 / op1 -> op0 * (1.0 / op1) */
 206    ir->operation = ir_binop_mul;
 207    ir->operands[1] = expr;
 208
 209    this->progress = true;
 210 }
 211
 212 void
 213 lower_instructions_visitor::int_div_to_mul_rcp(ir_expression *ir)
 214 {
 215    assert(ir->operands[1]->type->is_integer());
 216
 217    /* Be careful with integer division -- we need to do it as a
 218     * float and re-truncate, since rcp(n > 1) of an integer would
 219     * just be 0.
 220     */
 221    ir_rvalue *op0, *op1;
 222    const struct glsl_type *vec_type;
 223
 224    vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
 225                                       ir->operands[1]->type->vector_elements,
 226                                       ir->operands[1]->type->matrix_columns);
 227
 228    if (ir->operands[1]->type->base_type == GLSL_TYPE_INT)
 229       op1 = new(ir) ir_expression(ir_unop_i2f, vec_type, ir->operands[1], NULL);
 230    else
 231       op1 = new(ir) ir_expression(ir_unop_u2f, vec_type, ir->operands[1], NULL);
 232
 233    op1 = new(ir) ir_expression(ir_unop_rcp, op1->type, op1, NULL);
 234
 235    vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
 236                                       ir->operands[0]->type->vector_elements,
 237                                       ir->operands[0]->type->matrix_columns);
 238
 239    if (ir->operands[0]->type->base_type == GLSL_TYPE_INT)
 240       op0 = new(ir) ir_expression(ir_unop_i2f, vec_type, ir->operands[0], NULL);
 241    else
 242       op0 = new(ir) ir_expression(ir_unop_u2f, vec_type, ir->operands[0], NULL);
 243
 244    vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
 245                                       ir->type->vector_elements,
 246                                       ir->type->matrix_columns);
 247
 248    op0 = new(ir) ir_expression(ir_binop_mul, vec_type, op0, op1);
 249
 250    if (ir->operands[1]->type->base_type == GLSL_TYPE_INT) {
 251       ir->operation = ir_unop_f2i;
 252       ir->operands[0] = op0;
 253    } else {
 254       ir->operation = ir_unop_i2u;
 255       ir->operands[0] = new(ir) ir_expression(ir_unop_f2i, op0);
 256    }
 257    ir->operands[1] = NULL;
 258
 259    this->progress = true;
 260 }
 261
 262 void
 263 lower_instructions_visitor::exp_to_exp2(ir_expression *ir)
 264 {
 265    ir_constant *log2_e = new(ir) ir_constant(float(M_LOG2E));
 266
 267    ir->operation = ir_unop_exp2;
 268    ir->operands[0] = new(ir) ir_expression(ir_binop_mul, ir->operands[0]->type,
 269                                            ir->operands[0], log2_e);
 270    this->progress = true;
 271 }
 272
 273 void
 274 lower_instructions_visitor::pow_to_exp2(ir_expression *ir)
 275 {
 276    ir_expression *const log2_x =
 277       new(ir) ir_expression(ir_unop_log2, ir->operands[0]->type,
 278                             ir->operands[0]);
 279
 280    ir->operation = ir_unop_exp2;
 281    ir->operands[0] = new(ir) ir_expression(ir_binop_mul, ir->operands[1]->type,
 282                                            ir->operands[1], log2_x);
 283    ir->operands[1] = NULL;
 284    this->progress = true;
 285 }
 286
 287 void
 288 lower_instructions_visitor::log_to_log2(ir_expression *ir)
 289 {
 290    ir->operation = ir_binop_mul;
 291    ir->operands[0] = new(ir) ir_expression(ir_unop_log2, ir->operands[0]->type,
 292                                            ir->operands[0], NULL);
 293    ir->operands[1] = new(ir) ir_constant(float(1.0 / M_LOG2E));
 294    this->progress = true;
 295 }
 296
 297 void
 298 lower_instructions_visitor::mod_to_floor(ir_expression *ir)
 299 {
 300    ir_variable *x = new(ir) ir_variable(ir->operands[0]->type, "mod_x",
 301                                          ir_var_temporary);
 302    ir_variable *y = new(ir) ir_variable(ir->operands[1]->type, "mod_y",
 303                                          ir_var_temporary);
 304    this->base_ir->insert_before(x);
 305    this->base_ir->insert_before(y);
 306
 307    ir_assignment *const assign_x =
 308       new(ir) ir_assignment(new(ir) ir_dereference_variable(x),
 309                             ir->operands[0], NULL);
 310    ir_assignment *const assign_y =
 311       new(ir) ir_assignment(new(ir) ir_dereference_variable(y),
 312                             ir->operands[1], NULL);
 313
 314    this->base_ir->insert_before(assign_x);
 315    this->base_ir->insert_before(assign_y);
 316
 317    ir_expression *const div_expr =
 318       new(ir) ir_expression(ir_binop_div, x->type,
 319                             new(ir) ir_dereference_variable(x),
 320                             new(ir) ir_dereference_variable(y));
 321
 322    /* Don't generate new IR that would need to be lowered in an additional
 323     * pass.
 324     */
 325    if (lowering(DIV_TO_MUL_RCP) && (ir->type->is_float() || ir->type->is_double()))
 326       div_to_mul_rcp(div_expr);
 327
 328    ir_expression *const floor_expr =
 329       new(ir) ir_expression(ir_unop_floor, x->type, div_expr);
 330
 331    if (lowering(DOPS_TO_DFRAC) && ir->type->is_double())
 332       dfloor_to_dfrac(floor_expr);
 333
 334    ir_expression *const mul_expr =
 335       new(ir) ir_expression(ir_binop_mul,
 336                             new(ir) ir_dereference_variable(y),
 337                             floor_expr);
 338
 339    ir->operation = ir_binop_sub;
 340    ir->operands[0] = new(ir) ir_dereference_variable(x);
 341    ir->operands[1] = mul_expr;
 342    this->progress = true;
 343 }
 344
 345 void
 346 lower_instructions_visitor::ldexp_to_arith(ir_expression *ir)
 347 {
 348    /* Translates
 349     *    ir_binop_ldexp x exp
 350     * into
 351     *
 352     *    extracted_biased_exp = rshift(bitcast_f2i(abs(x)), exp_shift);
 353     *    resulting_biased_exp = extracted_biased_exp + exp;
 354     *
 355     *    if (resulting_biased_exp < 1 || x == 0.0f) {
 356     *       return copysign(0.0, x);
 357     *    }
 358     *
 359     *    return bitcast_u2f((bitcast_f2u(x) & sign_mantissa_mask) |
 360     *                       lshift(i2u(resulting_biased_exp), exp_shift));
 361     *
 362     * which we can't actually implement as such, since the GLSL IR doesn't
 363     * have vectorized if-statements. We actually implement it without branches
 364     * using conditional-select:
 365     *
 366     *    extracted_biased_exp = rshift(bitcast_f2i(abs(x)), exp_shift);
 367     *    resulting_biased_exp = extracted_biased_exp + exp;
 368     *
 369     *    is_not_zero_or_underflow = logic_and(nequal(x, 0.0f),
 370     *                                         gequal(resulting_biased_exp, 1);
 371     *    x = csel(is_not_zero_or_underflow, x, copysign(0.0f, x));
 372     *    resulting_biased_exp = csel(is_not_zero_or_underflow,
 373     *                                resulting_biased_exp, 0);
 374     *
 375     *    return bitcast_u2f((bitcast_f2u(x) & sign_mantissa_mask) |
 376     *                       lshift(i2u(resulting_biased_exp), exp_shift));
 377     */
 378
 379    const unsigned vec_elem = ir->type->vector_elements;
 380
 381    /* Types */
 382    const glsl_type *ivec = glsl_type::get_instance(GLSL_TYPE_INT, vec_elem, 1);
 383    const glsl_type *bvec = glsl_type::get_instance(GLSL_TYPE_BOOL, vec_elem, 1);
 384
 385    /* Constants */
 386    ir_constant *zeroi = ir_constant::zero(ir, ivec);
 387
 388    ir_constant *sign_mask = new(ir) ir_constant(0x80000000u, vec_elem);
 389
 390    ir_constant *exp_shift = new(ir) ir_constant(23, vec_elem);
 391    ir_constant *exp_width = new(ir) ir_constant(8, vec_elem);
 392
 393    /* Temporary variables */
 394    ir_variable *x = new(ir) ir_variable(ir->type, "x", ir_var_temporary);
 395    ir_variable *exp = new(ir) ir_variable(ivec, "exp", ir_var_temporary);
 396
 397    ir_variable *zero_sign_x = new(ir) ir_variable(ir->type, "zero_sign_x",
 398                                                   ir_var_temporary);
 399
 400    ir_variable *extracted_biased_exp =
 401       new(ir) ir_variable(ivec, "extracted_biased_exp", ir_var_temporary);
 402    ir_variable *resulting_biased_exp =
 403       new(ir) ir_variable(ivec, "resulting_biased_exp", ir_var_temporary);
 404
 405    ir_variable *is_not_zero_or_underflow =
 406       new(ir) ir_variable(bvec, "is_not_zero_or_underflow", ir_var_temporary);
 407
 408    ir_instruction &i = *base_ir;
 409
 410    /* Copy <x> and <exp> arguments. */
 411    i.insert_before(x);
 412    i.insert_before(assign(x, ir->operands[0]));
 413    i.insert_before(exp);
 414    i.insert_before(assign(exp, ir->operands[1]));
 415
 416    /* Extract the biased exponent from <x>. */
 417    i.insert_before(extracted_biased_exp);
 418    i.insert_before(assign(extracted_biased_exp,
 419                           rshift(bitcast_f2i(abs(x)), exp_shift)));
 420
 421    i.insert_before(resulting_biased_exp);
 422    i.insert_before(assign(resulting_biased_exp,
 423                           add(extracted_biased_exp, exp)));
 424
 425    /* Test if result is ±0.0, subnormal, or underflow by checking if the
 426     * resulting biased exponent would be less than 0x1. If so, the result is
 427     * 0.0 with the sign of x. (Actually, invert the conditions so that
 428     * immediate values are the second arguments, which is better for i965)
 429     */
 430    i.insert_before(zero_sign_x);
 431    i.insert_before(assign(zero_sign_x,
 432                           bitcast_u2f(bit_and(bitcast_f2u(x), sign_mask))));
 433
 434    i.insert_before(is_not_zero_or_underflow);
 435    i.insert_before(assign(is_not_zero_or_underflow,
 436                           logic_and(nequal(x, new(ir) ir_constant(0.0f, vec_elem)),
 437                                     gequal(resulting_biased_exp,
 438                                            new(ir) ir_constant(0x1, vec_elem)))));
 439    i.insert_before(assign(x, csel(is_not_zero_or_underflow,
 440                                   x, zero_sign_x)));
 441    i.insert_before(assign(resulting_biased_exp,
 442                           csel(is_not_zero_or_underflow,
 443                                resulting_biased_exp, zeroi)));
 444
 445    /* We could test for overflows by checking if the resulting biased exponent
 446     * would be greater than 0xFE. Turns out we don't need to because the GLSL
 447     * spec says:
 448     *
 449     *    "If this product is too large to be represented in the
 450     *     floating-point type, the result is undefined."
 451     */
 452
 453    ir_constant *exp_shift_clone = exp_shift->clone(ir, NULL);
 454    ir->operation = ir_unop_bitcast_i2f;
 455    ir->operands[0] = bitfield_insert(bitcast_f2i(x), resulting_biased_exp,
 456                                      exp_shift_clone, exp_width);
 457    ir->operands[1] = NULL;
 458
 459    this->progress = true;
 460 }
 461
 462 void
 463 lower_instructions_visitor::dldexp_to_arith(ir_expression *ir)
 464 {
 465    /* See ldexp_to_arith for structure. Uses frexp_exp to extract the exponent
 466     * from the significand.
 467     */
 468
 469    const unsigned vec_elem = ir->type->vector_elements;
 470
 471    /* Types */
 472    const glsl_type *ivec = glsl_type::get_instance(GLSL_TYPE_INT, vec_elem, 1);
 473    const glsl_type *bvec = glsl_type::get_instance(GLSL_TYPE_BOOL, vec_elem, 1);
 474
 475    /* Constants */
 476    ir_constant *zeroi = ir_constant::zero(ir, ivec);
 477
 478    ir_constant *sign_mask = new(ir) ir_constant(0x80000000u);
 479
 480    ir_constant *exp_shift = new(ir) ir_constant(20u);
 481    ir_constant *exp_width = new(ir) ir_constant(11u);
 482    ir_constant *exp_bias = new(ir) ir_constant(1022, vec_elem);
 483
 484    /* Temporary variables */
 485    ir_variable *x = new(ir) ir_variable(ir->type, "x", ir_var_temporary);
 486    ir_variable *exp = new(ir) ir_variable(ivec, "exp", ir_var_temporary);
 487
 488    ir_variable *zero_sign_x = new(ir) ir_variable(ir->type, "zero_sign_x",
 489                                                   ir_var_temporary);
 490
 491    ir_variable *extracted_biased_exp =
 492       new(ir) ir_variable(ivec, "extracted_biased_exp", ir_var_temporary);
 493    ir_variable *resulting_biased_exp =
 494       new(ir) ir_variable(ivec, "resulting_biased_exp", ir_var_temporary);
 495
 496    ir_variable *is_not_zero_or_underflow =
 497       new(ir) ir_variable(bvec, "is_not_zero_or_underflow", ir_var_temporary);
 498
 499    ir_instruction &i = *base_ir;
 500
 501    /* Copy <x> and <exp> arguments. */
 502    i.insert_before(x);
 503    i.insert_before(assign(x, ir->operands[0]));
 504    i.insert_before(exp);
 505    i.insert_before(assign(exp, ir->operands[1]));
 506
 507    ir_expression *frexp_exp = expr(ir_unop_frexp_exp, x);
 508    if (lowering(DFREXP_DLDEXP_TO_ARITH))
 509       dfrexp_exp_to_arith(frexp_exp);
 510
 511    /* Extract the biased exponent from <x>. */
 512    i.insert_before(extracted_biased_exp);
 513    i.insert_before(assign(extracted_biased_exp, add(frexp_exp, exp_bias)));
 514
 515    i.insert_before(resulting_biased_exp);
 516    i.insert_before(assign(resulting_biased_exp,
 517                           add(extracted_biased_exp, exp)));
 518
 519    /* Test if result is ±0.0, subnormal, or underflow by checking if the
 520     * resulting biased exponent would be less than 0x1. If so, the result is
 521     * 0.0 with the sign of x. (Actually, invert the conditions so that
 522     * immediate values are the second arguments, which is better for i965)
 523     * TODO: Implement in a vector fashion.
 524     */
 525    i.insert_before(zero_sign_x);
 526    for (unsigned elem = 0; elem < vec_elem; elem++) {
 527       ir_variable *unpacked =
 528          new(ir) ir_variable(glsl_type::uvec2_type, "unpacked", ir_var_temporary);
 529       i.insert_before(unpacked);
 530       i.insert_before(
 531             assign(unpacked,
 532                    expr(ir_unop_unpack_double_2x32, swizzle(x, elem, 1))));
 533       i.insert_before(assign(unpacked, bit_and(swizzle_y(unpacked), sign_mask->clone(ir, NULL)),
 534                              WRITEMASK_Y));
 535       i.insert_before(assign(unpacked, ir_constant::zero(ir, glsl_type::uint_type), WRITEMASK_X));
 536       i.insert_before(assign(zero_sign_x,
 537                              expr(ir_unop_pack_double_2x32, unpacked),
 538                              1 << elem));
 539    }
 540    i.insert_before(is_not_zero_or_underflow);
 541    i.insert_before(assign(is_not_zero_or_underflow,
 542                           gequal(resulting_biased_exp,
 543                                   new(ir) ir_constant(0x1, vec_elem))));
 544    i.insert_before(assign(x, csel(is_not_zero_or_underflow,
 545                                   x, zero_sign_x)));
 546    i.insert_before(assign(resulting_biased_exp,
 547                           csel(is_not_zero_or_underflow,
 548                                resulting_biased_exp, zeroi)));
 549
 550    /* We could test for overflows by checking if the resulting biased exponent
 551     * would be greater than 0xFE. Turns out we don't need to because the GLSL
 552     * spec says:
 553     *
 554     *    "If this product is too large to be represented in the
 555     *     floating-point type, the result is undefined."
 556     */
 557
 558    ir_rvalue *results[4] = {NULL};
 559    for (unsigned elem = 0; elem < vec_elem; elem++) {
 560       ir_variable *unpacked =
 561          new(ir) ir_variable(glsl_type::uvec2_type, "unpacked", ir_var_temporary);
 562       i.insert_before(unpacked);
 563       i.insert_before(
 564             assign(unpacked,
 565                    expr(ir_unop_unpack_double_2x32, swizzle(x, elem, 1))));
 566
 567       ir_expression *bfi = bitfield_insert(
 568             swizzle_y(unpacked),
 569             i2u(swizzle(resulting_biased_exp, elem, 1)),
 570             exp_shift->clone(ir, NULL),
 571             exp_width->clone(ir, NULL));
 572
 573       i.insert_before(assign(unpacked, bfi, WRITEMASK_Y));
 574
 575       results[elem] = expr(ir_unop_pack_double_2x32, unpacked);
 576    }
 577
 578    ir->operation = ir_quadop_vector;
 579    ir->operands[0] = results[0];
 580    ir->operands[1] = results[1];
 581    ir->operands[2] = results[2];
 582    ir->operands[3] = results[3];
 583
 584    /* Don't generate new IR that would need to be lowered in an additional
 585     * pass.
 586     */
 587
 588    this->progress = true;
 589 }
 590
 591 void
 592 lower_instructions_visitor::dfrexp_sig_to_arith(ir_expression *ir)
 593 {
 594    const unsigned vec_elem = ir->type->vector_elements;
 595    const glsl_type *bvec = glsl_type::get_instance(GLSL_TYPE_BOOL, vec_elem, 1);
 596
 597    /* Double-precision floating-point values are stored as
 598     *   1 sign bit;
 599     *   11 exponent bits;
 600     *   52 mantissa bits.
 601     *
 602     * We're just extracting the significand here, so we only need to modify
 603     * the upper 32-bit uint. Unfortunately we must extract each double
 604     * independently as there is no vector version of unpackDouble.
 605     */
 606
 607    ir_instruction &i = *base_ir;
 608
 609    ir_variable *is_not_zero =
 610       new(ir) ir_variable(bvec, "is_not_zero", ir_var_temporary);
 611    ir_rvalue *results[4] = {NULL};
 612
 613    ir_constant *dzero = new(ir) ir_constant(0.0, vec_elem);
 614    i.insert_before(is_not_zero);
 615    i.insert_before(
 616          assign(is_not_zero,
 617                 nequal(abs(ir->operands[0]->clone(ir, NULL)), dzero)));
 618
 619    /* TODO: Remake this as more vector-friendly when int64 support is
 620     * available.
 621     */
 622    for (unsigned elem = 0; elem < vec_elem; elem++) {
 623       ir_constant *zero = new(ir) ir_constant(0u, 1);
 624       ir_constant *sign_mantissa_mask = new(ir) ir_constant(0x800fffffu, 1);
 625
 626       /* Exponent of double floating-point values in the range [0.5, 1.0). */
 627       ir_constant *exponent_value = new(ir) ir_constant(0x3fe00000u, 1);
 628
 629       ir_variable *bits =
 630          new(ir) ir_variable(glsl_type::uint_type, "bits", ir_var_temporary);
 631       ir_variable *unpacked =
 632          new(ir) ir_variable(glsl_type::uvec2_type, "unpacked", ir_var_temporary);
 633
 634       ir_rvalue *x = swizzle(ir->operands[0]->clone(ir, NULL), elem, 1);
 635
 636       i.insert_before(bits);
 637       i.insert_before(unpacked);
 638       i.insert_before(assign(unpacked, expr(ir_unop_unpack_double_2x32, x)));
 639
 640       /* Manipulate the high uint to remove the exponent and replace it with
 641        * either the default exponent or zero.
 642        */
 643       i.insert_before(assign(bits, swizzle_y(unpacked)));
 644       i.insert_before(assign(bits, bit_and(bits, sign_mantissa_mask)));
 645       i.insert_before(assign(bits, bit_or(bits,
 646                                           csel(swizzle(is_not_zero, elem, 1),
 647                                                exponent_value,
 648                                                zero))));
 649       i.insert_before(assign(unpacked, bits, WRITEMASK_Y));
 650       results[elem] = expr(ir_unop_pack_double_2x32, unpacked);
 651    }
 652
 653    /* Put the dvec back together */
 654    ir->operation = ir_quadop_vector;
 655    ir->operands[0] = results[0];
 656    ir->operands[1] = results[1];
 657    ir->operands[2] = results[2];
 658    ir->operands[3] = results[3];
 659
 660    this->progress = true;
 661 }
 662
 663 void
 664 lower_instructions_visitor::dfrexp_exp_to_arith(ir_expression *ir)
 665 {
 666    const unsigned vec_elem = ir->type->vector_elements;
 667    const glsl_type *bvec = glsl_type::get_instance(GLSL_TYPE_BOOL, vec_elem, 1);
 668    const glsl_type *uvec = glsl_type::get_instance(GLSL_TYPE_UINT, vec_elem, 1);
 669
 670    /* Double-precision floating-point values are stored as
 671     *   1 sign bit;
 672     *   11 exponent bits;
 673     *   52 mantissa bits.
 674     *
 675     * We're just extracting the exponent here, so we only care about the upper
 676     * 32-bit uint.
 677     */
 678
 679    ir_instruction &i = *base_ir;
 680
 681    ir_variable *is_not_zero =
 682       new(ir) ir_variable(bvec, "is_not_zero", ir_var_temporary);
 683    ir_variable *high_words =
 684       new(ir) ir_variable(uvec, "high_words", ir_var_temporary);
 685    ir_constant *dzero = new(ir) ir_constant(0.0, vec_elem);
 686    ir_constant *izero = new(ir) ir_constant(0, vec_elem);
 687
 688    ir_rvalue *absval = abs(ir->operands[0]);
 689
 690    i.insert_before(is_not_zero);
 691    i.insert_before(high_words);
 692    i.insert_before(assign(is_not_zero, nequal(absval->clone(ir, NULL), dzero)));
 693
 694    /* Extract all of the upper uints. */
 695    for (unsigned elem = 0; elem < vec_elem; elem++) {
 696       ir_rvalue *x = swizzle(absval->clone(ir, NULL), elem, 1);
 697
 698       i.insert_before(assign(high_words,
 699                              swizzle_y(expr(ir_unop_unpack_double_2x32, x)),
 700                              1 << elem));
 701
 702    }
 703    ir_constant *exponent_shift = new(ir) ir_constant(20, vec_elem);
 704    ir_constant *exponent_bias = new(ir) ir_constant(-1022, vec_elem);
 705
 706    /* For non-zero inputs, shift the exponent down and apply bias. */
 707    ir->operation = ir_triop_csel;
 708    ir->operands[0] = new(ir) ir_dereference_variable(is_not_zero);
 709    ir->operands[1] = add(exponent_bias, u2i(rshift(high_words, exponent_shift)));
 710    ir->operands[2] = izero;
 711
 712    this->progress = true;
 713 }
 714
 715 void
 716 lower_instructions_visitor::carry_to_arith(ir_expression *ir)
 717 {
 718    /* Translates
 719     *   ir_binop_carry x y
 720     * into
 721     *   sum = ir_binop_add x y
 722     *   bcarry = ir_binop_less sum x
 723     *   carry = ir_unop_b2i bcarry
 724     */
 725
 726    ir_rvalue *x_clone = ir->operands[0]->clone(ir, NULL);
 727    ir->operation = ir_unop_i2u;
 728    ir->operands[0] = b2i(less(add(ir->operands[0], ir->operands[1]), x_clone));
 729    ir->operands[1] = NULL;
 730
 731    this->progress = true;
 732 }
 733
 734 void
 735 lower_instructions_visitor::borrow_to_arith(ir_expression *ir)
 736 {
 737    /* Translates
 738     *   ir_binop_borrow x y
 739     * into
 740     *   bcarry = ir_binop_less x y
 741     *   carry = ir_unop_b2i bcarry
 742     */
 743
 744    ir->operation = ir_unop_i2u;
 745    ir->operands[0] = b2i(less(ir->operands[0], ir->operands[1]));
 746    ir->operands[1] = NULL;
 747
 748    this->progress = true;
 749 }
 750
 751 void
 752 lower_instructions_visitor::sat_to_clamp(ir_expression *ir)
 753 {
 754    /* Translates
 755     *   ir_unop_saturate x
 756     * into
 757     *   ir_binop_min (ir_binop_max(x, 0.0), 1.0)
 758     */
 759
 760    ir->operation = ir_binop_min;
 761    ir->operands[0] = new(ir) ir_expression(ir_binop_max, ir->operands[0]->type,
 762                                            ir->operands[0],
 763                                            new(ir) ir_constant(0.0f));
 764    ir->operands[1] = new(ir) ir_constant(1.0f);
 765
 766    this->progress = true;
 767 }
 768
 769 void
 770 lower_instructions_visitor::double_dot_to_fma(ir_expression *ir)
 771 {
 772    ir_variable *temp = new(ir) ir_variable(ir->operands[0]->type->get_base_type(), "dot_res",
 773                                            ir_var_temporary);
 774    this->base_ir->insert_before(temp);
 775
 776    int nc = ir->operands[0]->type->components();
 777    for (int i = nc - 1; i >= 1; i--) {
 778       ir_assignment *assig;
 779       if (i == (nc - 1)) {
 780          assig = assign(temp, mul(swizzle(ir->operands[0]->clone(ir, NULL), i, 1),
 781                                   swizzle(ir->operands[1]->clone(ir, NULL), i, 1)));
 782       } else {
 783          assig = assign(temp, fma(swizzle(ir->operands[0]->clone(ir, NULL), i, 1),
 784                                   swizzle(ir->operands[1]->clone(ir, NULL), i, 1),
 785                                   temp));
 786       }
 787       this->base_ir->insert_before(assig);
 788    }
 789
 790    ir->operation = ir_triop_fma;
 791    ir->operands[0] = swizzle(ir->operands[0], 0, 1);
 792    ir->operands[1] = swizzle(ir->operands[1], 0, 1);
 793    ir->operands[2] = new(ir) ir_dereference_variable(temp);
 794
 795    this->progress = true;
 796
 797 }
 798
 799 void
 800 lower_instructions_visitor::double_lrp(ir_expression *ir)
 801 {
 802    int swizval;
 803    ir_rvalue *op0 = ir->operands[0], *op2 = ir->operands[2];
 804    ir_constant *one = new(ir) ir_constant(1.0, op2->type->vector_elements);
 805
 806    switch (op2->type->vector_elements) {
 807    case 1:
 808       swizval = SWIZZLE_XXXX;
 809       break;
 810    default:
 811       assert(op0->type->vector_elements == op2->type->vector_elements);
 812       swizval = SWIZZLE_XYZW;
 813       break;
 814    }
 815
 816    ir->operation = ir_triop_fma;
 817    ir->operands[0] = swizzle(op2, swizval, op0->type->vector_elements);
 818    ir->operands[2] = mul(sub(one, op2->clone(ir, NULL)), op0);
 819
 820    this->progress = true;
 821 }
 822
 823 void
 824 lower_instructions_visitor::dceil_to_dfrac(ir_expression *ir)
 825 {
 826    /*
 827     * frtemp = frac(x);
 828     * temp = sub(x, frtemp);
 829     * result = temp + ((frtemp != 0.0) ? 1.0 : 0.0);
 830     */
 831    ir_instruction &i = *base_ir;
 832    ir_constant *zero = new(ir) ir_constant(0.0, ir->operands[0]->type->vector_elements);
 833    ir_constant *one = new(ir) ir_constant(1.0, ir->operands[0]->type->vector_elements);
 834    ir_variable *frtemp = new(ir) ir_variable(ir->operands[0]->type, "frtemp",
 835                                              ir_var_temporary);
 836
 837    i.insert_before(frtemp);
 838    i.insert_before(assign(frtemp, fract(ir->operands[0])));
 839
 840    ir->operation = ir_binop_add;
 841    ir->operands[0] = sub(ir->operands[0]->clone(ir, NULL), frtemp);
 842    ir->operands[1] = csel(nequal(frtemp, zero), one, zero->clone(ir, NULL));
 843
 844    this->progress = true;
 845 }
 846
 847 void
 848 lower_instructions_visitor::dfloor_to_dfrac(ir_expression *ir)
 849 {
 850    /*
 851     * frtemp = frac(x);
 852     * result = sub(x, frtemp);
 853     */
 854    ir->operation = ir_binop_sub;
 855    ir->operands[1] = fract(ir->operands[0]->clone(ir, NULL));
 856
 857    this->progress = true;
 858 }
 859 void
 860 lower_instructions_visitor::dround_even_to_dfrac(ir_expression *ir)
 861 {
 862    /*
 863     * insane but works
 864     * temp = x + 0.5;
 865     * frtemp = frac(temp);
 866     * t2 = sub(temp, frtemp);
 867     * if (frac(x) == 0.5)
 868     *     result = frac(t2 * 0.5) == 0 ? t2 : t2 - 1;
 869     *  else
 870     *     result = t2;
 871
 872     */
 873    ir_instruction &i = *base_ir;
 874    ir_variable *frtemp = new(ir) ir_variable(ir->operands[0]->type, "frtemp",
 875                                              ir_var_temporary);
 876    ir_variable *temp = new(ir) ir_variable(ir->operands[0]->type, "temp",
 877                                            ir_var_temporary);
 878    ir_variable *t2 = new(ir) ir_variable(ir->operands[0]->type, "t2",
 879                                            ir_var_temporary);
 880    ir_constant *p5 = new(ir) ir_constant(0.5, ir->operands[0]->type->vector_elements);
 881    ir_constant *one = new(ir) ir_constant(1.0, ir->operands[0]->type->vector_elements);
 882    ir_constant *zero = new(ir) ir_constant(0.0, ir->operands[0]->type->vector_elements);
 883
 884    i.insert_before(temp);
 885    i.insert_before(assign(temp, add(ir->operands[0], p5)));
 886
 887    i.insert_before(frtemp);
 888    i.insert_before(assign(frtemp, fract(temp)));
 889
 890    i.insert_before(t2);
 891    i.insert_before(assign(t2, sub(temp, frtemp)));
 892
 893    ir->operation = ir_triop_csel;
 894    ir->operands[0] = equal(fract(ir->operands[0]->clone(ir, NULL)),
 895                            p5->clone(ir, NULL));
 896    ir->operands[1] = csel(equal(fract(mul(t2, p5->clone(ir, NULL))),
 897                                 zero),
 898                           t2,
 899                           sub(t2, one));
 900    ir->operands[2] = new(ir) ir_dereference_variable(t2);
 901
 902    this->progress = true;
 903 }
 904
 905 void
 906 lower_instructions_visitor::dtrunc_to_dfrac(ir_expression *ir)
 907 {
 908    /*
 909     * frtemp = frac(x);
 910     * temp = sub(x, frtemp);
 911     * result = x >= 0 ? temp : temp + (frtemp == 0.0) ? 0 : 1;
 912     */
 913    ir_rvalue *arg = ir->operands[0];
 914    ir_instruction &i = *base_ir;
 915
 916    ir_constant *zero = new(ir) ir_constant(0.0, arg->type->vector_elements);
 917    ir_constant *one = new(ir) ir_constant(1.0, arg->type->vector_elements);
 918    ir_variable *frtemp = new(ir) ir_variable(arg->type, "frtemp",
 919                                              ir_var_temporary);
 920    ir_variable *temp = new(ir) ir_variable(ir->operands[0]->type, "temp",
 921                                            ir_var_temporary);
 922
 923    i.insert_before(frtemp);
 924    i.insert_before(assign(frtemp, fract(arg)));
 925    i.insert_before(temp);
 926    i.insert_before(assign(temp, sub(arg->clone(ir, NULL), frtemp)));
 927
 928    ir->operation = ir_triop_csel;
 929    ir->operands[0] = gequal(arg->clone(ir, NULL), zero);
 930    ir->operands[1] = new (ir) ir_dereference_variable(temp);
 931    ir->operands[2] = add(temp,
 932                          csel(equal(frtemp, zero->clone(ir, NULL)),
 933                               zero->clone(ir, NULL),
 934                               one));
 935
 936    this->progress = true;
 937 }
 938
 939 void
 940 lower_instructions_visitor::dsign_to_csel(ir_expression *ir)
 941 {
 942    /*
 943     * temp = x > 0.0 ? 1.0 : 0.0;
 944     * result = x < 0.0 ? -1.0 : temp;
 945     */
 946    ir_rvalue *arg = ir->operands[0];
 947    ir_constant *zero = new(ir) ir_constant(0.0, arg->type->vector_elements);
 948    ir_constant *one = new(ir) ir_constant(1.0, arg->type->vector_elements);
 949    ir_constant *neg_one = new(ir) ir_constant(-1.0, arg->type->vector_elements);
 950
 951    ir->operation = ir_triop_csel;
 952    ir->operands[0] = less(arg->clone(ir, NULL),
 953                           zero->clone(ir, NULL));
 954    ir->operands[1] = neg_one;
 955    ir->operands[2] = csel(greater(arg, zero),
 956                           one,
 957                           zero->clone(ir, NULL));
 958
 959    this->progress = true;
 960 }
 961
 962 void
 963 lower_instructions_visitor::bit_count_to_math(ir_expression *ir)
 964 {
 965    /* For more details, see:
 966     *
 967     * http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetPaallel
 968     */
 969    const unsigned elements = ir->operands[0]->type->vector_elements;
 970    ir_variable *temp = new(ir) ir_variable(glsl_type::uvec(elements), "temp",
 971                                            ir_var_temporary);
 972    ir_constant *c55555555 = new(ir) ir_constant(0x55555555u);
 973    ir_constant *c33333333 = new(ir) ir_constant(0x33333333u);
 974    ir_constant *c0F0F0F0F = new(ir) ir_constant(0x0F0F0F0Fu);
 975    ir_constant *c01010101 = new(ir) ir_constant(0x01010101u);
 976    ir_constant *c1 = new(ir) ir_constant(1u);
 977    ir_constant *c2 = new(ir) ir_constant(2u);
 978    ir_constant *c4 = new(ir) ir_constant(4u);
 979    ir_constant *c24 = new(ir) ir_constant(24u);
 980
 981    base_ir->insert_before(temp);
 982
 983    if (ir->operands[0]->type->base_type == GLSL_TYPE_UINT) {
 984       base_ir->insert_before(assign(temp, ir->operands[0]));
 985    } else {
 986       assert(ir->operands[0]->type->base_type == GLSL_TYPE_INT);
 987       base_ir->insert_before(assign(temp, i2u(ir->operands[0])));
 988    }
 989
 990    /* temp = temp - ((temp >> 1) & 0x55555555u); */
 991    base_ir->insert_before(assign(temp, sub(temp, bit_and(rshift(temp, c1),
 992                                                          c55555555))));
 993
 994    /* temp = (temp & 0x33333333u) + ((temp >> 2) & 0x33333333u); */
 995    base_ir->insert_before(assign(temp, add(bit_and(temp, c33333333),
 996                                            bit_and(rshift(temp, c2),
 997                                                    c33333333->clone(ir, NULL)))));
 998
 999    /* int(((temp + (temp >> 4) & 0xF0F0F0Fu) * 0x1010101u) >> 24); */
1000    ir->operation = ir_unop_u2i;
1001    ir->operands[0] = rshift(mul(bit_and(add(temp, rshift(temp, c4)), c0F0F0F0F),
1002                                 c01010101),
1003                             c24);
1004
1005    this->progress = true;
1006 }
1007
1008 void
1009 lower_instructions_visitor::extract_to_shifts(ir_expression *ir)
1010 {
1011    ir_variable *bits =
1012       new(ir) ir_variable(ir->operands[0]->type, "bits", ir_var_temporary);
1013
1014    base_ir->insert_before(bits);
1015    base_ir->insert_before(assign(bits, ir->operands[2]));
1016
1017    if (ir->operands[0]->type->base_type == GLSL_TYPE_UINT) {
1018       ir_constant *c1 =
1019          new(ir) ir_constant(1u, ir->operands[0]->type->vector_elements);
1020       ir_constant *c32 =
1021          new(ir) ir_constant(32u, ir->operands[0]->type->vector_elements);
1022       ir_constant *cFFFFFFFF =
1023          new(ir) ir_constant(0xFFFFFFFFu, ir->operands[0]->type->vector_elements);
1024
1025       /* At least some hardware treats (x << y) as (x << (y%32)).  This means
1026        * we'd get a mask of 0 when bits is 32.  Special case it.
1027        *
1028        * mask = bits == 32 ? 0xffffffff : (1u << bits) - 1u;
1029        */
1030       ir_expression *mask = csel(equal(bits, c32),
1031                                  cFFFFFFFF,
1032                                  sub(lshift(c1, bits), c1->clone(ir, NULL)));
1033
1034       /* Section 8.8 (Integer Functions) of the GLSL 4.50 spec says:
1035        *
1036        *    If bits is zero, the result will be zero.
1037        *
1038        * Since (1 << 0) - 1 == 0, we don't need to bother with the conditional
1039        * select as in the signed integer case.
1040        *
1041        * (value >> offset) & mask;
1042        */
1043       ir->operation = ir_binop_bit_and;
1044       ir->operands[0] = rshift(ir->operands[0], ir->operands[1]);
1045       ir->operands[1] = mask;
1046       ir->operands[2] = NULL;
1047    } else {
1048       ir_constant *c0 =
1049          new(ir) ir_constant(int(0), ir->operands[0]->type->vector_elements);
1050       ir_constant *c32 =
1051          new(ir) ir_constant(int(32), ir->operands[0]->type->vector_elements);
1052       ir_variable *temp =
1053          new(ir) ir_variable(ir->operands[0]->type, "temp", ir_var_temporary);
1054
1055       /* temp = 32 - bits; */
1056       base_ir->insert_before(temp);
1057       base_ir->insert_before(assign(temp, sub(c32, bits)));
1058
1059       /* expr = value << (temp - offset)) >> temp; */
1060       ir_expression *expr =
1061          rshift(lshift(ir->operands[0], sub(temp, ir->operands[1])), temp);
1062
1063       /* Section 8.8 (Integer Functions) of the GLSL 4.50 spec says:
1064        *
1065        *    If bits is zero, the result will be zero.
1066        *
1067        * Due to the (x << (y%32)) behavior mentioned before, the (value <<
1068        * (32-0)) doesn't "erase" all of the data as we would like, so finish
1069        * up with:
1070        *
1071        * (bits == 0) ? 0 : e;
1072        */
1073       ir->operation = ir_triop_csel;
1074       ir->operands[0] = equal(c0, bits);
1075       ir->operands[1] = c0->clone(ir, NULL);
1076       ir->operands[2] = expr;
1077    }
1078
1079    this->progress = true;
1080 }
1081
1082 void
1083 lower_instructions_visitor::insert_to_shifts(ir_expression *ir)
1084 {
1085    ir_constant *c1;
1086    ir_constant *c32;
1087    ir_constant *cFFFFFFFF;
1088    ir_variable *offset =
1089       new(ir) ir_variable(ir->operands[0]->type, "offset", ir_var_temporary);
1090    ir_variable *bits =
1091       new(ir) ir_variable(ir->operands[0]->type, "bits", ir_var_temporary);
1092    ir_variable *mask =
1093       new(ir) ir_variable(ir->operands[0]->type, "mask", ir_var_temporary);
1094
1095    if (ir->operands[0]->type->base_type == GLSL_TYPE_INT) {
1096       c1 = new(ir) ir_constant(int(1), ir->operands[0]->type->vector_elements);
1097       c32 = new(ir) ir_constant(int(32), ir->operands[0]->type->vector_elements);
1098       cFFFFFFFF = new(ir) ir_constant(int(0xFFFFFFFF), ir->operands[0]->type->vector_elements);
1099    } else {
1100       assert(ir->operands[0]->type->base_type == GLSL_TYPE_UINT);
1101
1102       c1 = new(ir) ir_constant(1u, ir->operands[0]->type->vector_elements);
1103       c32 = new(ir) ir_constant(32u, ir->operands[0]->type->vector_elements);
1104       cFFFFFFFF = new(ir) ir_constant(0xFFFFFFFFu, ir->operands[0]->type->vector_elements);
1105    }
1106
1107    base_ir->insert_before(offset);
1108    base_ir->insert_before(assign(offset, ir->operands[2]));
1109
1110    base_ir->insert_before(bits);
1111    base_ir->insert_before(assign(bits, ir->operands[3]));
1112
1113    /* At least some hardware treats (x << y) as (x << (y%32)).  This means
1114     * we'd get a mask of 0 when bits is 32.  Special case it.
1115     *
1116     * mask = (bits == 32 ? 0xffffffff : (1u << bits) - 1u) << offset;
1117     *
1118     * Section 8.8 (Integer Functions) of the GLSL 4.50 spec says:
1119     *
1120     *    The result will be undefined if offset or bits is negative, or if the
1121     *    sum of offset and bits is greater than the number of bits used to
1122     *    store the operand.
1123     *
1124     * Since it's undefined, there are a couple other ways this could be
1125     * implemented.  The other way that was considered was to put the csel
1126     * around the whole thing:
1127     *
1128     *    final_result = bits == 32 ? insert : ... ;
1129     */
1130    base_ir->insert_before(mask);
1131
1132    base_ir->insert_before(assign(mask, csel(equal(bits, c32),
1133                                             cFFFFFFFF,
1134                                             lshift(sub(lshift(c1, bits),
1135                                                        c1->clone(ir, NULL)),
1136                                                    offset))));
1137
1138    /* (base & ~mask) | ((insert << offset) & mask) */
1139    ir->operation = ir_binop_bit_or;
1140    ir->operands[0] = bit_and(ir->operands[0], bit_not(mask));
1141    ir->operands[1] = bit_and(lshift(ir->operands[1], offset), mask);
1142    ir->operands[2] = NULL;
1143    ir->operands[3] = NULL;
1144
1145    this->progress = true;
1146 }
1147
1148 void
1149 lower_instructions_visitor::reverse_to_shifts(ir_expression *ir)
1150 {
1151    /* For more details, see:
1152     *
1153     * http://graphics.stanford.edu/~seander/bithacks.html#ReverseParallel
1154     */
1155    ir_constant *c1 =
1156       new(ir) ir_constant(1u, ir->operands[0]->type->vector_elements);
1157    ir_constant *c2 =
1158       new(ir) ir_constant(2u, ir->operands[0]->type->vector_elements);
1159    ir_constant *c4 =
1160       new(ir) ir_constant(4u, ir->operands[0]->type->vector_elements);
1161    ir_constant *c8 =
1162       new(ir) ir_constant(8u, ir->operands[0]->type->vector_elements);
1163    ir_constant *c16 =
1164       new(ir) ir_constant(16u, ir->operands[0]->type->vector_elements);
1165    ir_constant *c33333333 =
1166       new(ir) ir_constant(0x33333333u, ir->operands[0]->type->vector_elements);
1167    ir_constant *c55555555 =
1168       new(ir) ir_constant(0x55555555u, ir->operands[0]->type->vector_elements);
1169    ir_constant *c0F0F0F0F =
1170       new(ir) ir_constant(0x0F0F0F0Fu, ir->operands[0]->type->vector_elements);
1171    ir_constant *c00FF00FF =
1172       new(ir) ir_constant(0x00FF00FFu, ir->operands[0]->type->vector_elements);
1173    ir_variable *temp =
1174       new(ir) ir_variable(glsl_type::uvec(ir->operands[0]->type->vector_elements),
1175                           "temp", ir_var_temporary);
1176    ir_instruction &i = *base_ir;
1177
1178    i.insert_before(temp);
1179
1180    if (ir->operands[0]->type->base_type == GLSL_TYPE_UINT) {
1181       i.insert_before(assign(temp, ir->operands[0]));
1182    } else {
1183       assert(ir->operands[0]->type->base_type == GLSL_TYPE_INT);
1184       i.insert_before(assign(temp, i2u(ir->operands[0])));
1185    }
1186
1187    /* Swap odd and even bits.
1188     *
1189     * temp = ((temp >> 1) & 0x55555555u) | ((temp & 0x55555555u) << 1);
1190     */
1191    i.insert_before(assign(temp, bit_or(bit_and(rshift(temp, c1), c55555555),
1192                                        lshift(bit_and(temp, c55555555->clone(ir, NULL)),
1193                                               c1->clone(ir, NULL)))));
1194    /* Swap consecutive pairs.
1195     *
1196     * temp = ((temp >> 2) & 0x33333333u) | ((temp & 0x33333333u) << 2);
1197     */
1198    i.insert_before(assign(temp, bit_or(bit_and(rshift(temp, c2), c33333333),
1199                                        lshift(bit_and(temp, c33333333->clone(ir, NULL)),
1200                                               c2->clone(ir, NULL)))));
1201
1202    /* Swap nibbles.
1203     *
1204     * temp = ((temp >> 4) & 0x0F0F0F0Fu) | ((temp & 0x0F0F0F0Fu) << 4);
1205     */
1206    i.insert_before(assign(temp, bit_or(bit_and(rshift(temp, c4), c0F0F0F0F),
1207                                        lshift(bit_and(temp, c0F0F0F0F->clone(ir, NULL)),
1208                                               c4->clone(ir, NULL)))));
1209
1210    /* The last step is, basically, bswap.  Swap the bytes, then swap the
1211     * words.  When this code is run through GCC on x86, it does generate a
1212     * bswap instruction.
1213     *
1214     * temp = ((temp >> 8) & 0x00FF00FFu) | ((temp & 0x00FF00FFu) << 8);
1215     * temp = ( temp >> 16              ) | ( temp                << 16);
1216     */
1217    i.insert_before(assign(temp, bit_or(bit_and(rshift(temp, c8), c00FF00FF),
1218                                        lshift(bit_and(temp, c00FF00FF->clone(ir, NULL)),
1219                                               c8->clone(ir, NULL)))));
1220
1221    if (ir->operands[0]->type->base_type == GLSL_TYPE_UINT) {
1222       ir->operation = ir_binop_bit_or;
1223       ir->operands[0] = rshift(temp, c16);
1224       ir->operands[1] = lshift(temp, c16->clone(ir, NULL));
1225    } else {
1226       ir->operation = ir_unop_u2i;
1227       ir->operands[0] = bit_or(rshift(temp, c16),
1228                                lshift(temp, c16->clone(ir, NULL)));
1229    }
1230
1231    this->progress = true;
1232 }
1233
1234 void
1235 lower_instructions_visitor::find_lsb_to_float_cast(ir_expression *ir)
1236 {
1237    /* For more details, see:
1238     *
1239     * http://graphics.stanford.edu/~seander/bithacks.html#ZerosOnRightFloatCast
1240     */
1241    const unsigned elements = ir->operands[0]->type->vector_elements;
1242    ir_constant *c0 = new(ir) ir_constant(unsigned(0), elements);
1243    ir_constant *cminus1 = new(ir) ir_constant(int(-1), elements);
1244    ir_constant *c23 = new(ir) ir_constant(int(23), elements);
1245    ir_constant *c7F = new(ir) ir_constant(int(0x7F), elements);
1246    ir_variable *temp =
1247       new(ir) ir_variable(glsl_type::ivec(elements), "temp", ir_var_temporary);
1248    ir_variable *lsb_only =
1249       new(ir) ir_variable(glsl_type::uvec(elements), "lsb_only", ir_var_temporary);
1250    ir_variable *as_float =
1251       new(ir) ir_variable(glsl_type::vec(elements), "as_float", ir_var_temporary);
1252    ir_variable *lsb =
1253       new(ir) ir_variable(glsl_type::ivec(elements), "lsb", ir_var_temporary);
1254
1255    ir_instruction &i = *base_ir;
1256
1257    i.insert_before(temp);
1258
1259    if (ir->operands[0]->type->base_type == GLSL_TYPE_INT) {
1260       i.insert_before(assign(temp, ir->operands[0]));
1261    } else {
1262       assert(ir->operands[0]->type->base_type == GLSL_TYPE_UINT);
1263       i.insert_before(assign(temp, u2i(ir->operands[0])));
1264    }
1265
1266    /* The int-to-float conversion is lossless because (value & -value) is
1267     * either a power of two or zero.  We don't use the result in the zero
1268     * case.  The uint() cast is necessary so that 0x80000000 does not
1269     * generate a negative value.
1270     *
1271     * uint lsb_only = uint(value & -value);
1272     * float as_float = float(lsb_only);
1273     */
1274    i.insert_before(lsb_only);
1275    i.insert_before(assign(lsb_only, i2u(bit_and(temp, neg(temp)))));
1276
1277    i.insert_before(as_float);
1278    i.insert_before(assign(as_float, u2f(lsb_only)));
1279
1280    /* This is basically an open-coded frexp.  Implementations that have a
1281     * native frexp instruction would be better served by that.  This is
1282     * optimized versus a full-featured open-coded implementation in two ways:
1283     *
1284     * - We don't care about a correct result from subnormal numbers (including
1285     *   0.0), so the raw exponent can always be safely unbiased.
1286     *
1287     * - The value cannot be negative, so it does not need to be masked off to
1288     *   extract the exponent.
1289     *
1290     * int lsb = (floatBitsToInt(as_float) >> 23) - 0x7f;
1291     */
1292    i.insert_before(lsb);
1293    i.insert_before(assign(lsb, sub(rshift(bitcast_f2i(as_float), c23), c7F)));
1294
1295    /* Use lsb_only in the comparison instead of temp so that the & (far above)
1296     * can possibly generate the result without an explicit comparison.
1297     *
1298     * (lsb_only == 0) ? -1 : lsb;
1299     *
1300     * Since our input values are all integers, the unbiased exponent must not
1301     * be negative.  It will only be negative (-0x7f, in fact) if lsb_only is
1302     * 0.  Instead of using (lsb_only == 0), we could use (lsb >= 0).  Which is
1303     * better is likely GPU dependent.  Either way, the difference should be
1304     * small.
1305     */
1306    ir->operation = ir_triop_csel;
1307    ir->operands[0] = equal(lsb_only, c0);
1308    ir->operands[1] = cminus1;
1309    ir->operands[2] = new(ir) ir_dereference_variable(lsb);
1310
1311    this->progress = true;
1312 }
1313
1314 ir_visitor_status
1315 lower_instructions_visitor::visit_leave(ir_expression *ir)
1316 {
1317    switch (ir->operation) {
1318    case ir_binop_dot:
1319       if (ir->operands[0]->type->is_double())
1320          double_dot_to_fma(ir);
1321       break;
1322    case ir_triop_lrp:
1323       if (ir->operands[0]->type->is_double())
1324          double_lrp(ir);
1325       break;
1326    case ir_binop_sub:
1327       if (lowering(SUB_TO_ADD_NEG))
1328          sub_to_add_neg(ir);
1329       break;
1330
1331    case ir_binop_div:
1332       if (ir->operands[1]->type->is_integer() && lowering(INT_DIV_TO_MUL_RCP))
1333          int_div_to_mul_rcp(ir);
1334       else if ((ir->operands[1]->type->is_float() ||
1335                 ir->operands[1]->type->is_double()) && lowering(DIV_TO_MUL_RCP))
1336          div_to_mul_rcp(ir);
1337       break;
1338
1339    case ir_unop_exp:
1340       if (lowering(EXP_TO_EXP2))
1341          exp_to_exp2(ir);
1342       break;
1343
1344    case ir_unop_log:
1345       if (lowering(LOG_TO_LOG2))
1346          log_to_log2(ir);
1347       break;
1348
1349    case ir_binop_mod:
1350       if (lowering(MOD_TO_FLOOR) && (ir->type->is_float() || ir->type->is_double()))
1351          mod_to_floor(ir);
1352       break;
1353
1354    case ir_binop_pow:
1355       if (lowering(POW_TO_EXP2))
1356          pow_to_exp2(ir);
1357       break;
1358
1359    case ir_binop_ldexp:
1360       if (lowering(LDEXP_TO_ARITH) && ir->type->is_float())
1361          ldexp_to_arith(ir);
1362       if (lowering(DFREXP_DLDEXP_TO_ARITH) && ir->type->is_double())
1363          dldexp_to_arith(ir);
1364       break;
1365
1366    case ir_unop_frexp_exp:
1367       if (lowering(DFREXP_DLDEXP_TO_ARITH) && ir->operands[0]->type->is_double())
1368          dfrexp_exp_to_arith(ir);
1369       break;
1370
1371    case ir_unop_frexp_sig:
1372       if (lowering(DFREXP_DLDEXP_TO_ARITH) && ir->operands[0]->type->is_double())
1373          dfrexp_sig_to_arith(ir);
1374       break;
1375
1376    case ir_binop_carry:
1377       if (lowering(CARRY_TO_ARITH))
1378          carry_to_arith(ir);
1379       break;
1380
1381    case ir_binop_borrow:
1382       if (lowering(BORROW_TO_ARITH))
1383          borrow_to_arith(ir);
1384       break;
1385
1386    case ir_unop_saturate:
1387       if (lowering(SAT_TO_CLAMP))
1388          sat_to_clamp(ir);
1389       break;
1390
1391    case ir_unop_trunc:
1392       if (lowering(DOPS_TO_DFRAC) && ir->type->is_double())
1393          dtrunc_to_dfrac(ir);
1394       break;
1395
1396    case ir_unop_ceil:
1397       if (lowering(DOPS_TO_DFRAC) && ir->type->is_double())
1398          dceil_to_dfrac(ir);
1399       break;
1400
1401    case ir_unop_floor:
1402       if (lowering(DOPS_TO_DFRAC) && ir->type->is_double())
1403          dfloor_to_dfrac(ir);
1404       break;
1405
1406    case ir_unop_round_even:
1407       if (lowering(DOPS_TO_DFRAC) && ir->type->is_double())
1408          dround_even_to_dfrac(ir);
1409       break;
1410
1411    case ir_unop_sign:
1412       if (lowering(DOPS_TO_DFRAC) && ir->type->is_double())
1413          dsign_to_csel(ir);
1414       break;
1415
1416    case ir_unop_bit_count:
1417       if (lowering(BIT_COUNT_TO_MATH))
1418          bit_count_to_math(ir);
1419       break;
1420
1421    case ir_triop_bitfield_extract:
1422       if (lowering(EXTRACT_TO_SHIFTS))
1423          extract_to_shifts(ir);
1424       break;
1425
1426    case ir_quadop_bitfield_insert:
1427       if (lowering(INSERT_TO_SHIFTS))
1428          insert_to_shifts(ir);
1429       break;
1430
1431    case ir_unop_bitfield_reverse:
1432       if (lowering(REVERSE_TO_SHIFTS))
1433          reverse_to_shifts(ir);
1434       break;
1435
1436    case ir_unop_find_lsb:
1437       if (lowering(FIND_LSB_TO_FLOAT_CAST))
1438          find_lsb_to_float_cast(ir);
1439       break;
1440
1441    default:
1442       return visit_continue;
1443    }
1444
1445    return visit_continue;
1446 }