src/compiler/glsl/lower_instructions.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  21  * DEALINGS IN THE SOFTWARE.
  22  */
  23
  24 /**
  25  * \file lower_instructions.cpp
  26  *
  27  * Many GPUs lack native instructions for certain expression operations, and
  28  * must replace them with some other expression tree.  This pass lowers some
  29  * of the most common cases, allowing the lowering code to be implemented once
  30  * rather than in each driver backend.
  31  *
  32  * Currently supported transformations:
  33  * - SUB_TO_ADD_NEG
  34  * - DIV_TO_MUL_RCP
  35  * - INT_DIV_TO_MUL_RCP
  36  * - EXP_TO_EXP2
  37  * - POW_TO_EXP2
  38  * - LOG_TO_LOG2
  39  * - MOD_TO_FLOOR
  40  * - LDEXP_TO_ARITH
  41  * - DFREXP_TO_ARITH
  42  * - CARRY_TO_ARITH
  43  * - BORROW_TO_ARITH
  44  * - SAT_TO_CLAMP
  45  * - DOPS_TO_DFRAC
  46  *
  47  * SUB_TO_ADD_NEG:
  48  * ---------------
  49  * Breaks an ir_binop_sub expression down to add(op0, neg(op1))
  50  *
  51  * This simplifies expression reassociation, and for many backends
  52  * there is no subtract operation separate from adding the negation.
  53  * For backends with native subtract operations, they will probably
  54  * want to recognize add(op0, neg(op1)) or the other way around to
  55  * produce a subtract anyway.
  56  *
  57  * DIV_TO_MUL_RCP and INT_DIV_TO_MUL_RCP:
  58  * --------------------------------------
  59  * Breaks an ir_binop_div expression down to op0 * (rcp(op1)).
  60  *
  61  * Many GPUs don't have a divide instruction (945 and 965 included),
  62  * but they do have an RCP instruction to compute an approximate
  63  * reciprocal.  By breaking the operation down, constant reciprocals
  64  * can get constant folded.
  65  *
  66  * DIV_TO_MUL_RCP only lowers floating point division; INT_DIV_TO_MUL_RCP
  67  * handles the integer case, converting to and from floating point so that
  68  * RCP is possible.
  69  *
  70  * EXP_TO_EXP2 and LOG_TO_LOG2:
  71  * ----------------------------
  72  * Many GPUs don't have a base e log or exponent instruction, but they
  73  * do have base 2 versions, so this pass converts exp and log to exp2
  74  * and log2 operations.
  75  *
  76  * POW_TO_EXP2:
  77  * -----------
  78  * Many older GPUs don't have an x**y instruction.  For these GPUs, convert
  79  * x**y to 2**(y * log2(x)).
  80  *
  81  * MOD_TO_FLOOR:
  82  * -------------
  83  * Breaks an ir_binop_mod expression down to (op0 - op1 * floor(op0 / op1))
  84  *
  85  * Many GPUs don't have a MOD instruction (945 and 965 included), and
  86  * if we have to break it down like this anyway, it gives an
  87  * opportunity to do things like constant fold the (1.0 / op1) easily.
  88  *
  89  * Note: before we used to implement this as op1 * fract(op / op1) but this
  90  * implementation had significant precision errors.
  91  *
  92  * LDEXP_TO_ARITH:
  93  * -------------
  94  * Converts ir_binop_ldexp to arithmetic and bit operations for float sources.
  95  *
  96  * DFREXP_DLDEXP_TO_ARITH:
  97  * ---------------
  98  * Converts ir_binop_ldexp, ir_unop_frexp_sig, and ir_unop_frexp_exp to
  99  * arithmetic and bit ops for double arguments.
 100  *
 101  * CARRY_TO_ARITH:
 102  * ---------------
 103  * Converts ir_carry into (x + y) < x.
 104  *
 105  * BORROW_TO_ARITH:
 106  * ----------------
 107  * Converts ir_borrow into (x < y).
 108  *
 109  * SAT_TO_CLAMP:
 110  * -------------
 111  * Converts ir_unop_saturate into min(max(x, 0.0), 1.0)
 112  *
 113  * DOPS_TO_DFRAC:
 114  * --------------
 115  * Converts double trunc, ceil, floor, round to fract
 116  */
 117
 118 #include "c99_math.h"
 119 #include "program/prog_instruction.h" /* for swizzle */
 120 #include "compiler/glsl_types.h"
 121 #include "ir.h"
 122 #include "ir_builder.h"
 123 #include "ir_optimization.h"
 124
 125 using namespace ir_builder;
 126
 127 namespace {
 128
 129 class lower_instructions_visitor : public ir_hierarchical_visitor {
 130 public:
 131    lower_instructions_visitor(unsigned lower)
 132       : progress(false), lower(lower) { }
 133
 134    ir_visitor_status visit_leave(ir_expression *);
 135
 136    bool progress;
 137
 138 private:
 139    unsigned lower; /** Bitfield of which operations to lower */
 140
 141    void sub_to_add_neg(ir_expression *);
 142    void div_to_mul_rcp(ir_expression *);
 143    void int_div_to_mul_rcp(ir_expression *);
 144    void mod_to_floor(ir_expression *);
 145    void exp_to_exp2(ir_expression *);
 146    void pow_to_exp2(ir_expression *);
 147    void log_to_log2(ir_expression *);
 148    void ldexp_to_arith(ir_expression *);
 149    void dldexp_to_arith(ir_expression *);
 150    void dfrexp_sig_to_arith(ir_expression *);
 151    void dfrexp_exp_to_arith(ir_expression *);
 152    void carry_to_arith(ir_expression *);
 153    void borrow_to_arith(ir_expression *);
 154    void sat_to_clamp(ir_expression *);
 155    void double_dot_to_fma(ir_expression *);
 156    void double_lrp(ir_expression *);
 157    void dceil_to_dfrac(ir_expression *);
 158    void dfloor_to_dfrac(ir_expression *);
 159    void dround_even_to_dfrac(ir_expression *);
 160    void dtrunc_to_dfrac(ir_expression *);
 161    void dsign_to_csel(ir_expression *);
 162    void bit_count_to_math(ir_expression *);
 163    void extract_to_shifts(ir_expression *);
 164    void insert_to_shifts(ir_expression *);
 165    void reverse_to_shifts(ir_expression *ir);
 166    void find_lsb_to_float_cast(ir_expression *ir);
 167    void find_msb_to_float_cast(ir_expression *ir);
 168 };
 169
 170 } /* anonymous namespace */
 171
 172 /**
 173  * Determine if a particular type of lowering should occur
 174  */
 175 #define lowering(x) (this->lower & x)
 176
 177 bool
 178 lower_instructions(exec_list *instructions, unsigned what_to_lower)
 179 {
 180    lower_instructions_visitor v(what_to_lower);
 181
 182    visit_list_elements(&v, instructions);
 183    return v.progress;
 184 }
 185
 186 void
 187 lower_instructions_visitor::sub_to_add_neg(ir_expression *ir)
 188 {
 189    ir->operation = ir_binop_add;
 190    ir->operands[1] = new(ir) ir_expression(ir_unop_neg, ir->operands[1]->type,
 191                                            ir->operands[1], NULL);
 192    this->progress = true;
 193 }
 194
 195 void
 196 lower_instructions_visitor::div_to_mul_rcp(ir_expression *ir)
 197 {
 198    assert(ir->operands[1]->type->is_float() || ir->operands[1]->type->is_double());
 199
 200    /* New expression for the 1.0 / op1 */
 201    ir_rvalue *expr;
 202    expr = new(ir) ir_expression(ir_unop_rcp,
 203                                 ir->operands[1]->type,
 204                                 ir->operands[1]);
 205
 206    /* op0 / op1 -> op0 * (1.0 / op1) */
 207    ir->operation = ir_binop_mul;
 208    ir->operands[1] = expr;
 209
 210    this->progress = true;
 211 }
 212
 213 void
 214 lower_instructions_visitor::int_div_to_mul_rcp(ir_expression *ir)
 215 {
 216    assert(ir->operands[1]->type->is_integer());
 217
 218    /* Be careful with integer division -- we need to do it as a
 219     * float and re-truncate, since rcp(n > 1) of an integer would
 220     * just be 0.
 221     */
 222    ir_rvalue *op0, *op1;
 223    const struct glsl_type *vec_type;
 224
 225    vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
 226                                       ir->operands[1]->type->vector_elements,
 227                                       ir->operands[1]->type->matrix_columns);
 228
 229    if (ir->operands[1]->type->base_type == GLSL_TYPE_INT)
 230       op1 = new(ir) ir_expression(ir_unop_i2f, vec_type, ir->operands[1], NULL);
 231    else
 232       op1 = new(ir) ir_expression(ir_unop_u2f, vec_type, ir->operands[1], NULL);
 233
 234    op1 = new(ir) ir_expression(ir_unop_rcp, op1->type, op1, NULL);
 235
 236    vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
 237                                       ir->operands[0]->type->vector_elements,
 238                                       ir->operands[0]->type->matrix_columns);
 239
 240    if (ir->operands[0]->type->base_type == GLSL_TYPE_INT)
 241       op0 = new(ir) ir_expression(ir_unop_i2f, vec_type, ir->operands[0], NULL);
 242    else
 243       op0 = new(ir) ir_expression(ir_unop_u2f, vec_type, ir->operands[0], NULL);
 244
 245    vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
 246                                       ir->type->vector_elements,
 247                                       ir->type->matrix_columns);
 248
 249    op0 = new(ir) ir_expression(ir_binop_mul, vec_type, op0, op1);
 250
 251    if (ir->operands[1]->type->base_type == GLSL_TYPE_INT) {
 252       ir->operation = ir_unop_f2i;
 253       ir->operands[0] = op0;
 254    } else {
 255       ir->operation = ir_unop_i2u;
 256       ir->operands[0] = new(ir) ir_expression(ir_unop_f2i, op0);
 257    }
 258    ir->operands[1] = NULL;
 259
 260    this->progress = true;
 261 }
 262
 263 void
 264 lower_instructions_visitor::exp_to_exp2(ir_expression *ir)
 265 {
 266    ir_constant *log2_e = new(ir) ir_constant(float(M_LOG2E));
 267
 268    ir->operation = ir_unop_exp2;
 269    ir->operands[0] = new(ir) ir_expression(ir_binop_mul, ir->operands[0]->type,
 270                                            ir->operands[0], log2_e);
 271    this->progress = true;
 272 }
 273
 274 void
 275 lower_instructions_visitor::pow_to_exp2(ir_expression *ir)
 276 {
 277    ir_expression *const log2_x =
 278       new(ir) ir_expression(ir_unop_log2, ir->operands[0]->type,
 279                             ir->operands[0]);
 280
 281    ir->operation = ir_unop_exp2;
 282    ir->operands[0] = new(ir) ir_expression(ir_binop_mul, ir->operands[1]->type,
 283                                            ir->operands[1], log2_x);
 284    ir->operands[1] = NULL;
 285    this->progress = true;
 286 }
 287
 288 void
 289 lower_instructions_visitor::log_to_log2(ir_expression *ir)
 290 {
 291    ir->operation = ir_binop_mul;
 292    ir->operands[0] = new(ir) ir_expression(ir_unop_log2, ir->operands[0]->type,
 293                                            ir->operands[0], NULL);
 294    ir->operands[1] = new(ir) ir_constant(float(1.0 / M_LOG2E));
 295    this->progress = true;
 296 }
 297
 298 void
 299 lower_instructions_visitor::mod_to_floor(ir_expression *ir)
 300 {
 301    ir_variable *x = new(ir) ir_variable(ir->operands[0]->type, "mod_x",
 302                                          ir_var_temporary);
 303    ir_variable *y = new(ir) ir_variable(ir->operands[1]->type, "mod_y",
 304                                          ir_var_temporary);
 305    this->base_ir->insert_before(x);
 306    this->base_ir->insert_before(y);
 307
 308    ir_assignment *const assign_x =
 309       new(ir) ir_assignment(new(ir) ir_dereference_variable(x),
 310                             ir->operands[0], NULL);
 311    ir_assignment *const assign_y =
 312       new(ir) ir_assignment(new(ir) ir_dereference_variable(y),
 313                             ir->operands[1], NULL);
 314
 315    this->base_ir->insert_before(assign_x);
 316    this->base_ir->insert_before(assign_y);
 317
 318    ir_expression *const div_expr =
 319       new(ir) ir_expression(ir_binop_div, x->type,
 320                             new(ir) ir_dereference_variable(x),
 321                             new(ir) ir_dereference_variable(y));
 322
 323    /* Don't generate new IR that would need to be lowered in an additional
 324     * pass.
 325     */
 326    if (lowering(DIV_TO_MUL_RCP) && (ir->type->is_float() || ir->type->is_double()))
 327       div_to_mul_rcp(div_expr);
 328
 329    ir_expression *const floor_expr =
 330       new(ir) ir_expression(ir_unop_floor, x->type, div_expr);
 331
 332    if (lowering(DOPS_TO_DFRAC) && ir->type->is_double())
 333       dfloor_to_dfrac(floor_expr);
 334
 335    ir_expression *const mul_expr =
 336       new(ir) ir_expression(ir_binop_mul,
 337                             new(ir) ir_dereference_variable(y),
 338                             floor_expr);
 339
 340    ir->operation = ir_binop_sub;
 341    ir->operands[0] = new(ir) ir_dereference_variable(x);
 342    ir->operands[1] = mul_expr;
 343    this->progress = true;
 344 }
 345
 346 void
 347 lower_instructions_visitor::ldexp_to_arith(ir_expression *ir)
 348 {
 349    /* Translates
 350     *    ir_binop_ldexp x exp
 351     * into
 352     *
 353     *    extracted_biased_exp = rshift(bitcast_f2i(abs(x)), exp_shift);
 354     *    resulting_biased_exp = extracted_biased_exp + exp;
 355     *
 356     *    if (resulting_biased_exp < 1 || x == 0.0f) {
 357     *       return copysign(0.0, x);
 358     *    }
 359     *
 360     *    return bitcast_u2f((bitcast_f2u(x) & sign_mantissa_mask) |
 361     *                       lshift(i2u(resulting_biased_exp), exp_shift));
 362     *
 363     * which we can't actually implement as such, since the GLSL IR doesn't
 364     * have vectorized if-statements. We actually implement it without branches
 365     * using conditional-select:
 366     *
 367     *    extracted_biased_exp = rshift(bitcast_f2i(abs(x)), exp_shift);
 368     *    resulting_biased_exp = extracted_biased_exp + exp;
 369     *
 370     *    is_not_zero_or_underflow = logic_and(nequal(x, 0.0f),
 371     *                                         gequal(resulting_biased_exp, 1);
 372     *    x = csel(is_not_zero_or_underflow, x, copysign(0.0f, x));
 373     *    resulting_biased_exp = csel(is_not_zero_or_underflow,
 374     *                                resulting_biased_exp, 0);
 375     *
 376     *    return bitcast_u2f((bitcast_f2u(x) & sign_mantissa_mask) |
 377     *                       lshift(i2u(resulting_biased_exp), exp_shift));
 378     */
 379
 380    const unsigned vec_elem = ir->type->vector_elements;
 381
 382    /* Types */
 383    const glsl_type *ivec = glsl_type::get_instance(GLSL_TYPE_INT, vec_elem, 1);
 384    const glsl_type *bvec = glsl_type::get_instance(GLSL_TYPE_BOOL, vec_elem, 1);
 385
 386    /* Constants */
 387    ir_constant *zeroi = ir_constant::zero(ir, ivec);
 388
 389    ir_constant *sign_mask = new(ir) ir_constant(0x80000000u, vec_elem);
 390
 391    ir_constant *exp_shift = new(ir) ir_constant(23, vec_elem);
 392    ir_constant *exp_width = new(ir) ir_constant(8, vec_elem);
 393
 394    /* Temporary variables */
 395    ir_variable *x = new(ir) ir_variable(ir->type, "x", ir_var_temporary);
 396    ir_variable *exp = new(ir) ir_variable(ivec, "exp", ir_var_temporary);
 397
 398    ir_variable *zero_sign_x = new(ir) ir_variable(ir->type, "zero_sign_x",
 399                                                   ir_var_temporary);
 400
 401    ir_variable *extracted_biased_exp =
 402       new(ir) ir_variable(ivec, "extracted_biased_exp", ir_var_temporary);
 403    ir_variable *resulting_biased_exp =
 404       new(ir) ir_variable(ivec, "resulting_biased_exp", ir_var_temporary);
 405
 406    ir_variable *is_not_zero_or_underflow =
 407       new(ir) ir_variable(bvec, "is_not_zero_or_underflow", ir_var_temporary);
 408
 409    ir_instruction &i = *base_ir;
 410
 411    /* Copy <x> and <exp> arguments. */
 412    i.insert_before(x);
 413    i.insert_before(assign(x, ir->operands[0]));
 414    i.insert_before(exp);
 415    i.insert_before(assign(exp, ir->operands[1]));
 416
 417    /* Extract the biased exponent from <x>. */
 418    i.insert_before(extracted_biased_exp);
 419    i.insert_before(assign(extracted_biased_exp,
 420                           rshift(bitcast_f2i(abs(x)), exp_shift)));
 421
 422    i.insert_before(resulting_biased_exp);
 423    i.insert_before(assign(resulting_biased_exp,
 424                           add(extracted_biased_exp, exp)));
 425
 426    /* Test if result is ±0.0, subnormal, or underflow by checking if the
 427     * resulting biased exponent would be less than 0x1. If so, the result is
 428     * 0.0 with the sign of x. (Actually, invert the conditions so that
 429     * immediate values are the second arguments, which is better for i965)
 430     */
 431    i.insert_before(zero_sign_x);
 432    i.insert_before(assign(zero_sign_x,
 433                           bitcast_u2f(bit_and(bitcast_f2u(x), sign_mask))));
 434
 435    i.insert_before(is_not_zero_or_underflow);
 436    i.insert_before(assign(is_not_zero_or_underflow,
 437                           logic_and(nequal(x, new(ir) ir_constant(0.0f, vec_elem)),
 438                                     gequal(resulting_biased_exp,
 439                                            new(ir) ir_constant(0x1, vec_elem)))));
 440    i.insert_before(assign(x, csel(is_not_zero_or_underflow,
 441                                   x, zero_sign_x)));
 442    i.insert_before(assign(resulting_biased_exp,
 443                           csel(is_not_zero_or_underflow,
 444                                resulting_biased_exp, zeroi)));
 445
 446    /* We could test for overflows by checking if the resulting biased exponent
 447     * would be greater than 0xFE. Turns out we don't need to because the GLSL
 448     * spec says:
 449     *
 450     *    "If this product is too large to be represented in the
 451     *     floating-point type, the result is undefined."
 452     */
 453
 454    ir_constant *exp_shift_clone = exp_shift->clone(ir, NULL);
 455    ir->operation = ir_unop_bitcast_i2f;
 456    ir->operands[0] = bitfield_insert(bitcast_f2i(x), resulting_biased_exp,
 457                                      exp_shift_clone, exp_width);
 458    ir->operands[1] = NULL;
 459
 460    this->progress = true;
 461 }
 462
 463 void
 464 lower_instructions_visitor::dldexp_to_arith(ir_expression *ir)
 465 {
 466    /* See ldexp_to_arith for structure. Uses frexp_exp to extract the exponent
 467     * from the significand.
 468     */
 469
 470    const unsigned vec_elem = ir->type->vector_elements;
 471
 472    /* Types */
 473    const glsl_type *ivec = glsl_type::get_instance(GLSL_TYPE_INT, vec_elem, 1);
 474    const glsl_type *bvec = glsl_type::get_instance(GLSL_TYPE_BOOL, vec_elem, 1);
 475
 476    /* Constants */
 477    ir_constant *zeroi = ir_constant::zero(ir, ivec);
 478
 479    ir_constant *sign_mask = new(ir) ir_constant(0x80000000u);
 480
 481    ir_constant *exp_shift = new(ir) ir_constant(20u);
 482    ir_constant *exp_width = new(ir) ir_constant(11u);
 483    ir_constant *exp_bias = new(ir) ir_constant(1022, vec_elem);
 484
 485    /* Temporary variables */
 486    ir_variable *x = new(ir) ir_variable(ir->type, "x", ir_var_temporary);
 487    ir_variable *exp = new(ir) ir_variable(ivec, "exp", ir_var_temporary);
 488
 489    ir_variable *zero_sign_x = new(ir) ir_variable(ir->type, "zero_sign_x",
 490                                                   ir_var_temporary);
 491
 492    ir_variable *extracted_biased_exp =
 493       new(ir) ir_variable(ivec, "extracted_biased_exp", ir_var_temporary);
 494    ir_variable *resulting_biased_exp =
 495       new(ir) ir_variable(ivec, "resulting_biased_exp", ir_var_temporary);
 496
 497    ir_variable *is_not_zero_or_underflow =
 498       new(ir) ir_variable(bvec, "is_not_zero_or_underflow", ir_var_temporary);
 499
 500    ir_instruction &i = *base_ir;
 501
 502    /* Copy <x> and <exp> arguments. */
 503    i.insert_before(x);
 504    i.insert_before(assign(x, ir->operands[0]));
 505    i.insert_before(exp);
 506    i.insert_before(assign(exp, ir->operands[1]));
 507
 508    ir_expression *frexp_exp = expr(ir_unop_frexp_exp, x);
 509    if (lowering(DFREXP_DLDEXP_TO_ARITH))
 510       dfrexp_exp_to_arith(frexp_exp);
 511
 512    /* Extract the biased exponent from <x>. */
 513    i.insert_before(extracted_biased_exp);
 514    i.insert_before(assign(extracted_biased_exp, add(frexp_exp, exp_bias)));
 515
 516    i.insert_before(resulting_biased_exp);
 517    i.insert_before(assign(resulting_biased_exp,
 518                           add(extracted_biased_exp, exp)));
 519
 520    /* Test if result is ±0.0, subnormal, or underflow by checking if the
 521     * resulting biased exponent would be less than 0x1. If so, the result is
 522     * 0.0 with the sign of x. (Actually, invert the conditions so that
 523     * immediate values are the second arguments, which is better for i965)
 524     * TODO: Implement in a vector fashion.
 525     */
 526    i.insert_before(zero_sign_x);
 527    for (unsigned elem = 0; elem < vec_elem; elem++) {
 528       ir_variable *unpacked =
 529          new(ir) ir_variable(glsl_type::uvec2_type, "unpacked", ir_var_temporary);
 530       i.insert_before(unpacked);
 531       i.insert_before(
 532             assign(unpacked,
 533                    expr(ir_unop_unpack_double_2x32, swizzle(x, elem, 1))));
 534       i.insert_before(assign(unpacked, bit_and(swizzle_y(unpacked), sign_mask->clone(ir, NULL)),
 535                              WRITEMASK_Y));
 536       i.insert_before(assign(unpacked, ir_constant::zero(ir, glsl_type::uint_type), WRITEMASK_X));
 537       i.insert_before(assign(zero_sign_x,
 538                              expr(ir_unop_pack_double_2x32, unpacked),
 539                              1 << elem));
 540    }
 541    i.insert_before(is_not_zero_or_underflow);
 542    i.insert_before(assign(is_not_zero_or_underflow,
 543                           gequal(resulting_biased_exp,
 544                                   new(ir) ir_constant(0x1, vec_elem))));
 545    i.insert_before(assign(x, csel(is_not_zero_or_underflow,
 546                                   x, zero_sign_x)));
 547    i.insert_before(assign(resulting_biased_exp,
 548                           csel(is_not_zero_or_underflow,
 549                                resulting_biased_exp, zeroi)));
 550
 551    /* We could test for overflows by checking if the resulting biased exponent
 552     * would be greater than 0xFE. Turns out we don't need to because the GLSL
 553     * spec says:
 554     *
 555     *    "If this product is too large to be represented in the
 556     *     floating-point type, the result is undefined."
 557     */
 558
 559    ir_rvalue *results[4] = {NULL};
 560    for (unsigned elem = 0; elem < vec_elem; elem++) {
 561       ir_variable *unpacked =
 562          new(ir) ir_variable(glsl_type::uvec2_type, "unpacked", ir_var_temporary);
 563       i.insert_before(unpacked);
 564       i.insert_before(
 565             assign(unpacked,
 566                    expr(ir_unop_unpack_double_2x32, swizzle(x, elem, 1))));
 567
 568       ir_expression *bfi = bitfield_insert(
 569             swizzle_y(unpacked),
 570             i2u(swizzle(resulting_biased_exp, elem, 1)),
 571             exp_shift->clone(ir, NULL),
 572             exp_width->clone(ir, NULL));
 573
 574       i.insert_before(assign(unpacked, bfi, WRITEMASK_Y));
 575
 576       results[elem] = expr(ir_unop_pack_double_2x32, unpacked);
 577    }
 578
 579    ir->operation = ir_quadop_vector;
 580    ir->operands[0] = results[0];
 581    ir->operands[1] = results[1];
 582    ir->operands[2] = results[2];
 583    ir->operands[3] = results[3];
 584
 585    /* Don't generate new IR that would need to be lowered in an additional
 586     * pass.
 587     */
 588
 589    this->progress = true;
 590 }
 591
 592 void
 593 lower_instructions_visitor::dfrexp_sig_to_arith(ir_expression *ir)
 594 {
 595    const unsigned vec_elem = ir->type->vector_elements;
 596    const glsl_type *bvec = glsl_type::get_instance(GLSL_TYPE_BOOL, vec_elem, 1);
 597
 598    /* Double-precision floating-point values are stored as
 599     *   1 sign bit;
 600     *   11 exponent bits;
 601     *   52 mantissa bits.
 602     *
 603     * We're just extracting the significand here, so we only need to modify
 604     * the upper 32-bit uint. Unfortunately we must extract each double
 605     * independently as there is no vector version of unpackDouble.
 606     */
 607
 608    ir_instruction &i = *base_ir;
 609
 610    ir_variable *is_not_zero =
 611       new(ir) ir_variable(bvec, "is_not_zero", ir_var_temporary);
 612    ir_rvalue *results[4] = {NULL};
 613
 614    ir_constant *dzero = new(ir) ir_constant(0.0, vec_elem);
 615    i.insert_before(is_not_zero);
 616    i.insert_before(
 617          assign(is_not_zero,
 618                 nequal(abs(ir->operands[0]->clone(ir, NULL)), dzero)));
 619
 620    /* TODO: Remake this as more vector-friendly when int64 support is
 621     * available.
 622     */
 623    for (unsigned elem = 0; elem < vec_elem; elem++) {
 624       ir_constant *zero = new(ir) ir_constant(0u, 1);
 625       ir_constant *sign_mantissa_mask = new(ir) ir_constant(0x800fffffu, 1);
 626
 627       /* Exponent of double floating-point values in the range [0.5, 1.0). */
 628       ir_constant *exponent_value = new(ir) ir_constant(0x3fe00000u, 1);
 629
 630       ir_variable *bits =
 631          new(ir) ir_variable(glsl_type::uint_type, "bits", ir_var_temporary);
 632       ir_variable *unpacked =
 633          new(ir) ir_variable(glsl_type::uvec2_type, "unpacked", ir_var_temporary);
 634
 635       ir_rvalue *x = swizzle(ir->operands[0]->clone(ir, NULL), elem, 1);
 636
 637       i.insert_before(bits);
 638       i.insert_before(unpacked);
 639       i.insert_before(assign(unpacked, expr(ir_unop_unpack_double_2x32, x)));
 640
 641       /* Manipulate the high uint to remove the exponent and replace it with
 642        * either the default exponent or zero.
 643        */
 644       i.insert_before(assign(bits, swizzle_y(unpacked)));
 645       i.insert_before(assign(bits, bit_and(bits, sign_mantissa_mask)));
 646       i.insert_before(assign(bits, bit_or(bits,
 647                                           csel(swizzle(is_not_zero, elem, 1),
 648                                                exponent_value,
 649                                                zero))));
 650       i.insert_before(assign(unpacked, bits, WRITEMASK_Y));
 651       results[elem] = expr(ir_unop_pack_double_2x32, unpacked);
 652    }
 653
 654    /* Put the dvec back together */
 655    ir->operation = ir_quadop_vector;
 656    ir->operands[0] = results[0];
 657    ir->operands[1] = results[1];
 658    ir->operands[2] = results[2];
 659    ir->operands[3] = results[3];
 660
 661    this->progress = true;
 662 }
 663
 664 void
 665 lower_instructions_visitor::dfrexp_exp_to_arith(ir_expression *ir)
 666 {
 667    const unsigned vec_elem = ir->type->vector_elements;
 668    const glsl_type *bvec = glsl_type::get_instance(GLSL_TYPE_BOOL, vec_elem, 1);
 669    const glsl_type *uvec = glsl_type::get_instance(GLSL_TYPE_UINT, vec_elem, 1);
 670
 671    /* Double-precision floating-point values are stored as
 672     *   1 sign bit;
 673     *   11 exponent bits;
 674     *   52 mantissa bits.
 675     *
 676     * We're just extracting the exponent here, so we only care about the upper
 677     * 32-bit uint.
 678     */
 679
 680    ir_instruction &i = *base_ir;
 681
 682    ir_variable *is_not_zero =
 683       new(ir) ir_variable(bvec, "is_not_zero", ir_var_temporary);
 684    ir_variable *high_words =
 685       new(ir) ir_variable(uvec, "high_words", ir_var_temporary);
 686    ir_constant *dzero = new(ir) ir_constant(0.0, vec_elem);
 687    ir_constant *izero = new(ir) ir_constant(0, vec_elem);
 688
 689    ir_rvalue *absval = abs(ir->operands[0]);
 690
 691    i.insert_before(is_not_zero);
 692    i.insert_before(high_words);
 693    i.insert_before(assign(is_not_zero, nequal(absval->clone(ir, NULL), dzero)));
 694
 695    /* Extract all of the upper uints. */
 696    for (unsigned elem = 0; elem < vec_elem; elem++) {
 697       ir_rvalue *x = swizzle(absval->clone(ir, NULL), elem, 1);
 698
 699       i.insert_before(assign(high_words,
 700                              swizzle_y(expr(ir_unop_unpack_double_2x32, x)),
 701                              1 << elem));
 702
 703    }
 704    ir_constant *exponent_shift = new(ir) ir_constant(20, vec_elem);
 705    ir_constant *exponent_bias = new(ir) ir_constant(-1022, vec_elem);
 706
 707    /* For non-zero inputs, shift the exponent down and apply bias. */
 708    ir->operation = ir_triop_csel;
 709    ir->operands[0] = new(ir) ir_dereference_variable(is_not_zero);
 710    ir->operands[1] = add(exponent_bias, u2i(rshift(high_words, exponent_shift)));
 711    ir->operands[2] = izero;
 712
 713    this->progress = true;
 714 }
 715
 716 void
 717 lower_instructions_visitor::carry_to_arith(ir_expression *ir)
 718 {
 719    /* Translates
 720     *   ir_binop_carry x y
 721     * into
 722     *   sum = ir_binop_add x y
 723     *   bcarry = ir_binop_less sum x
 724     *   carry = ir_unop_b2i bcarry
 725     */
 726
 727    ir_rvalue *x_clone = ir->operands[0]->clone(ir, NULL);
 728    ir->operation = ir_unop_i2u;
 729    ir->operands[0] = b2i(less(add(ir->operands[0], ir->operands[1]), x_clone));
 730    ir->operands[1] = NULL;
 731
 732    this->progress = true;
 733 }
 734
 735 void
 736 lower_instructions_visitor::borrow_to_arith(ir_expression *ir)
 737 {
 738    /* Translates
 739     *   ir_binop_borrow x y
 740     * into
 741     *   bcarry = ir_binop_less x y
 742     *   carry = ir_unop_b2i bcarry
 743     */
 744
 745    ir->operation = ir_unop_i2u;
 746    ir->operands[0] = b2i(less(ir->operands[0], ir->operands[1]));
 747    ir->operands[1] = NULL;
 748
 749    this->progress = true;
 750 }
 751
 752 void
 753 lower_instructions_visitor::sat_to_clamp(ir_expression *ir)
 754 {
 755    /* Translates
 756     *   ir_unop_saturate x
 757     * into
 758     *   ir_binop_min (ir_binop_max(x, 0.0), 1.0)
 759     */
 760
 761    ir->operation = ir_binop_min;
 762    ir->operands[0] = new(ir) ir_expression(ir_binop_max, ir->operands[0]->type,
 763                                            ir->operands[0],
 764                                            new(ir) ir_constant(0.0f));
 765    ir->operands[1] = new(ir) ir_constant(1.0f);
 766
 767    this->progress = true;
 768 }
 769
 770 void
 771 lower_instructions_visitor::double_dot_to_fma(ir_expression *ir)
 772 {
 773    ir_variable *temp = new(ir) ir_variable(ir->operands[0]->type->get_base_type(), "dot_res",
 774                                            ir_var_temporary);
 775    this->base_ir->insert_before(temp);
 776
 777    int nc = ir->operands[0]->type->components();
 778    for (int i = nc - 1; i >= 1; i--) {
 779       ir_assignment *assig;
 780       if (i == (nc - 1)) {
 781          assig = assign(temp, mul(swizzle(ir->operands[0]->clone(ir, NULL), i, 1),
 782                                   swizzle(ir->operands[1]->clone(ir, NULL), i, 1)));
 783       } else {
 784          assig = assign(temp, fma(swizzle(ir->operands[0]->clone(ir, NULL), i, 1),
 785                                   swizzle(ir->operands[1]->clone(ir, NULL), i, 1),
 786                                   temp));
 787       }
 788       this->base_ir->insert_before(assig);
 789    }
 790
 791    ir->operation = ir_triop_fma;
 792    ir->operands[0] = swizzle(ir->operands[0], 0, 1);
 793    ir->operands[1] = swizzle(ir->operands[1], 0, 1);
 794    ir->operands[2] = new(ir) ir_dereference_variable(temp);
 795
 796    this->progress = true;
 797
 798 }
 799
 800 void
 801 lower_instructions_visitor::double_lrp(ir_expression *ir)
 802 {
 803    int swizval;
 804    ir_rvalue *op0 = ir->operands[0], *op2 = ir->operands[2];
 805    ir_constant *one = new(ir) ir_constant(1.0, op2->type->vector_elements);
 806
 807    switch (op2->type->vector_elements) {
 808    case 1:
 809       swizval = SWIZZLE_XXXX;
 810       break;
 811    default:
 812       assert(op0->type->vector_elements == op2->type->vector_elements);
 813       swizval = SWIZZLE_XYZW;
 814       break;
 815    }
 816
 817    ir->operation = ir_triop_fma;
 818    ir->operands[0] = swizzle(op2, swizval, op0->type->vector_elements);
 819    ir->operands[2] = mul(sub(one, op2->clone(ir, NULL)), op0);
 820
 821    this->progress = true;
 822 }
 823
 824 void
 825 lower_instructions_visitor::dceil_to_dfrac(ir_expression *ir)
 826 {
 827    /*
 828     * frtemp = frac(x);
 829     * temp = sub(x, frtemp);
 830     * result = temp + ((frtemp != 0.0) ? 1.0 : 0.0);
 831     */
 832    ir_instruction &i = *base_ir;
 833    ir_constant *zero = new(ir) ir_constant(0.0, ir->operands[0]->type->vector_elements);
 834    ir_constant *one = new(ir) ir_constant(1.0, ir->operands[0]->type->vector_elements);
 835    ir_variable *frtemp = new(ir) ir_variable(ir->operands[0]->type, "frtemp",
 836                                              ir_var_temporary);
 837
 838    i.insert_before(frtemp);
 839    i.insert_before(assign(frtemp, fract(ir->operands[0])));
 840
 841    ir->operation = ir_binop_add;
 842    ir->operands[0] = sub(ir->operands[0]->clone(ir, NULL), frtemp);
 843    ir->operands[1] = csel(nequal(frtemp, zero), one, zero->clone(ir, NULL));
 844
 845    this->progress = true;
 846 }
 847
 848 void
 849 lower_instructions_visitor::dfloor_to_dfrac(ir_expression *ir)
 850 {
 851    /*
 852     * frtemp = frac(x);
 853     * result = sub(x, frtemp);
 854     */
 855    ir->operation = ir_binop_sub;
 856    ir->operands[1] = fract(ir->operands[0]->clone(ir, NULL));
 857
 858    this->progress = true;
 859 }
 860 void
 861 lower_instructions_visitor::dround_even_to_dfrac(ir_expression *ir)
 862 {
 863    /*
 864     * insane but works
 865     * temp = x + 0.5;
 866     * frtemp = frac(temp);
 867     * t2 = sub(temp, frtemp);
 868     * if (frac(x) == 0.5)
 869     *     result = frac(t2 * 0.5) == 0 ? t2 : t2 - 1;
 870     *  else
 871     *     result = t2;
 872
 873     */
 874    ir_instruction &i = *base_ir;
 875    ir_variable *frtemp = new(ir) ir_variable(ir->operands[0]->type, "frtemp",
 876                                              ir_var_temporary);
 877    ir_variable *temp = new(ir) ir_variable(ir->operands[0]->type, "temp",
 878                                            ir_var_temporary);
 879    ir_variable *t2 = new(ir) ir_variable(ir->operands[0]->type, "t2",
 880                                            ir_var_temporary);
 881    ir_constant *p5 = new(ir) ir_constant(0.5, ir->operands[0]->type->vector_elements);
 882    ir_constant *one = new(ir) ir_constant(1.0, ir->operands[0]->type->vector_elements);
 883    ir_constant *zero = new(ir) ir_constant(0.0, ir->operands[0]->type->vector_elements);
 884
 885    i.insert_before(temp);
 886    i.insert_before(assign(temp, add(ir->operands[0], p5)));
 887
 888    i.insert_before(frtemp);
 889    i.insert_before(assign(frtemp, fract(temp)));
 890
 891    i.insert_before(t2);
 892    i.insert_before(assign(t2, sub(temp, frtemp)));
 893
 894    ir->operation = ir_triop_csel;
 895    ir->operands[0] = equal(fract(ir->operands[0]->clone(ir, NULL)),
 896                            p5->clone(ir, NULL));
 897    ir->operands[1] = csel(equal(fract(mul(t2, p5->clone(ir, NULL))),
 898                                 zero),
 899                           t2,
 900                           sub(t2, one));
 901    ir->operands[2] = new(ir) ir_dereference_variable(t2);
 902
 903    this->progress = true;
 904 }
 905
 906 void
 907 lower_instructions_visitor::dtrunc_to_dfrac(ir_expression *ir)
 908 {
 909    /*
 910     * frtemp = frac(x);
 911     * temp = sub(x, frtemp);
 912     * result = x >= 0 ? temp : temp + (frtemp == 0.0) ? 0 : 1;
 913     */
 914    ir_rvalue *arg = ir->operands[0];
 915    ir_instruction &i = *base_ir;
 916
 917    ir_constant *zero = new(ir) ir_constant(0.0, arg->type->vector_elements);
 918    ir_constant *one = new(ir) ir_constant(1.0, arg->type->vector_elements);
 919    ir_variable *frtemp = new(ir) ir_variable(arg->type, "frtemp",
 920                                              ir_var_temporary);
 921    ir_variable *temp = new(ir) ir_variable(ir->operands[0]->type, "temp",
 922                                            ir_var_temporary);
 923
 924    i.insert_before(frtemp);
 925    i.insert_before(assign(frtemp, fract(arg)));
 926    i.insert_before(temp);
 927    i.insert_before(assign(temp, sub(arg->clone(ir, NULL), frtemp)));
 928
 929    ir->operation = ir_triop_csel;
 930    ir->operands[0] = gequal(arg->clone(ir, NULL), zero);
 931    ir->operands[1] = new (ir) ir_dereference_variable(temp);
 932    ir->operands[2] = add(temp,
 933                          csel(equal(frtemp, zero->clone(ir, NULL)),
 934                               zero->clone(ir, NULL),
 935                               one));
 936
 937    this->progress = true;
 938 }
 939
 940 void
 941 lower_instructions_visitor::dsign_to_csel(ir_expression *ir)
 942 {
 943    /*
 944     * temp = x > 0.0 ? 1.0 : 0.0;
 945     * result = x < 0.0 ? -1.0 : temp;
 946     */
 947    ir_rvalue *arg = ir->operands[0];
 948    ir_constant *zero = new(ir) ir_constant(0.0, arg->type->vector_elements);
 949    ir_constant *one = new(ir) ir_constant(1.0, arg->type->vector_elements);
 950    ir_constant *neg_one = new(ir) ir_constant(-1.0, arg->type->vector_elements);
 951
 952    ir->operation = ir_triop_csel;
 953    ir->operands[0] = less(arg->clone(ir, NULL),
 954                           zero->clone(ir, NULL));
 955    ir->operands[1] = neg_one;
 956    ir->operands[2] = csel(greater(arg, zero),
 957                           one,
 958                           zero->clone(ir, NULL));
 959
 960    this->progress = true;
 961 }
 962
 963 void
 964 lower_instructions_visitor::bit_count_to_math(ir_expression *ir)
 965 {
 966    /* For more details, see:
 967     *
 968     * http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetPaallel
 969     */
 970    const unsigned elements = ir->operands[0]->type->vector_elements;
 971    ir_variable *temp = new(ir) ir_variable(glsl_type::uvec(elements), "temp",
 972                                            ir_var_temporary);
 973    ir_constant *c55555555 = new(ir) ir_constant(0x55555555u);
 974    ir_constant *c33333333 = new(ir) ir_constant(0x33333333u);
 975    ir_constant *c0F0F0F0F = new(ir) ir_constant(0x0F0F0F0Fu);
 976    ir_constant *c01010101 = new(ir) ir_constant(0x01010101u);
 977    ir_constant *c1 = new(ir) ir_constant(1u);
 978    ir_constant *c2 = new(ir) ir_constant(2u);
 979    ir_constant *c4 = new(ir) ir_constant(4u);
 980    ir_constant *c24 = new(ir) ir_constant(24u);
 981
 982    base_ir->insert_before(temp);
 983
 984    if (ir->operands[0]->type->base_type == GLSL_TYPE_UINT) {
 985       base_ir->insert_before(assign(temp, ir->operands[0]));
 986    } else {
 987       assert(ir->operands[0]->type->base_type == GLSL_TYPE_INT);
 988       base_ir->insert_before(assign(temp, i2u(ir->operands[0])));
 989    }
 990
 991    /* temp = temp - ((temp >> 1) & 0x55555555u); */
 992    base_ir->insert_before(assign(temp, sub(temp, bit_and(rshift(temp, c1),
 993                                                          c55555555))));
 994
 995    /* temp = (temp & 0x33333333u) + ((temp >> 2) & 0x33333333u); */
 996    base_ir->insert_before(assign(temp, add(bit_and(temp, c33333333),
 997                                            bit_and(rshift(temp, c2),
 998                                                    c33333333->clone(ir, NULL)))));
 999
1000    /* int(((temp + (temp >> 4) & 0xF0F0F0Fu) * 0x1010101u) >> 24); */
1001    ir->operation = ir_unop_u2i;
1002    ir->operands[0] = rshift(mul(bit_and(add(temp, rshift(temp, c4)), c0F0F0F0F),
1003                                 c01010101),
1004                             c24);
1005
1006    this->progress = true;
1007 }
1008
1009 void
1010 lower_instructions_visitor::extract_to_shifts(ir_expression *ir)
1011 {
1012    ir_variable *bits =
1013       new(ir) ir_variable(ir->operands[0]->type, "bits", ir_var_temporary);
1014
1015    base_ir->insert_before(bits);
1016    base_ir->insert_before(assign(bits, ir->operands[2]));
1017
1018    if (ir->operands[0]->type->base_type == GLSL_TYPE_UINT) {
1019       ir_constant *c1 =
1020          new(ir) ir_constant(1u, ir->operands[0]->type->vector_elements);
1021       ir_constant *c32 =
1022          new(ir) ir_constant(32u, ir->operands[0]->type->vector_elements);
1023       ir_constant *cFFFFFFFF =
1024          new(ir) ir_constant(0xFFFFFFFFu, ir->operands[0]->type->vector_elements);
1025
1026       /* At least some hardware treats (x << y) as (x << (y%32)).  This means
1027        * we'd get a mask of 0 when bits is 32.  Special case it.
1028        *
1029        * mask = bits == 32 ? 0xffffffff : (1u << bits) - 1u;
1030        */
1031       ir_expression *mask = csel(equal(bits, c32),
1032                                  cFFFFFFFF,
1033                                  sub(lshift(c1, bits), c1->clone(ir, NULL)));
1034
1035       /* Section 8.8 (Integer Functions) of the GLSL 4.50 spec says:
1036        *
1037        *    If bits is zero, the result will be zero.
1038        *
1039        * Since (1 << 0) - 1 == 0, we don't need to bother with the conditional
1040        * select as in the signed integer case.
1041        *
1042        * (value >> offset) & mask;
1043        */
1044       ir->operation = ir_binop_bit_and;
1045       ir->operands[0] = rshift(ir->operands[0], ir->operands[1]);
1046       ir->operands[1] = mask;
1047       ir->operands[2] = NULL;
1048    } else {
1049       ir_constant *c0 =
1050          new(ir) ir_constant(int(0), ir->operands[0]->type->vector_elements);
1051       ir_constant *c32 =
1052          new(ir) ir_constant(int(32), ir->operands[0]->type->vector_elements);
1053       ir_variable *temp =
1054          new(ir) ir_variable(ir->operands[0]->type, "temp", ir_var_temporary);
1055
1056       /* temp = 32 - bits; */
1057       base_ir->insert_before(temp);
1058       base_ir->insert_before(assign(temp, sub(c32, bits)));
1059
1060       /* expr = value << (temp - offset)) >> temp; */
1061       ir_expression *expr =
1062          rshift(lshift(ir->operands[0], sub(temp, ir->operands[1])), temp);
1063
1064       /* Section 8.8 (Integer Functions) of the GLSL 4.50 spec says:
1065        *
1066        *    If bits is zero, the result will be zero.
1067        *
1068        * Due to the (x << (y%32)) behavior mentioned before, the (value <<
1069        * (32-0)) doesn't "erase" all of the data as we would like, so finish
1070        * up with:
1071        *
1072        * (bits == 0) ? 0 : e;
1073        */
1074       ir->operation = ir_triop_csel;
1075       ir->operands[0] = equal(c0, bits);
1076       ir->operands[1] = c0->clone(ir, NULL);
1077       ir->operands[2] = expr;
1078    }
1079
1080    this->progress = true;
1081 }
1082
1083 void
1084 lower_instructions_visitor::insert_to_shifts(ir_expression *ir)
1085 {
1086    ir_constant *c1;
1087    ir_constant *c32;
1088    ir_constant *cFFFFFFFF;
1089    ir_variable *offset =
1090       new(ir) ir_variable(ir->operands[0]->type, "offset", ir_var_temporary);
1091    ir_variable *bits =
1092       new(ir) ir_variable(ir->operands[0]->type, "bits", ir_var_temporary);
1093    ir_variable *mask =
1094       new(ir) ir_variable(ir->operands[0]->type, "mask", ir_var_temporary);
1095
1096    if (ir->operands[0]->type->base_type == GLSL_TYPE_INT) {
1097       c1 = new(ir) ir_constant(int(1), ir->operands[0]->type->vector_elements);
1098       c32 = new(ir) ir_constant(int(32), ir->operands[0]->type->vector_elements);
1099       cFFFFFFFF = new(ir) ir_constant(int(0xFFFFFFFF), ir->operands[0]->type->vector_elements);
1100    } else {
1101       assert(ir->operands[0]->type->base_type == GLSL_TYPE_UINT);
1102
1103       c1 = new(ir) ir_constant(1u, ir->operands[0]->type->vector_elements);
1104       c32 = new(ir) ir_constant(32u, ir->operands[0]->type->vector_elements);
1105       cFFFFFFFF = new(ir) ir_constant(0xFFFFFFFFu, ir->operands[0]->type->vector_elements);
1106    }
1107
1108    base_ir->insert_before(offset);
1109    base_ir->insert_before(assign(offset, ir->operands[2]));
1110
1111    base_ir->insert_before(bits);
1112    base_ir->insert_before(assign(bits, ir->operands[3]));
1113
1114    /* At least some hardware treats (x << y) as (x << (y%32)).  This means
1115     * we'd get a mask of 0 when bits is 32.  Special case it.
1116     *
1117     * mask = (bits == 32 ? 0xffffffff : (1u << bits) - 1u) << offset;
1118     *
1119     * Section 8.8 (Integer Functions) of the GLSL 4.50 spec says:
1120     *
1121     *    The result will be undefined if offset or bits is negative, or if the
1122     *    sum of offset and bits is greater than the number of bits used to
1123     *    store the operand.
1124     *
1125     * Since it's undefined, there are a couple other ways this could be
1126     * implemented.  The other way that was considered was to put the csel
1127     * around the whole thing:
1128     *
1129     *    final_result = bits == 32 ? insert : ... ;
1130     */
1131    base_ir->insert_before(mask);
1132
1133    base_ir->insert_before(assign(mask, csel(equal(bits, c32),
1134                                             cFFFFFFFF,
1135                                             lshift(sub(lshift(c1, bits),
1136                                                        c1->clone(ir, NULL)),
1137                                                    offset))));
1138
1139    /* (base & ~mask) | ((insert << offset) & mask) */
1140    ir->operation = ir_binop_bit_or;
1141    ir->operands[0] = bit_and(ir->operands[0], bit_not(mask));
1142    ir->operands[1] = bit_and(lshift(ir->operands[1], offset), mask);
1143    ir->operands[2] = NULL;
1144    ir->operands[3] = NULL;
1145
1146    this->progress = true;
1147 }
1148
1149 void
1150 lower_instructions_visitor::reverse_to_shifts(ir_expression *ir)
1151 {
1152    /* For more details, see:
1153     *
1154     * http://graphics.stanford.edu/~seander/bithacks.html#ReverseParallel
1155     */
1156    ir_constant *c1 =
1157       new(ir) ir_constant(1u, ir->operands[0]->type->vector_elements);
1158    ir_constant *c2 =
1159       new(ir) ir_constant(2u, ir->operands[0]->type->vector_elements);
1160    ir_constant *c4 =
1161       new(ir) ir_constant(4u, ir->operands[0]->type->vector_elements);
1162    ir_constant *c8 =
1163       new(ir) ir_constant(8u, ir->operands[0]->type->vector_elements);
1164    ir_constant *c16 =
1165       new(ir) ir_constant(16u, ir->operands[0]->type->vector_elements);
1166    ir_constant *c33333333 =
1167       new(ir) ir_constant(0x33333333u, ir->operands[0]->type->vector_elements);
1168    ir_constant *c55555555 =
1169       new(ir) ir_constant(0x55555555u, ir->operands[0]->type->vector_elements);
1170    ir_constant *c0F0F0F0F =
1171       new(ir) ir_constant(0x0F0F0F0Fu, ir->operands[0]->type->vector_elements);
1172    ir_constant *c00FF00FF =
1173       new(ir) ir_constant(0x00FF00FFu, ir->operands[0]->type->vector_elements);
1174    ir_variable *temp =
1175       new(ir) ir_variable(glsl_type::uvec(ir->operands[0]->type->vector_elements),
1176                           "temp", ir_var_temporary);
1177    ir_instruction &i = *base_ir;
1178
1179    i.insert_before(temp);
1180
1181    if (ir->operands[0]->type->base_type == GLSL_TYPE_UINT) {
1182       i.insert_before(assign(temp, ir->operands[0]));
1183    } else {
1184       assert(ir->operands[0]->type->base_type == GLSL_TYPE_INT);
1185       i.insert_before(assign(temp, i2u(ir->operands[0])));
1186    }
1187
1188    /* Swap odd and even bits.
1189     *
1190     * temp = ((temp >> 1) & 0x55555555u) | ((temp & 0x55555555u) << 1);
1191     */
1192    i.insert_before(assign(temp, bit_or(bit_and(rshift(temp, c1), c55555555),
1193                                        lshift(bit_and(temp, c55555555->clone(ir, NULL)),
1194                                               c1->clone(ir, NULL)))));
1195    /* Swap consecutive pairs.
1196     *
1197     * temp = ((temp >> 2) & 0x33333333u) | ((temp & 0x33333333u) << 2);
1198     */
1199    i.insert_before(assign(temp, bit_or(bit_and(rshift(temp, c2), c33333333),
1200                                        lshift(bit_and(temp, c33333333->clone(ir, NULL)),
1201                                               c2->clone(ir, NULL)))));
1202
1203    /* Swap nibbles.
1204     *
1205     * temp = ((temp >> 4) & 0x0F0F0F0Fu) | ((temp & 0x0F0F0F0Fu) << 4);
1206     */
1207    i.insert_before(assign(temp, bit_or(bit_and(rshift(temp, c4), c0F0F0F0F),
1208                                        lshift(bit_and(temp, c0F0F0F0F->clone(ir, NULL)),
1209                                               c4->clone(ir, NULL)))));
1210
1211    /* The last step is, basically, bswap.  Swap the bytes, then swap the
1212     * words.  When this code is run through GCC on x86, it does generate a
1213     * bswap instruction.
1214     *
1215     * temp = ((temp >> 8) & 0x00FF00FFu) | ((temp & 0x00FF00FFu) << 8);
1216     * temp = ( temp >> 16              ) | ( temp                << 16);
1217     */
1218    i.insert_before(assign(temp, bit_or(bit_and(rshift(temp, c8), c00FF00FF),
1219                                        lshift(bit_and(temp, c00FF00FF->clone(ir, NULL)),
1220                                               c8->clone(ir, NULL)))));
1221
1222    if (ir->operands[0]->type->base_type == GLSL_TYPE_UINT) {
1223       ir->operation = ir_binop_bit_or;
1224       ir->operands[0] = rshift(temp, c16);
1225       ir->operands[1] = lshift(temp, c16->clone(ir, NULL));
1226    } else {
1227       ir->operation = ir_unop_u2i;
1228       ir->operands[0] = bit_or(rshift(temp, c16),
1229                                lshift(temp, c16->clone(ir, NULL)));
1230    }
1231
1232    this->progress = true;
1233 }
1234
1235 void
1236 lower_instructions_visitor::find_lsb_to_float_cast(ir_expression *ir)
1237 {
1238    /* For more details, see:
1239     *
1240     * http://graphics.stanford.edu/~seander/bithacks.html#ZerosOnRightFloatCast
1241     */
1242    const unsigned elements = ir->operands[0]->type->vector_elements;
1243    ir_constant *c0 = new(ir) ir_constant(unsigned(0), elements);
1244    ir_constant *cminus1 = new(ir) ir_constant(int(-1), elements);
1245    ir_constant *c23 = new(ir) ir_constant(int(23), elements);
1246    ir_constant *c7F = new(ir) ir_constant(int(0x7F), elements);
1247    ir_variable *temp =
1248       new(ir) ir_variable(glsl_type::ivec(elements), "temp", ir_var_temporary);
1249    ir_variable *lsb_only =
1250       new(ir) ir_variable(glsl_type::uvec(elements), "lsb_only", ir_var_temporary);
1251    ir_variable *as_float =
1252       new(ir) ir_variable(glsl_type::vec(elements), "as_float", ir_var_temporary);
1253    ir_variable *lsb =
1254       new(ir) ir_variable(glsl_type::ivec(elements), "lsb", ir_var_temporary);
1255
1256    ir_instruction &i = *base_ir;
1257
1258    i.insert_before(temp);
1259
1260    if (ir->operands[0]->type->base_type == GLSL_TYPE_INT) {
1261       i.insert_before(assign(temp, ir->operands[0]));
1262    } else {
1263       assert(ir->operands[0]->type->base_type == GLSL_TYPE_UINT);
1264       i.insert_before(assign(temp, u2i(ir->operands[0])));
1265    }
1266
1267    /* The int-to-float conversion is lossless because (value & -value) is
1268     * either a power of two or zero.  We don't use the result in the zero
1269     * case.  The uint() cast is necessary so that 0x80000000 does not
1270     * generate a negative value.
1271     *
1272     * uint lsb_only = uint(value & -value);
1273     * float as_float = float(lsb_only);
1274     */
1275    i.insert_before(lsb_only);
1276    i.insert_before(assign(lsb_only, i2u(bit_and(temp, neg(temp)))));
1277
1278    i.insert_before(as_float);
1279    i.insert_before(assign(as_float, u2f(lsb_only)));
1280
1281    /* This is basically an open-coded frexp.  Implementations that have a
1282     * native frexp instruction would be better served by that.  This is
1283     * optimized versus a full-featured open-coded implementation in two ways:
1284     *
1285     * - We don't care about a correct result from subnormal numbers (including
1286     *   0.0), so the raw exponent can always be safely unbiased.
1287     *
1288     * - The value cannot be negative, so it does not need to be masked off to
1289     *   extract the exponent.
1290     *
1291     * int lsb = (floatBitsToInt(as_float) >> 23) - 0x7f;
1292     */
1293    i.insert_before(lsb);
1294    i.insert_before(assign(lsb, sub(rshift(bitcast_f2i(as_float), c23), c7F)));
1295
1296    /* Use lsb_only in the comparison instead of temp so that the & (far above)
1297     * can possibly generate the result without an explicit comparison.
1298     *
1299     * (lsb_only == 0) ? -1 : lsb;
1300     *
1301     * Since our input values are all integers, the unbiased exponent must not
1302     * be negative.  It will only be negative (-0x7f, in fact) if lsb_only is
1303     * 0.  Instead of using (lsb_only == 0), we could use (lsb >= 0).  Which is
1304     * better is likely GPU dependent.  Either way, the difference should be
1305     * small.
1306     */
1307    ir->operation = ir_triop_csel;
1308    ir->operands[0] = equal(lsb_only, c0);
1309    ir->operands[1] = cminus1;
1310    ir->operands[2] = new(ir) ir_dereference_variable(lsb);
1311
1312    this->progress = true;
1313 }
1314
1315 void
1316 lower_instructions_visitor::find_msb_to_float_cast(ir_expression *ir)
1317 {
1318    /* For more details, see:
1319     *
1320     * http://graphics.stanford.edu/~seander/bithacks.html#ZerosOnRightFloatCast
1321     */
1322    const unsigned elements = ir->operands[0]->type->vector_elements;
1323    ir_constant *c0 = new(ir) ir_constant(int(0), elements);
1324    ir_constant *cminus1 = new(ir) ir_constant(int(-1), elements);
1325    ir_constant *c23 = new(ir) ir_constant(int(23), elements);
1326    ir_constant *c7F = new(ir) ir_constant(int(0x7F), elements);
1327    ir_constant *c000000FF = new(ir) ir_constant(0x000000FFu, elements);
1328    ir_constant *cFFFFFF00 = new(ir) ir_constant(0xFFFFFF00u, elements);
1329    ir_variable *temp =
1330       new(ir) ir_variable(glsl_type::uvec(elements), "temp", ir_var_temporary);
1331    ir_variable *as_float =
1332       new(ir) ir_variable(glsl_type::vec(elements), "as_float", ir_var_temporary);
1333    ir_variable *msb =
1334       new(ir) ir_variable(glsl_type::ivec(elements), "msb", ir_var_temporary);
1335
1336    ir_instruction &i = *base_ir;
1337
1338    i.insert_before(temp);
1339
1340    if (ir->operands[0]->type->base_type == GLSL_TYPE_UINT) {
1341       i.insert_before(assign(temp, ir->operands[0]));
1342    } else {
1343       assert(ir->operands[0]->type->base_type == GLSL_TYPE_INT);
1344
1345       /* findMSB(uint(abs(some_int))) almost always does the right thing.
1346        * There are two problem values:
1347        *
1348        * * 0x80000000.  Since abs(0x80000000) == 0x80000000, findMSB returns
1349        *   31.  However, findMSB(int(0x80000000)) == 30.
1350        *
1351        * * 0xffffffff.  Since abs(0xffffffff) == 1, findMSB returns
1352        *   31.  Section 8.8 (Integer Functions) of the GLSL 4.50 spec says:
1353        *
1354        *    For a value of zero or negative one, -1 will be returned.
1355        *
1356        * For all negative number cases, including 0x80000000 and 0xffffffff,
1357        * the correct value is obtained from findMSB if instead of negating the
1358        * (already negative) value the logical-not is used.  A conditonal
1359        * logical-not can be achieved in two instructions.
1360        */
1361       ir_variable *as_int =
1362          new(ir) ir_variable(glsl_type::ivec(elements), "as_int", ir_var_temporary);
1363       ir_constant *c31 = new(ir) ir_constant(int(31), elements);
1364
1365       i.insert_before(as_int);
1366       i.insert_before(assign(as_int, ir->operands[0]));
1367       i.insert_before(assign(temp, i2u(expr(ir_binop_bit_xor,
1368                                             as_int,
1369                                             rshift(as_int, c31)))));
1370    }
1371
1372    /* The int-to-float conversion is lossless because bits are conditionally
1373     * masked off the bottom of temp to ensure the value has at most 24 bits of
1374     * data or is zero.  We don't use the result in the zero case.  The uint()
1375     * cast is necessary so that 0x80000000 does not generate a negative value.
1376     *
1377     * float as_float = float(temp > 255 ? temp & ~255 : temp);
1378     */
1379    i.insert_before(as_float);
1380    i.insert_before(assign(as_float, u2f(csel(greater(temp, c000000FF),
1381                                              bit_and(temp, cFFFFFF00),
1382                                              temp))));
1383
1384    /* This is basically an open-coded frexp.  Implementations that have a
1385     * native frexp instruction would be better served by that.  This is
1386     * optimized versus a full-featured open-coded implementation in two ways:
1387     *
1388     * - We don't care about a correct result from subnormal numbers (including
1389     *   0.0), so the raw exponent can always be safely unbiased.
1390     *
1391     * - The value cannot be negative, so it does not need to be masked off to
1392     *   extract the exponent.
1393     *
1394     * int msb = (floatBitsToInt(as_float) >> 23) - 0x7f;
1395     */
1396    i.insert_before(msb);
1397    i.insert_before(assign(msb, sub(rshift(bitcast_f2i(as_float), c23), c7F)));
1398
1399    /* Use msb in the comparison instead of temp so that the subtract can
1400     * possibly generate the result without an explicit comparison.
1401     *
1402     * (msb < 0) ? -1 : msb;
1403     *
1404     * Since our input values are all integers, the unbiased exponent must not
1405     * be negative.  It will only be negative (-0x7f, in fact) if temp is 0.
1406     */
1407    ir->operation = ir_triop_csel;
1408    ir->operands[0] = less(msb, c0);
1409    ir->operands[1] = cminus1;
1410    ir->operands[2] = new(ir) ir_dereference_variable(msb);
1411
1412    this->progress = true;
1413 }
1414
1415 ir_visitor_status
1416 lower_instructions_visitor::visit_leave(ir_expression *ir)
1417 {
1418    switch (ir->operation) {
1419    case ir_binop_dot:
1420       if (ir->operands[0]->type->is_double())
1421          double_dot_to_fma(ir);
1422       break;
1423    case ir_triop_lrp:
1424       if (ir->operands[0]->type->is_double())
1425          double_lrp(ir);
1426       break;
1427    case ir_binop_sub:
1428       if (lowering(SUB_TO_ADD_NEG))
1429          sub_to_add_neg(ir);
1430       break;
1431
1432    case ir_binop_div:
1433       if (ir->operands[1]->type->is_integer() && lowering(INT_DIV_TO_MUL_RCP))
1434          int_div_to_mul_rcp(ir);
1435       else if ((ir->operands[1]->type->is_float() ||
1436                 ir->operands[1]->type->is_double()) && lowering(DIV_TO_MUL_RCP))
1437          div_to_mul_rcp(ir);
1438       break;
1439
1440    case ir_unop_exp:
1441       if (lowering(EXP_TO_EXP2))
1442          exp_to_exp2(ir);
1443       break;
1444
1445    case ir_unop_log:
1446       if (lowering(LOG_TO_LOG2))
1447          log_to_log2(ir);
1448       break;
1449
1450    case ir_binop_mod:
1451       if (lowering(MOD_TO_FLOOR) && (ir->type->is_float() || ir->type->is_double()))
1452          mod_to_floor(ir);
1453       break;
1454
1455    case ir_binop_pow:
1456       if (lowering(POW_TO_EXP2))
1457          pow_to_exp2(ir);
1458       break;
1459
1460    case ir_binop_ldexp:
1461       if (lowering(LDEXP_TO_ARITH) && ir->type->is_float())
1462          ldexp_to_arith(ir);
1463       if (lowering(DFREXP_DLDEXP_TO_ARITH) && ir->type->is_double())
1464          dldexp_to_arith(ir);
1465       break;
1466
1467    case ir_unop_frexp_exp:
1468       if (lowering(DFREXP_DLDEXP_TO_ARITH) && ir->operands[0]->type->is_double())
1469          dfrexp_exp_to_arith(ir);
1470       break;
1471
1472    case ir_unop_frexp_sig:
1473       if (lowering(DFREXP_DLDEXP_TO_ARITH) && ir->operands[0]->type->is_double())
1474          dfrexp_sig_to_arith(ir);
1475       break;
1476
1477    case ir_binop_carry:
1478       if (lowering(CARRY_TO_ARITH))
1479          carry_to_arith(ir);
1480       break;
1481
1482    case ir_binop_borrow:
1483       if (lowering(BORROW_TO_ARITH))
1484          borrow_to_arith(ir);
1485       break;
1486
1487    case ir_unop_saturate:
1488       if (lowering(SAT_TO_CLAMP))
1489          sat_to_clamp(ir);
1490       break;
1491
1492    case ir_unop_trunc:
1493       if (lowering(DOPS_TO_DFRAC) && ir->type->is_double())
1494          dtrunc_to_dfrac(ir);
1495       break;
1496
1497    case ir_unop_ceil:
1498       if (lowering(DOPS_TO_DFRAC) && ir->type->is_double())
1499          dceil_to_dfrac(ir);
1500       break;
1501
1502    case ir_unop_floor:
1503       if (lowering(DOPS_TO_DFRAC) && ir->type->is_double())
1504          dfloor_to_dfrac(ir);
1505       break;
1506
1507    case ir_unop_round_even:
1508       if (lowering(DOPS_TO_DFRAC) && ir->type->is_double())
1509          dround_even_to_dfrac(ir);
1510       break;
1511
1512    case ir_unop_sign:
1513       if (lowering(DOPS_TO_DFRAC) && ir->type->is_double())
1514          dsign_to_csel(ir);
1515       break;
1516
1517    case ir_unop_bit_count:
1518       if (lowering(BIT_COUNT_TO_MATH))
1519          bit_count_to_math(ir);
1520       break;
1521
1522    case ir_triop_bitfield_extract:
1523       if (lowering(EXTRACT_TO_SHIFTS))
1524          extract_to_shifts(ir);
1525       break;
1526
1527    case ir_quadop_bitfield_insert:
1528       if (lowering(INSERT_TO_SHIFTS))
1529          insert_to_shifts(ir);
1530       break;
1531
1532    case ir_unop_bitfield_reverse:
1533       if (lowering(REVERSE_TO_SHIFTS))
1534          reverse_to_shifts(ir);
1535       break;
1536
1537    case ir_unop_find_lsb:
1538       if (lowering(FIND_LSB_TO_FLOAT_CAST))
1539          find_lsb_to_float_cast(ir);
1540       break;
1541
1542    case ir_unop_find_msb:
1543       if (lowering(FIND_MSB_TO_FLOAT_CAST))
1544          find_msb_to_float_cast(ir);
1545       break;
1546
1547    default:
1548       return visit_continue;
1549    }
1550
1551    return visit_continue;
1552 }