src/glsl/lower_instructions.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  21  * DEALINGS IN THE SOFTWARE.
  22  */
  23
  24 /**
  25  * \file lower_instructions.cpp
  26  *
  27  * Many GPUs lack native instructions for certain expression operations, and
  28  * must replace them with some other expression tree.  This pass lowers some
  29  * of the most common cases, allowing the lowering code to be implemented once
  30  * rather than in each driver backend.
  31  *
  32  * Currently supported transformations:
  33  * - SUB_TO_ADD_NEG
  34  * - DIV_TO_MUL_RCP
  35  * - INT_DIV_TO_MUL_RCP
  36  * - EXP_TO_EXP2
  37  * - POW_TO_EXP2
  38  * - LOG_TO_LOG2
  39  * - MOD_TO_FRACT
  40  * - LDEXP_TO_ARITH
  41  * - BITFIELD_INSERT_TO_BFM_BFI
  42  * - CARRY_TO_ARITH
  43  * - BORROW_TO_ARITH
  44  * - SAT_TO_CLAMP
  45  *
  46  * SUB_TO_ADD_NEG:
  47  * ---------------
  48  * Breaks an ir_binop_sub expression down to add(op0, neg(op1))
  49  *
  50  * This simplifies expression reassociation, and for many backends
  51  * there is no subtract operation separate from adding the negation.
  52  * For backends with native subtract operations, they will probably
  53  * want to recognize add(op0, neg(op1)) or the other way around to
  54  * produce a subtract anyway.
  55  *
  56  * DIV_TO_MUL_RCP and INT_DIV_TO_MUL_RCP:
  57  * --------------------------------------
  58  * Breaks an ir_binop_div expression down to op0 * (rcp(op1)).
  59  *
  60  * Many GPUs don't have a divide instruction (945 and 965 included),
  61  * but they do have an RCP instruction to compute an approximate
  62  * reciprocal.  By breaking the operation down, constant reciprocals
  63  * can get constant folded.
  64  *
  65  * DIV_TO_MUL_RCP only lowers floating point division; INT_DIV_TO_MUL_RCP
  66  * handles the integer case, converting to and from floating point so that
  67  * RCP is possible.
  68  *
  69  * EXP_TO_EXP2 and LOG_TO_LOG2:
  70  * ----------------------------
  71  * Many GPUs don't have a base e log or exponent instruction, but they
  72  * do have base 2 versions, so this pass converts exp and log to exp2
  73  * and log2 operations.
  74  *
  75  * POW_TO_EXP2:
  76  * -----------
  77  * Many older GPUs don't have an x**y instruction.  For these GPUs, convert
  78  * x**y to 2**(y * log2(x)).
  79  *
  80  * MOD_TO_FRACT:
  81  * -------------
  82  * Breaks an ir_binop_mod expression down to (op1 * fract(op0 / op1))
  83  *
  84  * Many GPUs don't have a MOD instruction (945 and 965 included), and
  85  * if we have to break it down like this anyway, it gives an
  86  * opportunity to do things like constant fold the (1.0 / op1) easily.
  87  *
  88  * LDEXP_TO_ARITH:
  89  * -------------
  90  * Converts ir_binop_ldexp to arithmetic and bit operations.
  91  *
  92  * BITFIELD_INSERT_TO_BFM_BFI:
  93  * ---------------------------
  94  * Breaks ir_quadop_bitfield_insert into ir_binop_bfm (bitfield mask) and
  95  * ir_triop_bfi (bitfield insert).
  96  *
  97  * Many GPUs implement the bitfieldInsert() built-in from ARB_gpu_shader_5
  98  * with a pair of instructions.
  99  *
 100  * CARRY_TO_ARITH:
 101  * ---------------
 102  * Converts ir_carry into (x + y) < x.
 103  *
 104  * BORROW_TO_ARITH:
 105  * ----------------
 106  * Converts ir_borrow into (x < y).
 107  *
 108  * SAT_TO_CLAMP:
 109  * -------------
 110  * Converts ir_unop_saturate into min(max(x, 0.0), 1.0)
 111  *
 112  */
 113
 114 #include "main/core.h" /* for M_LOG2E */
 115 #include "glsl_types.h"
 116 #include "ir.h"
 117 #include "ir_builder.h"
 118 #include "ir_optimization.h"
 119
 120 using namespace ir_builder;
 121
 122 namespace {
 123
 124 class lower_instructions_visitor : public ir_hierarchical_visitor {
 125 public:
 126    lower_instructions_visitor(unsigned lower)
 127       : progress(false), lower(lower) { }
 128
 129    ir_visitor_status visit_leave(ir_expression *);
 130
 131    bool progress;
 132
 133 private:
 134    unsigned lower; /** Bitfield of which operations to lower */
 135
 136    void sub_to_add_neg(ir_expression *);
 137    void div_to_mul_rcp(ir_expression *);
 138    void int_div_to_mul_rcp(ir_expression *);
 139    void mod_to_fract(ir_expression *);
 140    void exp_to_exp2(ir_expression *);
 141    void pow_to_exp2(ir_expression *);
 142    void log_to_log2(ir_expression *);
 143    void bitfield_insert_to_bfm_bfi(ir_expression *);
 144    void ldexp_to_arith(ir_expression *);
 145    void carry_to_arith(ir_expression *);
 146    void borrow_to_arith(ir_expression *);
 147    void sat_to_clamp(ir_expression *);
 148 };
 149
 150 } /* anonymous namespace */
 151
 152 /**
 153  * Determine if a particular type of lowering should occur
 154  */
 155 #define lowering(x) (this->lower & x)
 156
 157 bool
 158 lower_instructions(exec_list *instructions, unsigned what_to_lower)
 159 {
 160    lower_instructions_visitor v(what_to_lower);
 161
 162    visit_list_elements(&v, instructions);
 163    return v.progress;
 164 }
 165
 166 void
 167 lower_instructions_visitor::sub_to_add_neg(ir_expression *ir)
 168 {
 169    ir->operation = ir_binop_add;
 170    ir->operands[1] = new(ir) ir_expression(ir_unop_neg, ir->operands[1]->type,
 171                                            ir->operands[1], NULL);
 172    this->progress = true;
 173 }
 174
 175 void
 176 lower_instructions_visitor::div_to_mul_rcp(ir_expression *ir)
 177 {
 178    assert(ir->operands[1]->type->is_float());
 179
 180    /* New expression for the 1.0 / op1 */
 181    ir_rvalue *expr;
 182    expr = new(ir) ir_expression(ir_unop_rcp,
 183                                 ir->operands[1]->type,
 184                                 ir->operands[1]);
 185
 186    /* op0 / op1 -> op0 * (1.0 / op1) */
 187    ir->operation = ir_binop_mul;
 188    ir->operands[1] = expr;
 189
 190    this->progress = true;
 191 }
 192
 193 void
 194 lower_instructions_visitor::int_div_to_mul_rcp(ir_expression *ir)
 195 {
 196    assert(ir->operands[1]->type->is_integer());
 197
 198    /* Be careful with integer division -- we need to do it as a
 199     * float and re-truncate, since rcp(n > 1) of an integer would
 200     * just be 0.
 201     */
 202    ir_rvalue *op0, *op1;
 203    const struct glsl_type *vec_type;
 204
 205    vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
 206                                       ir->operands[1]->type->vector_elements,
 207                                       ir->operands[1]->type->matrix_columns);
 208
 209    if (ir->operands[1]->type->base_type == GLSL_TYPE_INT)
 210       op1 = new(ir) ir_expression(ir_unop_i2f, vec_type, ir->operands[1], NULL);
 211    else
 212       op1 = new(ir) ir_expression(ir_unop_u2f, vec_type, ir->operands[1], NULL);
 213
 214    op1 = new(ir) ir_expression(ir_unop_rcp, op1->type, op1, NULL);
 215
 216    vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
 217                                       ir->operands[0]->type->vector_elements,
 218                                       ir->operands[0]->type->matrix_columns);
 219
 220    if (ir->operands[0]->type->base_type == GLSL_TYPE_INT)
 221       op0 = new(ir) ir_expression(ir_unop_i2f, vec_type, ir->operands[0], NULL);
 222    else
 223       op0 = new(ir) ir_expression(ir_unop_u2f, vec_type, ir->operands[0], NULL);
 224
 225    vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
 226                                       ir->type->vector_elements,
 227                                       ir->type->matrix_columns);
 228
 229    op0 = new(ir) ir_expression(ir_binop_mul, vec_type, op0, op1);
 230
 231    if (ir->operands[1]->type->base_type == GLSL_TYPE_INT) {
 232       ir->operation = ir_unop_f2i;
 233       ir->operands[0] = op0;
 234    } else {
 235       ir->operation = ir_unop_i2u;
 236       ir->operands[0] = new(ir) ir_expression(ir_unop_f2i, op0);
 237    }
 238    ir->operands[1] = NULL;
 239
 240    this->progress = true;
 241 }
 242
 243 void
 244 lower_instructions_visitor::exp_to_exp2(ir_expression *ir)
 245 {
 246    ir_constant *log2_e = new(ir) ir_constant(float(M_LOG2E));
 247
 248    ir->operation = ir_unop_exp2;
 249    ir->operands[0] = new(ir) ir_expression(ir_binop_mul, ir->operands[0]->type,
 250                                            ir->operands[0], log2_e);
 251    this->progress = true;
 252 }
 253
 254 void
 255 lower_instructions_visitor::pow_to_exp2(ir_expression *ir)
 256 {
 257    ir_expression *const log2_x =
 258       new(ir) ir_expression(ir_unop_log2, ir->operands[0]->type,
 259                             ir->operands[0]);
 260
 261    ir->operation = ir_unop_exp2;
 262    ir->operands[0] = new(ir) ir_expression(ir_binop_mul, ir->operands[1]->type,
 263                                            ir->operands[1], log2_x);
 264    ir->operands[1] = NULL;
 265    this->progress = true;
 266 }
 267
 268 void
 269 lower_instructions_visitor::log_to_log2(ir_expression *ir)
 270 {
 271    ir->operation = ir_binop_mul;
 272    ir->operands[0] = new(ir) ir_expression(ir_unop_log2, ir->operands[0]->type,
 273                                            ir->operands[0], NULL);
 274    ir->operands[1] = new(ir) ir_constant(float(1.0 / M_LOG2E));
 275    this->progress = true;
 276 }
 277
 278 void
 279 lower_instructions_visitor::mod_to_fract(ir_expression *ir)
 280 {
 281    ir_variable *temp = new(ir) ir_variable(ir->operands[1]->type, "mod_b",
 282                                            ir_var_temporary);
 283    this->base_ir->insert_before(temp);
 284
 285    ir_assignment *const assign =
 286       new(ir) ir_assignment(new(ir) ir_dereference_variable(temp),
 287                             ir->operands[1], NULL);
 288
 289    this->base_ir->insert_before(assign);
 290
 291    ir_expression *const div_expr =
 292       new(ir) ir_expression(ir_binop_div, ir->operands[0]->type,
 293                             ir->operands[0],
 294                             new(ir) ir_dereference_variable(temp));
 295
 296    /* Don't generate new IR that would need to be lowered in an additional
 297     * pass.
 298     */
 299    if (lowering(DIV_TO_MUL_RCP))
 300       div_to_mul_rcp(div_expr);
 301
 302    ir_rvalue *expr = new(ir) ir_expression(ir_unop_fract,
 303                                            ir->operands[0]->type,
 304                                            div_expr,
 305                                            NULL);
 306
 307    ir->operation = ir_binop_mul;
 308    ir->operands[0] = new(ir) ir_dereference_variable(temp);
 309    ir->operands[1] = expr;
 310    this->progress = true;
 311 }
 312
 313 void
 314 lower_instructions_visitor::bitfield_insert_to_bfm_bfi(ir_expression *ir)
 315 {
 316    /* Translates
 317     *    ir_quadop_bitfield_insert base insert offset bits
 318     * into
 319     *    ir_triop_bfi (ir_binop_bfm bits offset) insert base
 320     */
 321
 322    ir_rvalue *base_expr = ir->operands[0];
 323
 324    ir->operation = ir_triop_bfi;
 325    ir->operands[0] = new(ir) ir_expression(ir_binop_bfm,
 326                                            ir->type->get_base_type(),
 327                                            ir->operands[3],
 328                                            ir->operands[2]);
 329    /* ir->operands[1] is still the value to insert. */
 330    ir->operands[2] = base_expr;
 331    ir->operands[3] = NULL;
 332
 333    this->progress = true;
 334 }
 335
 336 void
 337 lower_instructions_visitor::ldexp_to_arith(ir_expression *ir)
 338 {
 339    /* Translates
 340     *    ir_binop_ldexp x exp
 341     * into
 342     *
 343     *    extracted_biased_exp = rshift(bitcast_f2i(abs(x)), exp_shift);
 344     *    resulting_biased_exp = extracted_biased_exp + exp;
 345     *
 346     *    if (resulting_biased_exp < 1) {
 347     *       return copysign(0.0, x);
 348     *    }
 349     *
 350     *    return bitcast_u2f((bitcast_f2u(x) & sign_mantissa_mask) |
 351     *                       lshift(i2u(resulting_biased_exp), exp_shift));
 352     *
 353     * which we can't actually implement as such, since the GLSL IR doesn't
 354     * have vectorized if-statements. We actually implement it without branches
 355     * using conditional-select:
 356     *
 357     *    extracted_biased_exp = rshift(bitcast_f2i(abs(x)), exp_shift);
 358     *    resulting_biased_exp = extracted_biased_exp + exp;
 359     *
 360     *    is_not_zero_or_underflow = gequal(resulting_biased_exp, 1);
 361     *    x = csel(is_not_zero_or_underflow, x, copysign(0.0f, x));
 362     *    resulting_biased_exp = csel(is_not_zero_or_underflow,
 363     *                                resulting_biased_exp, 0);
 364     *
 365     *    return bitcast_u2f((bitcast_f2u(x) & sign_mantissa_mask) |
 366     *                       lshift(i2u(resulting_biased_exp), exp_shift));
 367     */
 368
 369    const unsigned vec_elem = ir->type->vector_elements;
 370
 371    /* Types */
 372    const glsl_type *ivec = glsl_type::get_instance(GLSL_TYPE_INT, vec_elem, 1);
 373    const glsl_type *bvec = glsl_type::get_instance(GLSL_TYPE_BOOL, vec_elem, 1);
 374
 375    /* Constants */
 376    ir_constant *zeroi = ir_constant::zero(ir, ivec);
 377
 378    ir_constant *sign_mask = new(ir) ir_constant(0x80000000u, vec_elem);
 379
 380    ir_constant *exp_shift = new(ir) ir_constant(23);
 381    ir_constant *exp_width = new(ir) ir_constant(8);
 382
 383    /* Temporary variables */
 384    ir_variable *x = new(ir) ir_variable(ir->type, "x", ir_var_temporary);
 385    ir_variable *exp = new(ir) ir_variable(ivec, "exp", ir_var_temporary);
 386
 387    ir_variable *zero_sign_x = new(ir) ir_variable(ir->type, "zero_sign_x",
 388                                                   ir_var_temporary);
 389
 390    ir_variable *extracted_biased_exp =
 391       new(ir) ir_variable(ivec, "extracted_biased_exp", ir_var_temporary);
 392    ir_variable *resulting_biased_exp =
 393       new(ir) ir_variable(ivec, "resulting_biased_exp", ir_var_temporary);
 394
 395    ir_variable *is_not_zero_or_underflow =
 396       new(ir) ir_variable(bvec, "is_not_zero_or_underflow", ir_var_temporary);
 397
 398    ir_instruction &i = *base_ir;
 399
 400    /* Copy <x> and <exp> arguments. */
 401    i.insert_before(x);
 402    i.insert_before(assign(x, ir->operands[0]));
 403    i.insert_before(exp);
 404    i.insert_before(assign(exp, ir->operands[1]));
 405
 406    /* Extract the biased exponent from <x>. */
 407    i.insert_before(extracted_biased_exp);
 408    i.insert_before(assign(extracted_biased_exp,
 409                           rshift(bitcast_f2i(abs(x)), exp_shift)));
 410
 411    i.insert_before(resulting_biased_exp);
 412    i.insert_before(assign(resulting_biased_exp,
 413                           add(extracted_biased_exp, exp)));
 414
 415    /* Test if result is ±0.0, subnormal, or underflow by checking if the
 416     * resulting biased exponent would be less than 0x1. If so, the result is
 417     * 0.0 with the sign of x. (Actually, invert the conditions so that
 418     * immediate values are the second arguments, which is better for i965)
 419     */
 420    i.insert_before(zero_sign_x);
 421    i.insert_before(assign(zero_sign_x,
 422                           bitcast_u2f(bit_and(bitcast_f2u(x), sign_mask))));
 423
 424    i.insert_before(is_not_zero_or_underflow);
 425    i.insert_before(assign(is_not_zero_or_underflow,
 426                           gequal(resulting_biased_exp,
 427                                   new(ir) ir_constant(0x1, vec_elem))));
 428    i.insert_before(assign(x, csel(is_not_zero_or_underflow,
 429                                   x, zero_sign_x)));
 430    i.insert_before(assign(resulting_biased_exp,
 431                           csel(is_not_zero_or_underflow,
 432                                resulting_biased_exp, zeroi)));
 433
 434    /* We could test for overflows by checking if the resulting biased exponent
 435     * would be greater than 0xFE. Turns out we don't need to because the GLSL
 436     * spec says:
 437     *
 438     *    "If this product is too large to be represented in the
 439     *     floating-point type, the result is undefined."
 440     */
 441
 442    ir_constant *exp_shift_clone = exp_shift->clone(ir, NULL);
 443    ir->operation = ir_unop_bitcast_i2f;
 444    ir->operands[0] = bitfield_insert(bitcast_f2i(x), resulting_biased_exp,
 445                                      exp_shift_clone, exp_width);
 446    ir->operands[1] = NULL;
 447
 448    /* Don't generate new IR that would need to be lowered in an additional
 449     * pass.
 450     */
 451    if (lowering(BITFIELD_INSERT_TO_BFM_BFI))
 452       bitfield_insert_to_bfm_bfi(ir->operands[0]->as_expression());
 453
 454    this->progress = true;
 455 }
 456
 457 void
 458 lower_instructions_visitor::carry_to_arith(ir_expression *ir)
 459 {
 460    /* Translates
 461     *   ir_binop_carry x y
 462     * into
 463     *   sum = ir_binop_add x y
 464     *   bcarry = ir_binop_less sum x
 465     *   carry = ir_unop_b2i bcarry
 466     */
 467
 468    ir_rvalue *x_clone = ir->operands[0]->clone(ir, NULL);
 469    ir->operation = ir_unop_i2u;
 470    ir->operands[0] = b2i(less(add(ir->operands[0], ir->operands[1]), x_clone));
 471    ir->operands[1] = NULL;
 472
 473    this->progress = true;
 474 }
 475
 476 void
 477 lower_instructions_visitor::borrow_to_arith(ir_expression *ir)
 478 {
 479    /* Translates
 480     *   ir_binop_borrow x y
 481     * into
 482     *   bcarry = ir_binop_less x y
 483     *   carry = ir_unop_b2i bcarry
 484     */
 485
 486    ir->operation = ir_unop_i2u;
 487    ir->operands[0] = b2i(less(ir->operands[0], ir->operands[1]));
 488    ir->operands[1] = NULL;
 489
 490    this->progress = true;
 491 }
 492
 493 void
 494 lower_instructions_visitor::sat_to_clamp(ir_expression *ir)
 495 {
 496    /* Translates
 497     *   ir_unop_saturate x
 498     * into
 499     *   ir_binop_min (ir_binop_max(x, 0.0), 1.0)
 500     */
 501
 502    ir->operation = ir_binop_min;
 503    ir->operands[0] = new(ir) ir_expression(ir_binop_max, ir->operands[0]->type,
 504                                            ir->operands[0],
 505                                            new(ir) ir_constant(0.0f));
 506    ir->operands[1] = new(ir) ir_constant(1.0f);
 507
 508    this->progress = true;
 509 }
 510
 511 ir_visitor_status
 512 lower_instructions_visitor::visit_leave(ir_expression *ir)
 513 {
 514    switch (ir->operation) {
 515    case ir_binop_sub:
 516       if (lowering(SUB_TO_ADD_NEG))
 517          sub_to_add_neg(ir);
 518       break;
 519
 520    case ir_binop_div:
 521       if (ir->operands[1]->type->is_integer() && lowering(INT_DIV_TO_MUL_RCP))
 522          int_div_to_mul_rcp(ir);
 523       else if (ir->operands[1]->type->is_float() && lowering(DIV_TO_MUL_RCP))
 524          div_to_mul_rcp(ir);
 525       break;
 526
 527    case ir_unop_exp:
 528       if (lowering(EXP_TO_EXP2))
 529          exp_to_exp2(ir);
 530       break;
 531
 532    case ir_unop_log:
 533       if (lowering(LOG_TO_LOG2))
 534          log_to_log2(ir);
 535       break;
 536
 537    case ir_binop_mod:
 538       if (lowering(MOD_TO_FRACT) && ir->type->is_float())
 539          mod_to_fract(ir);
 540       break;
 541
 542    case ir_binop_pow:
 543       if (lowering(POW_TO_EXP2))
 544          pow_to_exp2(ir);
 545       break;
 546
 547    case ir_quadop_bitfield_insert:
 548       if (lowering(BITFIELD_INSERT_TO_BFM_BFI))
 549          bitfield_insert_to_bfm_bfi(ir);
 550       break;
 551
 552    case ir_binop_ldexp:
 553       if (lowering(LDEXP_TO_ARITH))
 554          ldexp_to_arith(ir);
 555       break;
 556
 557    case ir_binop_carry:
 558       if (lowering(CARRY_TO_ARITH))
 559          carry_to_arith(ir);
 560       break;
 561
 562    case ir_binop_borrow:
 563       if (lowering(BORROW_TO_ARITH))
 564          borrow_to_arith(ir);
 565       break;
 566
 567    case ir_unop_saturate:
 568       if (lowering(SAT_TO_CLAMP))
 569          sat_to_clamp(ir);
 570       break;
 571
 572    default:
 573       return visit_continue;
 574    }
 575
 576    return visit_continue;
 577 }