2 * Copyright © 2010 Intel Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 * DEALINGS IN THE SOFTWARE.
25 * \file lower_instructions.cpp
27 * Many GPUs lack native instructions for certain expression operations, and
28 * must replace them with some other expression tree. This pass lowers some
29 * of the most common cases, allowing the lowering code to be implemented once
30 * rather than in each driver backend.
32 * Currently supported transformations:
35 * - INT_DIV_TO_MUL_RCP
41 * - BITFIELD_INSERT_TO_BFM_BFI
48 * Breaks an ir_binop_sub expression down to add(op0, neg(op1))
50 * This simplifies expression reassociation, and for many backends
51 * there is no subtract operation separate from adding the negation.
52 * For backends with native subtract operations, they will probably
53 * want to recognize add(op0, neg(op1)) or the other way around to
54 * produce a subtract anyway.
56 * DIV_TO_MUL_RCP and INT_DIV_TO_MUL_RCP:
57 * --------------------------------------
58 * Breaks an ir_binop_div expression down to op0 * (rcp(op1)).
60 * Many GPUs don't have a divide instruction (945 and 965 included),
61 * but they do have an RCP instruction to compute an approximate
62 * reciprocal. By breaking the operation down, constant reciprocals
63 * can get constant folded.
65 * DIV_TO_MUL_RCP only lowers floating point division; INT_DIV_TO_MUL_RCP
66 * handles the integer case, converting to and from floating point so that
69 * EXP_TO_EXP2 and LOG_TO_LOG2:
70 * ----------------------------
71 * Many GPUs don't have a base e log or exponent instruction, but they
72 * do have base 2 versions, so this pass converts exp and log to exp2
73 * and log2 operations.
77 * Many older GPUs don't have an x**y instruction. For these GPUs, convert
78 * x**y to 2**(y * log2(x)).
82 * Breaks an ir_binop_mod expression down to (op1 * fract(op0 / op1))
84 * Many GPUs don't have a MOD instruction (945 and 965 included), and
85 * if we have to break it down like this anyway, it gives an
86 * opportunity to do things like constant fold the (1.0 / op1) easily.
90 * Converts ir_binop_ldexp to arithmetic and bit operations.
92 * BITFIELD_INSERT_TO_BFM_BFI:
93 * ---------------------------
94 * Breaks ir_quadop_bitfield_insert into ir_binop_bfm (bitfield mask) and
95 * ir_triop_bfi (bitfield insert).
97 * Many GPUs implement the bitfieldInsert() built-in from ARB_gpu_shader_5
98 * with a pair of instructions.
102 * Converts ir_carry into (x + y) < x.
106 * Converts ir_borrow into (x < y).
110 * Converts ir_unop_saturate into min(max(x, 0.0), 1.0)
114 #include "main/core.h" /* for M_LOG2E */
115 #include "glsl_types.h"
117 #include "ir_builder.h"
118 #include "ir_optimization.h"
120 using namespace ir_builder
;
124 class lower_instructions_visitor
: public ir_hierarchical_visitor
{
126 lower_instructions_visitor(unsigned lower
)
127 : progress(false), lower(lower
) { }
129 ir_visitor_status
visit_leave(ir_expression
*);
134 unsigned lower
; /** Bitfield of which operations to lower */
136 void sub_to_add_neg(ir_expression
*);
137 void div_to_mul_rcp(ir_expression
*);
138 void int_div_to_mul_rcp(ir_expression
*);
139 void mod_to_fract(ir_expression
*);
140 void exp_to_exp2(ir_expression
*);
141 void pow_to_exp2(ir_expression
*);
142 void log_to_log2(ir_expression
*);
143 void bitfield_insert_to_bfm_bfi(ir_expression
*);
144 void ldexp_to_arith(ir_expression
*);
145 void carry_to_arith(ir_expression
*);
146 void borrow_to_arith(ir_expression
*);
147 void sat_to_clamp(ir_expression
*);
150 } /* anonymous namespace */
153 * Determine if a particular type of lowering should occur
155 #define lowering(x) (this->lower & x)
158 lower_instructions(exec_list
*instructions
, unsigned what_to_lower
)
160 lower_instructions_visitor
v(what_to_lower
);
162 visit_list_elements(&v
, instructions
);
167 lower_instructions_visitor::sub_to_add_neg(ir_expression
*ir
)
169 ir
->operation
= ir_binop_add
;
170 ir
->operands
[1] = new(ir
) ir_expression(ir_unop_neg
, ir
->operands
[1]->type
,
171 ir
->operands
[1], NULL
);
172 this->progress
= true;
176 lower_instructions_visitor::div_to_mul_rcp(ir_expression
*ir
)
178 assert(ir
->operands
[1]->type
->is_float());
180 /* New expression for the 1.0 / op1 */
182 expr
= new(ir
) ir_expression(ir_unop_rcp
,
183 ir
->operands
[1]->type
,
186 /* op0 / op1 -> op0 * (1.0 / op1) */
187 ir
->operation
= ir_binop_mul
;
188 ir
->operands
[1] = expr
;
190 this->progress
= true;
194 lower_instructions_visitor::int_div_to_mul_rcp(ir_expression
*ir
)
196 assert(ir
->operands
[1]->type
->is_integer());
198 /* Be careful with integer division -- we need to do it as a
199 * float and re-truncate, since rcp(n > 1) of an integer would
202 ir_rvalue
*op0
, *op1
;
203 const struct glsl_type
*vec_type
;
205 vec_type
= glsl_type::get_instance(GLSL_TYPE_FLOAT
,
206 ir
->operands
[1]->type
->vector_elements
,
207 ir
->operands
[1]->type
->matrix_columns
);
209 if (ir
->operands
[1]->type
->base_type
== GLSL_TYPE_INT
)
210 op1
= new(ir
) ir_expression(ir_unop_i2f
, vec_type
, ir
->operands
[1], NULL
);
212 op1
= new(ir
) ir_expression(ir_unop_u2f
, vec_type
, ir
->operands
[1], NULL
);
214 op1
= new(ir
) ir_expression(ir_unop_rcp
, op1
->type
, op1
, NULL
);
216 vec_type
= glsl_type::get_instance(GLSL_TYPE_FLOAT
,
217 ir
->operands
[0]->type
->vector_elements
,
218 ir
->operands
[0]->type
->matrix_columns
);
220 if (ir
->operands
[0]->type
->base_type
== GLSL_TYPE_INT
)
221 op0
= new(ir
) ir_expression(ir_unop_i2f
, vec_type
, ir
->operands
[0], NULL
);
223 op0
= new(ir
) ir_expression(ir_unop_u2f
, vec_type
, ir
->operands
[0], NULL
);
225 vec_type
= glsl_type::get_instance(GLSL_TYPE_FLOAT
,
226 ir
->type
->vector_elements
,
227 ir
->type
->matrix_columns
);
229 op0
= new(ir
) ir_expression(ir_binop_mul
, vec_type
, op0
, op1
);
231 if (ir
->operands
[1]->type
->base_type
== GLSL_TYPE_INT
) {
232 ir
->operation
= ir_unop_f2i
;
233 ir
->operands
[0] = op0
;
235 ir
->operation
= ir_unop_i2u
;
236 ir
->operands
[0] = new(ir
) ir_expression(ir_unop_f2i
, op0
);
238 ir
->operands
[1] = NULL
;
240 this->progress
= true;
244 lower_instructions_visitor::exp_to_exp2(ir_expression
*ir
)
246 ir_constant
*log2_e
= new(ir
) ir_constant(float(M_LOG2E
));
248 ir
->operation
= ir_unop_exp2
;
249 ir
->operands
[0] = new(ir
) ir_expression(ir_binop_mul
, ir
->operands
[0]->type
,
250 ir
->operands
[0], log2_e
);
251 this->progress
= true;
255 lower_instructions_visitor::pow_to_exp2(ir_expression
*ir
)
257 ir_expression
*const log2_x
=
258 new(ir
) ir_expression(ir_unop_log2
, ir
->operands
[0]->type
,
261 ir
->operation
= ir_unop_exp2
;
262 ir
->operands
[0] = new(ir
) ir_expression(ir_binop_mul
, ir
->operands
[1]->type
,
263 ir
->operands
[1], log2_x
);
264 ir
->operands
[1] = NULL
;
265 this->progress
= true;
269 lower_instructions_visitor::log_to_log2(ir_expression
*ir
)
271 ir
->operation
= ir_binop_mul
;
272 ir
->operands
[0] = new(ir
) ir_expression(ir_unop_log2
, ir
->operands
[0]->type
,
273 ir
->operands
[0], NULL
);
274 ir
->operands
[1] = new(ir
) ir_constant(float(1.0 / M_LOG2E
));
275 this->progress
= true;
279 lower_instructions_visitor::mod_to_fract(ir_expression
*ir
)
281 ir_variable
*temp
= new(ir
) ir_variable(ir
->operands
[1]->type
, "mod_b",
283 this->base_ir
->insert_before(temp
);
285 ir_assignment
*const assign
=
286 new(ir
) ir_assignment(new(ir
) ir_dereference_variable(temp
),
287 ir
->operands
[1], NULL
);
289 this->base_ir
->insert_before(assign
);
291 ir_expression
*const div_expr
=
292 new(ir
) ir_expression(ir_binop_div
, ir
->operands
[0]->type
,
294 new(ir
) ir_dereference_variable(temp
));
296 /* Don't generate new IR that would need to be lowered in an additional
299 if (lowering(DIV_TO_MUL_RCP
))
300 div_to_mul_rcp(div_expr
);
302 ir_rvalue
*expr
= new(ir
) ir_expression(ir_unop_fract
,
303 ir
->operands
[0]->type
,
307 ir
->operation
= ir_binop_mul
;
308 ir
->operands
[0] = new(ir
) ir_dereference_variable(temp
);
309 ir
->operands
[1] = expr
;
310 this->progress
= true;
314 lower_instructions_visitor::bitfield_insert_to_bfm_bfi(ir_expression
*ir
)
317 * ir_quadop_bitfield_insert base insert offset bits
319 * ir_triop_bfi (ir_binop_bfm bits offset) insert base
322 ir_rvalue
*base_expr
= ir
->operands
[0];
324 ir
->operation
= ir_triop_bfi
;
325 ir
->operands
[0] = new(ir
) ir_expression(ir_binop_bfm
,
326 ir
->type
->get_base_type(),
329 /* ir->operands[1] is still the value to insert. */
330 ir
->operands
[2] = base_expr
;
331 ir
->operands
[3] = NULL
;
333 this->progress
= true;
337 lower_instructions_visitor::ldexp_to_arith(ir_expression
*ir
)
340 * ir_binop_ldexp x exp
343 * extracted_biased_exp = rshift(bitcast_f2i(abs(x)), exp_shift);
344 * resulting_biased_exp = extracted_biased_exp + exp;
346 * if (resulting_biased_exp < 1) {
347 * return copysign(0.0, x);
350 * return bitcast_u2f((bitcast_f2u(x) & sign_mantissa_mask) |
351 * lshift(i2u(resulting_biased_exp), exp_shift));
353 * which we can't actually implement as such, since the GLSL IR doesn't
354 * have vectorized if-statements. We actually implement it without branches
355 * using conditional-select:
357 * extracted_biased_exp = rshift(bitcast_f2i(abs(x)), exp_shift);
358 * resulting_biased_exp = extracted_biased_exp + exp;
360 * is_not_zero_or_underflow = gequal(resulting_biased_exp, 1);
361 * x = csel(is_not_zero_or_underflow, x, copysign(0.0f, x));
362 * resulting_biased_exp = csel(is_not_zero_or_underflow,
363 * resulting_biased_exp, 0);
365 * return bitcast_u2f((bitcast_f2u(x) & sign_mantissa_mask) |
366 * lshift(i2u(resulting_biased_exp), exp_shift));
369 const unsigned vec_elem
= ir
->type
->vector_elements
;
372 const glsl_type
*ivec
= glsl_type::get_instance(GLSL_TYPE_INT
, vec_elem
, 1);
373 const glsl_type
*bvec
= glsl_type::get_instance(GLSL_TYPE_BOOL
, vec_elem
, 1);
376 ir_constant
*zeroi
= ir_constant::zero(ir
, ivec
);
378 ir_constant
*sign_mask
= new(ir
) ir_constant(0x80000000u
, vec_elem
);
380 ir_constant
*exp_shift
= new(ir
) ir_constant(23);
381 ir_constant
*exp_width
= new(ir
) ir_constant(8);
383 /* Temporary variables */
384 ir_variable
*x
= new(ir
) ir_variable(ir
->type
, "x", ir_var_temporary
);
385 ir_variable
*exp
= new(ir
) ir_variable(ivec
, "exp", ir_var_temporary
);
387 ir_variable
*zero_sign_x
= new(ir
) ir_variable(ir
->type
, "zero_sign_x",
390 ir_variable
*extracted_biased_exp
=
391 new(ir
) ir_variable(ivec
, "extracted_biased_exp", ir_var_temporary
);
392 ir_variable
*resulting_biased_exp
=
393 new(ir
) ir_variable(ivec
, "resulting_biased_exp", ir_var_temporary
);
395 ir_variable
*is_not_zero_or_underflow
=
396 new(ir
) ir_variable(bvec
, "is_not_zero_or_underflow", ir_var_temporary
);
398 ir_instruction
&i
= *base_ir
;
400 /* Copy <x> and <exp> arguments. */
402 i
.insert_before(assign(x
, ir
->operands
[0]));
403 i
.insert_before(exp
);
404 i
.insert_before(assign(exp
, ir
->operands
[1]));
406 /* Extract the biased exponent from <x>. */
407 i
.insert_before(extracted_biased_exp
);
408 i
.insert_before(assign(extracted_biased_exp
,
409 rshift(bitcast_f2i(abs(x
)), exp_shift
)));
411 i
.insert_before(resulting_biased_exp
);
412 i
.insert_before(assign(resulting_biased_exp
,
413 add(extracted_biased_exp
, exp
)));
415 /* Test if result is ±0.0, subnormal, or underflow by checking if the
416 * resulting biased exponent would be less than 0x1. If so, the result is
417 * 0.0 with the sign of x. (Actually, invert the conditions so that
418 * immediate values are the second arguments, which is better for i965)
420 i
.insert_before(zero_sign_x
);
421 i
.insert_before(assign(zero_sign_x
,
422 bitcast_u2f(bit_and(bitcast_f2u(x
), sign_mask
))));
424 i
.insert_before(is_not_zero_or_underflow
);
425 i
.insert_before(assign(is_not_zero_or_underflow
,
426 gequal(resulting_biased_exp
,
427 new(ir
) ir_constant(0x1, vec_elem
))));
428 i
.insert_before(assign(x
, csel(is_not_zero_or_underflow
,
430 i
.insert_before(assign(resulting_biased_exp
,
431 csel(is_not_zero_or_underflow
,
432 resulting_biased_exp
, zeroi
)));
434 /* We could test for overflows by checking if the resulting biased exponent
435 * would be greater than 0xFE. Turns out we don't need to because the GLSL
438 * "If this product is too large to be represented in the
439 * floating-point type, the result is undefined."
442 ir_constant
*exp_shift_clone
= exp_shift
->clone(ir
, NULL
);
443 ir
->operation
= ir_unop_bitcast_i2f
;
444 ir
->operands
[0] = bitfield_insert(bitcast_f2i(x
), resulting_biased_exp
,
445 exp_shift_clone
, exp_width
);
446 ir
->operands
[1] = NULL
;
448 /* Don't generate new IR that would need to be lowered in an additional
451 if (lowering(BITFIELD_INSERT_TO_BFM_BFI
))
452 bitfield_insert_to_bfm_bfi(ir
->operands
[0]->as_expression());
454 this->progress
= true;
458 lower_instructions_visitor::carry_to_arith(ir_expression
*ir
)
463 * sum = ir_binop_add x y
464 * bcarry = ir_binop_less sum x
465 * carry = ir_unop_b2i bcarry
468 ir_rvalue
*x_clone
= ir
->operands
[0]->clone(ir
, NULL
);
469 ir
->operation
= ir_unop_i2u
;
470 ir
->operands
[0] = b2i(less(add(ir
->operands
[0], ir
->operands
[1]), x_clone
));
471 ir
->operands
[1] = NULL
;
473 this->progress
= true;
477 lower_instructions_visitor::borrow_to_arith(ir_expression
*ir
)
480 * ir_binop_borrow x y
482 * bcarry = ir_binop_less x y
483 * carry = ir_unop_b2i bcarry
486 ir
->operation
= ir_unop_i2u
;
487 ir
->operands
[0] = b2i(less(ir
->operands
[0], ir
->operands
[1]));
488 ir
->operands
[1] = NULL
;
490 this->progress
= true;
494 lower_instructions_visitor::sat_to_clamp(ir_expression
*ir
)
499 * ir_binop_min (ir_binop_max(x, 0.0), 1.0)
502 ir
->operation
= ir_binop_min
;
503 ir
->operands
[0] = new(ir
) ir_expression(ir_binop_max
, ir
->operands
[0]->type
,
505 new(ir
) ir_constant(0.0f
));
506 ir
->operands
[1] = new(ir
) ir_constant(1.0f
);
508 this->progress
= true;
512 lower_instructions_visitor::visit_leave(ir_expression
*ir
)
514 switch (ir
->operation
) {
516 if (lowering(SUB_TO_ADD_NEG
))
521 if (ir
->operands
[1]->type
->is_integer() && lowering(INT_DIV_TO_MUL_RCP
))
522 int_div_to_mul_rcp(ir
);
523 else if (ir
->operands
[1]->type
->is_float() && lowering(DIV_TO_MUL_RCP
))
528 if (lowering(EXP_TO_EXP2
))
533 if (lowering(LOG_TO_LOG2
))
538 if (lowering(MOD_TO_FRACT
) && ir
->type
->is_float())
543 if (lowering(POW_TO_EXP2
))
547 case ir_quadop_bitfield_insert
:
548 if (lowering(BITFIELD_INSERT_TO_BFM_BFI
))
549 bitfield_insert_to_bfm_bfi(ir
);
553 if (lowering(LDEXP_TO_ARITH
))
558 if (lowering(CARRY_TO_ARITH
))
562 case ir_binop_borrow
:
563 if (lowering(BORROW_TO_ARITH
))
567 case ir_unop_saturate
:
568 if (lowering(SAT_TO_CLAMP
))
573 return visit_continue
;
576 return visit_continue
;