2 * Copyright © 2010 Intel Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 * DEALINGS IN THE SOFTWARE.
25 * \file lower_instructions.cpp
27 * Many GPUs lack native instructions for certain expression operations, and
28 * must replace them with some other expression tree. This pass lowers some
29 * of the most common cases, allowing the lowering code to be implemented once
30 * rather than in each driver backend.
32 * Currently supported transformations:
35 * - INT_DIV_TO_MUL_RCP
42 * - BITFIELD_INSERT_TO_BFM_BFI
50 * Breaks an ir_binop_sub expression down to add(op0, neg(op1))
52 * This simplifies expression reassociation, and for many backends
53 * there is no subtract operation separate from adding the negation.
54 * For backends with native subtract operations, they will probably
55 * want to recognize add(op0, neg(op1)) or the other way around to
56 * produce a subtract anyway.
58 * DIV_TO_MUL_RCP and INT_DIV_TO_MUL_RCP:
59 * --------------------------------------
60 * Breaks an ir_binop_div expression down to op0 * (rcp(op1)).
62 * Many GPUs don't have a divide instruction (945 and 965 included),
63 * but they do have an RCP instruction to compute an approximate
64 * reciprocal. By breaking the operation down, constant reciprocals
65 * can get constant folded.
67 * DIV_TO_MUL_RCP only lowers floating point division; INT_DIV_TO_MUL_RCP
68 * handles the integer case, converting to and from floating point so that
71 * EXP_TO_EXP2 and LOG_TO_LOG2:
72 * ----------------------------
73 * Many GPUs don't have a base e log or exponent instruction, but they
74 * do have base 2 versions, so this pass converts exp and log to exp2
75 * and log2 operations.
79 * Many older GPUs don't have an x**y instruction. For these GPUs, convert
80 * x**y to 2**(y * log2(x)).
84 * Breaks an ir_binop_mod expression down to (op0 - op1 * floor(op0 / op1))
86 * Many GPUs don't have a MOD instruction (945 and 965 included), and
87 * if we have to break it down like this anyway, it gives an
88 * opportunity to do things like constant fold the (1.0 / op1) easily.
90 * Note: before we used to implement this as op1 * fract(op / op1) but this
91 * implementation had significant precision errors.
95 * Converts ir_binop_ldexp to arithmetic and bit operations for float sources.
97 * DFREXP_DLDEXP_TO_ARITH:
99 * Converts ir_binop_ldexp, ir_unop_frexp_sig, and ir_unop_frexp_exp to
100 * arithmetic and bit ops for double arguments.
102 * BITFIELD_INSERT_TO_BFM_BFI:
103 * ---------------------------
104 * Breaks ir_quadop_bitfield_insert into ir_binop_bfm (bitfield mask) and
105 * ir_triop_bfi (bitfield insert).
107 * Many GPUs implement the bitfieldInsert() built-in from ARB_gpu_shader_5
108 * with a pair of instructions.
112 * Converts ir_carry into (x + y) < x.
116 * Converts ir_borrow into (x < y).
120 * Converts ir_unop_saturate into min(max(x, 0.0), 1.0)
124 * Converts double trunc, ceil, floor, round to fract
127 #include "c99_math.h"
128 #include "program/prog_instruction.h" /* for swizzle */
129 #include "glsl_types.h"
131 #include "ir_builder.h"
132 #include "ir_optimization.h"
134 using namespace ir_builder
;
138 class lower_instructions_visitor
: public ir_hierarchical_visitor
{
140 lower_instructions_visitor(unsigned lower
)
141 : progress(false), lower(lower
) { }
143 ir_visitor_status
visit_leave(ir_expression
*);
148 unsigned lower
; /** Bitfield of which operations to lower */
150 void sub_to_add_neg(ir_expression
*);
151 void div_to_mul_rcp(ir_expression
*);
152 void int_div_to_mul_rcp(ir_expression
*);
153 void mod_to_floor(ir_expression
*);
154 void exp_to_exp2(ir_expression
*);
155 void pow_to_exp2(ir_expression
*);
156 void log_to_log2(ir_expression
*);
157 void bitfield_insert_to_bfm_bfi(ir_expression
*);
158 void ldexp_to_arith(ir_expression
*);
159 void dldexp_to_arith(ir_expression
*);
160 void dfrexp_sig_to_arith(ir_expression
*);
161 void dfrexp_exp_to_arith(ir_expression
*);
162 void carry_to_arith(ir_expression
*);
163 void borrow_to_arith(ir_expression
*);
164 void sat_to_clamp(ir_expression
*);
165 void double_dot_to_fma(ir_expression
*);
166 void double_lrp(ir_expression
*);
167 void dceil_to_dfrac(ir_expression
*);
168 void dfloor_to_dfrac(ir_expression
*);
169 void dround_even_to_dfrac(ir_expression
*);
170 void dtrunc_to_dfrac(ir_expression
*);
171 void dsign_to_csel(ir_expression
*);
174 } /* anonymous namespace */
177 * Determine if a particular type of lowering should occur
179 #define lowering(x) (this->lower & x)
182 lower_instructions(exec_list
*instructions
, unsigned what_to_lower
)
184 lower_instructions_visitor
v(what_to_lower
);
186 visit_list_elements(&v
, instructions
);
191 lower_instructions_visitor::sub_to_add_neg(ir_expression
*ir
)
193 ir
->operation
= ir_binop_add
;
194 ir
->operands
[1] = new(ir
) ir_expression(ir_unop_neg
, ir
->operands
[1]->type
,
195 ir
->operands
[1], NULL
);
196 this->progress
= true;
200 lower_instructions_visitor::div_to_mul_rcp(ir_expression
*ir
)
202 assert(ir
->operands
[1]->type
->is_float() || ir
->operands
[1]->type
->is_double());
204 /* New expression for the 1.0 / op1 */
206 expr
= new(ir
) ir_expression(ir_unop_rcp
,
207 ir
->operands
[1]->type
,
210 /* op0 / op1 -> op0 * (1.0 / op1) */
211 ir
->operation
= ir_binop_mul
;
212 ir
->operands
[1] = expr
;
214 this->progress
= true;
218 lower_instructions_visitor::int_div_to_mul_rcp(ir_expression
*ir
)
220 assert(ir
->operands
[1]->type
->is_integer());
222 /* Be careful with integer division -- we need to do it as a
223 * float and re-truncate, since rcp(n > 1) of an integer would
226 ir_rvalue
*op0
, *op1
;
227 const struct glsl_type
*vec_type
;
229 vec_type
= glsl_type::get_instance(GLSL_TYPE_FLOAT
,
230 ir
->operands
[1]->type
->vector_elements
,
231 ir
->operands
[1]->type
->matrix_columns
);
233 if (ir
->operands
[1]->type
->base_type
== GLSL_TYPE_INT
)
234 op1
= new(ir
) ir_expression(ir_unop_i2f
, vec_type
, ir
->operands
[1], NULL
);
236 op1
= new(ir
) ir_expression(ir_unop_u2f
, vec_type
, ir
->operands
[1], NULL
);
238 op1
= new(ir
) ir_expression(ir_unop_rcp
, op1
->type
, op1
, NULL
);
240 vec_type
= glsl_type::get_instance(GLSL_TYPE_FLOAT
,
241 ir
->operands
[0]->type
->vector_elements
,
242 ir
->operands
[0]->type
->matrix_columns
);
244 if (ir
->operands
[0]->type
->base_type
== GLSL_TYPE_INT
)
245 op0
= new(ir
) ir_expression(ir_unop_i2f
, vec_type
, ir
->operands
[0], NULL
);
247 op0
= new(ir
) ir_expression(ir_unop_u2f
, vec_type
, ir
->operands
[0], NULL
);
249 vec_type
= glsl_type::get_instance(GLSL_TYPE_FLOAT
,
250 ir
->type
->vector_elements
,
251 ir
->type
->matrix_columns
);
253 op0
= new(ir
) ir_expression(ir_binop_mul
, vec_type
, op0
, op1
);
255 if (ir
->operands
[1]->type
->base_type
== GLSL_TYPE_INT
) {
256 ir
->operation
= ir_unop_f2i
;
257 ir
->operands
[0] = op0
;
259 ir
->operation
= ir_unop_i2u
;
260 ir
->operands
[0] = new(ir
) ir_expression(ir_unop_f2i
, op0
);
262 ir
->operands
[1] = NULL
;
264 this->progress
= true;
268 lower_instructions_visitor::exp_to_exp2(ir_expression
*ir
)
270 ir_constant
*log2_e
= new(ir
) ir_constant(float(M_LOG2E
));
272 ir
->operation
= ir_unop_exp2
;
273 ir
->operands
[0] = new(ir
) ir_expression(ir_binop_mul
, ir
->operands
[0]->type
,
274 ir
->operands
[0], log2_e
);
275 this->progress
= true;
279 lower_instructions_visitor::pow_to_exp2(ir_expression
*ir
)
281 ir_expression
*const log2_x
=
282 new(ir
) ir_expression(ir_unop_log2
, ir
->operands
[0]->type
,
285 ir
->operation
= ir_unop_exp2
;
286 ir
->operands
[0] = new(ir
) ir_expression(ir_binop_mul
, ir
->operands
[1]->type
,
287 ir
->operands
[1], log2_x
);
288 ir
->operands
[1] = NULL
;
289 this->progress
= true;
293 lower_instructions_visitor::log_to_log2(ir_expression
*ir
)
295 ir
->operation
= ir_binop_mul
;
296 ir
->operands
[0] = new(ir
) ir_expression(ir_unop_log2
, ir
->operands
[0]->type
,
297 ir
->operands
[0], NULL
);
298 ir
->operands
[1] = new(ir
) ir_constant(float(1.0 / M_LOG2E
));
299 this->progress
= true;
303 lower_instructions_visitor::mod_to_floor(ir_expression
*ir
)
305 ir_variable
*x
= new(ir
) ir_variable(ir
->operands
[0]->type
, "mod_x",
307 ir_variable
*y
= new(ir
) ir_variable(ir
->operands
[1]->type
, "mod_y",
309 this->base_ir
->insert_before(x
);
310 this->base_ir
->insert_before(y
);
312 ir_assignment
*const assign_x
=
313 new(ir
) ir_assignment(new(ir
) ir_dereference_variable(x
),
314 ir
->operands
[0], NULL
);
315 ir_assignment
*const assign_y
=
316 new(ir
) ir_assignment(new(ir
) ir_dereference_variable(y
),
317 ir
->operands
[1], NULL
);
319 this->base_ir
->insert_before(assign_x
);
320 this->base_ir
->insert_before(assign_y
);
322 ir_expression
*const div_expr
=
323 new(ir
) ir_expression(ir_binop_div
, x
->type
,
324 new(ir
) ir_dereference_variable(x
),
325 new(ir
) ir_dereference_variable(y
));
327 /* Don't generate new IR that would need to be lowered in an additional
330 if (lowering(DIV_TO_MUL_RCP
) && (ir
->type
->is_float() || ir
->type
->is_double()))
331 div_to_mul_rcp(div_expr
);
333 ir_expression
*const floor_expr
=
334 new(ir
) ir_expression(ir_unop_floor
, x
->type
, div_expr
);
336 if (lowering(DOPS_TO_DFRAC
) && ir
->type
->is_double())
337 dfloor_to_dfrac(floor_expr
);
339 ir_expression
*const mul_expr
=
340 new(ir
) ir_expression(ir_binop_mul
,
341 new(ir
) ir_dereference_variable(y
),
344 ir
->operation
= ir_binop_sub
;
345 ir
->operands
[0] = new(ir
) ir_dereference_variable(x
);
346 ir
->operands
[1] = mul_expr
;
347 this->progress
= true;
351 lower_instructions_visitor::bitfield_insert_to_bfm_bfi(ir_expression
*ir
)
354 * ir_quadop_bitfield_insert base insert offset bits
356 * ir_triop_bfi (ir_binop_bfm bits offset) insert base
359 ir_rvalue
*base_expr
= ir
->operands
[0];
361 ir
->operation
= ir_triop_bfi
;
362 ir
->operands
[0] = new(ir
) ir_expression(ir_binop_bfm
,
363 ir
->type
->get_base_type(),
366 /* ir->operands[1] is still the value to insert. */
367 ir
->operands
[2] = base_expr
;
368 ir
->operands
[3] = NULL
;
370 this->progress
= true;
374 lower_instructions_visitor::ldexp_to_arith(ir_expression
*ir
)
377 * ir_binop_ldexp x exp
380 * extracted_biased_exp = rshift(bitcast_f2i(abs(x)), exp_shift);
381 * resulting_biased_exp = extracted_biased_exp + exp;
383 * if (resulting_biased_exp < 1) {
384 * return copysign(0.0, x);
387 * return bitcast_u2f((bitcast_f2u(x) & sign_mantissa_mask) |
388 * lshift(i2u(resulting_biased_exp), exp_shift));
390 * which we can't actually implement as such, since the GLSL IR doesn't
391 * have vectorized if-statements. We actually implement it without branches
392 * using conditional-select:
394 * extracted_biased_exp = rshift(bitcast_f2i(abs(x)), exp_shift);
395 * resulting_biased_exp = extracted_biased_exp + exp;
397 * is_not_zero_or_underflow = gequal(resulting_biased_exp, 1);
398 * x = csel(is_not_zero_or_underflow, x, copysign(0.0f, x));
399 * resulting_biased_exp = csel(is_not_zero_or_underflow,
400 * resulting_biased_exp, 0);
402 * return bitcast_u2f((bitcast_f2u(x) & sign_mantissa_mask) |
403 * lshift(i2u(resulting_biased_exp), exp_shift));
406 const unsigned vec_elem
= ir
->type
->vector_elements
;
409 const glsl_type
*ivec
= glsl_type::get_instance(GLSL_TYPE_INT
, vec_elem
, 1);
410 const glsl_type
*bvec
= glsl_type::get_instance(GLSL_TYPE_BOOL
, vec_elem
, 1);
413 ir_constant
*zeroi
= ir_constant::zero(ir
, ivec
);
415 ir_constant
*sign_mask
= new(ir
) ir_constant(0x80000000u
, vec_elem
);
417 ir_constant
*exp_shift
= new(ir
) ir_constant(23);
418 ir_constant
*exp_width
= new(ir
) ir_constant(8);
420 /* Temporary variables */
421 ir_variable
*x
= new(ir
) ir_variable(ir
->type
, "x", ir_var_temporary
);
422 ir_variable
*exp
= new(ir
) ir_variable(ivec
, "exp", ir_var_temporary
);
424 ir_variable
*zero_sign_x
= new(ir
) ir_variable(ir
->type
, "zero_sign_x",
427 ir_variable
*extracted_biased_exp
=
428 new(ir
) ir_variable(ivec
, "extracted_biased_exp", ir_var_temporary
);
429 ir_variable
*resulting_biased_exp
=
430 new(ir
) ir_variable(ivec
, "resulting_biased_exp", ir_var_temporary
);
432 ir_variable
*is_not_zero_or_underflow
=
433 new(ir
) ir_variable(bvec
, "is_not_zero_or_underflow", ir_var_temporary
);
435 ir_instruction
&i
= *base_ir
;
437 /* Copy <x> and <exp> arguments. */
439 i
.insert_before(assign(x
, ir
->operands
[0]));
440 i
.insert_before(exp
);
441 i
.insert_before(assign(exp
, ir
->operands
[1]));
443 /* Extract the biased exponent from <x>. */
444 i
.insert_before(extracted_biased_exp
);
445 i
.insert_before(assign(extracted_biased_exp
,
446 rshift(bitcast_f2i(abs(x
)), exp_shift
)));
448 i
.insert_before(resulting_biased_exp
);
449 i
.insert_before(assign(resulting_biased_exp
,
450 add(extracted_biased_exp
, exp
)));
452 /* Test if result is ±0.0, subnormal, or underflow by checking if the
453 * resulting biased exponent would be less than 0x1. If so, the result is
454 * 0.0 with the sign of x. (Actually, invert the conditions so that
455 * immediate values are the second arguments, which is better for i965)
457 i
.insert_before(zero_sign_x
);
458 i
.insert_before(assign(zero_sign_x
,
459 bitcast_u2f(bit_and(bitcast_f2u(x
), sign_mask
))));
461 i
.insert_before(is_not_zero_or_underflow
);
462 i
.insert_before(assign(is_not_zero_or_underflow
,
463 gequal(resulting_biased_exp
,
464 new(ir
) ir_constant(0x1, vec_elem
))));
465 i
.insert_before(assign(x
, csel(is_not_zero_or_underflow
,
467 i
.insert_before(assign(resulting_biased_exp
,
468 csel(is_not_zero_or_underflow
,
469 resulting_biased_exp
, zeroi
)));
471 /* We could test for overflows by checking if the resulting biased exponent
472 * would be greater than 0xFE. Turns out we don't need to because the GLSL
475 * "If this product is too large to be represented in the
476 * floating-point type, the result is undefined."
479 ir_constant
*exp_shift_clone
= exp_shift
->clone(ir
, NULL
);
480 ir
->operation
= ir_unop_bitcast_i2f
;
481 ir
->operands
[0] = bitfield_insert(bitcast_f2i(x
), resulting_biased_exp
,
482 exp_shift_clone
, exp_width
);
483 ir
->operands
[1] = NULL
;
485 /* Don't generate new IR that would need to be lowered in an additional
488 if (lowering(BITFIELD_INSERT_TO_BFM_BFI
))
489 bitfield_insert_to_bfm_bfi(ir
->operands
[0]->as_expression());
491 this->progress
= true;
495 lower_instructions_visitor::dldexp_to_arith(ir_expression
*ir
)
497 /* See ldexp_to_arith for structure. Uses frexp_exp to extract the exponent
498 * from the significand.
501 const unsigned vec_elem
= ir
->type
->vector_elements
;
504 const glsl_type
*ivec
= glsl_type::get_instance(GLSL_TYPE_INT
, vec_elem
, 1);
505 const glsl_type
*bvec
= glsl_type::get_instance(GLSL_TYPE_BOOL
, vec_elem
, 1);
508 ir_constant
*zeroi
= ir_constant::zero(ir
, ivec
);
510 ir_constant
*sign_mask
= new(ir
) ir_constant(0x80000000u
);
512 ir_constant
*exp_shift
= new(ir
) ir_constant(20);
513 ir_constant
*exp_width
= new(ir
) ir_constant(11);
514 ir_constant
*exp_bias
= new(ir
) ir_constant(1022, vec_elem
);
516 /* Temporary variables */
517 ir_variable
*x
= new(ir
) ir_variable(ir
->type
, "x", ir_var_temporary
);
518 ir_variable
*exp
= new(ir
) ir_variable(ivec
, "exp", ir_var_temporary
);
520 ir_variable
*zero_sign_x
= new(ir
) ir_variable(ir
->type
, "zero_sign_x",
523 ir_variable
*extracted_biased_exp
=
524 new(ir
) ir_variable(ivec
, "extracted_biased_exp", ir_var_temporary
);
525 ir_variable
*resulting_biased_exp
=
526 new(ir
) ir_variable(ivec
, "resulting_biased_exp", ir_var_temporary
);
528 ir_variable
*is_not_zero_or_underflow
=
529 new(ir
) ir_variable(bvec
, "is_not_zero_or_underflow", ir_var_temporary
);
531 ir_instruction
&i
= *base_ir
;
533 /* Copy <x> and <exp> arguments. */
535 i
.insert_before(assign(x
, ir
->operands
[0]));
536 i
.insert_before(exp
);
537 i
.insert_before(assign(exp
, ir
->operands
[1]));
539 ir_expression
*frexp_exp
= expr(ir_unop_frexp_exp
, x
);
540 if (lowering(DFREXP_DLDEXP_TO_ARITH
))
541 dfrexp_exp_to_arith(frexp_exp
);
543 /* Extract the biased exponent from <x>. */
544 i
.insert_before(extracted_biased_exp
);
545 i
.insert_before(assign(extracted_biased_exp
, add(frexp_exp
, exp_bias
)));
547 i
.insert_before(resulting_biased_exp
);
548 i
.insert_before(assign(resulting_biased_exp
,
549 add(extracted_biased_exp
, exp
)));
551 /* Test if result is ±0.0, subnormal, or underflow by checking if the
552 * resulting biased exponent would be less than 0x1. If so, the result is
553 * 0.0 with the sign of x. (Actually, invert the conditions so that
554 * immediate values are the second arguments, which is better for i965)
555 * TODO: Implement in a vector fashion.
557 i
.insert_before(zero_sign_x
);
558 for (unsigned elem
= 0; elem
< vec_elem
; elem
++) {
559 ir_variable
*unpacked
=
560 new(ir
) ir_variable(glsl_type::uvec2_type
, "unpacked", ir_var_temporary
);
561 i
.insert_before(unpacked
);
564 expr(ir_unop_unpack_double_2x32
, swizzle(x
, elem
, 1))));
565 i
.insert_before(assign(unpacked
, bit_and(swizzle_y(unpacked
), sign_mask
->clone(ir
, NULL
)),
567 i
.insert_before(assign(unpacked
, ir_constant::zero(ir
, glsl_type::uint_type
), WRITEMASK_X
));
568 i
.insert_before(assign(zero_sign_x
,
569 expr(ir_unop_pack_double_2x32
, unpacked
),
572 i
.insert_before(is_not_zero_or_underflow
);
573 i
.insert_before(assign(is_not_zero_or_underflow
,
574 gequal(resulting_biased_exp
,
575 new(ir
) ir_constant(0x1, vec_elem
))));
576 i
.insert_before(assign(x
, csel(is_not_zero_or_underflow
,
578 i
.insert_before(assign(resulting_biased_exp
,
579 csel(is_not_zero_or_underflow
,
580 resulting_biased_exp
, zeroi
)));
582 /* We could test for overflows by checking if the resulting biased exponent
583 * would be greater than 0xFE. Turns out we don't need to because the GLSL
586 * "If this product is too large to be represented in the
587 * floating-point type, the result is undefined."
590 ir_rvalue
*results
[4] = {NULL
};
591 for (unsigned elem
= 0; elem
< vec_elem
; elem
++) {
592 ir_variable
*unpacked
=
593 new(ir
) ir_variable(glsl_type::uvec2_type
, "unpacked", ir_var_temporary
);
594 i
.insert_before(unpacked
);
597 expr(ir_unop_unpack_double_2x32
, swizzle(x
, elem
, 1))));
599 ir_expression
*bfi
= bitfield_insert(
601 i2u(swizzle(resulting_biased_exp
, elem
, 1)),
602 exp_shift
->clone(ir
, NULL
),
603 exp_width
->clone(ir
, NULL
));
605 if (lowering(BITFIELD_INSERT_TO_BFM_BFI
))
606 bitfield_insert_to_bfm_bfi(bfi
);
608 i
.insert_before(assign(unpacked
, bfi
, WRITEMASK_Y
));
610 results
[elem
] = expr(ir_unop_pack_double_2x32
, unpacked
);
613 ir
->operation
= ir_quadop_vector
;
614 ir
->operands
[0] = results
[0];
615 ir
->operands
[1] = results
[1];
616 ir
->operands
[2] = results
[2];
617 ir
->operands
[3] = results
[3];
619 /* Don't generate new IR that would need to be lowered in an additional
623 this->progress
= true;
627 lower_instructions_visitor::dfrexp_sig_to_arith(ir_expression
*ir
)
629 const unsigned vec_elem
= ir
->type
->vector_elements
;
630 const glsl_type
*bvec
= glsl_type::get_instance(GLSL_TYPE_BOOL
, vec_elem
, 1);
632 /* Double-precision floating-point values are stored as
637 * We're just extracting the significand here, so we only need to modify
638 * the upper 32-bit uint. Unfortunately we must extract each double
639 * independently as there is no vector version of unpackDouble.
642 ir_instruction
&i
= *base_ir
;
644 ir_variable
*is_not_zero
=
645 new(ir
) ir_variable(bvec
, "is_not_zero", ir_var_temporary
);
646 ir_rvalue
*results
[4] = {NULL
};
648 ir_constant
*dzero
= new(ir
) ir_constant(0.0, vec_elem
);
649 i
.insert_before(is_not_zero
);
652 nequal(abs(ir
->operands
[0]->clone(ir
, NULL
)), dzero
)));
654 /* TODO: Remake this as more vector-friendly when int64 support is
657 for (unsigned elem
= 0; elem
< vec_elem
; elem
++) {
658 ir_constant
*zero
= new(ir
) ir_constant(0u, 1);
659 ir_constant
*sign_mantissa_mask
= new(ir
) ir_constant(0x800fffffu
, 1);
661 /* Exponent of double floating-point values in the range [0.5, 1.0). */
662 ir_constant
*exponent_value
= new(ir
) ir_constant(0x3fe00000u
, 1);
665 new(ir
) ir_variable(glsl_type::uint_type
, "bits", ir_var_temporary
);
666 ir_variable
*unpacked
=
667 new(ir
) ir_variable(glsl_type::uvec2_type
, "unpacked", ir_var_temporary
);
669 ir_rvalue
*x
= swizzle(ir
->operands
[0]->clone(ir
, NULL
), elem
, 1);
671 i
.insert_before(bits
);
672 i
.insert_before(unpacked
);
673 i
.insert_before(assign(unpacked
, expr(ir_unop_unpack_double_2x32
, x
)));
675 /* Manipulate the high uint to remove the exponent and replace it with
676 * either the default exponent or zero.
678 i
.insert_before(assign(bits
, swizzle_y(unpacked
)));
679 i
.insert_before(assign(bits
, bit_and(bits
, sign_mantissa_mask
)));
680 i
.insert_before(assign(bits
, bit_or(bits
,
681 csel(swizzle(is_not_zero
, elem
, 1),
684 i
.insert_before(assign(unpacked
, bits
, WRITEMASK_Y
));
685 results
[elem
] = expr(ir_unop_pack_double_2x32
, unpacked
);
688 /* Put the dvec back together */
689 ir
->operation
= ir_quadop_vector
;
690 ir
->operands
[0] = results
[0];
691 ir
->operands
[1] = results
[1];
692 ir
->operands
[2] = results
[2];
693 ir
->operands
[3] = results
[3];
695 this->progress
= true;
699 lower_instructions_visitor::dfrexp_exp_to_arith(ir_expression
*ir
)
701 const unsigned vec_elem
= ir
->type
->vector_elements
;
702 const glsl_type
*bvec
= glsl_type::get_instance(GLSL_TYPE_BOOL
, vec_elem
, 1);
703 const glsl_type
*uvec
= glsl_type::get_instance(GLSL_TYPE_UINT
, vec_elem
, 1);
705 /* Double-precision floating-point values are stored as
710 * We're just extracting the exponent here, so we only care about the upper
714 ir_instruction
&i
= *base_ir
;
716 ir_variable
*is_not_zero
=
717 new(ir
) ir_variable(bvec
, "is_not_zero", ir_var_temporary
);
718 ir_variable
*high_words
=
719 new(ir
) ir_variable(uvec
, "high_words", ir_var_temporary
);
720 ir_constant
*dzero
= new(ir
) ir_constant(0.0, vec_elem
);
721 ir_constant
*izero
= new(ir
) ir_constant(0, vec_elem
);
723 ir_rvalue
*absval
= abs(ir
->operands
[0]);
725 i
.insert_before(is_not_zero
);
726 i
.insert_before(high_words
);
727 i
.insert_before(assign(is_not_zero
, nequal(absval
->clone(ir
, NULL
), dzero
)));
729 /* Extract all of the upper uints. */
730 for (unsigned elem
= 0; elem
< vec_elem
; elem
++) {
731 ir_rvalue
*x
= swizzle(absval
->clone(ir
, NULL
), elem
, 1);
733 i
.insert_before(assign(high_words
,
734 swizzle_y(expr(ir_unop_unpack_double_2x32
, x
)),
738 ir_constant
*exponent_shift
= new(ir
) ir_constant(20, vec_elem
);
739 ir_constant
*exponent_bias
= new(ir
) ir_constant(-1022, vec_elem
);
741 /* For non-zero inputs, shift the exponent down and apply bias. */
742 ir
->operation
= ir_triop_csel
;
743 ir
->operands
[0] = new(ir
) ir_dereference_variable(is_not_zero
);
744 ir
->operands
[1] = add(exponent_bias
, u2i(rshift(high_words
, exponent_shift
)));
745 ir
->operands
[2] = izero
;
747 this->progress
= true;
751 lower_instructions_visitor::carry_to_arith(ir_expression
*ir
)
756 * sum = ir_binop_add x y
757 * bcarry = ir_binop_less sum x
758 * carry = ir_unop_b2i bcarry
761 ir_rvalue
*x_clone
= ir
->operands
[0]->clone(ir
, NULL
);
762 ir
->operation
= ir_unop_i2u
;
763 ir
->operands
[0] = b2i(less(add(ir
->operands
[0], ir
->operands
[1]), x_clone
));
764 ir
->operands
[1] = NULL
;
766 this->progress
= true;
770 lower_instructions_visitor::borrow_to_arith(ir_expression
*ir
)
773 * ir_binop_borrow x y
775 * bcarry = ir_binop_less x y
776 * carry = ir_unop_b2i bcarry
779 ir
->operation
= ir_unop_i2u
;
780 ir
->operands
[0] = b2i(less(ir
->operands
[0], ir
->operands
[1]));
781 ir
->operands
[1] = NULL
;
783 this->progress
= true;
787 lower_instructions_visitor::sat_to_clamp(ir_expression
*ir
)
792 * ir_binop_min (ir_binop_max(x, 0.0), 1.0)
795 ir
->operation
= ir_binop_min
;
796 ir
->operands
[0] = new(ir
) ir_expression(ir_binop_max
, ir
->operands
[0]->type
,
798 new(ir
) ir_constant(0.0f
));
799 ir
->operands
[1] = new(ir
) ir_constant(1.0f
);
801 this->progress
= true;
805 lower_instructions_visitor::double_dot_to_fma(ir_expression
*ir
)
807 ir_variable
*temp
= new(ir
) ir_variable(ir
->operands
[0]->type
->get_base_type(), "dot_res",
809 this->base_ir
->insert_before(temp
);
811 int nc
= ir
->operands
[0]->type
->components();
812 for (int i
= nc
- 1; i
>= 1; i
--) {
813 ir_assignment
*assig
;
815 assig
= assign(temp
, mul(swizzle(ir
->operands
[0]->clone(ir
, NULL
), i
, 1),
816 swizzle(ir
->operands
[1]->clone(ir
, NULL
), i
, 1)));
818 assig
= assign(temp
, fma(swizzle(ir
->operands
[0]->clone(ir
, NULL
), i
, 1),
819 swizzle(ir
->operands
[1]->clone(ir
, NULL
), i
, 1),
822 this->base_ir
->insert_before(assig
);
825 ir
->operation
= ir_triop_fma
;
826 ir
->operands
[0] = swizzle(ir
->operands
[0], 0, 1);
827 ir
->operands
[1] = swizzle(ir
->operands
[1], 0, 1);
828 ir
->operands
[2] = new(ir
) ir_dereference_variable(temp
);
830 this->progress
= true;
835 lower_instructions_visitor::double_lrp(ir_expression
*ir
)
838 ir_rvalue
*op0
= ir
->operands
[0], *op2
= ir
->operands
[2];
839 ir_constant
*one
= new(ir
) ir_constant(1.0, op2
->type
->vector_elements
);
841 switch (op2
->type
->vector_elements
) {
843 swizval
= SWIZZLE_XXXX
;
846 assert(op0
->type
->vector_elements
== op2
->type
->vector_elements
);
847 swizval
= SWIZZLE_XYZW
;
851 ir
->operation
= ir_triop_fma
;
852 ir
->operands
[0] = swizzle(op2
, swizval
, op0
->type
->vector_elements
);
853 ir
->operands
[2] = mul(sub(one
, op2
->clone(ir
, NULL
)), op0
);
855 this->progress
= true;
859 lower_instructions_visitor::dceil_to_dfrac(ir_expression
*ir
)
863 * temp = sub(x, frtemp);
864 * result = temp + ((frtemp != 0.0) ? 1.0 : 0.0);
866 ir_instruction
&i
= *base_ir
;
867 ir_constant
*zero
= new(ir
) ir_constant(0.0, ir
->operands
[0]->type
->vector_elements
);
868 ir_constant
*one
= new(ir
) ir_constant(1.0, ir
->operands
[0]->type
->vector_elements
);
869 ir_variable
*frtemp
= new(ir
) ir_variable(ir
->operands
[0]->type
, "frtemp",
872 i
.insert_before(frtemp
);
873 i
.insert_before(assign(frtemp
, fract(ir
->operands
[0])));
875 ir
->operation
= ir_binop_add
;
876 ir
->operands
[0] = sub(ir
->operands
[0]->clone(ir
, NULL
), frtemp
);
877 ir
->operands
[1] = csel(nequal(frtemp
, zero
), one
, zero
->clone(ir
, NULL
));
879 this->progress
= true;
883 lower_instructions_visitor::dfloor_to_dfrac(ir_expression
*ir
)
887 * result = sub(x, frtemp);
889 ir
->operation
= ir_binop_sub
;
890 ir
->operands
[1] = fract(ir
->operands
[0]->clone(ir
, NULL
));
892 this->progress
= true;
895 lower_instructions_visitor::dround_even_to_dfrac(ir_expression
*ir
)
900 * frtemp = frac(temp);
901 * t2 = sub(temp, frtemp);
902 * if (frac(x) == 0.5)
903 * result = frac(t2 * 0.5) == 0 ? t2 : t2 - 1;
908 ir_instruction
&i
= *base_ir
;
909 ir_variable
*frtemp
= new(ir
) ir_variable(ir
->operands
[0]->type
, "frtemp",
911 ir_variable
*temp
= new(ir
) ir_variable(ir
->operands
[0]->type
, "temp",
913 ir_variable
*t2
= new(ir
) ir_variable(ir
->operands
[0]->type
, "t2",
915 ir_constant
*p5
= new(ir
) ir_constant(0.5, ir
->operands
[0]->type
->vector_elements
);
916 ir_constant
*one
= new(ir
) ir_constant(1.0, ir
->operands
[0]->type
->vector_elements
);
917 ir_constant
*zero
= new(ir
) ir_constant(0.0, ir
->operands
[0]->type
->vector_elements
);
919 i
.insert_before(temp
);
920 i
.insert_before(assign(temp
, add(ir
->operands
[0], p5
)));
922 i
.insert_before(frtemp
);
923 i
.insert_before(assign(frtemp
, fract(temp
)));
926 i
.insert_before(assign(t2
, sub(temp
, frtemp
)));
928 ir
->operation
= ir_triop_csel
;
929 ir
->operands
[0] = equal(fract(ir
->operands
[0]->clone(ir
, NULL
)),
930 p5
->clone(ir
, NULL
));
931 ir
->operands
[1] = csel(equal(fract(mul(t2
, p5
->clone(ir
, NULL
))),
935 ir
->operands
[2] = new(ir
) ir_dereference_variable(t2
);
937 this->progress
= true;
941 lower_instructions_visitor::dtrunc_to_dfrac(ir_expression
*ir
)
945 * temp = sub(x, frtemp);
946 * result = x >= 0 ? temp : temp + (frtemp == 0.0) ? 0 : 1;
948 ir_rvalue
*arg
= ir
->operands
[0];
949 ir_instruction
&i
= *base_ir
;
951 ir_constant
*zero
= new(ir
) ir_constant(0.0, arg
->type
->vector_elements
);
952 ir_constant
*one
= new(ir
) ir_constant(1.0, arg
->type
->vector_elements
);
953 ir_variable
*frtemp
= new(ir
) ir_variable(arg
->type
, "frtemp",
955 ir_variable
*temp
= new(ir
) ir_variable(ir
->operands
[0]->type
, "temp",
958 i
.insert_before(frtemp
);
959 i
.insert_before(assign(frtemp
, fract(arg
)));
960 i
.insert_before(temp
);
961 i
.insert_before(assign(temp
, sub(arg
->clone(ir
, NULL
), frtemp
)));
963 ir
->operation
= ir_triop_csel
;
964 ir
->operands
[0] = gequal(arg
->clone(ir
, NULL
), zero
);
965 ir
->operands
[1] = new (ir
) ir_dereference_variable(temp
);
966 ir
->operands
[2] = add(temp
,
967 csel(equal(frtemp
, zero
->clone(ir
, NULL
)),
968 zero
->clone(ir
, NULL
),
971 this->progress
= true;
975 lower_instructions_visitor::dsign_to_csel(ir_expression
*ir
)
978 * temp = x > 0.0 ? 1.0 : 0.0;
979 * result = x < 0.0 ? -1.0 : temp;
981 ir_rvalue
*arg
= ir
->operands
[0];
982 ir_constant
*zero
= new(ir
) ir_constant(0.0, arg
->type
->vector_elements
);
983 ir_constant
*one
= new(ir
) ir_constant(1.0, arg
->type
->vector_elements
);
984 ir_constant
*neg_one
= new(ir
) ir_constant(-1.0, arg
->type
->vector_elements
);
986 ir
->operation
= ir_triop_csel
;
987 ir
->operands
[0] = less(arg
->clone(ir
, NULL
),
988 zero
->clone(ir
, NULL
));
989 ir
->operands
[1] = neg_one
;
990 ir
->operands
[2] = csel(greater(arg
, zero
),
992 zero
->clone(ir
, NULL
));
994 this->progress
= true;
998 lower_instructions_visitor::visit_leave(ir_expression
*ir
)
1000 switch (ir
->operation
) {
1002 if (ir
->operands
[0]->type
->is_double())
1003 double_dot_to_fma(ir
);
1006 if (ir
->operands
[0]->type
->is_double())
1010 if (lowering(SUB_TO_ADD_NEG
))
1015 if (ir
->operands
[1]->type
->is_integer() && lowering(INT_DIV_TO_MUL_RCP
))
1016 int_div_to_mul_rcp(ir
);
1017 else if ((ir
->operands
[1]->type
->is_float() ||
1018 ir
->operands
[1]->type
->is_double()) && lowering(DIV_TO_MUL_RCP
))
1023 if (lowering(EXP_TO_EXP2
))
1028 if (lowering(LOG_TO_LOG2
))
1033 if (lowering(MOD_TO_FLOOR
) && (ir
->type
->is_float() || ir
->type
->is_double()))
1038 if (lowering(POW_TO_EXP2
))
1042 case ir_quadop_bitfield_insert
:
1043 if (lowering(BITFIELD_INSERT_TO_BFM_BFI
))
1044 bitfield_insert_to_bfm_bfi(ir
);
1047 case ir_binop_ldexp
:
1048 if (lowering(LDEXP_TO_ARITH
) && ir
->type
->is_float())
1050 if (lowering(DFREXP_DLDEXP_TO_ARITH
) && ir
->type
->is_double())
1051 dldexp_to_arith(ir
);
1054 case ir_unop_frexp_exp
:
1055 if (lowering(DFREXP_DLDEXP_TO_ARITH
) && ir
->operands
[0]->type
->is_double())
1056 dfrexp_exp_to_arith(ir
);
1059 case ir_unop_frexp_sig
:
1060 if (lowering(DFREXP_DLDEXP_TO_ARITH
) && ir
->operands
[0]->type
->is_double())
1061 dfrexp_sig_to_arith(ir
);
1064 case ir_binop_carry
:
1065 if (lowering(CARRY_TO_ARITH
))
1069 case ir_binop_borrow
:
1070 if (lowering(BORROW_TO_ARITH
))
1071 borrow_to_arith(ir
);
1074 case ir_unop_saturate
:
1075 if (lowering(SAT_TO_CLAMP
))
1080 if (lowering(DOPS_TO_DFRAC
) && ir
->type
->is_double())
1081 dtrunc_to_dfrac(ir
);
1085 if (lowering(DOPS_TO_DFRAC
) && ir
->type
->is_double())
1090 if (lowering(DOPS_TO_DFRAC
) && ir
->type
->is_double())
1091 dfloor_to_dfrac(ir
);
1094 case ir_unop_round_even
:
1095 if (lowering(DOPS_TO_DFRAC
) && ir
->type
->is_double())
1096 dround_even_to_dfrac(ir
);
1100 if (lowering(DOPS_TO_DFRAC
) && ir
->type
->is_double())
1104 return visit_continue
;
1107 return visit_continue
;