glsl/lower_instructions: fix regression in dldexp_to_arith
[mesa.git] / src / glsl / lower_instructions.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 * DEALINGS IN THE SOFTWARE.
22 */
23
24 /**
25 * \file lower_instructions.cpp
26 *
27 * Many GPUs lack native instructions for certain expression operations, and
28 * must replace them with some other expression tree. This pass lowers some
29 * of the most common cases, allowing the lowering code to be implemented once
30 * rather than in each driver backend.
31 *
32 * Currently supported transformations:
33 * - SUB_TO_ADD_NEG
34 * - DIV_TO_MUL_RCP
35 * - INT_DIV_TO_MUL_RCP
36 * - EXP_TO_EXP2
37 * - POW_TO_EXP2
38 * - LOG_TO_LOG2
39 * - MOD_TO_FLOOR
40 * - LDEXP_TO_ARITH
41 * - DFREXP_TO_ARITH
42 * - CARRY_TO_ARITH
43 * - BORROW_TO_ARITH
44 * - SAT_TO_CLAMP
45 * - DOPS_TO_DFRAC
46 *
47 * SUB_TO_ADD_NEG:
48 * ---------------
49 * Breaks an ir_binop_sub expression down to add(op0, neg(op1))
50 *
51 * This simplifies expression reassociation, and for many backends
52 * there is no subtract operation separate from adding the negation.
53 * For backends with native subtract operations, they will probably
54 * want to recognize add(op0, neg(op1)) or the other way around to
55 * produce a subtract anyway.
56 *
57 * DIV_TO_MUL_RCP and INT_DIV_TO_MUL_RCP:
58 * --------------------------------------
59 * Breaks an ir_binop_div expression down to op0 * (rcp(op1)).
60 *
61 * Many GPUs don't have a divide instruction (945 and 965 included),
62 * but they do have an RCP instruction to compute an approximate
63 * reciprocal. By breaking the operation down, constant reciprocals
64 * can get constant folded.
65 *
66 * DIV_TO_MUL_RCP only lowers floating point division; INT_DIV_TO_MUL_RCP
67 * handles the integer case, converting to and from floating point so that
68 * RCP is possible.
69 *
70 * EXP_TO_EXP2 and LOG_TO_LOG2:
71 * ----------------------------
72 * Many GPUs don't have a base e log or exponent instruction, but they
73 * do have base 2 versions, so this pass converts exp and log to exp2
74 * and log2 operations.
75 *
76 * POW_TO_EXP2:
77 * -----------
78 * Many older GPUs don't have an x**y instruction. For these GPUs, convert
79 * x**y to 2**(y * log2(x)).
80 *
81 * MOD_TO_FLOOR:
82 * -------------
83 * Breaks an ir_binop_mod expression down to (op0 - op1 * floor(op0 / op1))
84 *
85 * Many GPUs don't have a MOD instruction (945 and 965 included), and
86 * if we have to break it down like this anyway, it gives an
87 * opportunity to do things like constant fold the (1.0 / op1) easily.
88 *
89 * Note: before we used to implement this as op1 * fract(op / op1) but this
90 * implementation had significant precision errors.
91 *
92 * LDEXP_TO_ARITH:
93 * -------------
94 * Converts ir_binop_ldexp to arithmetic and bit operations for float sources.
95 *
96 * DFREXP_DLDEXP_TO_ARITH:
97 * ---------------
98 * Converts ir_binop_ldexp, ir_unop_frexp_sig, and ir_unop_frexp_exp to
99 * arithmetic and bit ops for double arguments.
100 *
101 * CARRY_TO_ARITH:
102 * ---------------
103 * Converts ir_carry into (x + y) < x.
104 *
105 * BORROW_TO_ARITH:
106 * ----------------
107 * Converts ir_borrow into (x < y).
108 *
109 * SAT_TO_CLAMP:
110 * -------------
111 * Converts ir_unop_saturate into min(max(x, 0.0), 1.0)
112 *
113 * DOPS_TO_DFRAC:
114 * --------------
115 * Converts double trunc, ceil, floor, round to fract
116 */
117
118 #include "c99_math.h"
119 #include "program/prog_instruction.h" /* for swizzle */
120 #include "glsl_types.h"
121 #include "ir.h"
122 #include "ir_builder.h"
123 #include "ir_optimization.h"
124
125 using namespace ir_builder;
126
127 namespace {
128
129 class lower_instructions_visitor : public ir_hierarchical_visitor {
130 public:
131 lower_instructions_visitor(unsigned lower)
132 : progress(false), lower(lower) { }
133
134 ir_visitor_status visit_leave(ir_expression *);
135
136 bool progress;
137
138 private:
139 unsigned lower; /** Bitfield of which operations to lower */
140
141 void sub_to_add_neg(ir_expression *);
142 void div_to_mul_rcp(ir_expression *);
143 void int_div_to_mul_rcp(ir_expression *);
144 void mod_to_floor(ir_expression *);
145 void exp_to_exp2(ir_expression *);
146 void pow_to_exp2(ir_expression *);
147 void log_to_log2(ir_expression *);
148 void ldexp_to_arith(ir_expression *);
149 void dldexp_to_arith(ir_expression *);
150 void dfrexp_sig_to_arith(ir_expression *);
151 void dfrexp_exp_to_arith(ir_expression *);
152 void carry_to_arith(ir_expression *);
153 void borrow_to_arith(ir_expression *);
154 void sat_to_clamp(ir_expression *);
155 void double_dot_to_fma(ir_expression *);
156 void double_lrp(ir_expression *);
157 void dceil_to_dfrac(ir_expression *);
158 void dfloor_to_dfrac(ir_expression *);
159 void dround_even_to_dfrac(ir_expression *);
160 void dtrunc_to_dfrac(ir_expression *);
161 void dsign_to_csel(ir_expression *);
162 };
163
164 } /* anonymous namespace */
165
166 /**
167 * Determine if a particular type of lowering should occur
168 */
169 #define lowering(x) (this->lower & x)
170
171 bool
172 lower_instructions(exec_list *instructions, unsigned what_to_lower)
173 {
174 lower_instructions_visitor v(what_to_lower);
175
176 visit_list_elements(&v, instructions);
177 return v.progress;
178 }
179
180 void
181 lower_instructions_visitor::sub_to_add_neg(ir_expression *ir)
182 {
183 ir->operation = ir_binop_add;
184 ir->operands[1] = new(ir) ir_expression(ir_unop_neg, ir->operands[1]->type,
185 ir->operands[1], NULL);
186 this->progress = true;
187 }
188
189 void
190 lower_instructions_visitor::div_to_mul_rcp(ir_expression *ir)
191 {
192 assert(ir->operands[1]->type->is_float() || ir->operands[1]->type->is_double());
193
194 /* New expression for the 1.0 / op1 */
195 ir_rvalue *expr;
196 expr = new(ir) ir_expression(ir_unop_rcp,
197 ir->operands[1]->type,
198 ir->operands[1]);
199
200 /* op0 / op1 -> op0 * (1.0 / op1) */
201 ir->operation = ir_binop_mul;
202 ir->operands[1] = expr;
203
204 this->progress = true;
205 }
206
207 void
208 lower_instructions_visitor::int_div_to_mul_rcp(ir_expression *ir)
209 {
210 assert(ir->operands[1]->type->is_integer());
211
212 /* Be careful with integer division -- we need to do it as a
213 * float and re-truncate, since rcp(n > 1) of an integer would
214 * just be 0.
215 */
216 ir_rvalue *op0, *op1;
217 const struct glsl_type *vec_type;
218
219 vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
220 ir->operands[1]->type->vector_elements,
221 ir->operands[1]->type->matrix_columns);
222
223 if (ir->operands[1]->type->base_type == GLSL_TYPE_INT)
224 op1 = new(ir) ir_expression(ir_unop_i2f, vec_type, ir->operands[1], NULL);
225 else
226 op1 = new(ir) ir_expression(ir_unop_u2f, vec_type, ir->operands[1], NULL);
227
228 op1 = new(ir) ir_expression(ir_unop_rcp, op1->type, op1, NULL);
229
230 vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
231 ir->operands[0]->type->vector_elements,
232 ir->operands[0]->type->matrix_columns);
233
234 if (ir->operands[0]->type->base_type == GLSL_TYPE_INT)
235 op0 = new(ir) ir_expression(ir_unop_i2f, vec_type, ir->operands[0], NULL);
236 else
237 op0 = new(ir) ir_expression(ir_unop_u2f, vec_type, ir->operands[0], NULL);
238
239 vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
240 ir->type->vector_elements,
241 ir->type->matrix_columns);
242
243 op0 = new(ir) ir_expression(ir_binop_mul, vec_type, op0, op1);
244
245 if (ir->operands[1]->type->base_type == GLSL_TYPE_INT) {
246 ir->operation = ir_unop_f2i;
247 ir->operands[0] = op0;
248 } else {
249 ir->operation = ir_unop_i2u;
250 ir->operands[0] = new(ir) ir_expression(ir_unop_f2i, op0);
251 }
252 ir->operands[1] = NULL;
253
254 this->progress = true;
255 }
256
257 void
258 lower_instructions_visitor::exp_to_exp2(ir_expression *ir)
259 {
260 ir_constant *log2_e = new(ir) ir_constant(float(M_LOG2E));
261
262 ir->operation = ir_unop_exp2;
263 ir->operands[0] = new(ir) ir_expression(ir_binop_mul, ir->operands[0]->type,
264 ir->operands[0], log2_e);
265 this->progress = true;
266 }
267
268 void
269 lower_instructions_visitor::pow_to_exp2(ir_expression *ir)
270 {
271 ir_expression *const log2_x =
272 new(ir) ir_expression(ir_unop_log2, ir->operands[0]->type,
273 ir->operands[0]);
274
275 ir->operation = ir_unop_exp2;
276 ir->operands[0] = new(ir) ir_expression(ir_binop_mul, ir->operands[1]->type,
277 ir->operands[1], log2_x);
278 ir->operands[1] = NULL;
279 this->progress = true;
280 }
281
282 void
283 lower_instructions_visitor::log_to_log2(ir_expression *ir)
284 {
285 ir->operation = ir_binop_mul;
286 ir->operands[0] = new(ir) ir_expression(ir_unop_log2, ir->operands[0]->type,
287 ir->operands[0], NULL);
288 ir->operands[1] = new(ir) ir_constant(float(1.0 / M_LOG2E));
289 this->progress = true;
290 }
291
292 void
293 lower_instructions_visitor::mod_to_floor(ir_expression *ir)
294 {
295 ir_variable *x = new(ir) ir_variable(ir->operands[0]->type, "mod_x",
296 ir_var_temporary);
297 ir_variable *y = new(ir) ir_variable(ir->operands[1]->type, "mod_y",
298 ir_var_temporary);
299 this->base_ir->insert_before(x);
300 this->base_ir->insert_before(y);
301
302 ir_assignment *const assign_x =
303 new(ir) ir_assignment(new(ir) ir_dereference_variable(x),
304 ir->operands[0], NULL);
305 ir_assignment *const assign_y =
306 new(ir) ir_assignment(new(ir) ir_dereference_variable(y),
307 ir->operands[1], NULL);
308
309 this->base_ir->insert_before(assign_x);
310 this->base_ir->insert_before(assign_y);
311
312 ir_expression *const div_expr =
313 new(ir) ir_expression(ir_binop_div, x->type,
314 new(ir) ir_dereference_variable(x),
315 new(ir) ir_dereference_variable(y));
316
317 /* Don't generate new IR that would need to be lowered in an additional
318 * pass.
319 */
320 if (lowering(DIV_TO_MUL_RCP) && (ir->type->is_float() || ir->type->is_double()))
321 div_to_mul_rcp(div_expr);
322
323 ir_expression *const floor_expr =
324 new(ir) ir_expression(ir_unop_floor, x->type, div_expr);
325
326 if (lowering(DOPS_TO_DFRAC) && ir->type->is_double())
327 dfloor_to_dfrac(floor_expr);
328
329 ir_expression *const mul_expr =
330 new(ir) ir_expression(ir_binop_mul,
331 new(ir) ir_dereference_variable(y),
332 floor_expr);
333
334 ir->operation = ir_binop_sub;
335 ir->operands[0] = new(ir) ir_dereference_variable(x);
336 ir->operands[1] = mul_expr;
337 this->progress = true;
338 }
339
340 void
341 lower_instructions_visitor::ldexp_to_arith(ir_expression *ir)
342 {
343 /* Translates
344 * ir_binop_ldexp x exp
345 * into
346 *
347 * extracted_biased_exp = rshift(bitcast_f2i(abs(x)), exp_shift);
348 * resulting_biased_exp = extracted_biased_exp + exp;
349 *
350 * if (resulting_biased_exp < 1) {
351 * return copysign(0.0, x);
352 * }
353 *
354 * return bitcast_u2f((bitcast_f2u(x) & sign_mantissa_mask) |
355 * lshift(i2u(resulting_biased_exp), exp_shift));
356 *
357 * which we can't actually implement as such, since the GLSL IR doesn't
358 * have vectorized if-statements. We actually implement it without branches
359 * using conditional-select:
360 *
361 * extracted_biased_exp = rshift(bitcast_f2i(abs(x)), exp_shift);
362 * resulting_biased_exp = extracted_biased_exp + exp;
363 *
364 * is_not_zero_or_underflow = gequal(resulting_biased_exp, 1);
365 * x = csel(is_not_zero_or_underflow, x, copysign(0.0f, x));
366 * resulting_biased_exp = csel(is_not_zero_or_underflow,
367 * resulting_biased_exp, 0);
368 *
369 * return bitcast_u2f((bitcast_f2u(x) & sign_mantissa_mask) |
370 * lshift(i2u(resulting_biased_exp), exp_shift));
371 */
372
373 const unsigned vec_elem = ir->type->vector_elements;
374
375 /* Types */
376 const glsl_type *ivec = glsl_type::get_instance(GLSL_TYPE_INT, vec_elem, 1);
377 const glsl_type *bvec = glsl_type::get_instance(GLSL_TYPE_BOOL, vec_elem, 1);
378
379 /* Constants */
380 ir_constant *zeroi = ir_constant::zero(ir, ivec);
381
382 ir_constant *sign_mask = new(ir) ir_constant(0x80000000u, vec_elem);
383
384 ir_constant *exp_shift = new(ir) ir_constant(23, vec_elem);
385 ir_constant *exp_width = new(ir) ir_constant(8, vec_elem);
386
387 /* Temporary variables */
388 ir_variable *x = new(ir) ir_variable(ir->type, "x", ir_var_temporary);
389 ir_variable *exp = new(ir) ir_variable(ivec, "exp", ir_var_temporary);
390
391 ir_variable *zero_sign_x = new(ir) ir_variable(ir->type, "zero_sign_x",
392 ir_var_temporary);
393
394 ir_variable *extracted_biased_exp =
395 new(ir) ir_variable(ivec, "extracted_biased_exp", ir_var_temporary);
396 ir_variable *resulting_biased_exp =
397 new(ir) ir_variable(ivec, "resulting_biased_exp", ir_var_temporary);
398
399 ir_variable *is_not_zero_or_underflow =
400 new(ir) ir_variable(bvec, "is_not_zero_or_underflow", ir_var_temporary);
401
402 ir_instruction &i = *base_ir;
403
404 /* Copy <x> and <exp> arguments. */
405 i.insert_before(x);
406 i.insert_before(assign(x, ir->operands[0]));
407 i.insert_before(exp);
408 i.insert_before(assign(exp, ir->operands[1]));
409
410 /* Extract the biased exponent from <x>. */
411 i.insert_before(extracted_biased_exp);
412 i.insert_before(assign(extracted_biased_exp,
413 rshift(bitcast_f2i(abs(x)), exp_shift)));
414
415 i.insert_before(resulting_biased_exp);
416 i.insert_before(assign(resulting_biased_exp,
417 add(extracted_biased_exp, exp)));
418
419 /* Test if result is ±0.0, subnormal, or underflow by checking if the
420 * resulting biased exponent would be less than 0x1. If so, the result is
421 * 0.0 with the sign of x. (Actually, invert the conditions so that
422 * immediate values are the second arguments, which is better for i965)
423 */
424 i.insert_before(zero_sign_x);
425 i.insert_before(assign(zero_sign_x,
426 bitcast_u2f(bit_and(bitcast_f2u(x), sign_mask))));
427
428 i.insert_before(is_not_zero_or_underflow);
429 i.insert_before(assign(is_not_zero_or_underflow,
430 gequal(resulting_biased_exp,
431 new(ir) ir_constant(0x1, vec_elem))));
432 i.insert_before(assign(x, csel(is_not_zero_or_underflow,
433 x, zero_sign_x)));
434 i.insert_before(assign(resulting_biased_exp,
435 csel(is_not_zero_or_underflow,
436 resulting_biased_exp, zeroi)));
437
438 /* We could test for overflows by checking if the resulting biased exponent
439 * would be greater than 0xFE. Turns out we don't need to because the GLSL
440 * spec says:
441 *
442 * "If this product is too large to be represented in the
443 * floating-point type, the result is undefined."
444 */
445
446 ir_constant *exp_shift_clone = exp_shift->clone(ir, NULL);
447 ir->operation = ir_unop_bitcast_i2f;
448 ir->operands[0] = bitfield_insert(bitcast_f2i(x), resulting_biased_exp,
449 exp_shift_clone, exp_width);
450 ir->operands[1] = NULL;
451
452 this->progress = true;
453 }
454
455 void
456 lower_instructions_visitor::dldexp_to_arith(ir_expression *ir)
457 {
458 /* See ldexp_to_arith for structure. Uses frexp_exp to extract the exponent
459 * from the significand.
460 */
461
462 const unsigned vec_elem = ir->type->vector_elements;
463
464 /* Types */
465 const glsl_type *ivec = glsl_type::get_instance(GLSL_TYPE_INT, vec_elem, 1);
466 const glsl_type *bvec = glsl_type::get_instance(GLSL_TYPE_BOOL, vec_elem, 1);
467
468 /* Constants */
469 ir_constant *zeroi = ir_constant::zero(ir, ivec);
470
471 ir_constant *sign_mask = new(ir) ir_constant(0x80000000u);
472
473 ir_constant *exp_shift = new(ir) ir_constant(20u);
474 ir_constant *exp_width = new(ir) ir_constant(11u);
475 ir_constant *exp_bias = new(ir) ir_constant(1022, vec_elem);
476
477 /* Temporary variables */
478 ir_variable *x = new(ir) ir_variable(ir->type, "x", ir_var_temporary);
479 ir_variable *exp = new(ir) ir_variable(ivec, "exp", ir_var_temporary);
480
481 ir_variable *zero_sign_x = new(ir) ir_variable(ir->type, "zero_sign_x",
482 ir_var_temporary);
483
484 ir_variable *extracted_biased_exp =
485 new(ir) ir_variable(ivec, "extracted_biased_exp", ir_var_temporary);
486 ir_variable *resulting_biased_exp =
487 new(ir) ir_variable(ivec, "resulting_biased_exp", ir_var_temporary);
488
489 ir_variable *is_not_zero_or_underflow =
490 new(ir) ir_variable(bvec, "is_not_zero_or_underflow", ir_var_temporary);
491
492 ir_instruction &i = *base_ir;
493
494 /* Copy <x> and <exp> arguments. */
495 i.insert_before(x);
496 i.insert_before(assign(x, ir->operands[0]));
497 i.insert_before(exp);
498 i.insert_before(assign(exp, ir->operands[1]));
499
500 ir_expression *frexp_exp = expr(ir_unop_frexp_exp, x);
501 if (lowering(DFREXP_DLDEXP_TO_ARITH))
502 dfrexp_exp_to_arith(frexp_exp);
503
504 /* Extract the biased exponent from <x>. */
505 i.insert_before(extracted_biased_exp);
506 i.insert_before(assign(extracted_biased_exp, add(frexp_exp, exp_bias)));
507
508 i.insert_before(resulting_biased_exp);
509 i.insert_before(assign(resulting_biased_exp,
510 add(extracted_biased_exp, exp)));
511
512 /* Test if result is ±0.0, subnormal, or underflow by checking if the
513 * resulting biased exponent would be less than 0x1. If so, the result is
514 * 0.0 with the sign of x. (Actually, invert the conditions so that
515 * immediate values are the second arguments, which is better for i965)
516 * TODO: Implement in a vector fashion.
517 */
518 i.insert_before(zero_sign_x);
519 for (unsigned elem = 0; elem < vec_elem; elem++) {
520 ir_variable *unpacked =
521 new(ir) ir_variable(glsl_type::uvec2_type, "unpacked", ir_var_temporary);
522 i.insert_before(unpacked);
523 i.insert_before(
524 assign(unpacked,
525 expr(ir_unop_unpack_double_2x32, swizzle(x, elem, 1))));
526 i.insert_before(assign(unpacked, bit_and(swizzle_y(unpacked), sign_mask->clone(ir, NULL)),
527 WRITEMASK_Y));
528 i.insert_before(assign(unpacked, ir_constant::zero(ir, glsl_type::uint_type), WRITEMASK_X));
529 i.insert_before(assign(zero_sign_x,
530 expr(ir_unop_pack_double_2x32, unpacked),
531 1 << elem));
532 }
533 i.insert_before(is_not_zero_or_underflow);
534 i.insert_before(assign(is_not_zero_or_underflow,
535 gequal(resulting_biased_exp,
536 new(ir) ir_constant(0x1, vec_elem))));
537 i.insert_before(assign(x, csel(is_not_zero_or_underflow,
538 x, zero_sign_x)));
539 i.insert_before(assign(resulting_biased_exp,
540 csel(is_not_zero_or_underflow,
541 resulting_biased_exp, zeroi)));
542
543 /* We could test for overflows by checking if the resulting biased exponent
544 * would be greater than 0xFE. Turns out we don't need to because the GLSL
545 * spec says:
546 *
547 * "If this product is too large to be represented in the
548 * floating-point type, the result is undefined."
549 */
550
551 ir_rvalue *results[4] = {NULL};
552 for (unsigned elem = 0; elem < vec_elem; elem++) {
553 ir_variable *unpacked =
554 new(ir) ir_variable(glsl_type::uvec2_type, "unpacked", ir_var_temporary);
555 i.insert_before(unpacked);
556 i.insert_before(
557 assign(unpacked,
558 expr(ir_unop_unpack_double_2x32, swizzle(x, elem, 1))));
559
560 ir_expression *bfi = bitfield_insert(
561 swizzle_y(unpacked),
562 i2u(swizzle(resulting_biased_exp, elem, 1)),
563 exp_shift->clone(ir, NULL),
564 exp_width->clone(ir, NULL));
565
566 i.insert_before(assign(unpacked, bfi, WRITEMASK_Y));
567
568 results[elem] = expr(ir_unop_pack_double_2x32, unpacked);
569 }
570
571 ir->operation = ir_quadop_vector;
572 ir->operands[0] = results[0];
573 ir->operands[1] = results[1];
574 ir->operands[2] = results[2];
575 ir->operands[3] = results[3];
576
577 /* Don't generate new IR that would need to be lowered in an additional
578 * pass.
579 */
580
581 this->progress = true;
582 }
583
584 void
585 lower_instructions_visitor::dfrexp_sig_to_arith(ir_expression *ir)
586 {
587 const unsigned vec_elem = ir->type->vector_elements;
588 const glsl_type *bvec = glsl_type::get_instance(GLSL_TYPE_BOOL, vec_elem, 1);
589
590 /* Double-precision floating-point values are stored as
591 * 1 sign bit;
592 * 11 exponent bits;
593 * 52 mantissa bits.
594 *
595 * We're just extracting the significand here, so we only need to modify
596 * the upper 32-bit uint. Unfortunately we must extract each double
597 * independently as there is no vector version of unpackDouble.
598 */
599
600 ir_instruction &i = *base_ir;
601
602 ir_variable *is_not_zero =
603 new(ir) ir_variable(bvec, "is_not_zero", ir_var_temporary);
604 ir_rvalue *results[4] = {NULL};
605
606 ir_constant *dzero = new(ir) ir_constant(0.0, vec_elem);
607 i.insert_before(is_not_zero);
608 i.insert_before(
609 assign(is_not_zero,
610 nequal(abs(ir->operands[0]->clone(ir, NULL)), dzero)));
611
612 /* TODO: Remake this as more vector-friendly when int64 support is
613 * available.
614 */
615 for (unsigned elem = 0; elem < vec_elem; elem++) {
616 ir_constant *zero = new(ir) ir_constant(0u, 1);
617 ir_constant *sign_mantissa_mask = new(ir) ir_constant(0x800fffffu, 1);
618
619 /* Exponent of double floating-point values in the range [0.5, 1.0). */
620 ir_constant *exponent_value = new(ir) ir_constant(0x3fe00000u, 1);
621
622 ir_variable *bits =
623 new(ir) ir_variable(glsl_type::uint_type, "bits", ir_var_temporary);
624 ir_variable *unpacked =
625 new(ir) ir_variable(glsl_type::uvec2_type, "unpacked", ir_var_temporary);
626
627 ir_rvalue *x = swizzle(ir->operands[0]->clone(ir, NULL), elem, 1);
628
629 i.insert_before(bits);
630 i.insert_before(unpacked);
631 i.insert_before(assign(unpacked, expr(ir_unop_unpack_double_2x32, x)));
632
633 /* Manipulate the high uint to remove the exponent and replace it with
634 * either the default exponent or zero.
635 */
636 i.insert_before(assign(bits, swizzle_y(unpacked)));
637 i.insert_before(assign(bits, bit_and(bits, sign_mantissa_mask)));
638 i.insert_before(assign(bits, bit_or(bits,
639 csel(swizzle(is_not_zero, elem, 1),
640 exponent_value,
641 zero))));
642 i.insert_before(assign(unpacked, bits, WRITEMASK_Y));
643 results[elem] = expr(ir_unop_pack_double_2x32, unpacked);
644 }
645
646 /* Put the dvec back together */
647 ir->operation = ir_quadop_vector;
648 ir->operands[0] = results[0];
649 ir->operands[1] = results[1];
650 ir->operands[2] = results[2];
651 ir->operands[3] = results[3];
652
653 this->progress = true;
654 }
655
656 void
657 lower_instructions_visitor::dfrexp_exp_to_arith(ir_expression *ir)
658 {
659 const unsigned vec_elem = ir->type->vector_elements;
660 const glsl_type *bvec = glsl_type::get_instance(GLSL_TYPE_BOOL, vec_elem, 1);
661 const glsl_type *uvec = glsl_type::get_instance(GLSL_TYPE_UINT, vec_elem, 1);
662
663 /* Double-precision floating-point values are stored as
664 * 1 sign bit;
665 * 11 exponent bits;
666 * 52 mantissa bits.
667 *
668 * We're just extracting the exponent here, so we only care about the upper
669 * 32-bit uint.
670 */
671
672 ir_instruction &i = *base_ir;
673
674 ir_variable *is_not_zero =
675 new(ir) ir_variable(bvec, "is_not_zero", ir_var_temporary);
676 ir_variable *high_words =
677 new(ir) ir_variable(uvec, "high_words", ir_var_temporary);
678 ir_constant *dzero = new(ir) ir_constant(0.0, vec_elem);
679 ir_constant *izero = new(ir) ir_constant(0, vec_elem);
680
681 ir_rvalue *absval = abs(ir->operands[0]);
682
683 i.insert_before(is_not_zero);
684 i.insert_before(high_words);
685 i.insert_before(assign(is_not_zero, nequal(absval->clone(ir, NULL), dzero)));
686
687 /* Extract all of the upper uints. */
688 for (unsigned elem = 0; elem < vec_elem; elem++) {
689 ir_rvalue *x = swizzle(absval->clone(ir, NULL), elem, 1);
690
691 i.insert_before(assign(high_words,
692 swizzle_y(expr(ir_unop_unpack_double_2x32, x)),
693 1 << elem));
694
695 }
696 ir_constant *exponent_shift = new(ir) ir_constant(20, vec_elem);
697 ir_constant *exponent_bias = new(ir) ir_constant(-1022, vec_elem);
698
699 /* For non-zero inputs, shift the exponent down and apply bias. */
700 ir->operation = ir_triop_csel;
701 ir->operands[0] = new(ir) ir_dereference_variable(is_not_zero);
702 ir->operands[1] = add(exponent_bias, u2i(rshift(high_words, exponent_shift)));
703 ir->operands[2] = izero;
704
705 this->progress = true;
706 }
707
708 void
709 lower_instructions_visitor::carry_to_arith(ir_expression *ir)
710 {
711 /* Translates
712 * ir_binop_carry x y
713 * into
714 * sum = ir_binop_add x y
715 * bcarry = ir_binop_less sum x
716 * carry = ir_unop_b2i bcarry
717 */
718
719 ir_rvalue *x_clone = ir->operands[0]->clone(ir, NULL);
720 ir->operation = ir_unop_i2u;
721 ir->operands[0] = b2i(less(add(ir->operands[0], ir->operands[1]), x_clone));
722 ir->operands[1] = NULL;
723
724 this->progress = true;
725 }
726
727 void
728 lower_instructions_visitor::borrow_to_arith(ir_expression *ir)
729 {
730 /* Translates
731 * ir_binop_borrow x y
732 * into
733 * bcarry = ir_binop_less x y
734 * carry = ir_unop_b2i bcarry
735 */
736
737 ir->operation = ir_unop_i2u;
738 ir->operands[0] = b2i(less(ir->operands[0], ir->operands[1]));
739 ir->operands[1] = NULL;
740
741 this->progress = true;
742 }
743
744 void
745 lower_instructions_visitor::sat_to_clamp(ir_expression *ir)
746 {
747 /* Translates
748 * ir_unop_saturate x
749 * into
750 * ir_binop_min (ir_binop_max(x, 0.0), 1.0)
751 */
752
753 ir->operation = ir_binop_min;
754 ir->operands[0] = new(ir) ir_expression(ir_binop_max, ir->operands[0]->type,
755 ir->operands[0],
756 new(ir) ir_constant(0.0f));
757 ir->operands[1] = new(ir) ir_constant(1.0f);
758
759 this->progress = true;
760 }
761
762 void
763 lower_instructions_visitor::double_dot_to_fma(ir_expression *ir)
764 {
765 ir_variable *temp = new(ir) ir_variable(ir->operands[0]->type->get_base_type(), "dot_res",
766 ir_var_temporary);
767 this->base_ir->insert_before(temp);
768
769 int nc = ir->operands[0]->type->components();
770 for (int i = nc - 1; i >= 1; i--) {
771 ir_assignment *assig;
772 if (i == (nc - 1)) {
773 assig = assign(temp, mul(swizzle(ir->operands[0]->clone(ir, NULL), i, 1),
774 swizzle(ir->operands[1]->clone(ir, NULL), i, 1)));
775 } else {
776 assig = assign(temp, fma(swizzle(ir->operands[0]->clone(ir, NULL), i, 1),
777 swizzle(ir->operands[1]->clone(ir, NULL), i, 1),
778 temp));
779 }
780 this->base_ir->insert_before(assig);
781 }
782
783 ir->operation = ir_triop_fma;
784 ir->operands[0] = swizzle(ir->operands[0], 0, 1);
785 ir->operands[1] = swizzle(ir->operands[1], 0, 1);
786 ir->operands[2] = new(ir) ir_dereference_variable(temp);
787
788 this->progress = true;
789
790 }
791
792 void
793 lower_instructions_visitor::double_lrp(ir_expression *ir)
794 {
795 int swizval;
796 ir_rvalue *op0 = ir->operands[0], *op2 = ir->operands[2];
797 ir_constant *one = new(ir) ir_constant(1.0, op2->type->vector_elements);
798
799 switch (op2->type->vector_elements) {
800 case 1:
801 swizval = SWIZZLE_XXXX;
802 break;
803 default:
804 assert(op0->type->vector_elements == op2->type->vector_elements);
805 swizval = SWIZZLE_XYZW;
806 break;
807 }
808
809 ir->operation = ir_triop_fma;
810 ir->operands[0] = swizzle(op2, swizval, op0->type->vector_elements);
811 ir->operands[2] = mul(sub(one, op2->clone(ir, NULL)), op0);
812
813 this->progress = true;
814 }
815
816 void
817 lower_instructions_visitor::dceil_to_dfrac(ir_expression *ir)
818 {
819 /*
820 * frtemp = frac(x);
821 * temp = sub(x, frtemp);
822 * result = temp + ((frtemp != 0.0) ? 1.0 : 0.0);
823 */
824 ir_instruction &i = *base_ir;
825 ir_constant *zero = new(ir) ir_constant(0.0, ir->operands[0]->type->vector_elements);
826 ir_constant *one = new(ir) ir_constant(1.0, ir->operands[0]->type->vector_elements);
827 ir_variable *frtemp = new(ir) ir_variable(ir->operands[0]->type, "frtemp",
828 ir_var_temporary);
829
830 i.insert_before(frtemp);
831 i.insert_before(assign(frtemp, fract(ir->operands[0])));
832
833 ir->operation = ir_binop_add;
834 ir->operands[0] = sub(ir->operands[0]->clone(ir, NULL), frtemp);
835 ir->operands[1] = csel(nequal(frtemp, zero), one, zero->clone(ir, NULL));
836
837 this->progress = true;
838 }
839
840 void
841 lower_instructions_visitor::dfloor_to_dfrac(ir_expression *ir)
842 {
843 /*
844 * frtemp = frac(x);
845 * result = sub(x, frtemp);
846 */
847 ir->operation = ir_binop_sub;
848 ir->operands[1] = fract(ir->operands[0]->clone(ir, NULL));
849
850 this->progress = true;
851 }
852 void
853 lower_instructions_visitor::dround_even_to_dfrac(ir_expression *ir)
854 {
855 /*
856 * insane but works
857 * temp = x + 0.5;
858 * frtemp = frac(temp);
859 * t2 = sub(temp, frtemp);
860 * if (frac(x) == 0.5)
861 * result = frac(t2 * 0.5) == 0 ? t2 : t2 - 1;
862 * else
863 * result = t2;
864
865 */
866 ir_instruction &i = *base_ir;
867 ir_variable *frtemp = new(ir) ir_variable(ir->operands[0]->type, "frtemp",
868 ir_var_temporary);
869 ir_variable *temp = new(ir) ir_variable(ir->operands[0]->type, "temp",
870 ir_var_temporary);
871 ir_variable *t2 = new(ir) ir_variable(ir->operands[0]->type, "t2",
872 ir_var_temporary);
873 ir_constant *p5 = new(ir) ir_constant(0.5, ir->operands[0]->type->vector_elements);
874 ir_constant *one = new(ir) ir_constant(1.0, ir->operands[0]->type->vector_elements);
875 ir_constant *zero = new(ir) ir_constant(0.0, ir->operands[0]->type->vector_elements);
876
877 i.insert_before(temp);
878 i.insert_before(assign(temp, add(ir->operands[0], p5)));
879
880 i.insert_before(frtemp);
881 i.insert_before(assign(frtemp, fract(temp)));
882
883 i.insert_before(t2);
884 i.insert_before(assign(t2, sub(temp, frtemp)));
885
886 ir->operation = ir_triop_csel;
887 ir->operands[0] = equal(fract(ir->operands[0]->clone(ir, NULL)),
888 p5->clone(ir, NULL));
889 ir->operands[1] = csel(equal(fract(mul(t2, p5->clone(ir, NULL))),
890 zero),
891 t2,
892 sub(t2, one));
893 ir->operands[2] = new(ir) ir_dereference_variable(t2);
894
895 this->progress = true;
896 }
897
898 void
899 lower_instructions_visitor::dtrunc_to_dfrac(ir_expression *ir)
900 {
901 /*
902 * frtemp = frac(x);
903 * temp = sub(x, frtemp);
904 * result = x >= 0 ? temp : temp + (frtemp == 0.0) ? 0 : 1;
905 */
906 ir_rvalue *arg = ir->operands[0];
907 ir_instruction &i = *base_ir;
908
909 ir_constant *zero = new(ir) ir_constant(0.0, arg->type->vector_elements);
910 ir_constant *one = new(ir) ir_constant(1.0, arg->type->vector_elements);
911 ir_variable *frtemp = new(ir) ir_variable(arg->type, "frtemp",
912 ir_var_temporary);
913 ir_variable *temp = new(ir) ir_variable(ir->operands[0]->type, "temp",
914 ir_var_temporary);
915
916 i.insert_before(frtemp);
917 i.insert_before(assign(frtemp, fract(arg)));
918 i.insert_before(temp);
919 i.insert_before(assign(temp, sub(arg->clone(ir, NULL), frtemp)));
920
921 ir->operation = ir_triop_csel;
922 ir->operands[0] = gequal(arg->clone(ir, NULL), zero);
923 ir->operands[1] = new (ir) ir_dereference_variable(temp);
924 ir->operands[2] = add(temp,
925 csel(equal(frtemp, zero->clone(ir, NULL)),
926 zero->clone(ir, NULL),
927 one));
928
929 this->progress = true;
930 }
931
932 void
933 lower_instructions_visitor::dsign_to_csel(ir_expression *ir)
934 {
935 /*
936 * temp = x > 0.0 ? 1.0 : 0.0;
937 * result = x < 0.0 ? -1.0 : temp;
938 */
939 ir_rvalue *arg = ir->operands[0];
940 ir_constant *zero = new(ir) ir_constant(0.0, arg->type->vector_elements);
941 ir_constant *one = new(ir) ir_constant(1.0, arg->type->vector_elements);
942 ir_constant *neg_one = new(ir) ir_constant(-1.0, arg->type->vector_elements);
943
944 ir->operation = ir_triop_csel;
945 ir->operands[0] = less(arg->clone(ir, NULL),
946 zero->clone(ir, NULL));
947 ir->operands[1] = neg_one;
948 ir->operands[2] = csel(greater(arg, zero),
949 one,
950 zero->clone(ir, NULL));
951
952 this->progress = true;
953 }
954
955 ir_visitor_status
956 lower_instructions_visitor::visit_leave(ir_expression *ir)
957 {
958 switch (ir->operation) {
959 case ir_binop_dot:
960 if (ir->operands[0]->type->is_double())
961 double_dot_to_fma(ir);
962 break;
963 case ir_triop_lrp:
964 if (ir->operands[0]->type->is_double())
965 double_lrp(ir);
966 break;
967 case ir_binop_sub:
968 if (lowering(SUB_TO_ADD_NEG))
969 sub_to_add_neg(ir);
970 break;
971
972 case ir_binop_div:
973 if (ir->operands[1]->type->is_integer() && lowering(INT_DIV_TO_MUL_RCP))
974 int_div_to_mul_rcp(ir);
975 else if ((ir->operands[1]->type->is_float() ||
976 ir->operands[1]->type->is_double()) && lowering(DIV_TO_MUL_RCP))
977 div_to_mul_rcp(ir);
978 break;
979
980 case ir_unop_exp:
981 if (lowering(EXP_TO_EXP2))
982 exp_to_exp2(ir);
983 break;
984
985 case ir_unop_log:
986 if (lowering(LOG_TO_LOG2))
987 log_to_log2(ir);
988 break;
989
990 case ir_binop_mod:
991 if (lowering(MOD_TO_FLOOR) && (ir->type->is_float() || ir->type->is_double()))
992 mod_to_floor(ir);
993 break;
994
995 case ir_binop_pow:
996 if (lowering(POW_TO_EXP2))
997 pow_to_exp2(ir);
998 break;
999
1000 case ir_binop_ldexp:
1001 if (lowering(LDEXP_TO_ARITH) && ir->type->is_float())
1002 ldexp_to_arith(ir);
1003 if (lowering(DFREXP_DLDEXP_TO_ARITH) && ir->type->is_double())
1004 dldexp_to_arith(ir);
1005 break;
1006
1007 case ir_unop_frexp_exp:
1008 if (lowering(DFREXP_DLDEXP_TO_ARITH) && ir->operands[0]->type->is_double())
1009 dfrexp_exp_to_arith(ir);
1010 break;
1011
1012 case ir_unop_frexp_sig:
1013 if (lowering(DFREXP_DLDEXP_TO_ARITH) && ir->operands[0]->type->is_double())
1014 dfrexp_sig_to_arith(ir);
1015 break;
1016
1017 case ir_binop_carry:
1018 if (lowering(CARRY_TO_ARITH))
1019 carry_to_arith(ir);
1020 break;
1021
1022 case ir_binop_borrow:
1023 if (lowering(BORROW_TO_ARITH))
1024 borrow_to_arith(ir);
1025 break;
1026
1027 case ir_unop_saturate:
1028 if (lowering(SAT_TO_CLAMP))
1029 sat_to_clamp(ir);
1030 break;
1031
1032 case ir_unop_trunc:
1033 if (lowering(DOPS_TO_DFRAC) && ir->type->is_double())
1034 dtrunc_to_dfrac(ir);
1035 break;
1036
1037 case ir_unop_ceil:
1038 if (lowering(DOPS_TO_DFRAC) && ir->type->is_double())
1039 dceil_to_dfrac(ir);
1040 break;
1041
1042 case ir_unop_floor:
1043 if (lowering(DOPS_TO_DFRAC) && ir->type->is_double())
1044 dfloor_to_dfrac(ir);
1045 break;
1046
1047 case ir_unop_round_even:
1048 if (lowering(DOPS_TO_DFRAC) && ir->type->is_double())
1049 dround_even_to_dfrac(ir);
1050 break;
1051
1052 case ir_unop_sign:
1053 if (lowering(DOPS_TO_DFRAC) && ir->type->is_double())
1054 dsign_to_csel(ir);
1055 break;
1056 default:
1057 return visit_continue;
1058 }
1059
1060 return visit_continue;
1061 }