glsl: Add lowering pass for ir_triop_bitfield_extract
[mesa.git] / src / compiler / glsl / lower_instructions.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 * DEALINGS IN THE SOFTWARE.
22 */
23
24 /**
25 * \file lower_instructions.cpp
26 *
27 * Many GPUs lack native instructions for certain expression operations, and
28 * must replace them with some other expression tree. This pass lowers some
29 * of the most common cases, allowing the lowering code to be implemented once
30 * rather than in each driver backend.
31 *
32 * Currently supported transformations:
33 * - SUB_TO_ADD_NEG
34 * - DIV_TO_MUL_RCP
35 * - INT_DIV_TO_MUL_RCP
36 * - EXP_TO_EXP2
37 * - POW_TO_EXP2
38 * - LOG_TO_LOG2
39 * - MOD_TO_FLOOR
40 * - LDEXP_TO_ARITH
41 * - DFREXP_TO_ARITH
42 * - CARRY_TO_ARITH
43 * - BORROW_TO_ARITH
44 * - SAT_TO_CLAMP
45 * - DOPS_TO_DFRAC
46 *
47 * SUB_TO_ADD_NEG:
48 * ---------------
49 * Breaks an ir_binop_sub expression down to add(op0, neg(op1))
50 *
51 * This simplifies expression reassociation, and for many backends
52 * there is no subtract operation separate from adding the negation.
53 * For backends with native subtract operations, they will probably
54 * want to recognize add(op0, neg(op1)) or the other way around to
55 * produce a subtract anyway.
56 *
57 * DIV_TO_MUL_RCP and INT_DIV_TO_MUL_RCP:
58 * --------------------------------------
59 * Breaks an ir_binop_div expression down to op0 * (rcp(op1)).
60 *
61 * Many GPUs don't have a divide instruction (945 and 965 included),
62 * but they do have an RCP instruction to compute an approximate
63 * reciprocal. By breaking the operation down, constant reciprocals
64 * can get constant folded.
65 *
66 * DIV_TO_MUL_RCP only lowers floating point division; INT_DIV_TO_MUL_RCP
67 * handles the integer case, converting to and from floating point so that
68 * RCP is possible.
69 *
70 * EXP_TO_EXP2 and LOG_TO_LOG2:
71 * ----------------------------
72 * Many GPUs don't have a base e log or exponent instruction, but they
73 * do have base 2 versions, so this pass converts exp and log to exp2
74 * and log2 operations.
75 *
76 * POW_TO_EXP2:
77 * -----------
78 * Many older GPUs don't have an x**y instruction. For these GPUs, convert
79 * x**y to 2**(y * log2(x)).
80 *
81 * MOD_TO_FLOOR:
82 * -------------
83 * Breaks an ir_binop_mod expression down to (op0 - op1 * floor(op0 / op1))
84 *
85 * Many GPUs don't have a MOD instruction (945 and 965 included), and
86 * if we have to break it down like this anyway, it gives an
87 * opportunity to do things like constant fold the (1.0 / op1) easily.
88 *
89 * Note: before we used to implement this as op1 * fract(op / op1) but this
90 * implementation had significant precision errors.
91 *
92 * LDEXP_TO_ARITH:
93 * -------------
94 * Converts ir_binop_ldexp to arithmetic and bit operations for float sources.
95 *
96 * DFREXP_DLDEXP_TO_ARITH:
97 * ---------------
98 * Converts ir_binop_ldexp, ir_unop_frexp_sig, and ir_unop_frexp_exp to
99 * arithmetic and bit ops for double arguments.
100 *
101 * CARRY_TO_ARITH:
102 * ---------------
103 * Converts ir_carry into (x + y) < x.
104 *
105 * BORROW_TO_ARITH:
106 * ----------------
107 * Converts ir_borrow into (x < y).
108 *
109 * SAT_TO_CLAMP:
110 * -------------
111 * Converts ir_unop_saturate into min(max(x, 0.0), 1.0)
112 *
113 * DOPS_TO_DFRAC:
114 * --------------
115 * Converts double trunc, ceil, floor, round to fract
116 */
117
118 #include "c99_math.h"
119 #include "program/prog_instruction.h" /* for swizzle */
120 #include "compiler/glsl_types.h"
121 #include "ir.h"
122 #include "ir_builder.h"
123 #include "ir_optimization.h"
124
125 using namespace ir_builder;
126
127 namespace {
128
129 class lower_instructions_visitor : public ir_hierarchical_visitor {
130 public:
131 lower_instructions_visitor(unsigned lower)
132 : progress(false), lower(lower) { }
133
134 ir_visitor_status visit_leave(ir_expression *);
135
136 bool progress;
137
138 private:
139 unsigned lower; /** Bitfield of which operations to lower */
140
141 void sub_to_add_neg(ir_expression *);
142 void div_to_mul_rcp(ir_expression *);
143 void int_div_to_mul_rcp(ir_expression *);
144 void mod_to_floor(ir_expression *);
145 void exp_to_exp2(ir_expression *);
146 void pow_to_exp2(ir_expression *);
147 void log_to_log2(ir_expression *);
148 void ldexp_to_arith(ir_expression *);
149 void dldexp_to_arith(ir_expression *);
150 void dfrexp_sig_to_arith(ir_expression *);
151 void dfrexp_exp_to_arith(ir_expression *);
152 void carry_to_arith(ir_expression *);
153 void borrow_to_arith(ir_expression *);
154 void sat_to_clamp(ir_expression *);
155 void double_dot_to_fma(ir_expression *);
156 void double_lrp(ir_expression *);
157 void dceil_to_dfrac(ir_expression *);
158 void dfloor_to_dfrac(ir_expression *);
159 void dround_even_to_dfrac(ir_expression *);
160 void dtrunc_to_dfrac(ir_expression *);
161 void dsign_to_csel(ir_expression *);
162 void bit_count_to_math(ir_expression *);
163 void extract_to_shifts(ir_expression *);
164 };
165
166 } /* anonymous namespace */
167
168 /**
169 * Determine if a particular type of lowering should occur
170 */
171 #define lowering(x) (this->lower & x)
172
173 bool
174 lower_instructions(exec_list *instructions, unsigned what_to_lower)
175 {
176 lower_instructions_visitor v(what_to_lower);
177
178 visit_list_elements(&v, instructions);
179 return v.progress;
180 }
181
182 void
183 lower_instructions_visitor::sub_to_add_neg(ir_expression *ir)
184 {
185 ir->operation = ir_binop_add;
186 ir->operands[1] = new(ir) ir_expression(ir_unop_neg, ir->operands[1]->type,
187 ir->operands[1], NULL);
188 this->progress = true;
189 }
190
191 void
192 lower_instructions_visitor::div_to_mul_rcp(ir_expression *ir)
193 {
194 assert(ir->operands[1]->type->is_float() || ir->operands[1]->type->is_double());
195
196 /* New expression for the 1.0 / op1 */
197 ir_rvalue *expr;
198 expr = new(ir) ir_expression(ir_unop_rcp,
199 ir->operands[1]->type,
200 ir->operands[1]);
201
202 /* op0 / op1 -> op0 * (1.0 / op1) */
203 ir->operation = ir_binop_mul;
204 ir->operands[1] = expr;
205
206 this->progress = true;
207 }
208
209 void
210 lower_instructions_visitor::int_div_to_mul_rcp(ir_expression *ir)
211 {
212 assert(ir->operands[1]->type->is_integer());
213
214 /* Be careful with integer division -- we need to do it as a
215 * float and re-truncate, since rcp(n > 1) of an integer would
216 * just be 0.
217 */
218 ir_rvalue *op0, *op1;
219 const struct glsl_type *vec_type;
220
221 vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
222 ir->operands[1]->type->vector_elements,
223 ir->operands[1]->type->matrix_columns);
224
225 if (ir->operands[1]->type->base_type == GLSL_TYPE_INT)
226 op1 = new(ir) ir_expression(ir_unop_i2f, vec_type, ir->operands[1], NULL);
227 else
228 op1 = new(ir) ir_expression(ir_unop_u2f, vec_type, ir->operands[1], NULL);
229
230 op1 = new(ir) ir_expression(ir_unop_rcp, op1->type, op1, NULL);
231
232 vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
233 ir->operands[0]->type->vector_elements,
234 ir->operands[0]->type->matrix_columns);
235
236 if (ir->operands[0]->type->base_type == GLSL_TYPE_INT)
237 op0 = new(ir) ir_expression(ir_unop_i2f, vec_type, ir->operands[0], NULL);
238 else
239 op0 = new(ir) ir_expression(ir_unop_u2f, vec_type, ir->operands[0], NULL);
240
241 vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
242 ir->type->vector_elements,
243 ir->type->matrix_columns);
244
245 op0 = new(ir) ir_expression(ir_binop_mul, vec_type, op0, op1);
246
247 if (ir->operands[1]->type->base_type == GLSL_TYPE_INT) {
248 ir->operation = ir_unop_f2i;
249 ir->operands[0] = op0;
250 } else {
251 ir->operation = ir_unop_i2u;
252 ir->operands[0] = new(ir) ir_expression(ir_unop_f2i, op0);
253 }
254 ir->operands[1] = NULL;
255
256 this->progress = true;
257 }
258
259 void
260 lower_instructions_visitor::exp_to_exp2(ir_expression *ir)
261 {
262 ir_constant *log2_e = new(ir) ir_constant(float(M_LOG2E));
263
264 ir->operation = ir_unop_exp2;
265 ir->operands[0] = new(ir) ir_expression(ir_binop_mul, ir->operands[0]->type,
266 ir->operands[0], log2_e);
267 this->progress = true;
268 }
269
270 void
271 lower_instructions_visitor::pow_to_exp2(ir_expression *ir)
272 {
273 ir_expression *const log2_x =
274 new(ir) ir_expression(ir_unop_log2, ir->operands[0]->type,
275 ir->operands[0]);
276
277 ir->operation = ir_unop_exp2;
278 ir->operands[0] = new(ir) ir_expression(ir_binop_mul, ir->operands[1]->type,
279 ir->operands[1], log2_x);
280 ir->operands[1] = NULL;
281 this->progress = true;
282 }
283
284 void
285 lower_instructions_visitor::log_to_log2(ir_expression *ir)
286 {
287 ir->operation = ir_binop_mul;
288 ir->operands[0] = new(ir) ir_expression(ir_unop_log2, ir->operands[0]->type,
289 ir->operands[0], NULL);
290 ir->operands[1] = new(ir) ir_constant(float(1.0 / M_LOG2E));
291 this->progress = true;
292 }
293
294 void
295 lower_instructions_visitor::mod_to_floor(ir_expression *ir)
296 {
297 ir_variable *x = new(ir) ir_variable(ir->operands[0]->type, "mod_x",
298 ir_var_temporary);
299 ir_variable *y = new(ir) ir_variable(ir->operands[1]->type, "mod_y",
300 ir_var_temporary);
301 this->base_ir->insert_before(x);
302 this->base_ir->insert_before(y);
303
304 ir_assignment *const assign_x =
305 new(ir) ir_assignment(new(ir) ir_dereference_variable(x),
306 ir->operands[0], NULL);
307 ir_assignment *const assign_y =
308 new(ir) ir_assignment(new(ir) ir_dereference_variable(y),
309 ir->operands[1], NULL);
310
311 this->base_ir->insert_before(assign_x);
312 this->base_ir->insert_before(assign_y);
313
314 ir_expression *const div_expr =
315 new(ir) ir_expression(ir_binop_div, x->type,
316 new(ir) ir_dereference_variable(x),
317 new(ir) ir_dereference_variable(y));
318
319 /* Don't generate new IR that would need to be lowered in an additional
320 * pass.
321 */
322 if (lowering(DIV_TO_MUL_RCP) && (ir->type->is_float() || ir->type->is_double()))
323 div_to_mul_rcp(div_expr);
324
325 ir_expression *const floor_expr =
326 new(ir) ir_expression(ir_unop_floor, x->type, div_expr);
327
328 if (lowering(DOPS_TO_DFRAC) && ir->type->is_double())
329 dfloor_to_dfrac(floor_expr);
330
331 ir_expression *const mul_expr =
332 new(ir) ir_expression(ir_binop_mul,
333 new(ir) ir_dereference_variable(y),
334 floor_expr);
335
336 ir->operation = ir_binop_sub;
337 ir->operands[0] = new(ir) ir_dereference_variable(x);
338 ir->operands[1] = mul_expr;
339 this->progress = true;
340 }
341
342 void
343 lower_instructions_visitor::ldexp_to_arith(ir_expression *ir)
344 {
345 /* Translates
346 * ir_binop_ldexp x exp
347 * into
348 *
349 * extracted_biased_exp = rshift(bitcast_f2i(abs(x)), exp_shift);
350 * resulting_biased_exp = extracted_biased_exp + exp;
351 *
352 * if (resulting_biased_exp < 1 || x == 0.0f) {
353 * return copysign(0.0, x);
354 * }
355 *
356 * return bitcast_u2f((bitcast_f2u(x) & sign_mantissa_mask) |
357 * lshift(i2u(resulting_biased_exp), exp_shift));
358 *
359 * which we can't actually implement as such, since the GLSL IR doesn't
360 * have vectorized if-statements. We actually implement it without branches
361 * using conditional-select:
362 *
363 * extracted_biased_exp = rshift(bitcast_f2i(abs(x)), exp_shift);
364 * resulting_biased_exp = extracted_biased_exp + exp;
365 *
366 * is_not_zero_or_underflow = logic_and(nequal(x, 0.0f),
367 * gequal(resulting_biased_exp, 1);
368 * x = csel(is_not_zero_or_underflow, x, copysign(0.0f, x));
369 * resulting_biased_exp = csel(is_not_zero_or_underflow,
370 * resulting_biased_exp, 0);
371 *
372 * return bitcast_u2f((bitcast_f2u(x) & sign_mantissa_mask) |
373 * lshift(i2u(resulting_biased_exp), exp_shift));
374 */
375
376 const unsigned vec_elem = ir->type->vector_elements;
377
378 /* Types */
379 const glsl_type *ivec = glsl_type::get_instance(GLSL_TYPE_INT, vec_elem, 1);
380 const glsl_type *bvec = glsl_type::get_instance(GLSL_TYPE_BOOL, vec_elem, 1);
381
382 /* Constants */
383 ir_constant *zeroi = ir_constant::zero(ir, ivec);
384
385 ir_constant *sign_mask = new(ir) ir_constant(0x80000000u, vec_elem);
386
387 ir_constant *exp_shift = new(ir) ir_constant(23, vec_elem);
388 ir_constant *exp_width = new(ir) ir_constant(8, vec_elem);
389
390 /* Temporary variables */
391 ir_variable *x = new(ir) ir_variable(ir->type, "x", ir_var_temporary);
392 ir_variable *exp = new(ir) ir_variable(ivec, "exp", ir_var_temporary);
393
394 ir_variable *zero_sign_x = new(ir) ir_variable(ir->type, "zero_sign_x",
395 ir_var_temporary);
396
397 ir_variable *extracted_biased_exp =
398 new(ir) ir_variable(ivec, "extracted_biased_exp", ir_var_temporary);
399 ir_variable *resulting_biased_exp =
400 new(ir) ir_variable(ivec, "resulting_biased_exp", ir_var_temporary);
401
402 ir_variable *is_not_zero_or_underflow =
403 new(ir) ir_variable(bvec, "is_not_zero_or_underflow", ir_var_temporary);
404
405 ir_instruction &i = *base_ir;
406
407 /* Copy <x> and <exp> arguments. */
408 i.insert_before(x);
409 i.insert_before(assign(x, ir->operands[0]));
410 i.insert_before(exp);
411 i.insert_before(assign(exp, ir->operands[1]));
412
413 /* Extract the biased exponent from <x>. */
414 i.insert_before(extracted_biased_exp);
415 i.insert_before(assign(extracted_biased_exp,
416 rshift(bitcast_f2i(abs(x)), exp_shift)));
417
418 i.insert_before(resulting_biased_exp);
419 i.insert_before(assign(resulting_biased_exp,
420 add(extracted_biased_exp, exp)));
421
422 /* Test if result is ±0.0, subnormal, or underflow by checking if the
423 * resulting biased exponent would be less than 0x1. If so, the result is
424 * 0.0 with the sign of x. (Actually, invert the conditions so that
425 * immediate values are the second arguments, which is better for i965)
426 */
427 i.insert_before(zero_sign_x);
428 i.insert_before(assign(zero_sign_x,
429 bitcast_u2f(bit_and(bitcast_f2u(x), sign_mask))));
430
431 i.insert_before(is_not_zero_or_underflow);
432 i.insert_before(assign(is_not_zero_or_underflow,
433 logic_and(nequal(x, new(ir) ir_constant(0.0f, vec_elem)),
434 gequal(resulting_biased_exp,
435 new(ir) ir_constant(0x1, vec_elem)))));
436 i.insert_before(assign(x, csel(is_not_zero_or_underflow,
437 x, zero_sign_x)));
438 i.insert_before(assign(resulting_biased_exp,
439 csel(is_not_zero_or_underflow,
440 resulting_biased_exp, zeroi)));
441
442 /* We could test for overflows by checking if the resulting biased exponent
443 * would be greater than 0xFE. Turns out we don't need to because the GLSL
444 * spec says:
445 *
446 * "If this product is too large to be represented in the
447 * floating-point type, the result is undefined."
448 */
449
450 ir_constant *exp_shift_clone = exp_shift->clone(ir, NULL);
451 ir->operation = ir_unop_bitcast_i2f;
452 ir->operands[0] = bitfield_insert(bitcast_f2i(x), resulting_biased_exp,
453 exp_shift_clone, exp_width);
454 ir->operands[1] = NULL;
455
456 this->progress = true;
457 }
458
459 void
460 lower_instructions_visitor::dldexp_to_arith(ir_expression *ir)
461 {
462 /* See ldexp_to_arith for structure. Uses frexp_exp to extract the exponent
463 * from the significand.
464 */
465
466 const unsigned vec_elem = ir->type->vector_elements;
467
468 /* Types */
469 const glsl_type *ivec = glsl_type::get_instance(GLSL_TYPE_INT, vec_elem, 1);
470 const glsl_type *bvec = glsl_type::get_instance(GLSL_TYPE_BOOL, vec_elem, 1);
471
472 /* Constants */
473 ir_constant *zeroi = ir_constant::zero(ir, ivec);
474
475 ir_constant *sign_mask = new(ir) ir_constant(0x80000000u);
476
477 ir_constant *exp_shift = new(ir) ir_constant(20u);
478 ir_constant *exp_width = new(ir) ir_constant(11u);
479 ir_constant *exp_bias = new(ir) ir_constant(1022, vec_elem);
480
481 /* Temporary variables */
482 ir_variable *x = new(ir) ir_variable(ir->type, "x", ir_var_temporary);
483 ir_variable *exp = new(ir) ir_variable(ivec, "exp", ir_var_temporary);
484
485 ir_variable *zero_sign_x = new(ir) ir_variable(ir->type, "zero_sign_x",
486 ir_var_temporary);
487
488 ir_variable *extracted_biased_exp =
489 new(ir) ir_variable(ivec, "extracted_biased_exp", ir_var_temporary);
490 ir_variable *resulting_biased_exp =
491 new(ir) ir_variable(ivec, "resulting_biased_exp", ir_var_temporary);
492
493 ir_variable *is_not_zero_or_underflow =
494 new(ir) ir_variable(bvec, "is_not_zero_or_underflow", ir_var_temporary);
495
496 ir_instruction &i = *base_ir;
497
498 /* Copy <x> and <exp> arguments. */
499 i.insert_before(x);
500 i.insert_before(assign(x, ir->operands[0]));
501 i.insert_before(exp);
502 i.insert_before(assign(exp, ir->operands[1]));
503
504 ir_expression *frexp_exp = expr(ir_unop_frexp_exp, x);
505 if (lowering(DFREXP_DLDEXP_TO_ARITH))
506 dfrexp_exp_to_arith(frexp_exp);
507
508 /* Extract the biased exponent from <x>. */
509 i.insert_before(extracted_biased_exp);
510 i.insert_before(assign(extracted_biased_exp, add(frexp_exp, exp_bias)));
511
512 i.insert_before(resulting_biased_exp);
513 i.insert_before(assign(resulting_biased_exp,
514 add(extracted_biased_exp, exp)));
515
516 /* Test if result is ±0.0, subnormal, or underflow by checking if the
517 * resulting biased exponent would be less than 0x1. If so, the result is
518 * 0.0 with the sign of x. (Actually, invert the conditions so that
519 * immediate values are the second arguments, which is better for i965)
520 * TODO: Implement in a vector fashion.
521 */
522 i.insert_before(zero_sign_x);
523 for (unsigned elem = 0; elem < vec_elem; elem++) {
524 ir_variable *unpacked =
525 new(ir) ir_variable(glsl_type::uvec2_type, "unpacked", ir_var_temporary);
526 i.insert_before(unpacked);
527 i.insert_before(
528 assign(unpacked,
529 expr(ir_unop_unpack_double_2x32, swizzle(x, elem, 1))));
530 i.insert_before(assign(unpacked, bit_and(swizzle_y(unpacked), sign_mask->clone(ir, NULL)),
531 WRITEMASK_Y));
532 i.insert_before(assign(unpacked, ir_constant::zero(ir, glsl_type::uint_type), WRITEMASK_X));
533 i.insert_before(assign(zero_sign_x,
534 expr(ir_unop_pack_double_2x32, unpacked),
535 1 << elem));
536 }
537 i.insert_before(is_not_zero_or_underflow);
538 i.insert_before(assign(is_not_zero_or_underflow,
539 gequal(resulting_biased_exp,
540 new(ir) ir_constant(0x1, vec_elem))));
541 i.insert_before(assign(x, csel(is_not_zero_or_underflow,
542 x, zero_sign_x)));
543 i.insert_before(assign(resulting_biased_exp,
544 csel(is_not_zero_or_underflow,
545 resulting_biased_exp, zeroi)));
546
547 /* We could test for overflows by checking if the resulting biased exponent
548 * would be greater than 0xFE. Turns out we don't need to because the GLSL
549 * spec says:
550 *
551 * "If this product is too large to be represented in the
552 * floating-point type, the result is undefined."
553 */
554
555 ir_rvalue *results[4] = {NULL};
556 for (unsigned elem = 0; elem < vec_elem; elem++) {
557 ir_variable *unpacked =
558 new(ir) ir_variable(glsl_type::uvec2_type, "unpacked", ir_var_temporary);
559 i.insert_before(unpacked);
560 i.insert_before(
561 assign(unpacked,
562 expr(ir_unop_unpack_double_2x32, swizzle(x, elem, 1))));
563
564 ir_expression *bfi = bitfield_insert(
565 swizzle_y(unpacked),
566 i2u(swizzle(resulting_biased_exp, elem, 1)),
567 exp_shift->clone(ir, NULL),
568 exp_width->clone(ir, NULL));
569
570 i.insert_before(assign(unpacked, bfi, WRITEMASK_Y));
571
572 results[elem] = expr(ir_unop_pack_double_2x32, unpacked);
573 }
574
575 ir->operation = ir_quadop_vector;
576 ir->operands[0] = results[0];
577 ir->operands[1] = results[1];
578 ir->operands[2] = results[2];
579 ir->operands[3] = results[3];
580
581 /* Don't generate new IR that would need to be lowered in an additional
582 * pass.
583 */
584
585 this->progress = true;
586 }
587
588 void
589 lower_instructions_visitor::dfrexp_sig_to_arith(ir_expression *ir)
590 {
591 const unsigned vec_elem = ir->type->vector_elements;
592 const glsl_type *bvec = glsl_type::get_instance(GLSL_TYPE_BOOL, vec_elem, 1);
593
594 /* Double-precision floating-point values are stored as
595 * 1 sign bit;
596 * 11 exponent bits;
597 * 52 mantissa bits.
598 *
599 * We're just extracting the significand here, so we only need to modify
600 * the upper 32-bit uint. Unfortunately we must extract each double
601 * independently as there is no vector version of unpackDouble.
602 */
603
604 ir_instruction &i = *base_ir;
605
606 ir_variable *is_not_zero =
607 new(ir) ir_variable(bvec, "is_not_zero", ir_var_temporary);
608 ir_rvalue *results[4] = {NULL};
609
610 ir_constant *dzero = new(ir) ir_constant(0.0, vec_elem);
611 i.insert_before(is_not_zero);
612 i.insert_before(
613 assign(is_not_zero,
614 nequal(abs(ir->operands[0]->clone(ir, NULL)), dzero)));
615
616 /* TODO: Remake this as more vector-friendly when int64 support is
617 * available.
618 */
619 for (unsigned elem = 0; elem < vec_elem; elem++) {
620 ir_constant *zero = new(ir) ir_constant(0u, 1);
621 ir_constant *sign_mantissa_mask = new(ir) ir_constant(0x800fffffu, 1);
622
623 /* Exponent of double floating-point values in the range [0.5, 1.0). */
624 ir_constant *exponent_value = new(ir) ir_constant(0x3fe00000u, 1);
625
626 ir_variable *bits =
627 new(ir) ir_variable(glsl_type::uint_type, "bits", ir_var_temporary);
628 ir_variable *unpacked =
629 new(ir) ir_variable(glsl_type::uvec2_type, "unpacked", ir_var_temporary);
630
631 ir_rvalue *x = swizzle(ir->operands[0]->clone(ir, NULL), elem, 1);
632
633 i.insert_before(bits);
634 i.insert_before(unpacked);
635 i.insert_before(assign(unpacked, expr(ir_unop_unpack_double_2x32, x)));
636
637 /* Manipulate the high uint to remove the exponent and replace it with
638 * either the default exponent or zero.
639 */
640 i.insert_before(assign(bits, swizzle_y(unpacked)));
641 i.insert_before(assign(bits, bit_and(bits, sign_mantissa_mask)));
642 i.insert_before(assign(bits, bit_or(bits,
643 csel(swizzle(is_not_zero, elem, 1),
644 exponent_value,
645 zero))));
646 i.insert_before(assign(unpacked, bits, WRITEMASK_Y));
647 results[elem] = expr(ir_unop_pack_double_2x32, unpacked);
648 }
649
650 /* Put the dvec back together */
651 ir->operation = ir_quadop_vector;
652 ir->operands[0] = results[0];
653 ir->operands[1] = results[1];
654 ir->operands[2] = results[2];
655 ir->operands[3] = results[3];
656
657 this->progress = true;
658 }
659
660 void
661 lower_instructions_visitor::dfrexp_exp_to_arith(ir_expression *ir)
662 {
663 const unsigned vec_elem = ir->type->vector_elements;
664 const glsl_type *bvec = glsl_type::get_instance(GLSL_TYPE_BOOL, vec_elem, 1);
665 const glsl_type *uvec = glsl_type::get_instance(GLSL_TYPE_UINT, vec_elem, 1);
666
667 /* Double-precision floating-point values are stored as
668 * 1 sign bit;
669 * 11 exponent bits;
670 * 52 mantissa bits.
671 *
672 * We're just extracting the exponent here, so we only care about the upper
673 * 32-bit uint.
674 */
675
676 ir_instruction &i = *base_ir;
677
678 ir_variable *is_not_zero =
679 new(ir) ir_variable(bvec, "is_not_zero", ir_var_temporary);
680 ir_variable *high_words =
681 new(ir) ir_variable(uvec, "high_words", ir_var_temporary);
682 ir_constant *dzero = new(ir) ir_constant(0.0, vec_elem);
683 ir_constant *izero = new(ir) ir_constant(0, vec_elem);
684
685 ir_rvalue *absval = abs(ir->operands[0]);
686
687 i.insert_before(is_not_zero);
688 i.insert_before(high_words);
689 i.insert_before(assign(is_not_zero, nequal(absval->clone(ir, NULL), dzero)));
690
691 /* Extract all of the upper uints. */
692 for (unsigned elem = 0; elem < vec_elem; elem++) {
693 ir_rvalue *x = swizzle(absval->clone(ir, NULL), elem, 1);
694
695 i.insert_before(assign(high_words,
696 swizzle_y(expr(ir_unop_unpack_double_2x32, x)),
697 1 << elem));
698
699 }
700 ir_constant *exponent_shift = new(ir) ir_constant(20, vec_elem);
701 ir_constant *exponent_bias = new(ir) ir_constant(-1022, vec_elem);
702
703 /* For non-zero inputs, shift the exponent down and apply bias. */
704 ir->operation = ir_triop_csel;
705 ir->operands[0] = new(ir) ir_dereference_variable(is_not_zero);
706 ir->operands[1] = add(exponent_bias, u2i(rshift(high_words, exponent_shift)));
707 ir->operands[2] = izero;
708
709 this->progress = true;
710 }
711
712 void
713 lower_instructions_visitor::carry_to_arith(ir_expression *ir)
714 {
715 /* Translates
716 * ir_binop_carry x y
717 * into
718 * sum = ir_binop_add x y
719 * bcarry = ir_binop_less sum x
720 * carry = ir_unop_b2i bcarry
721 */
722
723 ir_rvalue *x_clone = ir->operands[0]->clone(ir, NULL);
724 ir->operation = ir_unop_i2u;
725 ir->operands[0] = b2i(less(add(ir->operands[0], ir->operands[1]), x_clone));
726 ir->operands[1] = NULL;
727
728 this->progress = true;
729 }
730
731 void
732 lower_instructions_visitor::borrow_to_arith(ir_expression *ir)
733 {
734 /* Translates
735 * ir_binop_borrow x y
736 * into
737 * bcarry = ir_binop_less x y
738 * carry = ir_unop_b2i bcarry
739 */
740
741 ir->operation = ir_unop_i2u;
742 ir->operands[0] = b2i(less(ir->operands[0], ir->operands[1]));
743 ir->operands[1] = NULL;
744
745 this->progress = true;
746 }
747
748 void
749 lower_instructions_visitor::sat_to_clamp(ir_expression *ir)
750 {
751 /* Translates
752 * ir_unop_saturate x
753 * into
754 * ir_binop_min (ir_binop_max(x, 0.0), 1.0)
755 */
756
757 ir->operation = ir_binop_min;
758 ir->operands[0] = new(ir) ir_expression(ir_binop_max, ir->operands[0]->type,
759 ir->operands[0],
760 new(ir) ir_constant(0.0f));
761 ir->operands[1] = new(ir) ir_constant(1.0f);
762
763 this->progress = true;
764 }
765
766 void
767 lower_instructions_visitor::double_dot_to_fma(ir_expression *ir)
768 {
769 ir_variable *temp = new(ir) ir_variable(ir->operands[0]->type->get_base_type(), "dot_res",
770 ir_var_temporary);
771 this->base_ir->insert_before(temp);
772
773 int nc = ir->operands[0]->type->components();
774 for (int i = nc - 1; i >= 1; i--) {
775 ir_assignment *assig;
776 if (i == (nc - 1)) {
777 assig = assign(temp, mul(swizzle(ir->operands[0]->clone(ir, NULL), i, 1),
778 swizzle(ir->operands[1]->clone(ir, NULL), i, 1)));
779 } else {
780 assig = assign(temp, fma(swizzle(ir->operands[0]->clone(ir, NULL), i, 1),
781 swizzle(ir->operands[1]->clone(ir, NULL), i, 1),
782 temp));
783 }
784 this->base_ir->insert_before(assig);
785 }
786
787 ir->operation = ir_triop_fma;
788 ir->operands[0] = swizzle(ir->operands[0], 0, 1);
789 ir->operands[1] = swizzle(ir->operands[1], 0, 1);
790 ir->operands[2] = new(ir) ir_dereference_variable(temp);
791
792 this->progress = true;
793
794 }
795
796 void
797 lower_instructions_visitor::double_lrp(ir_expression *ir)
798 {
799 int swizval;
800 ir_rvalue *op0 = ir->operands[0], *op2 = ir->operands[2];
801 ir_constant *one = new(ir) ir_constant(1.0, op2->type->vector_elements);
802
803 switch (op2->type->vector_elements) {
804 case 1:
805 swizval = SWIZZLE_XXXX;
806 break;
807 default:
808 assert(op0->type->vector_elements == op2->type->vector_elements);
809 swizval = SWIZZLE_XYZW;
810 break;
811 }
812
813 ir->operation = ir_triop_fma;
814 ir->operands[0] = swizzle(op2, swizval, op0->type->vector_elements);
815 ir->operands[2] = mul(sub(one, op2->clone(ir, NULL)), op0);
816
817 this->progress = true;
818 }
819
820 void
821 lower_instructions_visitor::dceil_to_dfrac(ir_expression *ir)
822 {
823 /*
824 * frtemp = frac(x);
825 * temp = sub(x, frtemp);
826 * result = temp + ((frtemp != 0.0) ? 1.0 : 0.0);
827 */
828 ir_instruction &i = *base_ir;
829 ir_constant *zero = new(ir) ir_constant(0.0, ir->operands[0]->type->vector_elements);
830 ir_constant *one = new(ir) ir_constant(1.0, ir->operands[0]->type->vector_elements);
831 ir_variable *frtemp = new(ir) ir_variable(ir->operands[0]->type, "frtemp",
832 ir_var_temporary);
833
834 i.insert_before(frtemp);
835 i.insert_before(assign(frtemp, fract(ir->operands[0])));
836
837 ir->operation = ir_binop_add;
838 ir->operands[0] = sub(ir->operands[0]->clone(ir, NULL), frtemp);
839 ir->operands[1] = csel(nequal(frtemp, zero), one, zero->clone(ir, NULL));
840
841 this->progress = true;
842 }
843
844 void
845 lower_instructions_visitor::dfloor_to_dfrac(ir_expression *ir)
846 {
847 /*
848 * frtemp = frac(x);
849 * result = sub(x, frtemp);
850 */
851 ir->operation = ir_binop_sub;
852 ir->operands[1] = fract(ir->operands[0]->clone(ir, NULL));
853
854 this->progress = true;
855 }
856 void
857 lower_instructions_visitor::dround_even_to_dfrac(ir_expression *ir)
858 {
859 /*
860 * insane but works
861 * temp = x + 0.5;
862 * frtemp = frac(temp);
863 * t2 = sub(temp, frtemp);
864 * if (frac(x) == 0.5)
865 * result = frac(t2 * 0.5) == 0 ? t2 : t2 - 1;
866 * else
867 * result = t2;
868
869 */
870 ir_instruction &i = *base_ir;
871 ir_variable *frtemp = new(ir) ir_variable(ir->operands[0]->type, "frtemp",
872 ir_var_temporary);
873 ir_variable *temp = new(ir) ir_variable(ir->operands[0]->type, "temp",
874 ir_var_temporary);
875 ir_variable *t2 = new(ir) ir_variable(ir->operands[0]->type, "t2",
876 ir_var_temporary);
877 ir_constant *p5 = new(ir) ir_constant(0.5, ir->operands[0]->type->vector_elements);
878 ir_constant *one = new(ir) ir_constant(1.0, ir->operands[0]->type->vector_elements);
879 ir_constant *zero = new(ir) ir_constant(0.0, ir->operands[0]->type->vector_elements);
880
881 i.insert_before(temp);
882 i.insert_before(assign(temp, add(ir->operands[0], p5)));
883
884 i.insert_before(frtemp);
885 i.insert_before(assign(frtemp, fract(temp)));
886
887 i.insert_before(t2);
888 i.insert_before(assign(t2, sub(temp, frtemp)));
889
890 ir->operation = ir_triop_csel;
891 ir->operands[0] = equal(fract(ir->operands[0]->clone(ir, NULL)),
892 p5->clone(ir, NULL));
893 ir->operands[1] = csel(equal(fract(mul(t2, p5->clone(ir, NULL))),
894 zero),
895 t2,
896 sub(t2, one));
897 ir->operands[2] = new(ir) ir_dereference_variable(t2);
898
899 this->progress = true;
900 }
901
902 void
903 lower_instructions_visitor::dtrunc_to_dfrac(ir_expression *ir)
904 {
905 /*
906 * frtemp = frac(x);
907 * temp = sub(x, frtemp);
908 * result = x >= 0 ? temp : temp + (frtemp == 0.0) ? 0 : 1;
909 */
910 ir_rvalue *arg = ir->operands[0];
911 ir_instruction &i = *base_ir;
912
913 ir_constant *zero = new(ir) ir_constant(0.0, arg->type->vector_elements);
914 ir_constant *one = new(ir) ir_constant(1.0, arg->type->vector_elements);
915 ir_variable *frtemp = new(ir) ir_variable(arg->type, "frtemp",
916 ir_var_temporary);
917 ir_variable *temp = new(ir) ir_variable(ir->operands[0]->type, "temp",
918 ir_var_temporary);
919
920 i.insert_before(frtemp);
921 i.insert_before(assign(frtemp, fract(arg)));
922 i.insert_before(temp);
923 i.insert_before(assign(temp, sub(arg->clone(ir, NULL), frtemp)));
924
925 ir->operation = ir_triop_csel;
926 ir->operands[0] = gequal(arg->clone(ir, NULL), zero);
927 ir->operands[1] = new (ir) ir_dereference_variable(temp);
928 ir->operands[2] = add(temp,
929 csel(equal(frtemp, zero->clone(ir, NULL)),
930 zero->clone(ir, NULL),
931 one));
932
933 this->progress = true;
934 }
935
936 void
937 lower_instructions_visitor::dsign_to_csel(ir_expression *ir)
938 {
939 /*
940 * temp = x > 0.0 ? 1.0 : 0.0;
941 * result = x < 0.0 ? -1.0 : temp;
942 */
943 ir_rvalue *arg = ir->operands[0];
944 ir_constant *zero = new(ir) ir_constant(0.0, arg->type->vector_elements);
945 ir_constant *one = new(ir) ir_constant(1.0, arg->type->vector_elements);
946 ir_constant *neg_one = new(ir) ir_constant(-1.0, arg->type->vector_elements);
947
948 ir->operation = ir_triop_csel;
949 ir->operands[0] = less(arg->clone(ir, NULL),
950 zero->clone(ir, NULL));
951 ir->operands[1] = neg_one;
952 ir->operands[2] = csel(greater(arg, zero),
953 one,
954 zero->clone(ir, NULL));
955
956 this->progress = true;
957 }
958
959 void
960 lower_instructions_visitor::bit_count_to_math(ir_expression *ir)
961 {
962 /* For more details, see:
963 *
964 * http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetPaallel
965 */
966 const unsigned elements = ir->operands[0]->type->vector_elements;
967 ir_variable *temp = new(ir) ir_variable(glsl_type::uvec(elements), "temp",
968 ir_var_temporary);
969 ir_constant *c55555555 = new(ir) ir_constant(0x55555555u);
970 ir_constant *c33333333 = new(ir) ir_constant(0x33333333u);
971 ir_constant *c0F0F0F0F = new(ir) ir_constant(0x0F0F0F0Fu);
972 ir_constant *c01010101 = new(ir) ir_constant(0x01010101u);
973 ir_constant *c1 = new(ir) ir_constant(1u);
974 ir_constant *c2 = new(ir) ir_constant(2u);
975 ir_constant *c4 = new(ir) ir_constant(4u);
976 ir_constant *c24 = new(ir) ir_constant(24u);
977
978 base_ir->insert_before(temp);
979
980 if (ir->operands[0]->type->base_type == GLSL_TYPE_UINT) {
981 base_ir->insert_before(assign(temp, ir->operands[0]));
982 } else {
983 assert(ir->operands[0]->type->base_type == GLSL_TYPE_INT);
984 base_ir->insert_before(assign(temp, i2u(ir->operands[0])));
985 }
986
987 /* temp = temp - ((temp >> 1) & 0x55555555u); */
988 base_ir->insert_before(assign(temp, sub(temp, bit_and(rshift(temp, c1),
989 c55555555))));
990
991 /* temp = (temp & 0x33333333u) + ((temp >> 2) & 0x33333333u); */
992 base_ir->insert_before(assign(temp, add(bit_and(temp, c33333333),
993 bit_and(rshift(temp, c2),
994 c33333333->clone(ir, NULL)))));
995
996 /* int(((temp + (temp >> 4) & 0xF0F0F0Fu) * 0x1010101u) >> 24); */
997 ir->operation = ir_unop_u2i;
998 ir->operands[0] = rshift(mul(bit_and(add(temp, rshift(temp, c4)), c0F0F0F0F),
999 c01010101),
1000 c24);
1001
1002 this->progress = true;
1003 }
1004
1005 void
1006 lower_instructions_visitor::extract_to_shifts(ir_expression *ir)
1007 {
1008 ir_variable *bits =
1009 new(ir) ir_variable(ir->operands[0]->type, "bits", ir_var_temporary);
1010
1011 base_ir->insert_before(bits);
1012 base_ir->insert_before(assign(bits, ir->operands[2]));
1013
1014 if (ir->operands[0]->type->base_type == GLSL_TYPE_UINT) {
1015 ir_constant *c1 =
1016 new(ir) ir_constant(1u, ir->operands[0]->type->vector_elements);
1017 ir_constant *c32 =
1018 new(ir) ir_constant(32u, ir->operands[0]->type->vector_elements);
1019 ir_constant *cFFFFFFFF =
1020 new(ir) ir_constant(0xFFFFFFFFu, ir->operands[0]->type->vector_elements);
1021
1022 /* At least some hardware treats (x << y) as (x << (y%32)). This means
1023 * we'd get a mask of 0 when bits is 32. Special case it.
1024 *
1025 * mask = bits == 32 ? 0xffffffff : (1u << bits) - 1u;
1026 */
1027 ir_expression *mask = csel(equal(bits, c32),
1028 cFFFFFFFF,
1029 sub(lshift(c1, bits), c1->clone(ir, NULL)));
1030
1031 /* Section 8.8 (Integer Functions) of the GLSL 4.50 spec says:
1032 *
1033 * If bits is zero, the result will be zero.
1034 *
1035 * Since (1 << 0) - 1 == 0, we don't need to bother with the conditional
1036 * select as in the signed integer case.
1037 *
1038 * (value >> offset) & mask;
1039 */
1040 ir->operation = ir_binop_bit_and;
1041 ir->operands[0] = rshift(ir->operands[0], ir->operands[1]);
1042 ir->operands[1] = mask;
1043 ir->operands[2] = NULL;
1044 } else {
1045 ir_constant *c0 =
1046 new(ir) ir_constant(int(0), ir->operands[0]->type->vector_elements);
1047 ir_constant *c32 =
1048 new(ir) ir_constant(int(32), ir->operands[0]->type->vector_elements);
1049 ir_variable *temp =
1050 new(ir) ir_variable(ir->operands[0]->type, "temp", ir_var_temporary);
1051
1052 /* temp = 32 - bits; */
1053 base_ir->insert_before(temp);
1054 base_ir->insert_before(assign(temp, sub(c32, bits)));
1055
1056 /* expr = value << (temp - offset)) >> temp; */
1057 ir_expression *expr =
1058 rshift(lshift(ir->operands[0], sub(temp, ir->operands[1])), temp);
1059
1060 /* Section 8.8 (Integer Functions) of the GLSL 4.50 spec says:
1061 *
1062 * If bits is zero, the result will be zero.
1063 *
1064 * Due to the (x << (y%32)) behavior mentioned before, the (value <<
1065 * (32-0)) doesn't "erase" all of the data as we would like, so finish
1066 * up with:
1067 *
1068 * (bits == 0) ? 0 : e;
1069 */
1070 ir->operation = ir_triop_csel;
1071 ir->operands[0] = equal(c0, bits);
1072 ir->operands[1] = c0->clone(ir, NULL);
1073 ir->operands[2] = expr;
1074 }
1075
1076 this->progress = true;
1077 }
1078
1079 ir_visitor_status
1080 lower_instructions_visitor::visit_leave(ir_expression *ir)
1081 {
1082 switch (ir->operation) {
1083 case ir_binop_dot:
1084 if (ir->operands[0]->type->is_double())
1085 double_dot_to_fma(ir);
1086 break;
1087 case ir_triop_lrp:
1088 if (ir->operands[0]->type->is_double())
1089 double_lrp(ir);
1090 break;
1091 case ir_binop_sub:
1092 if (lowering(SUB_TO_ADD_NEG))
1093 sub_to_add_neg(ir);
1094 break;
1095
1096 case ir_binop_div:
1097 if (ir->operands[1]->type->is_integer() && lowering(INT_DIV_TO_MUL_RCP))
1098 int_div_to_mul_rcp(ir);
1099 else if ((ir->operands[1]->type->is_float() ||
1100 ir->operands[1]->type->is_double()) && lowering(DIV_TO_MUL_RCP))
1101 div_to_mul_rcp(ir);
1102 break;
1103
1104 case ir_unop_exp:
1105 if (lowering(EXP_TO_EXP2))
1106 exp_to_exp2(ir);
1107 break;
1108
1109 case ir_unop_log:
1110 if (lowering(LOG_TO_LOG2))
1111 log_to_log2(ir);
1112 break;
1113
1114 case ir_binop_mod:
1115 if (lowering(MOD_TO_FLOOR) && (ir->type->is_float() || ir->type->is_double()))
1116 mod_to_floor(ir);
1117 break;
1118
1119 case ir_binop_pow:
1120 if (lowering(POW_TO_EXP2))
1121 pow_to_exp2(ir);
1122 break;
1123
1124 case ir_binop_ldexp:
1125 if (lowering(LDEXP_TO_ARITH) && ir->type->is_float())
1126 ldexp_to_arith(ir);
1127 if (lowering(DFREXP_DLDEXP_TO_ARITH) && ir->type->is_double())
1128 dldexp_to_arith(ir);
1129 break;
1130
1131 case ir_unop_frexp_exp:
1132 if (lowering(DFREXP_DLDEXP_TO_ARITH) && ir->operands[0]->type->is_double())
1133 dfrexp_exp_to_arith(ir);
1134 break;
1135
1136 case ir_unop_frexp_sig:
1137 if (lowering(DFREXP_DLDEXP_TO_ARITH) && ir->operands[0]->type->is_double())
1138 dfrexp_sig_to_arith(ir);
1139 break;
1140
1141 case ir_binop_carry:
1142 if (lowering(CARRY_TO_ARITH))
1143 carry_to_arith(ir);
1144 break;
1145
1146 case ir_binop_borrow:
1147 if (lowering(BORROW_TO_ARITH))
1148 borrow_to_arith(ir);
1149 break;
1150
1151 case ir_unop_saturate:
1152 if (lowering(SAT_TO_CLAMP))
1153 sat_to_clamp(ir);
1154 break;
1155
1156 case ir_unop_trunc:
1157 if (lowering(DOPS_TO_DFRAC) && ir->type->is_double())
1158 dtrunc_to_dfrac(ir);
1159 break;
1160
1161 case ir_unop_ceil:
1162 if (lowering(DOPS_TO_DFRAC) && ir->type->is_double())
1163 dceil_to_dfrac(ir);
1164 break;
1165
1166 case ir_unop_floor:
1167 if (lowering(DOPS_TO_DFRAC) && ir->type->is_double())
1168 dfloor_to_dfrac(ir);
1169 break;
1170
1171 case ir_unop_round_even:
1172 if (lowering(DOPS_TO_DFRAC) && ir->type->is_double())
1173 dround_even_to_dfrac(ir);
1174 break;
1175
1176 case ir_unop_sign:
1177 if (lowering(DOPS_TO_DFRAC) && ir->type->is_double())
1178 dsign_to_csel(ir);
1179 break;
1180
1181 case ir_unop_bit_count:
1182 if (lowering(BIT_COUNT_TO_MATH))
1183 bit_count_to_math(ir);
1184 break;
1185
1186 case ir_triop_bitfield_extract:
1187 if (lowering(EXTRACT_TO_SHIFTS))
1188 extract_to_shifts(ir);
1189 break;
1190
1191 default:
1192 return visit_continue;
1193 }
1194
1195 return visit_continue;
1196 }