glsl: add lowering passes for carry/borrow
[mesa.git] / src / glsl / lower_instructions.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 * DEALINGS IN THE SOFTWARE.
22 */
23
24 /**
25 * \file lower_instructions.cpp
26 *
27 * Many GPUs lack native instructions for certain expression operations, and
28 * must replace them with some other expression tree. This pass lowers some
29 * of the most common cases, allowing the lowering code to be implemented once
30 * rather than in each driver backend.
31 *
32 * Currently supported transformations:
33 * - SUB_TO_ADD_NEG
34 * - DIV_TO_MUL_RCP
35 * - INT_DIV_TO_MUL_RCP
36 * - EXP_TO_EXP2
37 * - POW_TO_EXP2
38 * - LOG_TO_LOG2
39 * - MOD_TO_FRACT
40 * - LDEXP_TO_ARITH
41 * - BITFIELD_INSERT_TO_BFM_BFI
42 * - CARRY_TO_ARITH
43 * - BORROW_TO_ARITH
44 *
45 * SUB_TO_ADD_NEG:
46 * ---------------
47 * Breaks an ir_binop_sub expression down to add(op0, neg(op1))
48 *
49 * This simplifies expression reassociation, and for many backends
50 * there is no subtract operation separate from adding the negation.
51 * For backends with native subtract operations, they will probably
52 * want to recognize add(op0, neg(op1)) or the other way around to
53 * produce a subtract anyway.
54 *
55 * DIV_TO_MUL_RCP and INT_DIV_TO_MUL_RCP:
56 * --------------------------------------
57 * Breaks an ir_binop_div expression down to op0 * (rcp(op1)).
58 *
59 * Many GPUs don't have a divide instruction (945 and 965 included),
60 * but they do have an RCP instruction to compute an approximate
61 * reciprocal. By breaking the operation down, constant reciprocals
62 * can get constant folded.
63 *
64 * DIV_TO_MUL_RCP only lowers floating point division; INT_DIV_TO_MUL_RCP
65 * handles the integer case, converting to and from floating point so that
66 * RCP is possible.
67 *
68 * EXP_TO_EXP2 and LOG_TO_LOG2:
69 * ----------------------------
70 * Many GPUs don't have a base e log or exponent instruction, but they
71 * do have base 2 versions, so this pass converts exp and log to exp2
72 * and log2 operations.
73 *
74 * POW_TO_EXP2:
75 * -----------
76 * Many older GPUs don't have an x**y instruction. For these GPUs, convert
77 * x**y to 2**(y * log2(x)).
78 *
79 * MOD_TO_FRACT:
80 * -------------
81 * Breaks an ir_binop_mod expression down to (op1 * fract(op0 / op1))
82 *
83 * Many GPUs don't have a MOD instruction (945 and 965 included), and
84 * if we have to break it down like this anyway, it gives an
85 * opportunity to do things like constant fold the (1.0 / op1) easily.
86 *
87 * LDEXP_TO_ARITH:
88 * -------------
89 * Converts ir_binop_ldexp to arithmetic and bit operations.
90 *
91 * BITFIELD_INSERT_TO_BFM_BFI:
92 * ---------------------------
93 * Breaks ir_quadop_bitfield_insert into ir_binop_bfm (bitfield mask) and
94 * ir_triop_bfi (bitfield insert).
95 *
96 * Many GPUs implement the bitfieldInsert() built-in from ARB_gpu_shader_5
97 * with a pair of instructions.
98 *
99 * CARRY_TO_ARITH:
100 * ---------------
101 * Converts ir_carry into (x + y) < x.
102 *
103 * BORROW_TO_ARITH:
104 * ----------------
105 * Converts ir_borrow into (x < y).
106 *
107 */
108
109 #include "main/core.h" /* for M_LOG2E */
110 #include "glsl_types.h"
111 #include "ir.h"
112 #include "ir_builder.h"
113 #include "ir_optimization.h"
114
115 using namespace ir_builder;
116
117 namespace {
118
119 class lower_instructions_visitor : public ir_hierarchical_visitor {
120 public:
121 lower_instructions_visitor(unsigned lower)
122 : progress(false), lower(lower) { }
123
124 ir_visitor_status visit_leave(ir_expression *);
125
126 bool progress;
127
128 private:
129 unsigned lower; /** Bitfield of which operations to lower */
130
131 void sub_to_add_neg(ir_expression *);
132 void div_to_mul_rcp(ir_expression *);
133 void int_div_to_mul_rcp(ir_expression *);
134 void mod_to_fract(ir_expression *);
135 void exp_to_exp2(ir_expression *);
136 void pow_to_exp2(ir_expression *);
137 void log_to_log2(ir_expression *);
138 void bitfield_insert_to_bfm_bfi(ir_expression *);
139 void ldexp_to_arith(ir_expression *);
140 void carry_to_arith(ir_expression *);
141 void borrow_to_arith(ir_expression *);
142 };
143
144 } /* anonymous namespace */
145
146 /**
147 * Determine if a particular type of lowering should occur
148 */
149 #define lowering(x) (this->lower & x)
150
151 bool
152 lower_instructions(exec_list *instructions, unsigned what_to_lower)
153 {
154 lower_instructions_visitor v(what_to_lower);
155
156 visit_list_elements(&v, instructions);
157 return v.progress;
158 }
159
160 void
161 lower_instructions_visitor::sub_to_add_neg(ir_expression *ir)
162 {
163 ir->operation = ir_binop_add;
164 ir->operands[1] = new(ir) ir_expression(ir_unop_neg, ir->operands[1]->type,
165 ir->operands[1], NULL);
166 this->progress = true;
167 }
168
169 void
170 lower_instructions_visitor::div_to_mul_rcp(ir_expression *ir)
171 {
172 assert(ir->operands[1]->type->is_float());
173
174 /* New expression for the 1.0 / op1 */
175 ir_rvalue *expr;
176 expr = new(ir) ir_expression(ir_unop_rcp,
177 ir->operands[1]->type,
178 ir->operands[1]);
179
180 /* op0 / op1 -> op0 * (1.0 / op1) */
181 ir->operation = ir_binop_mul;
182 ir->operands[1] = expr;
183
184 this->progress = true;
185 }
186
187 void
188 lower_instructions_visitor::int_div_to_mul_rcp(ir_expression *ir)
189 {
190 assert(ir->operands[1]->type->is_integer());
191
192 /* Be careful with integer division -- we need to do it as a
193 * float and re-truncate, since rcp(n > 1) of an integer would
194 * just be 0.
195 */
196 ir_rvalue *op0, *op1;
197 const struct glsl_type *vec_type;
198
199 vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
200 ir->operands[1]->type->vector_elements,
201 ir->operands[1]->type->matrix_columns);
202
203 if (ir->operands[1]->type->base_type == GLSL_TYPE_INT)
204 op1 = new(ir) ir_expression(ir_unop_i2f, vec_type, ir->operands[1], NULL);
205 else
206 op1 = new(ir) ir_expression(ir_unop_u2f, vec_type, ir->operands[1], NULL);
207
208 op1 = new(ir) ir_expression(ir_unop_rcp, op1->type, op1, NULL);
209
210 vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
211 ir->operands[0]->type->vector_elements,
212 ir->operands[0]->type->matrix_columns);
213
214 if (ir->operands[0]->type->base_type == GLSL_TYPE_INT)
215 op0 = new(ir) ir_expression(ir_unop_i2f, vec_type, ir->operands[0], NULL);
216 else
217 op0 = new(ir) ir_expression(ir_unop_u2f, vec_type, ir->operands[0], NULL);
218
219 vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
220 ir->type->vector_elements,
221 ir->type->matrix_columns);
222
223 op0 = new(ir) ir_expression(ir_binop_mul, vec_type, op0, op1);
224
225 if (ir->operands[1]->type->base_type == GLSL_TYPE_INT) {
226 ir->operation = ir_unop_f2i;
227 ir->operands[0] = op0;
228 } else {
229 ir->operation = ir_unop_i2u;
230 ir->operands[0] = new(ir) ir_expression(ir_unop_f2i, op0);
231 }
232 ir->operands[1] = NULL;
233
234 this->progress = true;
235 }
236
237 void
238 lower_instructions_visitor::exp_to_exp2(ir_expression *ir)
239 {
240 ir_constant *log2_e = new(ir) ir_constant(float(M_LOG2E));
241
242 ir->operation = ir_unop_exp2;
243 ir->operands[0] = new(ir) ir_expression(ir_binop_mul, ir->operands[0]->type,
244 ir->operands[0], log2_e);
245 this->progress = true;
246 }
247
248 void
249 lower_instructions_visitor::pow_to_exp2(ir_expression *ir)
250 {
251 ir_expression *const log2_x =
252 new(ir) ir_expression(ir_unop_log2, ir->operands[0]->type,
253 ir->operands[0]);
254
255 ir->operation = ir_unop_exp2;
256 ir->operands[0] = new(ir) ir_expression(ir_binop_mul, ir->operands[1]->type,
257 ir->operands[1], log2_x);
258 ir->operands[1] = NULL;
259 this->progress = true;
260 }
261
262 void
263 lower_instructions_visitor::log_to_log2(ir_expression *ir)
264 {
265 ir->operation = ir_binop_mul;
266 ir->operands[0] = new(ir) ir_expression(ir_unop_log2, ir->operands[0]->type,
267 ir->operands[0], NULL);
268 ir->operands[1] = new(ir) ir_constant(float(1.0 / M_LOG2E));
269 this->progress = true;
270 }
271
272 void
273 lower_instructions_visitor::mod_to_fract(ir_expression *ir)
274 {
275 ir_variable *temp = new(ir) ir_variable(ir->operands[1]->type, "mod_b",
276 ir_var_temporary);
277 this->base_ir->insert_before(temp);
278
279 ir_assignment *const assign =
280 new(ir) ir_assignment(new(ir) ir_dereference_variable(temp),
281 ir->operands[1], NULL);
282
283 this->base_ir->insert_before(assign);
284
285 ir_expression *const div_expr =
286 new(ir) ir_expression(ir_binop_div, ir->operands[0]->type,
287 ir->operands[0],
288 new(ir) ir_dereference_variable(temp));
289
290 /* Don't generate new IR that would need to be lowered in an additional
291 * pass.
292 */
293 if (lowering(DIV_TO_MUL_RCP))
294 div_to_mul_rcp(div_expr);
295
296 ir_rvalue *expr = new(ir) ir_expression(ir_unop_fract,
297 ir->operands[0]->type,
298 div_expr,
299 NULL);
300
301 ir->operation = ir_binop_mul;
302 ir->operands[0] = new(ir) ir_dereference_variable(temp);
303 ir->operands[1] = expr;
304 this->progress = true;
305 }
306
307 void
308 lower_instructions_visitor::bitfield_insert_to_bfm_bfi(ir_expression *ir)
309 {
310 /* Translates
311 * ir_quadop_bitfield_insert base insert offset bits
312 * into
313 * ir_triop_bfi (ir_binop_bfm bits offset) insert base
314 */
315
316 ir_rvalue *base_expr = ir->operands[0];
317
318 ir->operation = ir_triop_bfi;
319 ir->operands[0] = new(ir) ir_expression(ir_binop_bfm,
320 ir->type->get_base_type(),
321 ir->operands[3],
322 ir->operands[2]);
323 /* ir->operands[1] is still the value to insert. */
324 ir->operands[2] = base_expr;
325 ir->operands[3] = NULL;
326
327 this->progress = true;
328 }
329
330 void
331 lower_instructions_visitor::ldexp_to_arith(ir_expression *ir)
332 {
333 /* Translates
334 * ir_binop_ldexp x exp
335 * into
336 *
337 * extracted_biased_exp = rshift(bitcast_f2i(abs(x)), exp_shift);
338 * resulting_biased_exp = extracted_biased_exp + exp;
339 *
340 * if (resulting_biased_exp < 1) {
341 * return copysign(0.0, x);
342 * }
343 *
344 * return bitcast_u2f((bitcast_f2u(x) & sign_mantissa_mask) |
345 * lshift(i2u(resulting_biased_exp), exp_shift));
346 *
347 * which we can't actually implement as such, since the GLSL IR doesn't
348 * have vectorized if-statements. We actually implement it without branches
349 * using conditional-select:
350 *
351 * extracted_biased_exp = rshift(bitcast_f2i(abs(x)), exp_shift);
352 * resulting_biased_exp = extracted_biased_exp + exp;
353 *
354 * is_not_zero_or_underflow = gequal(resulting_biased_exp, 1);
355 * x = csel(is_not_zero_or_underflow, x, copysign(0.0f, x));
356 * resulting_biased_exp = csel(is_not_zero_or_underflow,
357 * resulting_biased_exp, 0);
358 *
359 * return bitcast_u2f((bitcast_f2u(x) & sign_mantissa_mask) |
360 * lshift(i2u(resulting_biased_exp), exp_shift));
361 */
362
363 const unsigned vec_elem = ir->type->vector_elements;
364
365 /* Types */
366 const glsl_type *ivec = glsl_type::get_instance(GLSL_TYPE_INT, vec_elem, 1);
367 const glsl_type *bvec = glsl_type::get_instance(GLSL_TYPE_BOOL, vec_elem, 1);
368
369 /* Constants */
370 ir_constant *zeroi = ir_constant::zero(ir, ivec);
371
372 ir_constant *sign_mask = new(ir) ir_constant(0x80000000u, vec_elem);
373
374 ir_constant *exp_shift = new(ir) ir_constant(23);
375 ir_constant *exp_width = new(ir) ir_constant(8);
376
377 /* Temporary variables */
378 ir_variable *x = new(ir) ir_variable(ir->type, "x", ir_var_temporary);
379 ir_variable *exp = new(ir) ir_variable(ivec, "exp", ir_var_temporary);
380
381 ir_variable *zero_sign_x = new(ir) ir_variable(ir->type, "zero_sign_x",
382 ir_var_temporary);
383
384 ir_variable *extracted_biased_exp =
385 new(ir) ir_variable(ivec, "extracted_biased_exp", ir_var_temporary);
386 ir_variable *resulting_biased_exp =
387 new(ir) ir_variable(ivec, "resulting_biased_exp", ir_var_temporary);
388
389 ir_variable *is_not_zero_or_underflow =
390 new(ir) ir_variable(bvec, "is_not_zero_or_underflow", ir_var_temporary);
391
392 ir_instruction &i = *base_ir;
393
394 /* Copy <x> and <exp> arguments. */
395 i.insert_before(x);
396 i.insert_before(assign(x, ir->operands[0]));
397 i.insert_before(exp);
398 i.insert_before(assign(exp, ir->operands[1]));
399
400 /* Extract the biased exponent from <x>. */
401 i.insert_before(extracted_biased_exp);
402 i.insert_before(assign(extracted_biased_exp,
403 rshift(bitcast_f2i(abs(x)), exp_shift)));
404
405 i.insert_before(resulting_biased_exp);
406 i.insert_before(assign(resulting_biased_exp,
407 add(extracted_biased_exp, exp)));
408
409 /* Test if result is ±0.0, subnormal, or underflow by checking if the
410 * resulting biased exponent would be less than 0x1. If so, the result is
411 * 0.0 with the sign of x. (Actually, invert the conditions so that
412 * immediate values are the second arguments, which is better for i965)
413 */
414 i.insert_before(zero_sign_x);
415 i.insert_before(assign(zero_sign_x,
416 bitcast_u2f(bit_and(bitcast_f2u(x), sign_mask))));
417
418 i.insert_before(is_not_zero_or_underflow);
419 i.insert_before(assign(is_not_zero_or_underflow,
420 gequal(resulting_biased_exp,
421 new(ir) ir_constant(0x1, vec_elem))));
422 i.insert_before(assign(x, csel(is_not_zero_or_underflow,
423 x, zero_sign_x)));
424 i.insert_before(assign(resulting_biased_exp,
425 csel(is_not_zero_or_underflow,
426 resulting_biased_exp, zeroi)));
427
428 /* We could test for overflows by checking if the resulting biased exponent
429 * would be greater than 0xFE. Turns out we don't need to because the GLSL
430 * spec says:
431 *
432 * "If this product is too large to be represented in the
433 * floating-point type, the result is undefined."
434 */
435
436 ir_constant *exp_shift_clone = exp_shift->clone(ir, NULL);
437 ir->operation = ir_unop_bitcast_i2f;
438 ir->operands[0] = bitfield_insert(bitcast_f2i(x), resulting_biased_exp,
439 exp_shift_clone, exp_width);
440 ir->operands[1] = NULL;
441
442 /* Don't generate new IR that would need to be lowered in an additional
443 * pass.
444 */
445 if (lowering(BITFIELD_INSERT_TO_BFM_BFI))
446 bitfield_insert_to_bfm_bfi(ir->operands[0]->as_expression());
447
448 this->progress = true;
449 }
450
451 void
452 lower_instructions_visitor::carry_to_arith(ir_expression *ir)
453 {
454 /* Translates
455 * ir_binop_carry x y
456 * into
457 * sum = ir_binop_add x y
458 * bcarry = ir_binop_less sum x
459 * carry = ir_unop_b2i bcarry
460 */
461
462 ir_rvalue *x_clone = ir->operands[0]->clone(ir, NULL);
463 ir->operation = ir_unop_i2u;
464 ir->operands[0] = b2i(less(add(ir->operands[0], ir->operands[1]), x_clone));
465 ir->operands[1] = NULL;
466
467 this->progress = true;
468 }
469
470 void
471 lower_instructions_visitor::borrow_to_arith(ir_expression *ir)
472 {
473 /* Translates
474 * ir_binop_borrow x y
475 * into
476 * bcarry = ir_binop_less x y
477 * carry = ir_unop_b2i bcarry
478 */
479
480 ir->operation = ir_unop_i2u;
481 ir->operands[0] = b2i(less(ir->operands[0], ir->operands[1]));
482 ir->operands[1] = NULL;
483
484 this->progress = true;
485 }
486
487 ir_visitor_status
488 lower_instructions_visitor::visit_leave(ir_expression *ir)
489 {
490 switch (ir->operation) {
491 case ir_binop_sub:
492 if (lowering(SUB_TO_ADD_NEG))
493 sub_to_add_neg(ir);
494 break;
495
496 case ir_binop_div:
497 if (ir->operands[1]->type->is_integer() && lowering(INT_DIV_TO_MUL_RCP))
498 int_div_to_mul_rcp(ir);
499 else if (ir->operands[1]->type->is_float() && lowering(DIV_TO_MUL_RCP))
500 div_to_mul_rcp(ir);
501 break;
502
503 case ir_unop_exp:
504 if (lowering(EXP_TO_EXP2))
505 exp_to_exp2(ir);
506 break;
507
508 case ir_unop_log:
509 if (lowering(LOG_TO_LOG2))
510 log_to_log2(ir);
511 break;
512
513 case ir_binop_mod:
514 if (lowering(MOD_TO_FRACT) && ir->type->is_float())
515 mod_to_fract(ir);
516 break;
517
518 case ir_binop_pow:
519 if (lowering(POW_TO_EXP2))
520 pow_to_exp2(ir);
521 break;
522
523 case ir_quadop_bitfield_insert:
524 if (lowering(BITFIELD_INSERT_TO_BFM_BFI))
525 bitfield_insert_to_bfm_bfi(ir);
526 break;
527
528 case ir_binop_ldexp:
529 if (lowering(LDEXP_TO_ARITH))
530 ldexp_to_arith(ir);
531 break;
532
533 case ir_binop_carry:
534 if (lowering(CARRY_TO_ARITH))
535 carry_to_arith(ir);
536 break;
537
538 case ir_binop_borrow:
539 if (lowering(BORROW_TO_ARITH))
540 borrow_to_arith(ir);
541 break;
542
543 default:
544 return visit_continue;
545 }
546
547 return visit_continue;
548 }