glsl: Add lowering pass for ir_unop_find_msb
[mesa.git] / src / compiler / glsl / lower_instructions.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 * DEALINGS IN THE SOFTWARE.
22 */
23
24 /**
25 * \file lower_instructions.cpp
26 *
27 * Many GPUs lack native instructions for certain expression operations, and
28 * must replace them with some other expression tree. This pass lowers some
29 * of the most common cases, allowing the lowering code to be implemented once
30 * rather than in each driver backend.
31 *
32 * Currently supported transformations:
33 * - SUB_TO_ADD_NEG
34 * - DIV_TO_MUL_RCP
35 * - INT_DIV_TO_MUL_RCP
36 * - EXP_TO_EXP2
37 * - POW_TO_EXP2
38 * - LOG_TO_LOG2
39 * - MOD_TO_FLOOR
40 * - LDEXP_TO_ARITH
41 * - DFREXP_TO_ARITH
42 * - CARRY_TO_ARITH
43 * - BORROW_TO_ARITH
44 * - SAT_TO_CLAMP
45 * - DOPS_TO_DFRAC
46 *
47 * SUB_TO_ADD_NEG:
48 * ---------------
49 * Breaks an ir_binop_sub expression down to add(op0, neg(op1))
50 *
51 * This simplifies expression reassociation, and for many backends
52 * there is no subtract operation separate from adding the negation.
53 * For backends with native subtract operations, they will probably
54 * want to recognize add(op0, neg(op1)) or the other way around to
55 * produce a subtract anyway.
56 *
57 * DIV_TO_MUL_RCP and INT_DIV_TO_MUL_RCP:
58 * --------------------------------------
59 * Breaks an ir_binop_div expression down to op0 * (rcp(op1)).
60 *
61 * Many GPUs don't have a divide instruction (945 and 965 included),
62 * but they do have an RCP instruction to compute an approximate
63 * reciprocal. By breaking the operation down, constant reciprocals
64 * can get constant folded.
65 *
66 * DIV_TO_MUL_RCP only lowers floating point division; INT_DIV_TO_MUL_RCP
67 * handles the integer case, converting to and from floating point so that
68 * RCP is possible.
69 *
70 * EXP_TO_EXP2 and LOG_TO_LOG2:
71 * ----------------------------
72 * Many GPUs don't have a base e log or exponent instruction, but they
73 * do have base 2 versions, so this pass converts exp and log to exp2
74 * and log2 operations.
75 *
76 * POW_TO_EXP2:
77 * -----------
78 * Many older GPUs don't have an x**y instruction. For these GPUs, convert
79 * x**y to 2**(y * log2(x)).
80 *
81 * MOD_TO_FLOOR:
82 * -------------
83 * Breaks an ir_binop_mod expression down to (op0 - op1 * floor(op0 / op1))
84 *
85 * Many GPUs don't have a MOD instruction (945 and 965 included), and
86 * if we have to break it down like this anyway, it gives an
87 * opportunity to do things like constant fold the (1.0 / op1) easily.
88 *
89 * Note: before we used to implement this as op1 * fract(op / op1) but this
90 * implementation had significant precision errors.
91 *
92 * LDEXP_TO_ARITH:
93 * -------------
94 * Converts ir_binop_ldexp to arithmetic and bit operations for float sources.
95 *
96 * DFREXP_DLDEXP_TO_ARITH:
97 * ---------------
98 * Converts ir_binop_ldexp, ir_unop_frexp_sig, and ir_unop_frexp_exp to
99 * arithmetic and bit ops for double arguments.
100 *
101 * CARRY_TO_ARITH:
102 * ---------------
103 * Converts ir_carry into (x + y) < x.
104 *
105 * BORROW_TO_ARITH:
106 * ----------------
107 * Converts ir_borrow into (x < y).
108 *
109 * SAT_TO_CLAMP:
110 * -------------
111 * Converts ir_unop_saturate into min(max(x, 0.0), 1.0)
112 *
113 * DOPS_TO_DFRAC:
114 * --------------
115 * Converts double trunc, ceil, floor, round to fract
116 */
117
118 #include "c99_math.h"
119 #include "program/prog_instruction.h" /* for swizzle */
120 #include "compiler/glsl_types.h"
121 #include "ir.h"
122 #include "ir_builder.h"
123 #include "ir_optimization.h"
124
125 using namespace ir_builder;
126
127 namespace {
128
129 class lower_instructions_visitor : public ir_hierarchical_visitor {
130 public:
131 lower_instructions_visitor(unsigned lower)
132 : progress(false), lower(lower) { }
133
134 ir_visitor_status visit_leave(ir_expression *);
135
136 bool progress;
137
138 private:
139 unsigned lower; /** Bitfield of which operations to lower */
140
141 void sub_to_add_neg(ir_expression *);
142 void div_to_mul_rcp(ir_expression *);
143 void int_div_to_mul_rcp(ir_expression *);
144 void mod_to_floor(ir_expression *);
145 void exp_to_exp2(ir_expression *);
146 void pow_to_exp2(ir_expression *);
147 void log_to_log2(ir_expression *);
148 void ldexp_to_arith(ir_expression *);
149 void dldexp_to_arith(ir_expression *);
150 void dfrexp_sig_to_arith(ir_expression *);
151 void dfrexp_exp_to_arith(ir_expression *);
152 void carry_to_arith(ir_expression *);
153 void borrow_to_arith(ir_expression *);
154 void sat_to_clamp(ir_expression *);
155 void double_dot_to_fma(ir_expression *);
156 void double_lrp(ir_expression *);
157 void dceil_to_dfrac(ir_expression *);
158 void dfloor_to_dfrac(ir_expression *);
159 void dround_even_to_dfrac(ir_expression *);
160 void dtrunc_to_dfrac(ir_expression *);
161 void dsign_to_csel(ir_expression *);
162 void bit_count_to_math(ir_expression *);
163 void extract_to_shifts(ir_expression *);
164 void insert_to_shifts(ir_expression *);
165 void reverse_to_shifts(ir_expression *ir);
166 void find_lsb_to_float_cast(ir_expression *ir);
167 void find_msb_to_float_cast(ir_expression *ir);
168 };
169
170 } /* anonymous namespace */
171
172 /**
173 * Determine if a particular type of lowering should occur
174 */
175 #define lowering(x) (this->lower & x)
176
177 bool
178 lower_instructions(exec_list *instructions, unsigned what_to_lower)
179 {
180 lower_instructions_visitor v(what_to_lower);
181
182 visit_list_elements(&v, instructions);
183 return v.progress;
184 }
185
186 void
187 lower_instructions_visitor::sub_to_add_neg(ir_expression *ir)
188 {
189 ir->operation = ir_binop_add;
190 ir->operands[1] = new(ir) ir_expression(ir_unop_neg, ir->operands[1]->type,
191 ir->operands[1], NULL);
192 this->progress = true;
193 }
194
195 void
196 lower_instructions_visitor::div_to_mul_rcp(ir_expression *ir)
197 {
198 assert(ir->operands[1]->type->is_float() || ir->operands[1]->type->is_double());
199
200 /* New expression for the 1.0 / op1 */
201 ir_rvalue *expr;
202 expr = new(ir) ir_expression(ir_unop_rcp,
203 ir->operands[1]->type,
204 ir->operands[1]);
205
206 /* op0 / op1 -> op0 * (1.0 / op1) */
207 ir->operation = ir_binop_mul;
208 ir->operands[1] = expr;
209
210 this->progress = true;
211 }
212
213 void
214 lower_instructions_visitor::int_div_to_mul_rcp(ir_expression *ir)
215 {
216 assert(ir->operands[1]->type->is_integer());
217
218 /* Be careful with integer division -- we need to do it as a
219 * float and re-truncate, since rcp(n > 1) of an integer would
220 * just be 0.
221 */
222 ir_rvalue *op0, *op1;
223 const struct glsl_type *vec_type;
224
225 vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
226 ir->operands[1]->type->vector_elements,
227 ir->operands[1]->type->matrix_columns);
228
229 if (ir->operands[1]->type->base_type == GLSL_TYPE_INT)
230 op1 = new(ir) ir_expression(ir_unop_i2f, vec_type, ir->operands[1], NULL);
231 else
232 op1 = new(ir) ir_expression(ir_unop_u2f, vec_type, ir->operands[1], NULL);
233
234 op1 = new(ir) ir_expression(ir_unop_rcp, op1->type, op1, NULL);
235
236 vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
237 ir->operands[0]->type->vector_elements,
238 ir->operands[0]->type->matrix_columns);
239
240 if (ir->operands[0]->type->base_type == GLSL_TYPE_INT)
241 op0 = new(ir) ir_expression(ir_unop_i2f, vec_type, ir->operands[0], NULL);
242 else
243 op0 = new(ir) ir_expression(ir_unop_u2f, vec_type, ir->operands[0], NULL);
244
245 vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
246 ir->type->vector_elements,
247 ir->type->matrix_columns);
248
249 op0 = new(ir) ir_expression(ir_binop_mul, vec_type, op0, op1);
250
251 if (ir->operands[1]->type->base_type == GLSL_TYPE_INT) {
252 ir->operation = ir_unop_f2i;
253 ir->operands[0] = op0;
254 } else {
255 ir->operation = ir_unop_i2u;
256 ir->operands[0] = new(ir) ir_expression(ir_unop_f2i, op0);
257 }
258 ir->operands[1] = NULL;
259
260 this->progress = true;
261 }
262
263 void
264 lower_instructions_visitor::exp_to_exp2(ir_expression *ir)
265 {
266 ir_constant *log2_e = new(ir) ir_constant(float(M_LOG2E));
267
268 ir->operation = ir_unop_exp2;
269 ir->operands[0] = new(ir) ir_expression(ir_binop_mul, ir->operands[0]->type,
270 ir->operands[0], log2_e);
271 this->progress = true;
272 }
273
274 void
275 lower_instructions_visitor::pow_to_exp2(ir_expression *ir)
276 {
277 ir_expression *const log2_x =
278 new(ir) ir_expression(ir_unop_log2, ir->operands[0]->type,
279 ir->operands[0]);
280
281 ir->operation = ir_unop_exp2;
282 ir->operands[0] = new(ir) ir_expression(ir_binop_mul, ir->operands[1]->type,
283 ir->operands[1], log2_x);
284 ir->operands[1] = NULL;
285 this->progress = true;
286 }
287
288 void
289 lower_instructions_visitor::log_to_log2(ir_expression *ir)
290 {
291 ir->operation = ir_binop_mul;
292 ir->operands[0] = new(ir) ir_expression(ir_unop_log2, ir->operands[0]->type,
293 ir->operands[0], NULL);
294 ir->operands[1] = new(ir) ir_constant(float(1.0 / M_LOG2E));
295 this->progress = true;
296 }
297
298 void
299 lower_instructions_visitor::mod_to_floor(ir_expression *ir)
300 {
301 ir_variable *x = new(ir) ir_variable(ir->operands[0]->type, "mod_x",
302 ir_var_temporary);
303 ir_variable *y = new(ir) ir_variable(ir->operands[1]->type, "mod_y",
304 ir_var_temporary);
305 this->base_ir->insert_before(x);
306 this->base_ir->insert_before(y);
307
308 ir_assignment *const assign_x =
309 new(ir) ir_assignment(new(ir) ir_dereference_variable(x),
310 ir->operands[0], NULL);
311 ir_assignment *const assign_y =
312 new(ir) ir_assignment(new(ir) ir_dereference_variable(y),
313 ir->operands[1], NULL);
314
315 this->base_ir->insert_before(assign_x);
316 this->base_ir->insert_before(assign_y);
317
318 ir_expression *const div_expr =
319 new(ir) ir_expression(ir_binop_div, x->type,
320 new(ir) ir_dereference_variable(x),
321 new(ir) ir_dereference_variable(y));
322
323 /* Don't generate new IR that would need to be lowered in an additional
324 * pass.
325 */
326 if (lowering(DIV_TO_MUL_RCP) && (ir->type->is_float() || ir->type->is_double()))
327 div_to_mul_rcp(div_expr);
328
329 ir_expression *const floor_expr =
330 new(ir) ir_expression(ir_unop_floor, x->type, div_expr);
331
332 if (lowering(DOPS_TO_DFRAC) && ir->type->is_double())
333 dfloor_to_dfrac(floor_expr);
334
335 ir_expression *const mul_expr =
336 new(ir) ir_expression(ir_binop_mul,
337 new(ir) ir_dereference_variable(y),
338 floor_expr);
339
340 ir->operation = ir_binop_sub;
341 ir->operands[0] = new(ir) ir_dereference_variable(x);
342 ir->operands[1] = mul_expr;
343 this->progress = true;
344 }
345
346 void
347 lower_instructions_visitor::ldexp_to_arith(ir_expression *ir)
348 {
349 /* Translates
350 * ir_binop_ldexp x exp
351 * into
352 *
353 * extracted_biased_exp = rshift(bitcast_f2i(abs(x)), exp_shift);
354 * resulting_biased_exp = extracted_biased_exp + exp;
355 *
356 * if (resulting_biased_exp < 1 || x == 0.0f) {
357 * return copysign(0.0, x);
358 * }
359 *
360 * return bitcast_u2f((bitcast_f2u(x) & sign_mantissa_mask) |
361 * lshift(i2u(resulting_biased_exp), exp_shift));
362 *
363 * which we can't actually implement as such, since the GLSL IR doesn't
364 * have vectorized if-statements. We actually implement it without branches
365 * using conditional-select:
366 *
367 * extracted_biased_exp = rshift(bitcast_f2i(abs(x)), exp_shift);
368 * resulting_biased_exp = extracted_biased_exp + exp;
369 *
370 * is_not_zero_or_underflow = logic_and(nequal(x, 0.0f),
371 * gequal(resulting_biased_exp, 1);
372 * x = csel(is_not_zero_or_underflow, x, copysign(0.0f, x));
373 * resulting_biased_exp = csel(is_not_zero_or_underflow,
374 * resulting_biased_exp, 0);
375 *
376 * return bitcast_u2f((bitcast_f2u(x) & sign_mantissa_mask) |
377 * lshift(i2u(resulting_biased_exp), exp_shift));
378 */
379
380 const unsigned vec_elem = ir->type->vector_elements;
381
382 /* Types */
383 const glsl_type *ivec = glsl_type::get_instance(GLSL_TYPE_INT, vec_elem, 1);
384 const glsl_type *bvec = glsl_type::get_instance(GLSL_TYPE_BOOL, vec_elem, 1);
385
386 /* Constants */
387 ir_constant *zeroi = ir_constant::zero(ir, ivec);
388
389 ir_constant *sign_mask = new(ir) ir_constant(0x80000000u, vec_elem);
390
391 ir_constant *exp_shift = new(ir) ir_constant(23, vec_elem);
392 ir_constant *exp_width = new(ir) ir_constant(8, vec_elem);
393
394 /* Temporary variables */
395 ir_variable *x = new(ir) ir_variable(ir->type, "x", ir_var_temporary);
396 ir_variable *exp = new(ir) ir_variable(ivec, "exp", ir_var_temporary);
397
398 ir_variable *zero_sign_x = new(ir) ir_variable(ir->type, "zero_sign_x",
399 ir_var_temporary);
400
401 ir_variable *extracted_biased_exp =
402 new(ir) ir_variable(ivec, "extracted_biased_exp", ir_var_temporary);
403 ir_variable *resulting_biased_exp =
404 new(ir) ir_variable(ivec, "resulting_biased_exp", ir_var_temporary);
405
406 ir_variable *is_not_zero_or_underflow =
407 new(ir) ir_variable(bvec, "is_not_zero_or_underflow", ir_var_temporary);
408
409 ir_instruction &i = *base_ir;
410
411 /* Copy <x> and <exp> arguments. */
412 i.insert_before(x);
413 i.insert_before(assign(x, ir->operands[0]));
414 i.insert_before(exp);
415 i.insert_before(assign(exp, ir->operands[1]));
416
417 /* Extract the biased exponent from <x>. */
418 i.insert_before(extracted_biased_exp);
419 i.insert_before(assign(extracted_biased_exp,
420 rshift(bitcast_f2i(abs(x)), exp_shift)));
421
422 i.insert_before(resulting_biased_exp);
423 i.insert_before(assign(resulting_biased_exp,
424 add(extracted_biased_exp, exp)));
425
426 /* Test if result is ±0.0, subnormal, or underflow by checking if the
427 * resulting biased exponent would be less than 0x1. If so, the result is
428 * 0.0 with the sign of x. (Actually, invert the conditions so that
429 * immediate values are the second arguments, which is better for i965)
430 */
431 i.insert_before(zero_sign_x);
432 i.insert_before(assign(zero_sign_x,
433 bitcast_u2f(bit_and(bitcast_f2u(x), sign_mask))));
434
435 i.insert_before(is_not_zero_or_underflow);
436 i.insert_before(assign(is_not_zero_or_underflow,
437 logic_and(nequal(x, new(ir) ir_constant(0.0f, vec_elem)),
438 gequal(resulting_biased_exp,
439 new(ir) ir_constant(0x1, vec_elem)))));
440 i.insert_before(assign(x, csel(is_not_zero_or_underflow,
441 x, zero_sign_x)));
442 i.insert_before(assign(resulting_biased_exp,
443 csel(is_not_zero_or_underflow,
444 resulting_biased_exp, zeroi)));
445
446 /* We could test for overflows by checking if the resulting biased exponent
447 * would be greater than 0xFE. Turns out we don't need to because the GLSL
448 * spec says:
449 *
450 * "If this product is too large to be represented in the
451 * floating-point type, the result is undefined."
452 */
453
454 ir_constant *exp_shift_clone = exp_shift->clone(ir, NULL);
455 ir->operation = ir_unop_bitcast_i2f;
456 ir->operands[0] = bitfield_insert(bitcast_f2i(x), resulting_biased_exp,
457 exp_shift_clone, exp_width);
458 ir->operands[1] = NULL;
459
460 this->progress = true;
461 }
462
463 void
464 lower_instructions_visitor::dldexp_to_arith(ir_expression *ir)
465 {
466 /* See ldexp_to_arith for structure. Uses frexp_exp to extract the exponent
467 * from the significand.
468 */
469
470 const unsigned vec_elem = ir->type->vector_elements;
471
472 /* Types */
473 const glsl_type *ivec = glsl_type::get_instance(GLSL_TYPE_INT, vec_elem, 1);
474 const glsl_type *bvec = glsl_type::get_instance(GLSL_TYPE_BOOL, vec_elem, 1);
475
476 /* Constants */
477 ir_constant *zeroi = ir_constant::zero(ir, ivec);
478
479 ir_constant *sign_mask = new(ir) ir_constant(0x80000000u);
480
481 ir_constant *exp_shift = new(ir) ir_constant(20u);
482 ir_constant *exp_width = new(ir) ir_constant(11u);
483 ir_constant *exp_bias = new(ir) ir_constant(1022, vec_elem);
484
485 /* Temporary variables */
486 ir_variable *x = new(ir) ir_variable(ir->type, "x", ir_var_temporary);
487 ir_variable *exp = new(ir) ir_variable(ivec, "exp", ir_var_temporary);
488
489 ir_variable *zero_sign_x = new(ir) ir_variable(ir->type, "zero_sign_x",
490 ir_var_temporary);
491
492 ir_variable *extracted_biased_exp =
493 new(ir) ir_variable(ivec, "extracted_biased_exp", ir_var_temporary);
494 ir_variable *resulting_biased_exp =
495 new(ir) ir_variable(ivec, "resulting_biased_exp", ir_var_temporary);
496
497 ir_variable *is_not_zero_or_underflow =
498 new(ir) ir_variable(bvec, "is_not_zero_or_underflow", ir_var_temporary);
499
500 ir_instruction &i = *base_ir;
501
502 /* Copy <x> and <exp> arguments. */
503 i.insert_before(x);
504 i.insert_before(assign(x, ir->operands[0]));
505 i.insert_before(exp);
506 i.insert_before(assign(exp, ir->operands[1]));
507
508 ir_expression *frexp_exp = expr(ir_unop_frexp_exp, x);
509 if (lowering(DFREXP_DLDEXP_TO_ARITH))
510 dfrexp_exp_to_arith(frexp_exp);
511
512 /* Extract the biased exponent from <x>. */
513 i.insert_before(extracted_biased_exp);
514 i.insert_before(assign(extracted_biased_exp, add(frexp_exp, exp_bias)));
515
516 i.insert_before(resulting_biased_exp);
517 i.insert_before(assign(resulting_biased_exp,
518 add(extracted_biased_exp, exp)));
519
520 /* Test if result is ±0.0, subnormal, or underflow by checking if the
521 * resulting biased exponent would be less than 0x1. If so, the result is
522 * 0.0 with the sign of x. (Actually, invert the conditions so that
523 * immediate values are the second arguments, which is better for i965)
524 * TODO: Implement in a vector fashion.
525 */
526 i.insert_before(zero_sign_x);
527 for (unsigned elem = 0; elem < vec_elem; elem++) {
528 ir_variable *unpacked =
529 new(ir) ir_variable(glsl_type::uvec2_type, "unpacked", ir_var_temporary);
530 i.insert_before(unpacked);
531 i.insert_before(
532 assign(unpacked,
533 expr(ir_unop_unpack_double_2x32, swizzle(x, elem, 1))));
534 i.insert_before(assign(unpacked, bit_and(swizzle_y(unpacked), sign_mask->clone(ir, NULL)),
535 WRITEMASK_Y));
536 i.insert_before(assign(unpacked, ir_constant::zero(ir, glsl_type::uint_type), WRITEMASK_X));
537 i.insert_before(assign(zero_sign_x,
538 expr(ir_unop_pack_double_2x32, unpacked),
539 1 << elem));
540 }
541 i.insert_before(is_not_zero_or_underflow);
542 i.insert_before(assign(is_not_zero_or_underflow,
543 gequal(resulting_biased_exp,
544 new(ir) ir_constant(0x1, vec_elem))));
545 i.insert_before(assign(x, csel(is_not_zero_or_underflow,
546 x, zero_sign_x)));
547 i.insert_before(assign(resulting_biased_exp,
548 csel(is_not_zero_or_underflow,
549 resulting_biased_exp, zeroi)));
550
551 /* We could test for overflows by checking if the resulting biased exponent
552 * would be greater than 0xFE. Turns out we don't need to because the GLSL
553 * spec says:
554 *
555 * "If this product is too large to be represented in the
556 * floating-point type, the result is undefined."
557 */
558
559 ir_rvalue *results[4] = {NULL};
560 for (unsigned elem = 0; elem < vec_elem; elem++) {
561 ir_variable *unpacked =
562 new(ir) ir_variable(glsl_type::uvec2_type, "unpacked", ir_var_temporary);
563 i.insert_before(unpacked);
564 i.insert_before(
565 assign(unpacked,
566 expr(ir_unop_unpack_double_2x32, swizzle(x, elem, 1))));
567
568 ir_expression *bfi = bitfield_insert(
569 swizzle_y(unpacked),
570 i2u(swizzle(resulting_biased_exp, elem, 1)),
571 exp_shift->clone(ir, NULL),
572 exp_width->clone(ir, NULL));
573
574 i.insert_before(assign(unpacked, bfi, WRITEMASK_Y));
575
576 results[elem] = expr(ir_unop_pack_double_2x32, unpacked);
577 }
578
579 ir->operation = ir_quadop_vector;
580 ir->operands[0] = results[0];
581 ir->operands[1] = results[1];
582 ir->operands[2] = results[2];
583 ir->operands[3] = results[3];
584
585 /* Don't generate new IR that would need to be lowered in an additional
586 * pass.
587 */
588
589 this->progress = true;
590 }
591
592 void
593 lower_instructions_visitor::dfrexp_sig_to_arith(ir_expression *ir)
594 {
595 const unsigned vec_elem = ir->type->vector_elements;
596 const glsl_type *bvec = glsl_type::get_instance(GLSL_TYPE_BOOL, vec_elem, 1);
597
598 /* Double-precision floating-point values are stored as
599 * 1 sign bit;
600 * 11 exponent bits;
601 * 52 mantissa bits.
602 *
603 * We're just extracting the significand here, so we only need to modify
604 * the upper 32-bit uint. Unfortunately we must extract each double
605 * independently as there is no vector version of unpackDouble.
606 */
607
608 ir_instruction &i = *base_ir;
609
610 ir_variable *is_not_zero =
611 new(ir) ir_variable(bvec, "is_not_zero", ir_var_temporary);
612 ir_rvalue *results[4] = {NULL};
613
614 ir_constant *dzero = new(ir) ir_constant(0.0, vec_elem);
615 i.insert_before(is_not_zero);
616 i.insert_before(
617 assign(is_not_zero,
618 nequal(abs(ir->operands[0]->clone(ir, NULL)), dzero)));
619
620 /* TODO: Remake this as more vector-friendly when int64 support is
621 * available.
622 */
623 for (unsigned elem = 0; elem < vec_elem; elem++) {
624 ir_constant *zero = new(ir) ir_constant(0u, 1);
625 ir_constant *sign_mantissa_mask = new(ir) ir_constant(0x800fffffu, 1);
626
627 /* Exponent of double floating-point values in the range [0.5, 1.0). */
628 ir_constant *exponent_value = new(ir) ir_constant(0x3fe00000u, 1);
629
630 ir_variable *bits =
631 new(ir) ir_variable(glsl_type::uint_type, "bits", ir_var_temporary);
632 ir_variable *unpacked =
633 new(ir) ir_variable(glsl_type::uvec2_type, "unpacked", ir_var_temporary);
634
635 ir_rvalue *x = swizzle(ir->operands[0]->clone(ir, NULL), elem, 1);
636
637 i.insert_before(bits);
638 i.insert_before(unpacked);
639 i.insert_before(assign(unpacked, expr(ir_unop_unpack_double_2x32, x)));
640
641 /* Manipulate the high uint to remove the exponent and replace it with
642 * either the default exponent or zero.
643 */
644 i.insert_before(assign(bits, swizzle_y(unpacked)));
645 i.insert_before(assign(bits, bit_and(bits, sign_mantissa_mask)));
646 i.insert_before(assign(bits, bit_or(bits,
647 csel(swizzle(is_not_zero, elem, 1),
648 exponent_value,
649 zero))));
650 i.insert_before(assign(unpacked, bits, WRITEMASK_Y));
651 results[elem] = expr(ir_unop_pack_double_2x32, unpacked);
652 }
653
654 /* Put the dvec back together */
655 ir->operation = ir_quadop_vector;
656 ir->operands[0] = results[0];
657 ir->operands[1] = results[1];
658 ir->operands[2] = results[2];
659 ir->operands[3] = results[3];
660
661 this->progress = true;
662 }
663
664 void
665 lower_instructions_visitor::dfrexp_exp_to_arith(ir_expression *ir)
666 {
667 const unsigned vec_elem = ir->type->vector_elements;
668 const glsl_type *bvec = glsl_type::get_instance(GLSL_TYPE_BOOL, vec_elem, 1);
669 const glsl_type *uvec = glsl_type::get_instance(GLSL_TYPE_UINT, vec_elem, 1);
670
671 /* Double-precision floating-point values are stored as
672 * 1 sign bit;
673 * 11 exponent bits;
674 * 52 mantissa bits.
675 *
676 * We're just extracting the exponent here, so we only care about the upper
677 * 32-bit uint.
678 */
679
680 ir_instruction &i = *base_ir;
681
682 ir_variable *is_not_zero =
683 new(ir) ir_variable(bvec, "is_not_zero", ir_var_temporary);
684 ir_variable *high_words =
685 new(ir) ir_variable(uvec, "high_words", ir_var_temporary);
686 ir_constant *dzero = new(ir) ir_constant(0.0, vec_elem);
687 ir_constant *izero = new(ir) ir_constant(0, vec_elem);
688
689 ir_rvalue *absval = abs(ir->operands[0]);
690
691 i.insert_before(is_not_zero);
692 i.insert_before(high_words);
693 i.insert_before(assign(is_not_zero, nequal(absval->clone(ir, NULL), dzero)));
694
695 /* Extract all of the upper uints. */
696 for (unsigned elem = 0; elem < vec_elem; elem++) {
697 ir_rvalue *x = swizzle(absval->clone(ir, NULL), elem, 1);
698
699 i.insert_before(assign(high_words,
700 swizzle_y(expr(ir_unop_unpack_double_2x32, x)),
701 1 << elem));
702
703 }
704 ir_constant *exponent_shift = new(ir) ir_constant(20, vec_elem);
705 ir_constant *exponent_bias = new(ir) ir_constant(-1022, vec_elem);
706
707 /* For non-zero inputs, shift the exponent down and apply bias. */
708 ir->operation = ir_triop_csel;
709 ir->operands[0] = new(ir) ir_dereference_variable(is_not_zero);
710 ir->operands[1] = add(exponent_bias, u2i(rshift(high_words, exponent_shift)));
711 ir->operands[2] = izero;
712
713 this->progress = true;
714 }
715
716 void
717 lower_instructions_visitor::carry_to_arith(ir_expression *ir)
718 {
719 /* Translates
720 * ir_binop_carry x y
721 * into
722 * sum = ir_binop_add x y
723 * bcarry = ir_binop_less sum x
724 * carry = ir_unop_b2i bcarry
725 */
726
727 ir_rvalue *x_clone = ir->operands[0]->clone(ir, NULL);
728 ir->operation = ir_unop_i2u;
729 ir->operands[0] = b2i(less(add(ir->operands[0], ir->operands[1]), x_clone));
730 ir->operands[1] = NULL;
731
732 this->progress = true;
733 }
734
735 void
736 lower_instructions_visitor::borrow_to_arith(ir_expression *ir)
737 {
738 /* Translates
739 * ir_binop_borrow x y
740 * into
741 * bcarry = ir_binop_less x y
742 * carry = ir_unop_b2i bcarry
743 */
744
745 ir->operation = ir_unop_i2u;
746 ir->operands[0] = b2i(less(ir->operands[0], ir->operands[1]));
747 ir->operands[1] = NULL;
748
749 this->progress = true;
750 }
751
752 void
753 lower_instructions_visitor::sat_to_clamp(ir_expression *ir)
754 {
755 /* Translates
756 * ir_unop_saturate x
757 * into
758 * ir_binop_min (ir_binop_max(x, 0.0), 1.0)
759 */
760
761 ir->operation = ir_binop_min;
762 ir->operands[0] = new(ir) ir_expression(ir_binop_max, ir->operands[0]->type,
763 ir->operands[0],
764 new(ir) ir_constant(0.0f));
765 ir->operands[1] = new(ir) ir_constant(1.0f);
766
767 this->progress = true;
768 }
769
770 void
771 lower_instructions_visitor::double_dot_to_fma(ir_expression *ir)
772 {
773 ir_variable *temp = new(ir) ir_variable(ir->operands[0]->type->get_base_type(), "dot_res",
774 ir_var_temporary);
775 this->base_ir->insert_before(temp);
776
777 int nc = ir->operands[0]->type->components();
778 for (int i = nc - 1; i >= 1; i--) {
779 ir_assignment *assig;
780 if (i == (nc - 1)) {
781 assig = assign(temp, mul(swizzle(ir->operands[0]->clone(ir, NULL), i, 1),
782 swizzle(ir->operands[1]->clone(ir, NULL), i, 1)));
783 } else {
784 assig = assign(temp, fma(swizzle(ir->operands[0]->clone(ir, NULL), i, 1),
785 swizzle(ir->operands[1]->clone(ir, NULL), i, 1),
786 temp));
787 }
788 this->base_ir->insert_before(assig);
789 }
790
791 ir->operation = ir_triop_fma;
792 ir->operands[0] = swizzle(ir->operands[0], 0, 1);
793 ir->operands[1] = swizzle(ir->operands[1], 0, 1);
794 ir->operands[2] = new(ir) ir_dereference_variable(temp);
795
796 this->progress = true;
797
798 }
799
800 void
801 lower_instructions_visitor::double_lrp(ir_expression *ir)
802 {
803 int swizval;
804 ir_rvalue *op0 = ir->operands[0], *op2 = ir->operands[2];
805 ir_constant *one = new(ir) ir_constant(1.0, op2->type->vector_elements);
806
807 switch (op2->type->vector_elements) {
808 case 1:
809 swizval = SWIZZLE_XXXX;
810 break;
811 default:
812 assert(op0->type->vector_elements == op2->type->vector_elements);
813 swizval = SWIZZLE_XYZW;
814 break;
815 }
816
817 ir->operation = ir_triop_fma;
818 ir->operands[0] = swizzle(op2, swizval, op0->type->vector_elements);
819 ir->operands[2] = mul(sub(one, op2->clone(ir, NULL)), op0);
820
821 this->progress = true;
822 }
823
824 void
825 lower_instructions_visitor::dceil_to_dfrac(ir_expression *ir)
826 {
827 /*
828 * frtemp = frac(x);
829 * temp = sub(x, frtemp);
830 * result = temp + ((frtemp != 0.0) ? 1.0 : 0.0);
831 */
832 ir_instruction &i = *base_ir;
833 ir_constant *zero = new(ir) ir_constant(0.0, ir->operands[0]->type->vector_elements);
834 ir_constant *one = new(ir) ir_constant(1.0, ir->operands[0]->type->vector_elements);
835 ir_variable *frtemp = new(ir) ir_variable(ir->operands[0]->type, "frtemp",
836 ir_var_temporary);
837
838 i.insert_before(frtemp);
839 i.insert_before(assign(frtemp, fract(ir->operands[0])));
840
841 ir->operation = ir_binop_add;
842 ir->operands[0] = sub(ir->operands[0]->clone(ir, NULL), frtemp);
843 ir->operands[1] = csel(nequal(frtemp, zero), one, zero->clone(ir, NULL));
844
845 this->progress = true;
846 }
847
848 void
849 lower_instructions_visitor::dfloor_to_dfrac(ir_expression *ir)
850 {
851 /*
852 * frtemp = frac(x);
853 * result = sub(x, frtemp);
854 */
855 ir->operation = ir_binop_sub;
856 ir->operands[1] = fract(ir->operands[0]->clone(ir, NULL));
857
858 this->progress = true;
859 }
860 void
861 lower_instructions_visitor::dround_even_to_dfrac(ir_expression *ir)
862 {
863 /*
864 * insane but works
865 * temp = x + 0.5;
866 * frtemp = frac(temp);
867 * t2 = sub(temp, frtemp);
868 * if (frac(x) == 0.5)
869 * result = frac(t2 * 0.5) == 0 ? t2 : t2 - 1;
870 * else
871 * result = t2;
872
873 */
874 ir_instruction &i = *base_ir;
875 ir_variable *frtemp = new(ir) ir_variable(ir->operands[0]->type, "frtemp",
876 ir_var_temporary);
877 ir_variable *temp = new(ir) ir_variable(ir->operands[0]->type, "temp",
878 ir_var_temporary);
879 ir_variable *t2 = new(ir) ir_variable(ir->operands[0]->type, "t2",
880 ir_var_temporary);
881 ir_constant *p5 = new(ir) ir_constant(0.5, ir->operands[0]->type->vector_elements);
882 ir_constant *one = new(ir) ir_constant(1.0, ir->operands[0]->type->vector_elements);
883 ir_constant *zero = new(ir) ir_constant(0.0, ir->operands[0]->type->vector_elements);
884
885 i.insert_before(temp);
886 i.insert_before(assign(temp, add(ir->operands[0], p5)));
887
888 i.insert_before(frtemp);
889 i.insert_before(assign(frtemp, fract(temp)));
890
891 i.insert_before(t2);
892 i.insert_before(assign(t2, sub(temp, frtemp)));
893
894 ir->operation = ir_triop_csel;
895 ir->operands[0] = equal(fract(ir->operands[0]->clone(ir, NULL)),
896 p5->clone(ir, NULL));
897 ir->operands[1] = csel(equal(fract(mul(t2, p5->clone(ir, NULL))),
898 zero),
899 t2,
900 sub(t2, one));
901 ir->operands[2] = new(ir) ir_dereference_variable(t2);
902
903 this->progress = true;
904 }
905
906 void
907 lower_instructions_visitor::dtrunc_to_dfrac(ir_expression *ir)
908 {
909 /*
910 * frtemp = frac(x);
911 * temp = sub(x, frtemp);
912 * result = x >= 0 ? temp : temp + (frtemp == 0.0) ? 0 : 1;
913 */
914 ir_rvalue *arg = ir->operands[0];
915 ir_instruction &i = *base_ir;
916
917 ir_constant *zero = new(ir) ir_constant(0.0, arg->type->vector_elements);
918 ir_constant *one = new(ir) ir_constant(1.0, arg->type->vector_elements);
919 ir_variable *frtemp = new(ir) ir_variable(arg->type, "frtemp",
920 ir_var_temporary);
921 ir_variable *temp = new(ir) ir_variable(ir->operands[0]->type, "temp",
922 ir_var_temporary);
923
924 i.insert_before(frtemp);
925 i.insert_before(assign(frtemp, fract(arg)));
926 i.insert_before(temp);
927 i.insert_before(assign(temp, sub(arg->clone(ir, NULL), frtemp)));
928
929 ir->operation = ir_triop_csel;
930 ir->operands[0] = gequal(arg->clone(ir, NULL), zero);
931 ir->operands[1] = new (ir) ir_dereference_variable(temp);
932 ir->operands[2] = add(temp,
933 csel(equal(frtemp, zero->clone(ir, NULL)),
934 zero->clone(ir, NULL),
935 one));
936
937 this->progress = true;
938 }
939
940 void
941 lower_instructions_visitor::dsign_to_csel(ir_expression *ir)
942 {
943 /*
944 * temp = x > 0.0 ? 1.0 : 0.0;
945 * result = x < 0.0 ? -1.0 : temp;
946 */
947 ir_rvalue *arg = ir->operands[0];
948 ir_constant *zero = new(ir) ir_constant(0.0, arg->type->vector_elements);
949 ir_constant *one = new(ir) ir_constant(1.0, arg->type->vector_elements);
950 ir_constant *neg_one = new(ir) ir_constant(-1.0, arg->type->vector_elements);
951
952 ir->operation = ir_triop_csel;
953 ir->operands[0] = less(arg->clone(ir, NULL),
954 zero->clone(ir, NULL));
955 ir->operands[1] = neg_one;
956 ir->operands[2] = csel(greater(arg, zero),
957 one,
958 zero->clone(ir, NULL));
959
960 this->progress = true;
961 }
962
963 void
964 lower_instructions_visitor::bit_count_to_math(ir_expression *ir)
965 {
966 /* For more details, see:
967 *
968 * http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetPaallel
969 */
970 const unsigned elements = ir->operands[0]->type->vector_elements;
971 ir_variable *temp = new(ir) ir_variable(glsl_type::uvec(elements), "temp",
972 ir_var_temporary);
973 ir_constant *c55555555 = new(ir) ir_constant(0x55555555u);
974 ir_constant *c33333333 = new(ir) ir_constant(0x33333333u);
975 ir_constant *c0F0F0F0F = new(ir) ir_constant(0x0F0F0F0Fu);
976 ir_constant *c01010101 = new(ir) ir_constant(0x01010101u);
977 ir_constant *c1 = new(ir) ir_constant(1u);
978 ir_constant *c2 = new(ir) ir_constant(2u);
979 ir_constant *c4 = new(ir) ir_constant(4u);
980 ir_constant *c24 = new(ir) ir_constant(24u);
981
982 base_ir->insert_before(temp);
983
984 if (ir->operands[0]->type->base_type == GLSL_TYPE_UINT) {
985 base_ir->insert_before(assign(temp, ir->operands[0]));
986 } else {
987 assert(ir->operands[0]->type->base_type == GLSL_TYPE_INT);
988 base_ir->insert_before(assign(temp, i2u(ir->operands[0])));
989 }
990
991 /* temp = temp - ((temp >> 1) & 0x55555555u); */
992 base_ir->insert_before(assign(temp, sub(temp, bit_and(rshift(temp, c1),
993 c55555555))));
994
995 /* temp = (temp & 0x33333333u) + ((temp >> 2) & 0x33333333u); */
996 base_ir->insert_before(assign(temp, add(bit_and(temp, c33333333),
997 bit_and(rshift(temp, c2),
998 c33333333->clone(ir, NULL)))));
999
1000 /* int(((temp + (temp >> 4) & 0xF0F0F0Fu) * 0x1010101u) >> 24); */
1001 ir->operation = ir_unop_u2i;
1002 ir->operands[0] = rshift(mul(bit_and(add(temp, rshift(temp, c4)), c0F0F0F0F),
1003 c01010101),
1004 c24);
1005
1006 this->progress = true;
1007 }
1008
1009 void
1010 lower_instructions_visitor::extract_to_shifts(ir_expression *ir)
1011 {
1012 ir_variable *bits =
1013 new(ir) ir_variable(ir->operands[0]->type, "bits", ir_var_temporary);
1014
1015 base_ir->insert_before(bits);
1016 base_ir->insert_before(assign(bits, ir->operands[2]));
1017
1018 if (ir->operands[0]->type->base_type == GLSL_TYPE_UINT) {
1019 ir_constant *c1 =
1020 new(ir) ir_constant(1u, ir->operands[0]->type->vector_elements);
1021 ir_constant *c32 =
1022 new(ir) ir_constant(32u, ir->operands[0]->type->vector_elements);
1023 ir_constant *cFFFFFFFF =
1024 new(ir) ir_constant(0xFFFFFFFFu, ir->operands[0]->type->vector_elements);
1025
1026 /* At least some hardware treats (x << y) as (x << (y%32)). This means
1027 * we'd get a mask of 0 when bits is 32. Special case it.
1028 *
1029 * mask = bits == 32 ? 0xffffffff : (1u << bits) - 1u;
1030 */
1031 ir_expression *mask = csel(equal(bits, c32),
1032 cFFFFFFFF,
1033 sub(lshift(c1, bits), c1->clone(ir, NULL)));
1034
1035 /* Section 8.8 (Integer Functions) of the GLSL 4.50 spec says:
1036 *
1037 * If bits is zero, the result will be zero.
1038 *
1039 * Since (1 << 0) - 1 == 0, we don't need to bother with the conditional
1040 * select as in the signed integer case.
1041 *
1042 * (value >> offset) & mask;
1043 */
1044 ir->operation = ir_binop_bit_and;
1045 ir->operands[0] = rshift(ir->operands[0], ir->operands[1]);
1046 ir->operands[1] = mask;
1047 ir->operands[2] = NULL;
1048 } else {
1049 ir_constant *c0 =
1050 new(ir) ir_constant(int(0), ir->operands[0]->type->vector_elements);
1051 ir_constant *c32 =
1052 new(ir) ir_constant(int(32), ir->operands[0]->type->vector_elements);
1053 ir_variable *temp =
1054 new(ir) ir_variable(ir->operands[0]->type, "temp", ir_var_temporary);
1055
1056 /* temp = 32 - bits; */
1057 base_ir->insert_before(temp);
1058 base_ir->insert_before(assign(temp, sub(c32, bits)));
1059
1060 /* expr = value << (temp - offset)) >> temp; */
1061 ir_expression *expr =
1062 rshift(lshift(ir->operands[0], sub(temp, ir->operands[1])), temp);
1063
1064 /* Section 8.8 (Integer Functions) of the GLSL 4.50 spec says:
1065 *
1066 * If bits is zero, the result will be zero.
1067 *
1068 * Due to the (x << (y%32)) behavior mentioned before, the (value <<
1069 * (32-0)) doesn't "erase" all of the data as we would like, so finish
1070 * up with:
1071 *
1072 * (bits == 0) ? 0 : e;
1073 */
1074 ir->operation = ir_triop_csel;
1075 ir->operands[0] = equal(c0, bits);
1076 ir->operands[1] = c0->clone(ir, NULL);
1077 ir->operands[2] = expr;
1078 }
1079
1080 this->progress = true;
1081 }
1082
1083 void
1084 lower_instructions_visitor::insert_to_shifts(ir_expression *ir)
1085 {
1086 ir_constant *c1;
1087 ir_constant *c32;
1088 ir_constant *cFFFFFFFF;
1089 ir_variable *offset =
1090 new(ir) ir_variable(ir->operands[0]->type, "offset", ir_var_temporary);
1091 ir_variable *bits =
1092 new(ir) ir_variable(ir->operands[0]->type, "bits", ir_var_temporary);
1093 ir_variable *mask =
1094 new(ir) ir_variable(ir->operands[0]->type, "mask", ir_var_temporary);
1095
1096 if (ir->operands[0]->type->base_type == GLSL_TYPE_INT) {
1097 c1 = new(ir) ir_constant(int(1), ir->operands[0]->type->vector_elements);
1098 c32 = new(ir) ir_constant(int(32), ir->operands[0]->type->vector_elements);
1099 cFFFFFFFF = new(ir) ir_constant(int(0xFFFFFFFF), ir->operands[0]->type->vector_elements);
1100 } else {
1101 assert(ir->operands[0]->type->base_type == GLSL_TYPE_UINT);
1102
1103 c1 = new(ir) ir_constant(1u, ir->operands[0]->type->vector_elements);
1104 c32 = new(ir) ir_constant(32u, ir->operands[0]->type->vector_elements);
1105 cFFFFFFFF = new(ir) ir_constant(0xFFFFFFFFu, ir->operands[0]->type->vector_elements);
1106 }
1107
1108 base_ir->insert_before(offset);
1109 base_ir->insert_before(assign(offset, ir->operands[2]));
1110
1111 base_ir->insert_before(bits);
1112 base_ir->insert_before(assign(bits, ir->operands[3]));
1113
1114 /* At least some hardware treats (x << y) as (x << (y%32)). This means
1115 * we'd get a mask of 0 when bits is 32. Special case it.
1116 *
1117 * mask = (bits == 32 ? 0xffffffff : (1u << bits) - 1u) << offset;
1118 *
1119 * Section 8.8 (Integer Functions) of the GLSL 4.50 spec says:
1120 *
1121 * The result will be undefined if offset or bits is negative, or if the
1122 * sum of offset and bits is greater than the number of bits used to
1123 * store the operand.
1124 *
1125 * Since it's undefined, there are a couple other ways this could be
1126 * implemented. The other way that was considered was to put the csel
1127 * around the whole thing:
1128 *
1129 * final_result = bits == 32 ? insert : ... ;
1130 */
1131 base_ir->insert_before(mask);
1132
1133 base_ir->insert_before(assign(mask, csel(equal(bits, c32),
1134 cFFFFFFFF,
1135 lshift(sub(lshift(c1, bits),
1136 c1->clone(ir, NULL)),
1137 offset))));
1138
1139 /* (base & ~mask) | ((insert << offset) & mask) */
1140 ir->operation = ir_binop_bit_or;
1141 ir->operands[0] = bit_and(ir->operands[0], bit_not(mask));
1142 ir->operands[1] = bit_and(lshift(ir->operands[1], offset), mask);
1143 ir->operands[2] = NULL;
1144 ir->operands[3] = NULL;
1145
1146 this->progress = true;
1147 }
1148
1149 void
1150 lower_instructions_visitor::reverse_to_shifts(ir_expression *ir)
1151 {
1152 /* For more details, see:
1153 *
1154 * http://graphics.stanford.edu/~seander/bithacks.html#ReverseParallel
1155 */
1156 ir_constant *c1 =
1157 new(ir) ir_constant(1u, ir->operands[0]->type->vector_elements);
1158 ir_constant *c2 =
1159 new(ir) ir_constant(2u, ir->operands[0]->type->vector_elements);
1160 ir_constant *c4 =
1161 new(ir) ir_constant(4u, ir->operands[0]->type->vector_elements);
1162 ir_constant *c8 =
1163 new(ir) ir_constant(8u, ir->operands[0]->type->vector_elements);
1164 ir_constant *c16 =
1165 new(ir) ir_constant(16u, ir->operands[0]->type->vector_elements);
1166 ir_constant *c33333333 =
1167 new(ir) ir_constant(0x33333333u, ir->operands[0]->type->vector_elements);
1168 ir_constant *c55555555 =
1169 new(ir) ir_constant(0x55555555u, ir->operands[0]->type->vector_elements);
1170 ir_constant *c0F0F0F0F =
1171 new(ir) ir_constant(0x0F0F0F0Fu, ir->operands[0]->type->vector_elements);
1172 ir_constant *c00FF00FF =
1173 new(ir) ir_constant(0x00FF00FFu, ir->operands[0]->type->vector_elements);
1174 ir_variable *temp =
1175 new(ir) ir_variable(glsl_type::uvec(ir->operands[0]->type->vector_elements),
1176 "temp", ir_var_temporary);
1177 ir_instruction &i = *base_ir;
1178
1179 i.insert_before(temp);
1180
1181 if (ir->operands[0]->type->base_type == GLSL_TYPE_UINT) {
1182 i.insert_before(assign(temp, ir->operands[0]));
1183 } else {
1184 assert(ir->operands[0]->type->base_type == GLSL_TYPE_INT);
1185 i.insert_before(assign(temp, i2u(ir->operands[0])));
1186 }
1187
1188 /* Swap odd and even bits.
1189 *
1190 * temp = ((temp >> 1) & 0x55555555u) | ((temp & 0x55555555u) << 1);
1191 */
1192 i.insert_before(assign(temp, bit_or(bit_and(rshift(temp, c1), c55555555),
1193 lshift(bit_and(temp, c55555555->clone(ir, NULL)),
1194 c1->clone(ir, NULL)))));
1195 /* Swap consecutive pairs.
1196 *
1197 * temp = ((temp >> 2) & 0x33333333u) | ((temp & 0x33333333u) << 2);
1198 */
1199 i.insert_before(assign(temp, bit_or(bit_and(rshift(temp, c2), c33333333),
1200 lshift(bit_and(temp, c33333333->clone(ir, NULL)),
1201 c2->clone(ir, NULL)))));
1202
1203 /* Swap nibbles.
1204 *
1205 * temp = ((temp >> 4) & 0x0F0F0F0Fu) | ((temp & 0x0F0F0F0Fu) << 4);
1206 */
1207 i.insert_before(assign(temp, bit_or(bit_and(rshift(temp, c4), c0F0F0F0F),
1208 lshift(bit_and(temp, c0F0F0F0F->clone(ir, NULL)),
1209 c4->clone(ir, NULL)))));
1210
1211 /* The last step is, basically, bswap. Swap the bytes, then swap the
1212 * words. When this code is run through GCC on x86, it does generate a
1213 * bswap instruction.
1214 *
1215 * temp = ((temp >> 8) & 0x00FF00FFu) | ((temp & 0x00FF00FFu) << 8);
1216 * temp = ( temp >> 16 ) | ( temp << 16);
1217 */
1218 i.insert_before(assign(temp, bit_or(bit_and(rshift(temp, c8), c00FF00FF),
1219 lshift(bit_and(temp, c00FF00FF->clone(ir, NULL)),
1220 c8->clone(ir, NULL)))));
1221
1222 if (ir->operands[0]->type->base_type == GLSL_TYPE_UINT) {
1223 ir->operation = ir_binop_bit_or;
1224 ir->operands[0] = rshift(temp, c16);
1225 ir->operands[1] = lshift(temp, c16->clone(ir, NULL));
1226 } else {
1227 ir->operation = ir_unop_u2i;
1228 ir->operands[0] = bit_or(rshift(temp, c16),
1229 lshift(temp, c16->clone(ir, NULL)));
1230 }
1231
1232 this->progress = true;
1233 }
1234
1235 void
1236 lower_instructions_visitor::find_lsb_to_float_cast(ir_expression *ir)
1237 {
1238 /* For more details, see:
1239 *
1240 * http://graphics.stanford.edu/~seander/bithacks.html#ZerosOnRightFloatCast
1241 */
1242 const unsigned elements = ir->operands[0]->type->vector_elements;
1243 ir_constant *c0 = new(ir) ir_constant(unsigned(0), elements);
1244 ir_constant *cminus1 = new(ir) ir_constant(int(-1), elements);
1245 ir_constant *c23 = new(ir) ir_constant(int(23), elements);
1246 ir_constant *c7F = new(ir) ir_constant(int(0x7F), elements);
1247 ir_variable *temp =
1248 new(ir) ir_variable(glsl_type::ivec(elements), "temp", ir_var_temporary);
1249 ir_variable *lsb_only =
1250 new(ir) ir_variable(glsl_type::uvec(elements), "lsb_only", ir_var_temporary);
1251 ir_variable *as_float =
1252 new(ir) ir_variable(glsl_type::vec(elements), "as_float", ir_var_temporary);
1253 ir_variable *lsb =
1254 new(ir) ir_variable(glsl_type::ivec(elements), "lsb", ir_var_temporary);
1255
1256 ir_instruction &i = *base_ir;
1257
1258 i.insert_before(temp);
1259
1260 if (ir->operands[0]->type->base_type == GLSL_TYPE_INT) {
1261 i.insert_before(assign(temp, ir->operands[0]));
1262 } else {
1263 assert(ir->operands[0]->type->base_type == GLSL_TYPE_UINT);
1264 i.insert_before(assign(temp, u2i(ir->operands[0])));
1265 }
1266
1267 /* The int-to-float conversion is lossless because (value & -value) is
1268 * either a power of two or zero. We don't use the result in the zero
1269 * case. The uint() cast is necessary so that 0x80000000 does not
1270 * generate a negative value.
1271 *
1272 * uint lsb_only = uint(value & -value);
1273 * float as_float = float(lsb_only);
1274 */
1275 i.insert_before(lsb_only);
1276 i.insert_before(assign(lsb_only, i2u(bit_and(temp, neg(temp)))));
1277
1278 i.insert_before(as_float);
1279 i.insert_before(assign(as_float, u2f(lsb_only)));
1280
1281 /* This is basically an open-coded frexp. Implementations that have a
1282 * native frexp instruction would be better served by that. This is
1283 * optimized versus a full-featured open-coded implementation in two ways:
1284 *
1285 * - We don't care about a correct result from subnormal numbers (including
1286 * 0.0), so the raw exponent can always be safely unbiased.
1287 *
1288 * - The value cannot be negative, so it does not need to be masked off to
1289 * extract the exponent.
1290 *
1291 * int lsb = (floatBitsToInt(as_float) >> 23) - 0x7f;
1292 */
1293 i.insert_before(lsb);
1294 i.insert_before(assign(lsb, sub(rshift(bitcast_f2i(as_float), c23), c7F)));
1295
1296 /* Use lsb_only in the comparison instead of temp so that the & (far above)
1297 * can possibly generate the result without an explicit comparison.
1298 *
1299 * (lsb_only == 0) ? -1 : lsb;
1300 *
1301 * Since our input values are all integers, the unbiased exponent must not
1302 * be negative. It will only be negative (-0x7f, in fact) if lsb_only is
1303 * 0. Instead of using (lsb_only == 0), we could use (lsb >= 0). Which is
1304 * better is likely GPU dependent. Either way, the difference should be
1305 * small.
1306 */
1307 ir->operation = ir_triop_csel;
1308 ir->operands[0] = equal(lsb_only, c0);
1309 ir->operands[1] = cminus1;
1310 ir->operands[2] = new(ir) ir_dereference_variable(lsb);
1311
1312 this->progress = true;
1313 }
1314
1315 void
1316 lower_instructions_visitor::find_msb_to_float_cast(ir_expression *ir)
1317 {
1318 /* For more details, see:
1319 *
1320 * http://graphics.stanford.edu/~seander/bithacks.html#ZerosOnRightFloatCast
1321 */
1322 const unsigned elements = ir->operands[0]->type->vector_elements;
1323 ir_constant *c0 = new(ir) ir_constant(int(0), elements);
1324 ir_constant *cminus1 = new(ir) ir_constant(int(-1), elements);
1325 ir_constant *c23 = new(ir) ir_constant(int(23), elements);
1326 ir_constant *c7F = new(ir) ir_constant(int(0x7F), elements);
1327 ir_constant *c000000FF = new(ir) ir_constant(0x000000FFu, elements);
1328 ir_constant *cFFFFFF00 = new(ir) ir_constant(0xFFFFFF00u, elements);
1329 ir_variable *temp =
1330 new(ir) ir_variable(glsl_type::uvec(elements), "temp", ir_var_temporary);
1331 ir_variable *as_float =
1332 new(ir) ir_variable(glsl_type::vec(elements), "as_float", ir_var_temporary);
1333 ir_variable *msb =
1334 new(ir) ir_variable(glsl_type::ivec(elements), "msb", ir_var_temporary);
1335
1336 ir_instruction &i = *base_ir;
1337
1338 i.insert_before(temp);
1339
1340 if (ir->operands[0]->type->base_type == GLSL_TYPE_UINT) {
1341 i.insert_before(assign(temp, ir->operands[0]));
1342 } else {
1343 assert(ir->operands[0]->type->base_type == GLSL_TYPE_INT);
1344
1345 /* findMSB(uint(abs(some_int))) almost always does the right thing.
1346 * There are two problem values:
1347 *
1348 * * 0x80000000. Since abs(0x80000000) == 0x80000000, findMSB returns
1349 * 31. However, findMSB(int(0x80000000)) == 30.
1350 *
1351 * * 0xffffffff. Since abs(0xffffffff) == 1, findMSB returns
1352 * 31. Section 8.8 (Integer Functions) of the GLSL 4.50 spec says:
1353 *
1354 * For a value of zero or negative one, -1 will be returned.
1355 *
1356 * For all negative number cases, including 0x80000000 and 0xffffffff,
1357 * the correct value is obtained from findMSB if instead of negating the
1358 * (already negative) value the logical-not is used. A conditonal
1359 * logical-not can be achieved in two instructions.
1360 */
1361 ir_variable *as_int =
1362 new(ir) ir_variable(glsl_type::ivec(elements), "as_int", ir_var_temporary);
1363 ir_constant *c31 = new(ir) ir_constant(int(31), elements);
1364
1365 i.insert_before(as_int);
1366 i.insert_before(assign(as_int, ir->operands[0]));
1367 i.insert_before(assign(temp, i2u(expr(ir_binop_bit_xor,
1368 as_int,
1369 rshift(as_int, c31)))));
1370 }
1371
1372 /* The int-to-float conversion is lossless because bits are conditionally
1373 * masked off the bottom of temp to ensure the value has at most 24 bits of
1374 * data or is zero. We don't use the result in the zero case. The uint()
1375 * cast is necessary so that 0x80000000 does not generate a negative value.
1376 *
1377 * float as_float = float(temp > 255 ? temp & ~255 : temp);
1378 */
1379 i.insert_before(as_float);
1380 i.insert_before(assign(as_float, u2f(csel(greater(temp, c000000FF),
1381 bit_and(temp, cFFFFFF00),
1382 temp))));
1383
1384 /* This is basically an open-coded frexp. Implementations that have a
1385 * native frexp instruction would be better served by that. This is
1386 * optimized versus a full-featured open-coded implementation in two ways:
1387 *
1388 * - We don't care about a correct result from subnormal numbers (including
1389 * 0.0), so the raw exponent can always be safely unbiased.
1390 *
1391 * - The value cannot be negative, so it does not need to be masked off to
1392 * extract the exponent.
1393 *
1394 * int msb = (floatBitsToInt(as_float) >> 23) - 0x7f;
1395 */
1396 i.insert_before(msb);
1397 i.insert_before(assign(msb, sub(rshift(bitcast_f2i(as_float), c23), c7F)));
1398
1399 /* Use msb in the comparison instead of temp so that the subtract can
1400 * possibly generate the result without an explicit comparison.
1401 *
1402 * (msb < 0) ? -1 : msb;
1403 *
1404 * Since our input values are all integers, the unbiased exponent must not
1405 * be negative. It will only be negative (-0x7f, in fact) if temp is 0.
1406 */
1407 ir->operation = ir_triop_csel;
1408 ir->operands[0] = less(msb, c0);
1409 ir->operands[1] = cminus1;
1410 ir->operands[2] = new(ir) ir_dereference_variable(msb);
1411
1412 this->progress = true;
1413 }
1414
1415 ir_visitor_status
1416 lower_instructions_visitor::visit_leave(ir_expression *ir)
1417 {
1418 switch (ir->operation) {
1419 case ir_binop_dot:
1420 if (ir->operands[0]->type->is_double())
1421 double_dot_to_fma(ir);
1422 break;
1423 case ir_triop_lrp:
1424 if (ir->operands[0]->type->is_double())
1425 double_lrp(ir);
1426 break;
1427 case ir_binop_sub:
1428 if (lowering(SUB_TO_ADD_NEG))
1429 sub_to_add_neg(ir);
1430 break;
1431
1432 case ir_binop_div:
1433 if (ir->operands[1]->type->is_integer() && lowering(INT_DIV_TO_MUL_RCP))
1434 int_div_to_mul_rcp(ir);
1435 else if ((ir->operands[1]->type->is_float() ||
1436 ir->operands[1]->type->is_double()) && lowering(DIV_TO_MUL_RCP))
1437 div_to_mul_rcp(ir);
1438 break;
1439
1440 case ir_unop_exp:
1441 if (lowering(EXP_TO_EXP2))
1442 exp_to_exp2(ir);
1443 break;
1444
1445 case ir_unop_log:
1446 if (lowering(LOG_TO_LOG2))
1447 log_to_log2(ir);
1448 break;
1449
1450 case ir_binop_mod:
1451 if (lowering(MOD_TO_FLOOR) && (ir->type->is_float() || ir->type->is_double()))
1452 mod_to_floor(ir);
1453 break;
1454
1455 case ir_binop_pow:
1456 if (lowering(POW_TO_EXP2))
1457 pow_to_exp2(ir);
1458 break;
1459
1460 case ir_binop_ldexp:
1461 if (lowering(LDEXP_TO_ARITH) && ir->type->is_float())
1462 ldexp_to_arith(ir);
1463 if (lowering(DFREXP_DLDEXP_TO_ARITH) && ir->type->is_double())
1464 dldexp_to_arith(ir);
1465 break;
1466
1467 case ir_unop_frexp_exp:
1468 if (lowering(DFREXP_DLDEXP_TO_ARITH) && ir->operands[0]->type->is_double())
1469 dfrexp_exp_to_arith(ir);
1470 break;
1471
1472 case ir_unop_frexp_sig:
1473 if (lowering(DFREXP_DLDEXP_TO_ARITH) && ir->operands[0]->type->is_double())
1474 dfrexp_sig_to_arith(ir);
1475 break;
1476
1477 case ir_binop_carry:
1478 if (lowering(CARRY_TO_ARITH))
1479 carry_to_arith(ir);
1480 break;
1481
1482 case ir_binop_borrow:
1483 if (lowering(BORROW_TO_ARITH))
1484 borrow_to_arith(ir);
1485 break;
1486
1487 case ir_unop_saturate:
1488 if (lowering(SAT_TO_CLAMP))
1489 sat_to_clamp(ir);
1490 break;
1491
1492 case ir_unop_trunc:
1493 if (lowering(DOPS_TO_DFRAC) && ir->type->is_double())
1494 dtrunc_to_dfrac(ir);
1495 break;
1496
1497 case ir_unop_ceil:
1498 if (lowering(DOPS_TO_DFRAC) && ir->type->is_double())
1499 dceil_to_dfrac(ir);
1500 break;
1501
1502 case ir_unop_floor:
1503 if (lowering(DOPS_TO_DFRAC) && ir->type->is_double())
1504 dfloor_to_dfrac(ir);
1505 break;
1506
1507 case ir_unop_round_even:
1508 if (lowering(DOPS_TO_DFRAC) && ir->type->is_double())
1509 dround_even_to_dfrac(ir);
1510 break;
1511
1512 case ir_unop_sign:
1513 if (lowering(DOPS_TO_DFRAC) && ir->type->is_double())
1514 dsign_to_csel(ir);
1515 break;
1516
1517 case ir_unop_bit_count:
1518 if (lowering(BIT_COUNT_TO_MATH))
1519 bit_count_to_math(ir);
1520 break;
1521
1522 case ir_triop_bitfield_extract:
1523 if (lowering(EXTRACT_TO_SHIFTS))
1524 extract_to_shifts(ir);
1525 break;
1526
1527 case ir_quadop_bitfield_insert:
1528 if (lowering(INSERT_TO_SHIFTS))
1529 insert_to_shifts(ir);
1530 break;
1531
1532 case ir_unop_bitfield_reverse:
1533 if (lowering(REVERSE_TO_SHIFTS))
1534 reverse_to_shifts(ir);
1535 break;
1536
1537 case ir_unop_find_lsb:
1538 if (lowering(FIND_LSB_TO_FLOAT_CAST))
1539 find_lsb_to_float_cast(ir);
1540 break;
1541
1542 case ir_unop_find_msb:
1543 if (lowering(FIND_MSB_TO_FLOAT_CAST))
1544 find_msb_to_float_cast(ir);
1545 break;
1546
1547 default:
1548 return visit_continue;
1549 }
1550
1551 return visit_continue;
1552 }