gimple-walk.h: New File.
[gcc.git] / gcc / tree-ssa-math-opts.c
1 /* Global, SSA-based optimizations using mathematical identities.
2 Copyright (C) 2005-2013 Free Software Foundation, Inc.
3
4 This file is part of GCC.
5
6 GCC is free software; you can redistribute it and/or modify it
7 under the terms of the GNU General Public License as published by the
8 Free Software Foundation; either version 3, or (at your option) any
9 later version.
10
11 GCC is distributed in the hope that it will be useful, but WITHOUT
12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
19
20 /* Currently, the only mini-pass in this file tries to CSE reciprocal
21 operations. These are common in sequences such as this one:
22
23 modulus = sqrt(x*x + y*y + z*z);
24 x = x / modulus;
25 y = y / modulus;
26 z = z / modulus;
27
28 that can be optimized to
29
30 modulus = sqrt(x*x + y*y + z*z);
31 rmodulus = 1.0 / modulus;
32 x = x * rmodulus;
33 y = y * rmodulus;
34 z = z * rmodulus;
35
36 We do this for loop invariant divisors, and with this pass whenever
37 we notice that a division has the same divisor multiple times.
38
39 Of course, like in PRE, we don't insert a division if a dominator
40 already has one. However, this cannot be done as an extension of
41 PRE for several reasons.
42
43 First of all, with some experiments it was found out that the
44 transformation is not always useful if there are only two divisions
45 hy the same divisor. This is probably because modern processors
46 can pipeline the divisions; on older, in-order processors it should
47 still be effective to optimize two divisions by the same number.
48 We make this a param, and it shall be called N in the remainder of
49 this comment.
50
51 Second, if trapping math is active, we have less freedom on where
52 to insert divisions: we can only do so in basic blocks that already
53 contain one. (If divisions don't trap, instead, we can insert
54 divisions elsewhere, which will be in blocks that are common dominators
55 of those that have the division).
56
57 We really don't want to compute the reciprocal unless a division will
58 be found. To do this, we won't insert the division in a basic block
59 that has less than N divisions *post-dominating* it.
60
61 The algorithm constructs a subset of the dominator tree, holding the
62 blocks containing the divisions and the common dominators to them,
63 and walk it twice. The first walk is in post-order, and it annotates
64 each block with the number of divisions that post-dominate it: this
65 gives information on where divisions can be inserted profitably.
66 The second walk is in pre-order, and it inserts divisions as explained
67 above, and replaces divisions by multiplications.
68
69 In the best case, the cost of the pass is O(n_statements). In the
70 worst-case, the cost is due to creating the dominator tree subset,
71 with a cost of O(n_basic_blocks ^ 2); however this can only happen
72 for n_statements / n_basic_blocks statements. So, the amortized cost
73 of creating the dominator tree subset is O(n_basic_blocks) and the
74 worst-case cost of the pass is O(n_statements * n_basic_blocks).
75
76 More practically, the cost will be small because there are few
77 divisions, and they tend to be in the same basic block, so insert_bb
78 is called very few times.
79
80 If we did this using domwalk.c, an efficient implementation would have
81 to work on all the variables in a single pass, because we could not
82 work on just a subset of the dominator tree, as we do now, and the
83 cost would also be something like O(n_statements * n_basic_blocks).
84 The data structures would be more complex in order to work on all the
85 variables in a single pass. */
86
87 #include "config.h"
88 #include "system.h"
89 #include "coretypes.h"
90 #include "tm.h"
91 #include "flags.h"
92 #include "tree.h"
93 #include "gimplify.h"
94 #include "gimple-iterator.h"
95 #include "gimple-ssa.h"
96 #include "tree-cfg.h"
97 #include "tree-phinodes.h"
98 #include "ssa-iterators.h"
99 #include "tree-ssanames.h"
100 #include "tree-dfa.h"
101 #include "tree-ssa.h"
102 #include "tree-pass.h"
103 #include "alloc-pool.h"
104 #include "basic-block.h"
105 #include "target.h"
106 #include "gimple-pretty-print.h"
107
108 /* FIXME: RTL headers have to be included here for optabs. */
109 #include "rtl.h" /* Because optabs.h wants enum rtx_code. */
110 #include "expr.h" /* Because optabs.h wants sepops. */
111 #include "optabs.h"
112
113 /* This structure represents one basic block that either computes a
114 division, or is a common dominator for basic block that compute a
115 division. */
116 struct occurrence {
117 /* The basic block represented by this structure. */
118 basic_block bb;
119
120 /* If non-NULL, the SSA_NAME holding the definition for a reciprocal
121 inserted in BB. */
122 tree recip_def;
123
124 /* If non-NULL, the GIMPLE_ASSIGN for a reciprocal computation that
125 was inserted in BB. */
126 gimple recip_def_stmt;
127
128 /* Pointer to a list of "struct occurrence"s for blocks dominated
129 by BB. */
130 struct occurrence *children;
131
132 /* Pointer to the next "struct occurrence"s in the list of blocks
133 sharing a common dominator. */
134 struct occurrence *next;
135
136 /* The number of divisions that are in BB before compute_merit. The
137 number of divisions that are in BB or post-dominate it after
138 compute_merit. */
139 int num_divisions;
140
141 /* True if the basic block has a division, false if it is a common
142 dominator for basic blocks that do. If it is false and trapping
143 math is active, BB is not a candidate for inserting a reciprocal. */
144 bool bb_has_division;
145 };
146
147 static struct
148 {
149 /* Number of 1.0/X ops inserted. */
150 int rdivs_inserted;
151
152 /* Number of 1.0/FUNC ops inserted. */
153 int rfuncs_inserted;
154 } reciprocal_stats;
155
156 static struct
157 {
158 /* Number of cexpi calls inserted. */
159 int inserted;
160 } sincos_stats;
161
162 static struct
163 {
164 /* Number of hand-written 16-bit bswaps found. */
165 int found_16bit;
166
167 /* Number of hand-written 32-bit bswaps found. */
168 int found_32bit;
169
170 /* Number of hand-written 64-bit bswaps found. */
171 int found_64bit;
172 } bswap_stats;
173
174 static struct
175 {
176 /* Number of widening multiplication ops inserted. */
177 int widen_mults_inserted;
178
179 /* Number of integer multiply-and-accumulate ops inserted. */
180 int maccs_inserted;
181
182 /* Number of fp fused multiply-add ops inserted. */
183 int fmas_inserted;
184 } widen_mul_stats;
185
186 /* The instance of "struct occurrence" representing the highest
187 interesting block in the dominator tree. */
188 static struct occurrence *occ_head;
189
190 /* Allocation pool for getting instances of "struct occurrence". */
191 static alloc_pool occ_pool;
192
193
194
195 /* Allocate and return a new struct occurrence for basic block BB, and
196 whose children list is headed by CHILDREN. */
197 static struct occurrence *
198 occ_new (basic_block bb, struct occurrence *children)
199 {
200 struct occurrence *occ;
201
202 bb->aux = occ = (struct occurrence *) pool_alloc (occ_pool);
203 memset (occ, 0, sizeof (struct occurrence));
204
205 occ->bb = bb;
206 occ->children = children;
207 return occ;
208 }
209
210
211 /* Insert NEW_OCC into our subset of the dominator tree. P_HEAD points to a
212 list of "struct occurrence"s, one per basic block, having IDOM as
213 their common dominator.
214
215 We try to insert NEW_OCC as deep as possible in the tree, and we also
216 insert any other block that is a common dominator for BB and one
217 block already in the tree. */
218
219 static void
220 insert_bb (struct occurrence *new_occ, basic_block idom,
221 struct occurrence **p_head)
222 {
223 struct occurrence *occ, **p_occ;
224
225 for (p_occ = p_head; (occ = *p_occ) != NULL; )
226 {
227 basic_block bb = new_occ->bb, occ_bb = occ->bb;
228 basic_block dom = nearest_common_dominator (CDI_DOMINATORS, occ_bb, bb);
229 if (dom == bb)
230 {
231 /* BB dominates OCC_BB. OCC becomes NEW_OCC's child: remove OCC
232 from its list. */
233 *p_occ = occ->next;
234 occ->next = new_occ->children;
235 new_occ->children = occ;
236
237 /* Try the next block (it may as well be dominated by BB). */
238 }
239
240 else if (dom == occ_bb)
241 {
242 /* OCC_BB dominates BB. Tail recurse to look deeper. */
243 insert_bb (new_occ, dom, &occ->children);
244 return;
245 }
246
247 else if (dom != idom)
248 {
249 gcc_assert (!dom->aux);
250
251 /* There is a dominator between IDOM and BB, add it and make
252 two children out of NEW_OCC and OCC. First, remove OCC from
253 its list. */
254 *p_occ = occ->next;
255 new_occ->next = occ;
256 occ->next = NULL;
257
258 /* None of the previous blocks has DOM as a dominator: if we tail
259 recursed, we would reexamine them uselessly. Just switch BB with
260 DOM, and go on looking for blocks dominated by DOM. */
261 new_occ = occ_new (dom, new_occ);
262 }
263
264 else
265 {
266 /* Nothing special, go on with the next element. */
267 p_occ = &occ->next;
268 }
269 }
270
271 /* No place was found as a child of IDOM. Make BB a sibling of IDOM. */
272 new_occ->next = *p_head;
273 *p_head = new_occ;
274 }
275
276 /* Register that we found a division in BB. */
277
278 static inline void
279 register_division_in (basic_block bb)
280 {
281 struct occurrence *occ;
282
283 occ = (struct occurrence *) bb->aux;
284 if (!occ)
285 {
286 occ = occ_new (bb, NULL);
287 insert_bb (occ, ENTRY_BLOCK_PTR, &occ_head);
288 }
289
290 occ->bb_has_division = true;
291 occ->num_divisions++;
292 }
293
294
295 /* Compute the number of divisions that postdominate each block in OCC and
296 its children. */
297
298 static void
299 compute_merit (struct occurrence *occ)
300 {
301 struct occurrence *occ_child;
302 basic_block dom = occ->bb;
303
304 for (occ_child = occ->children; occ_child; occ_child = occ_child->next)
305 {
306 basic_block bb;
307 if (occ_child->children)
308 compute_merit (occ_child);
309
310 if (flag_exceptions)
311 bb = single_noncomplex_succ (dom);
312 else
313 bb = dom;
314
315 if (dominated_by_p (CDI_POST_DOMINATORS, bb, occ_child->bb))
316 occ->num_divisions += occ_child->num_divisions;
317 }
318 }
319
320
321 /* Return whether USE_STMT is a floating-point division by DEF. */
322 static inline bool
323 is_division_by (gimple use_stmt, tree def)
324 {
325 return is_gimple_assign (use_stmt)
326 && gimple_assign_rhs_code (use_stmt) == RDIV_EXPR
327 && gimple_assign_rhs2 (use_stmt) == def
328 /* Do not recognize x / x as valid division, as we are getting
329 confused later by replacing all immediate uses x in such
330 a stmt. */
331 && gimple_assign_rhs1 (use_stmt) != def;
332 }
333
334 /* Walk the subset of the dominator tree rooted at OCC, setting the
335 RECIP_DEF field to a definition of 1.0 / DEF that can be used in
336 the given basic block. The field may be left NULL, of course,
337 if it is not possible or profitable to do the optimization.
338
339 DEF_BSI is an iterator pointing at the statement defining DEF.
340 If RECIP_DEF is set, a dominator already has a computation that can
341 be used. */
342
343 static void
344 insert_reciprocals (gimple_stmt_iterator *def_gsi, struct occurrence *occ,
345 tree def, tree recip_def, int threshold)
346 {
347 tree type;
348 gimple new_stmt;
349 gimple_stmt_iterator gsi;
350 struct occurrence *occ_child;
351
352 if (!recip_def
353 && (occ->bb_has_division || !flag_trapping_math)
354 && occ->num_divisions >= threshold)
355 {
356 /* Make a variable with the replacement and substitute it. */
357 type = TREE_TYPE (def);
358 recip_def = create_tmp_reg (type, "reciptmp");
359 new_stmt = gimple_build_assign_with_ops (RDIV_EXPR, recip_def,
360 build_one_cst (type), def);
361
362 if (occ->bb_has_division)
363 {
364 /* Case 1: insert before an existing division. */
365 gsi = gsi_after_labels (occ->bb);
366 while (!gsi_end_p (gsi) && !is_division_by (gsi_stmt (gsi), def))
367 gsi_next (&gsi);
368
369 gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
370 }
371 else if (def_gsi && occ->bb == def_gsi->bb)
372 {
373 /* Case 2: insert right after the definition. Note that this will
374 never happen if the definition statement can throw, because in
375 that case the sole successor of the statement's basic block will
376 dominate all the uses as well. */
377 gsi_insert_after (def_gsi, new_stmt, GSI_NEW_STMT);
378 }
379 else
380 {
381 /* Case 3: insert in a basic block not containing defs/uses. */
382 gsi = gsi_after_labels (occ->bb);
383 gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
384 }
385
386 reciprocal_stats.rdivs_inserted++;
387
388 occ->recip_def_stmt = new_stmt;
389 }
390
391 occ->recip_def = recip_def;
392 for (occ_child = occ->children; occ_child; occ_child = occ_child->next)
393 insert_reciprocals (def_gsi, occ_child, def, recip_def, threshold);
394 }
395
396
397 /* Replace the division at USE_P with a multiplication by the reciprocal, if
398 possible. */
399
400 static inline void
401 replace_reciprocal (use_operand_p use_p)
402 {
403 gimple use_stmt = USE_STMT (use_p);
404 basic_block bb = gimple_bb (use_stmt);
405 struct occurrence *occ = (struct occurrence *) bb->aux;
406
407 if (optimize_bb_for_speed_p (bb)
408 && occ->recip_def && use_stmt != occ->recip_def_stmt)
409 {
410 gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
411 gimple_assign_set_rhs_code (use_stmt, MULT_EXPR);
412 SET_USE (use_p, occ->recip_def);
413 fold_stmt_inplace (&gsi);
414 update_stmt (use_stmt);
415 }
416 }
417
418
419 /* Free OCC and return one more "struct occurrence" to be freed. */
420
421 static struct occurrence *
422 free_bb (struct occurrence *occ)
423 {
424 struct occurrence *child, *next;
425
426 /* First get the two pointers hanging off OCC. */
427 next = occ->next;
428 child = occ->children;
429 occ->bb->aux = NULL;
430 pool_free (occ_pool, occ);
431
432 /* Now ensure that we don't recurse unless it is necessary. */
433 if (!child)
434 return next;
435 else
436 {
437 while (next)
438 next = free_bb (next);
439
440 return child;
441 }
442 }
443
444
445 /* Look for floating-point divisions among DEF's uses, and try to
446 replace them by multiplications with the reciprocal. Add
447 as many statements computing the reciprocal as needed.
448
449 DEF must be a GIMPLE register of a floating-point type. */
450
451 static void
452 execute_cse_reciprocals_1 (gimple_stmt_iterator *def_gsi, tree def)
453 {
454 use_operand_p use_p;
455 imm_use_iterator use_iter;
456 struct occurrence *occ;
457 int count = 0, threshold;
458
459 gcc_assert (FLOAT_TYPE_P (TREE_TYPE (def)) && is_gimple_reg (def));
460
461 FOR_EACH_IMM_USE_FAST (use_p, use_iter, def)
462 {
463 gimple use_stmt = USE_STMT (use_p);
464 if (is_division_by (use_stmt, def))
465 {
466 register_division_in (gimple_bb (use_stmt));
467 count++;
468 }
469 }
470
471 /* Do the expensive part only if we can hope to optimize something. */
472 threshold = targetm.min_divisions_for_recip_mul (TYPE_MODE (TREE_TYPE (def)));
473 if (count >= threshold)
474 {
475 gimple use_stmt;
476 for (occ = occ_head; occ; occ = occ->next)
477 {
478 compute_merit (occ);
479 insert_reciprocals (def_gsi, occ, def, NULL, threshold);
480 }
481
482 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, def)
483 {
484 if (is_division_by (use_stmt, def))
485 {
486 FOR_EACH_IMM_USE_ON_STMT (use_p, use_iter)
487 replace_reciprocal (use_p);
488 }
489 }
490 }
491
492 for (occ = occ_head; occ; )
493 occ = free_bb (occ);
494
495 occ_head = NULL;
496 }
497
498 static bool
499 gate_cse_reciprocals (void)
500 {
501 return optimize && flag_reciprocal_math;
502 }
503
504 /* Go through all the floating-point SSA_NAMEs, and call
505 execute_cse_reciprocals_1 on each of them. */
506 static unsigned int
507 execute_cse_reciprocals (void)
508 {
509 basic_block bb;
510 tree arg;
511
512 occ_pool = create_alloc_pool ("dominators for recip",
513 sizeof (struct occurrence),
514 n_basic_blocks / 3 + 1);
515
516 memset (&reciprocal_stats, 0, sizeof (reciprocal_stats));
517 calculate_dominance_info (CDI_DOMINATORS);
518 calculate_dominance_info (CDI_POST_DOMINATORS);
519
520 #ifdef ENABLE_CHECKING
521 FOR_EACH_BB (bb)
522 gcc_assert (!bb->aux);
523 #endif
524
525 for (arg = DECL_ARGUMENTS (cfun->decl); arg; arg = DECL_CHAIN (arg))
526 if (FLOAT_TYPE_P (TREE_TYPE (arg))
527 && is_gimple_reg (arg))
528 {
529 tree name = ssa_default_def (cfun, arg);
530 if (name)
531 execute_cse_reciprocals_1 (NULL, name);
532 }
533
534 FOR_EACH_BB (bb)
535 {
536 gimple_stmt_iterator gsi;
537 gimple phi;
538 tree def;
539
540 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
541 {
542 phi = gsi_stmt (gsi);
543 def = PHI_RESULT (phi);
544 if (! virtual_operand_p (def)
545 && FLOAT_TYPE_P (TREE_TYPE (def)))
546 execute_cse_reciprocals_1 (NULL, def);
547 }
548
549 for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
550 {
551 gimple stmt = gsi_stmt (gsi);
552
553 if (gimple_has_lhs (stmt)
554 && (def = SINGLE_SSA_TREE_OPERAND (stmt, SSA_OP_DEF)) != NULL
555 && FLOAT_TYPE_P (TREE_TYPE (def))
556 && TREE_CODE (def) == SSA_NAME)
557 execute_cse_reciprocals_1 (&gsi, def);
558 }
559
560 if (optimize_bb_for_size_p (bb))
561 continue;
562
563 /* Scan for a/func(b) and convert it to reciprocal a*rfunc(b). */
564 for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
565 {
566 gimple stmt = gsi_stmt (gsi);
567 tree fndecl;
568
569 if (is_gimple_assign (stmt)
570 && gimple_assign_rhs_code (stmt) == RDIV_EXPR)
571 {
572 tree arg1 = gimple_assign_rhs2 (stmt);
573 gimple stmt1;
574
575 if (TREE_CODE (arg1) != SSA_NAME)
576 continue;
577
578 stmt1 = SSA_NAME_DEF_STMT (arg1);
579
580 if (is_gimple_call (stmt1)
581 && gimple_call_lhs (stmt1)
582 && (fndecl = gimple_call_fndecl (stmt1))
583 && (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL
584 || DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD))
585 {
586 enum built_in_function code;
587 bool md_code, fail;
588 imm_use_iterator ui;
589 use_operand_p use_p;
590
591 code = DECL_FUNCTION_CODE (fndecl);
592 md_code = DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD;
593
594 fndecl = targetm.builtin_reciprocal (code, md_code, false);
595 if (!fndecl)
596 continue;
597
598 /* Check that all uses of the SSA name are divisions,
599 otherwise replacing the defining statement will do
600 the wrong thing. */
601 fail = false;
602 FOR_EACH_IMM_USE_FAST (use_p, ui, arg1)
603 {
604 gimple stmt2 = USE_STMT (use_p);
605 if (is_gimple_debug (stmt2))
606 continue;
607 if (!is_gimple_assign (stmt2)
608 || gimple_assign_rhs_code (stmt2) != RDIV_EXPR
609 || gimple_assign_rhs1 (stmt2) == arg1
610 || gimple_assign_rhs2 (stmt2) != arg1)
611 {
612 fail = true;
613 break;
614 }
615 }
616 if (fail)
617 continue;
618
619 gimple_replace_ssa_lhs (stmt1, arg1);
620 gimple_call_set_fndecl (stmt1, fndecl);
621 update_stmt (stmt1);
622 reciprocal_stats.rfuncs_inserted++;
623
624 FOR_EACH_IMM_USE_STMT (stmt, ui, arg1)
625 {
626 gimple_stmt_iterator gsi = gsi_for_stmt (stmt);
627 gimple_assign_set_rhs_code (stmt, MULT_EXPR);
628 fold_stmt_inplace (&gsi);
629 update_stmt (stmt);
630 }
631 }
632 }
633 }
634 }
635
636 statistics_counter_event (cfun, "reciprocal divs inserted",
637 reciprocal_stats.rdivs_inserted);
638 statistics_counter_event (cfun, "reciprocal functions inserted",
639 reciprocal_stats.rfuncs_inserted);
640
641 free_dominance_info (CDI_DOMINATORS);
642 free_dominance_info (CDI_POST_DOMINATORS);
643 free_alloc_pool (occ_pool);
644 return 0;
645 }
646
647 namespace {
648
649 const pass_data pass_data_cse_reciprocals =
650 {
651 GIMPLE_PASS, /* type */
652 "recip", /* name */
653 OPTGROUP_NONE, /* optinfo_flags */
654 true, /* has_gate */
655 true, /* has_execute */
656 TV_NONE, /* tv_id */
657 PROP_ssa, /* properties_required */
658 0, /* properties_provided */
659 0, /* properties_destroyed */
660 0, /* todo_flags_start */
661 ( TODO_update_ssa | TODO_verify_ssa
662 | TODO_verify_stmts ), /* todo_flags_finish */
663 };
664
665 class pass_cse_reciprocals : public gimple_opt_pass
666 {
667 public:
668 pass_cse_reciprocals (gcc::context *ctxt)
669 : gimple_opt_pass (pass_data_cse_reciprocals, ctxt)
670 {}
671
672 /* opt_pass methods: */
673 bool gate () { return gate_cse_reciprocals (); }
674 unsigned int execute () { return execute_cse_reciprocals (); }
675
676 }; // class pass_cse_reciprocals
677
678 } // anon namespace
679
680 gimple_opt_pass *
681 make_pass_cse_reciprocals (gcc::context *ctxt)
682 {
683 return new pass_cse_reciprocals (ctxt);
684 }
685
686 /* Records an occurrence at statement USE_STMT in the vector of trees
687 STMTS if it is dominated by *TOP_BB or dominates it or this basic block
688 is not yet initialized. Returns true if the occurrence was pushed on
689 the vector. Adjusts *TOP_BB to be the basic block dominating all
690 statements in the vector. */
691
692 static bool
693 maybe_record_sincos (vec<gimple> *stmts,
694 basic_block *top_bb, gimple use_stmt)
695 {
696 basic_block use_bb = gimple_bb (use_stmt);
697 if (*top_bb
698 && (*top_bb == use_bb
699 || dominated_by_p (CDI_DOMINATORS, use_bb, *top_bb)))
700 stmts->safe_push (use_stmt);
701 else if (!*top_bb
702 || dominated_by_p (CDI_DOMINATORS, *top_bb, use_bb))
703 {
704 stmts->safe_push (use_stmt);
705 *top_bb = use_bb;
706 }
707 else
708 return false;
709
710 return true;
711 }
712
713 /* Look for sin, cos and cexpi calls with the same argument NAME and
714 create a single call to cexpi CSEing the result in this case.
715 We first walk over all immediate uses of the argument collecting
716 statements that we can CSE in a vector and in a second pass replace
717 the statement rhs with a REALPART or IMAGPART expression on the
718 result of the cexpi call we insert before the use statement that
719 dominates all other candidates. */
720
721 static bool
722 execute_cse_sincos_1 (tree name)
723 {
724 gimple_stmt_iterator gsi;
725 imm_use_iterator use_iter;
726 tree fndecl, res, type;
727 gimple def_stmt, use_stmt, stmt;
728 int seen_cos = 0, seen_sin = 0, seen_cexpi = 0;
729 vec<gimple> stmts = vNULL;
730 basic_block top_bb = NULL;
731 int i;
732 bool cfg_changed = false;
733
734 type = TREE_TYPE (name);
735 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, name)
736 {
737 if (gimple_code (use_stmt) != GIMPLE_CALL
738 || !gimple_call_lhs (use_stmt)
739 || !(fndecl = gimple_call_fndecl (use_stmt))
740 || DECL_BUILT_IN_CLASS (fndecl) != BUILT_IN_NORMAL)
741 continue;
742
743 switch (DECL_FUNCTION_CODE (fndecl))
744 {
745 CASE_FLT_FN (BUILT_IN_COS):
746 seen_cos |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
747 break;
748
749 CASE_FLT_FN (BUILT_IN_SIN):
750 seen_sin |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
751 break;
752
753 CASE_FLT_FN (BUILT_IN_CEXPI):
754 seen_cexpi |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
755 break;
756
757 default:;
758 }
759 }
760
761 if (seen_cos + seen_sin + seen_cexpi <= 1)
762 {
763 stmts.release ();
764 return false;
765 }
766
767 /* Simply insert cexpi at the beginning of top_bb but not earlier than
768 the name def statement. */
769 fndecl = mathfn_built_in (type, BUILT_IN_CEXPI);
770 if (!fndecl)
771 return false;
772 stmt = gimple_build_call (fndecl, 1, name);
773 res = make_temp_ssa_name (TREE_TYPE (TREE_TYPE (fndecl)), stmt, "sincostmp");
774 gimple_call_set_lhs (stmt, res);
775
776 def_stmt = SSA_NAME_DEF_STMT (name);
777 if (!SSA_NAME_IS_DEFAULT_DEF (name)
778 && gimple_code (def_stmt) != GIMPLE_PHI
779 && gimple_bb (def_stmt) == top_bb)
780 {
781 gsi = gsi_for_stmt (def_stmt);
782 gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
783 }
784 else
785 {
786 gsi = gsi_after_labels (top_bb);
787 gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
788 }
789 sincos_stats.inserted++;
790
791 /* And adjust the recorded old call sites. */
792 for (i = 0; stmts.iterate (i, &use_stmt); ++i)
793 {
794 tree rhs = NULL;
795 fndecl = gimple_call_fndecl (use_stmt);
796
797 switch (DECL_FUNCTION_CODE (fndecl))
798 {
799 CASE_FLT_FN (BUILT_IN_COS):
800 rhs = fold_build1 (REALPART_EXPR, type, res);
801 break;
802
803 CASE_FLT_FN (BUILT_IN_SIN):
804 rhs = fold_build1 (IMAGPART_EXPR, type, res);
805 break;
806
807 CASE_FLT_FN (BUILT_IN_CEXPI):
808 rhs = res;
809 break;
810
811 default:;
812 gcc_unreachable ();
813 }
814
815 /* Replace call with a copy. */
816 stmt = gimple_build_assign (gimple_call_lhs (use_stmt), rhs);
817
818 gsi = gsi_for_stmt (use_stmt);
819 gsi_replace (&gsi, stmt, true);
820 if (gimple_purge_dead_eh_edges (gimple_bb (stmt)))
821 cfg_changed = true;
822 }
823
824 stmts.release ();
825
826 return cfg_changed;
827 }
828
829 /* To evaluate powi(x,n), the floating point value x raised to the
830 constant integer exponent n, we use a hybrid algorithm that
831 combines the "window method" with look-up tables. For an
832 introduction to exponentiation algorithms and "addition chains",
833 see section 4.6.3, "Evaluation of Powers" of Donald E. Knuth,
834 "Seminumerical Algorithms", Vol. 2, "The Art of Computer Programming",
835 3rd Edition, 1998, and Daniel M. Gordon, "A Survey of Fast Exponentiation
836 Methods", Journal of Algorithms, Vol. 27, pp. 129-146, 1998. */
837
838 /* Provide a default value for POWI_MAX_MULTS, the maximum number of
839 multiplications to inline before calling the system library's pow
840 function. powi(x,n) requires at worst 2*bits(n)-2 multiplications,
841 so this default never requires calling pow, powf or powl. */
842
843 #ifndef POWI_MAX_MULTS
844 #define POWI_MAX_MULTS (2*HOST_BITS_PER_WIDE_INT-2)
845 #endif
846
847 /* The size of the "optimal power tree" lookup table. All
848 exponents less than this value are simply looked up in the
849 powi_table below. This threshold is also used to size the
850 cache of pseudo registers that hold intermediate results. */
851 #define POWI_TABLE_SIZE 256
852
853 /* The size, in bits of the window, used in the "window method"
854 exponentiation algorithm. This is equivalent to a radix of
855 (1<<POWI_WINDOW_SIZE) in the corresponding "m-ary method". */
856 #define POWI_WINDOW_SIZE 3
857
858 /* The following table is an efficient representation of an
859 "optimal power tree". For each value, i, the corresponding
860 value, j, in the table states than an optimal evaluation
861 sequence for calculating pow(x,i) can be found by evaluating
862 pow(x,j)*pow(x,i-j). An optimal power tree for the first
863 100 integers is given in Knuth's "Seminumerical algorithms". */
864
865 static const unsigned char powi_table[POWI_TABLE_SIZE] =
866 {
867 0, 1, 1, 2, 2, 3, 3, 4, /* 0 - 7 */
868 4, 6, 5, 6, 6, 10, 7, 9, /* 8 - 15 */
869 8, 16, 9, 16, 10, 12, 11, 13, /* 16 - 23 */
870 12, 17, 13, 18, 14, 24, 15, 26, /* 24 - 31 */
871 16, 17, 17, 19, 18, 33, 19, 26, /* 32 - 39 */
872 20, 25, 21, 40, 22, 27, 23, 44, /* 40 - 47 */
873 24, 32, 25, 34, 26, 29, 27, 44, /* 48 - 55 */
874 28, 31, 29, 34, 30, 60, 31, 36, /* 56 - 63 */
875 32, 64, 33, 34, 34, 46, 35, 37, /* 64 - 71 */
876 36, 65, 37, 50, 38, 48, 39, 69, /* 72 - 79 */
877 40, 49, 41, 43, 42, 51, 43, 58, /* 80 - 87 */
878 44, 64, 45, 47, 46, 59, 47, 76, /* 88 - 95 */
879 48, 65, 49, 66, 50, 67, 51, 66, /* 96 - 103 */
880 52, 70, 53, 74, 54, 104, 55, 74, /* 104 - 111 */
881 56, 64, 57, 69, 58, 78, 59, 68, /* 112 - 119 */
882 60, 61, 61, 80, 62, 75, 63, 68, /* 120 - 127 */
883 64, 65, 65, 128, 66, 129, 67, 90, /* 128 - 135 */
884 68, 73, 69, 131, 70, 94, 71, 88, /* 136 - 143 */
885 72, 128, 73, 98, 74, 132, 75, 121, /* 144 - 151 */
886 76, 102, 77, 124, 78, 132, 79, 106, /* 152 - 159 */
887 80, 97, 81, 160, 82, 99, 83, 134, /* 160 - 167 */
888 84, 86, 85, 95, 86, 160, 87, 100, /* 168 - 175 */
889 88, 113, 89, 98, 90, 107, 91, 122, /* 176 - 183 */
890 92, 111, 93, 102, 94, 126, 95, 150, /* 184 - 191 */
891 96, 128, 97, 130, 98, 133, 99, 195, /* 192 - 199 */
892 100, 128, 101, 123, 102, 164, 103, 138, /* 200 - 207 */
893 104, 145, 105, 146, 106, 109, 107, 149, /* 208 - 215 */
894 108, 200, 109, 146, 110, 170, 111, 157, /* 216 - 223 */
895 112, 128, 113, 130, 114, 182, 115, 132, /* 224 - 231 */
896 116, 200, 117, 132, 118, 158, 119, 206, /* 232 - 239 */
897 120, 240, 121, 162, 122, 147, 123, 152, /* 240 - 247 */
898 124, 166, 125, 214, 126, 138, 127, 153, /* 248 - 255 */
899 };
900
901
902 /* Return the number of multiplications required to calculate
903 powi(x,n) where n is less than POWI_TABLE_SIZE. This is a
904 subroutine of powi_cost. CACHE is an array indicating
905 which exponents have already been calculated. */
906
907 static int
908 powi_lookup_cost (unsigned HOST_WIDE_INT n, bool *cache)
909 {
910 /* If we've already calculated this exponent, then this evaluation
911 doesn't require any additional multiplications. */
912 if (cache[n])
913 return 0;
914
915 cache[n] = true;
916 return powi_lookup_cost (n - powi_table[n], cache)
917 + powi_lookup_cost (powi_table[n], cache) + 1;
918 }
919
920 /* Return the number of multiplications required to calculate
921 powi(x,n) for an arbitrary x, given the exponent N. This
922 function needs to be kept in sync with powi_as_mults below. */
923
924 static int
925 powi_cost (HOST_WIDE_INT n)
926 {
927 bool cache[POWI_TABLE_SIZE];
928 unsigned HOST_WIDE_INT digit;
929 unsigned HOST_WIDE_INT val;
930 int result;
931
932 if (n == 0)
933 return 0;
934
935 /* Ignore the reciprocal when calculating the cost. */
936 val = (n < 0) ? -n : n;
937
938 /* Initialize the exponent cache. */
939 memset (cache, 0, POWI_TABLE_SIZE * sizeof (bool));
940 cache[1] = true;
941
942 result = 0;
943
944 while (val >= POWI_TABLE_SIZE)
945 {
946 if (val & 1)
947 {
948 digit = val & ((1 << POWI_WINDOW_SIZE) - 1);
949 result += powi_lookup_cost (digit, cache)
950 + POWI_WINDOW_SIZE + 1;
951 val >>= POWI_WINDOW_SIZE;
952 }
953 else
954 {
955 val >>= 1;
956 result++;
957 }
958 }
959
960 return result + powi_lookup_cost (val, cache);
961 }
962
963 /* Recursive subroutine of powi_as_mults. This function takes the
964 array, CACHE, of already calculated exponents and an exponent N and
965 returns a tree that corresponds to CACHE[1]**N, with type TYPE. */
966
967 static tree
968 powi_as_mults_1 (gimple_stmt_iterator *gsi, location_t loc, tree type,
969 HOST_WIDE_INT n, tree *cache)
970 {
971 tree op0, op1, ssa_target;
972 unsigned HOST_WIDE_INT digit;
973 gimple mult_stmt;
974
975 if (n < POWI_TABLE_SIZE && cache[n])
976 return cache[n];
977
978 ssa_target = make_temp_ssa_name (type, NULL, "powmult");
979
980 if (n < POWI_TABLE_SIZE)
981 {
982 cache[n] = ssa_target;
983 op0 = powi_as_mults_1 (gsi, loc, type, n - powi_table[n], cache);
984 op1 = powi_as_mults_1 (gsi, loc, type, powi_table[n], cache);
985 }
986 else if (n & 1)
987 {
988 digit = n & ((1 << POWI_WINDOW_SIZE) - 1);
989 op0 = powi_as_mults_1 (gsi, loc, type, n - digit, cache);
990 op1 = powi_as_mults_1 (gsi, loc, type, digit, cache);
991 }
992 else
993 {
994 op0 = powi_as_mults_1 (gsi, loc, type, n >> 1, cache);
995 op1 = op0;
996 }
997
998 mult_stmt = gimple_build_assign_with_ops (MULT_EXPR, ssa_target, op0, op1);
999 gimple_set_location (mult_stmt, loc);
1000 gsi_insert_before (gsi, mult_stmt, GSI_SAME_STMT);
1001
1002 return ssa_target;
1003 }
1004
1005 /* Convert ARG0**N to a tree of multiplications of ARG0 with itself.
1006 This function needs to be kept in sync with powi_cost above. */
1007
1008 static tree
1009 powi_as_mults (gimple_stmt_iterator *gsi, location_t loc,
1010 tree arg0, HOST_WIDE_INT n)
1011 {
1012 tree cache[POWI_TABLE_SIZE], result, type = TREE_TYPE (arg0);
1013 gimple div_stmt;
1014 tree target;
1015
1016 if (n == 0)
1017 return build_real (type, dconst1);
1018
1019 memset (cache, 0, sizeof (cache));
1020 cache[1] = arg0;
1021
1022 result = powi_as_mults_1 (gsi, loc, type, (n < 0) ? -n : n, cache);
1023 if (n >= 0)
1024 return result;
1025
1026 /* If the original exponent was negative, reciprocate the result. */
1027 target = make_temp_ssa_name (type, NULL, "powmult");
1028 div_stmt = gimple_build_assign_with_ops (RDIV_EXPR, target,
1029 build_real (type, dconst1),
1030 result);
1031 gimple_set_location (div_stmt, loc);
1032 gsi_insert_before (gsi, div_stmt, GSI_SAME_STMT);
1033
1034 return target;
1035 }
1036
1037 /* ARG0 and N are the two arguments to a powi builtin in GSI with
1038 location info LOC. If the arguments are appropriate, create an
1039 equivalent sequence of statements prior to GSI using an optimal
1040 number of multiplications, and return an expession holding the
1041 result. */
1042
1043 static tree
1044 gimple_expand_builtin_powi (gimple_stmt_iterator *gsi, location_t loc,
1045 tree arg0, HOST_WIDE_INT n)
1046 {
1047 /* Avoid largest negative number. */
1048 if (n != -n
1049 && ((n >= -1 && n <= 2)
1050 || (optimize_function_for_speed_p (cfun)
1051 && powi_cost (n) <= POWI_MAX_MULTS)))
1052 return powi_as_mults (gsi, loc, arg0, n);
1053
1054 return NULL_TREE;
1055 }
1056
1057 /* Build a gimple call statement that calls FN with argument ARG.
1058 Set the lhs of the call statement to a fresh SSA name. Insert the
1059 statement prior to GSI's current position, and return the fresh
1060 SSA name. */
1061
1062 static tree
1063 build_and_insert_call (gimple_stmt_iterator *gsi, location_t loc,
1064 tree fn, tree arg)
1065 {
1066 gimple call_stmt;
1067 tree ssa_target;
1068
1069 call_stmt = gimple_build_call (fn, 1, arg);
1070 ssa_target = make_temp_ssa_name (TREE_TYPE (arg), NULL, "powroot");
1071 gimple_set_lhs (call_stmt, ssa_target);
1072 gimple_set_location (call_stmt, loc);
1073 gsi_insert_before (gsi, call_stmt, GSI_SAME_STMT);
1074
1075 return ssa_target;
1076 }
1077
1078 /* Build a gimple binary operation with the given CODE and arguments
1079 ARG0, ARG1, assigning the result to a new SSA name for variable
1080 TARGET. Insert the statement prior to GSI's current position, and
1081 return the fresh SSA name.*/
1082
1083 static tree
1084 build_and_insert_binop (gimple_stmt_iterator *gsi, location_t loc,
1085 const char *name, enum tree_code code,
1086 tree arg0, tree arg1)
1087 {
1088 tree result = make_temp_ssa_name (TREE_TYPE (arg0), NULL, name);
1089 gimple stmt = gimple_build_assign_with_ops (code, result, arg0, arg1);
1090 gimple_set_location (stmt, loc);
1091 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1092 return result;
1093 }
1094
1095 /* Build a gimple reference operation with the given CODE and argument
1096 ARG, assigning the result to a new SSA name of TYPE with NAME.
1097 Insert the statement prior to GSI's current position, and return
1098 the fresh SSA name. */
1099
1100 static inline tree
1101 build_and_insert_ref (gimple_stmt_iterator *gsi, location_t loc, tree type,
1102 const char *name, enum tree_code code, tree arg0)
1103 {
1104 tree result = make_temp_ssa_name (type, NULL, name);
1105 gimple stmt = gimple_build_assign (result, build1 (code, type, arg0));
1106 gimple_set_location (stmt, loc);
1107 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1108 return result;
1109 }
1110
1111 /* Build a gimple assignment to cast VAL to TYPE. Insert the statement
1112 prior to GSI's current position, and return the fresh SSA name. */
1113
1114 static tree
1115 build_and_insert_cast (gimple_stmt_iterator *gsi, location_t loc,
1116 tree type, tree val)
1117 {
1118 tree result = make_ssa_name (type, NULL);
1119 gimple stmt = gimple_build_assign_with_ops (NOP_EXPR, result, val, NULL_TREE);
1120 gimple_set_location (stmt, loc);
1121 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1122 return result;
1123 }
1124
1125 /* ARG0 and ARG1 are the two arguments to a pow builtin call in GSI
1126 with location info LOC. If possible, create an equivalent and
1127 less expensive sequence of statements prior to GSI, and return an
1128 expession holding the result. */
1129
1130 static tree
1131 gimple_expand_builtin_pow (gimple_stmt_iterator *gsi, location_t loc,
1132 tree arg0, tree arg1)
1133 {
1134 REAL_VALUE_TYPE c, cint, dconst1_4, dconst3_4, dconst1_3, dconst1_6;
1135 REAL_VALUE_TYPE c2, dconst3;
1136 HOST_WIDE_INT n;
1137 tree type, sqrtfn, cbrtfn, sqrt_arg0, sqrt_sqrt, result, cbrt_x, powi_cbrt_x;
1138 enum machine_mode mode;
1139 bool hw_sqrt_exists, c_is_int, c2_is_int;
1140
1141 /* If the exponent isn't a constant, there's nothing of interest
1142 to be done. */
1143 if (TREE_CODE (arg1) != REAL_CST)
1144 return NULL_TREE;
1145
1146 /* If the exponent is equivalent to an integer, expand to an optimal
1147 multiplication sequence when profitable. */
1148 c = TREE_REAL_CST (arg1);
1149 n = real_to_integer (&c);
1150 real_from_integer (&cint, VOIDmode, n, n < 0 ? -1 : 0, 0);
1151 c_is_int = real_identical (&c, &cint);
1152
1153 if (c_is_int
1154 && ((n >= -1 && n <= 2)
1155 || (flag_unsafe_math_optimizations
1156 && optimize_insn_for_speed_p ()
1157 && powi_cost (n) <= POWI_MAX_MULTS)))
1158 return gimple_expand_builtin_powi (gsi, loc, arg0, n);
1159
1160 /* Attempt various optimizations using sqrt and cbrt. */
1161 type = TREE_TYPE (arg0);
1162 mode = TYPE_MODE (type);
1163 sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1164
1165 /* Optimize pow(x,0.5) = sqrt(x). This replacement is always safe
1166 unless signed zeros must be maintained. pow(-0,0.5) = +0, while
1167 sqrt(-0) = -0. */
1168 if (sqrtfn
1169 && REAL_VALUES_EQUAL (c, dconsthalf)
1170 && !HONOR_SIGNED_ZEROS (mode))
1171 return build_and_insert_call (gsi, loc, sqrtfn, arg0);
1172
1173 /* Optimize pow(x,0.25) = sqrt(sqrt(x)). Assume on most machines that
1174 a builtin sqrt instruction is smaller than a call to pow with 0.25,
1175 so do this optimization even if -Os. Don't do this optimization
1176 if we don't have a hardware sqrt insn. */
1177 dconst1_4 = dconst1;
1178 SET_REAL_EXP (&dconst1_4, REAL_EXP (&dconst1_4) - 2);
1179 hw_sqrt_exists = optab_handler (sqrt_optab, mode) != CODE_FOR_nothing;
1180
1181 if (flag_unsafe_math_optimizations
1182 && sqrtfn
1183 && REAL_VALUES_EQUAL (c, dconst1_4)
1184 && hw_sqrt_exists)
1185 {
1186 /* sqrt(x) */
1187 sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1188
1189 /* sqrt(sqrt(x)) */
1190 return build_and_insert_call (gsi, loc, sqrtfn, sqrt_arg0);
1191 }
1192
1193 /* Optimize pow(x,0.75) = sqrt(x) * sqrt(sqrt(x)) unless we are
1194 optimizing for space. Don't do this optimization if we don't have
1195 a hardware sqrt insn. */
1196 real_from_integer (&dconst3_4, VOIDmode, 3, 0, 0);
1197 SET_REAL_EXP (&dconst3_4, REAL_EXP (&dconst3_4) - 2);
1198
1199 if (flag_unsafe_math_optimizations
1200 && sqrtfn
1201 && optimize_function_for_speed_p (cfun)
1202 && REAL_VALUES_EQUAL (c, dconst3_4)
1203 && hw_sqrt_exists)
1204 {
1205 /* sqrt(x) */
1206 sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1207
1208 /* sqrt(sqrt(x)) */
1209 sqrt_sqrt = build_and_insert_call (gsi, loc, sqrtfn, sqrt_arg0);
1210
1211 /* sqrt(x) * sqrt(sqrt(x)) */
1212 return build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1213 sqrt_arg0, sqrt_sqrt);
1214 }
1215
1216 /* Optimize pow(x,1./3.) = cbrt(x). This requires unsafe math
1217 optimizations since 1./3. is not exactly representable. If x
1218 is negative and finite, the correct value of pow(x,1./3.) is
1219 a NaN with the "invalid" exception raised, because the value
1220 of 1./3. actually has an even denominator. The correct value
1221 of cbrt(x) is a negative real value. */
1222 cbrtfn = mathfn_built_in (type, BUILT_IN_CBRT);
1223 dconst1_3 = real_value_truncate (mode, dconst_third ());
1224
1225 if (flag_unsafe_math_optimizations
1226 && cbrtfn
1227 && (gimple_val_nonnegative_real_p (arg0) || !HONOR_NANS (mode))
1228 && REAL_VALUES_EQUAL (c, dconst1_3))
1229 return build_and_insert_call (gsi, loc, cbrtfn, arg0);
1230
1231 /* Optimize pow(x,1./6.) = cbrt(sqrt(x)). Don't do this optimization
1232 if we don't have a hardware sqrt insn. */
1233 dconst1_6 = dconst1_3;
1234 SET_REAL_EXP (&dconst1_6, REAL_EXP (&dconst1_6) - 1);
1235
1236 if (flag_unsafe_math_optimizations
1237 && sqrtfn
1238 && cbrtfn
1239 && (gimple_val_nonnegative_real_p (arg0) || !HONOR_NANS (mode))
1240 && optimize_function_for_speed_p (cfun)
1241 && hw_sqrt_exists
1242 && REAL_VALUES_EQUAL (c, dconst1_6))
1243 {
1244 /* sqrt(x) */
1245 sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1246
1247 /* cbrt(sqrt(x)) */
1248 return build_and_insert_call (gsi, loc, cbrtfn, sqrt_arg0);
1249 }
1250
1251 /* Optimize pow(x,c), where n = 2c for some nonzero integer n
1252 and c not an integer, into
1253
1254 sqrt(x) * powi(x, n/2), n > 0;
1255 1.0 / (sqrt(x) * powi(x, abs(n/2))), n < 0.
1256
1257 Do not calculate the powi factor when n/2 = 0. */
1258 real_arithmetic (&c2, MULT_EXPR, &c, &dconst2);
1259 n = real_to_integer (&c2);
1260 real_from_integer (&cint, VOIDmode, n, n < 0 ? -1 : 0, 0);
1261 c2_is_int = real_identical (&c2, &cint);
1262
1263 if (flag_unsafe_math_optimizations
1264 && sqrtfn
1265 && c2_is_int
1266 && !c_is_int
1267 && optimize_function_for_speed_p (cfun))
1268 {
1269 tree powi_x_ndiv2 = NULL_TREE;
1270
1271 /* Attempt to fold powi(arg0, abs(n/2)) into multiplies. If not
1272 possible or profitable, give up. Skip the degenerate case when
1273 n is 1 or -1, where the result is always 1. */
1274 if (absu_hwi (n) != 1)
1275 {
1276 powi_x_ndiv2 = gimple_expand_builtin_powi (gsi, loc, arg0,
1277 abs_hwi (n / 2));
1278 if (!powi_x_ndiv2)
1279 return NULL_TREE;
1280 }
1281
1282 /* Calculate sqrt(x). When n is not 1 or -1, multiply it by the
1283 result of the optimal multiply sequence just calculated. */
1284 sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1285
1286 if (absu_hwi (n) == 1)
1287 result = sqrt_arg0;
1288 else
1289 result = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1290 sqrt_arg0, powi_x_ndiv2);
1291
1292 /* If n is negative, reciprocate the result. */
1293 if (n < 0)
1294 result = build_and_insert_binop (gsi, loc, "powroot", RDIV_EXPR,
1295 build_real (type, dconst1), result);
1296 return result;
1297 }
1298
1299 /* Optimize pow(x,c), where 3c = n for some nonzero integer n, into
1300
1301 powi(x, n/3) * powi(cbrt(x), n%3), n > 0;
1302 1.0 / (powi(x, abs(n)/3) * powi(cbrt(x), abs(n)%3)), n < 0.
1303
1304 Do not calculate the first factor when n/3 = 0. As cbrt(x) is
1305 different from pow(x, 1./3.) due to rounding and behavior with
1306 negative x, we need to constrain this transformation to unsafe
1307 math and positive x or finite math. */
1308 real_from_integer (&dconst3, VOIDmode, 3, 0, 0);
1309 real_arithmetic (&c2, MULT_EXPR, &c, &dconst3);
1310 real_round (&c2, mode, &c2);
1311 n = real_to_integer (&c2);
1312 real_from_integer (&cint, VOIDmode, n, n < 0 ? -1 : 0, 0);
1313 real_arithmetic (&c2, RDIV_EXPR, &cint, &dconst3);
1314 real_convert (&c2, mode, &c2);
1315
1316 if (flag_unsafe_math_optimizations
1317 && cbrtfn
1318 && (gimple_val_nonnegative_real_p (arg0) || !HONOR_NANS (mode))
1319 && real_identical (&c2, &c)
1320 && !c2_is_int
1321 && optimize_function_for_speed_p (cfun)
1322 && powi_cost (n / 3) <= POWI_MAX_MULTS)
1323 {
1324 tree powi_x_ndiv3 = NULL_TREE;
1325
1326 /* Attempt to fold powi(arg0, abs(n/3)) into multiplies. If not
1327 possible or profitable, give up. Skip the degenerate case when
1328 abs(n) < 3, where the result is always 1. */
1329 if (absu_hwi (n) >= 3)
1330 {
1331 powi_x_ndiv3 = gimple_expand_builtin_powi (gsi, loc, arg0,
1332 abs_hwi (n / 3));
1333 if (!powi_x_ndiv3)
1334 return NULL_TREE;
1335 }
1336
1337 /* Calculate powi(cbrt(x), n%3). Don't use gimple_expand_builtin_powi
1338 as that creates an unnecessary variable. Instead, just produce
1339 either cbrt(x) or cbrt(x) * cbrt(x). */
1340 cbrt_x = build_and_insert_call (gsi, loc, cbrtfn, arg0);
1341
1342 if (absu_hwi (n) % 3 == 1)
1343 powi_cbrt_x = cbrt_x;
1344 else
1345 powi_cbrt_x = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1346 cbrt_x, cbrt_x);
1347
1348 /* Multiply the two subexpressions, unless powi(x,abs(n)/3) = 1. */
1349 if (absu_hwi (n) < 3)
1350 result = powi_cbrt_x;
1351 else
1352 result = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1353 powi_x_ndiv3, powi_cbrt_x);
1354
1355 /* If n is negative, reciprocate the result. */
1356 if (n < 0)
1357 result = build_and_insert_binop (gsi, loc, "powroot", RDIV_EXPR,
1358 build_real (type, dconst1), result);
1359
1360 return result;
1361 }
1362
1363 /* No optimizations succeeded. */
1364 return NULL_TREE;
1365 }
1366
1367 /* ARG is the argument to a cabs builtin call in GSI with location info
1368 LOC. Create a sequence of statements prior to GSI that calculates
1369 sqrt(R*R + I*I), where R and I are the real and imaginary components
1370 of ARG, respectively. Return an expression holding the result. */
1371
1372 static tree
1373 gimple_expand_builtin_cabs (gimple_stmt_iterator *gsi, location_t loc, tree arg)
1374 {
1375 tree real_part, imag_part, addend1, addend2, sum, result;
1376 tree type = TREE_TYPE (TREE_TYPE (arg));
1377 tree sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1378 enum machine_mode mode = TYPE_MODE (type);
1379
1380 if (!flag_unsafe_math_optimizations
1381 || !optimize_bb_for_speed_p (gimple_bb (gsi_stmt (*gsi)))
1382 || !sqrtfn
1383 || optab_handler (sqrt_optab, mode) == CODE_FOR_nothing)
1384 return NULL_TREE;
1385
1386 real_part = build_and_insert_ref (gsi, loc, type, "cabs",
1387 REALPART_EXPR, arg);
1388 addend1 = build_and_insert_binop (gsi, loc, "cabs", MULT_EXPR,
1389 real_part, real_part);
1390 imag_part = build_and_insert_ref (gsi, loc, type, "cabs",
1391 IMAGPART_EXPR, arg);
1392 addend2 = build_and_insert_binop (gsi, loc, "cabs", MULT_EXPR,
1393 imag_part, imag_part);
1394 sum = build_and_insert_binop (gsi, loc, "cabs", PLUS_EXPR, addend1, addend2);
1395 result = build_and_insert_call (gsi, loc, sqrtfn, sum);
1396
1397 return result;
1398 }
1399
1400 /* Go through all calls to sin, cos and cexpi and call execute_cse_sincos_1
1401 on the SSA_NAME argument of each of them. Also expand powi(x,n) into
1402 an optimal number of multiplies, when n is a constant. */
1403
1404 static unsigned int
1405 execute_cse_sincos (void)
1406 {
1407 basic_block bb;
1408 bool cfg_changed = false;
1409
1410 calculate_dominance_info (CDI_DOMINATORS);
1411 memset (&sincos_stats, 0, sizeof (sincos_stats));
1412
1413 FOR_EACH_BB (bb)
1414 {
1415 gimple_stmt_iterator gsi;
1416 bool cleanup_eh = false;
1417
1418 for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1419 {
1420 gimple stmt = gsi_stmt (gsi);
1421 tree fndecl;
1422
1423 /* Only the last stmt in a bb could throw, no need to call
1424 gimple_purge_dead_eh_edges if we change something in the middle
1425 of a basic block. */
1426 cleanup_eh = false;
1427
1428 if (is_gimple_call (stmt)
1429 && gimple_call_lhs (stmt)
1430 && (fndecl = gimple_call_fndecl (stmt))
1431 && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL)
1432 {
1433 tree arg, arg0, arg1, result;
1434 HOST_WIDE_INT n;
1435 location_t loc;
1436
1437 switch (DECL_FUNCTION_CODE (fndecl))
1438 {
1439 CASE_FLT_FN (BUILT_IN_COS):
1440 CASE_FLT_FN (BUILT_IN_SIN):
1441 CASE_FLT_FN (BUILT_IN_CEXPI):
1442 /* Make sure we have either sincos or cexp. */
1443 if (!targetm.libc_has_function (function_c99_math_complex)
1444 && !targetm.libc_has_function (function_sincos))
1445 break;
1446
1447 arg = gimple_call_arg (stmt, 0);
1448 if (TREE_CODE (arg) == SSA_NAME)
1449 cfg_changed |= execute_cse_sincos_1 (arg);
1450 break;
1451
1452 CASE_FLT_FN (BUILT_IN_POW):
1453 arg0 = gimple_call_arg (stmt, 0);
1454 arg1 = gimple_call_arg (stmt, 1);
1455
1456 loc = gimple_location (stmt);
1457 result = gimple_expand_builtin_pow (&gsi, loc, arg0, arg1);
1458
1459 if (result)
1460 {
1461 tree lhs = gimple_get_lhs (stmt);
1462 gimple new_stmt = gimple_build_assign (lhs, result);
1463 gimple_set_location (new_stmt, loc);
1464 unlink_stmt_vdef (stmt);
1465 gsi_replace (&gsi, new_stmt, true);
1466 cleanup_eh = true;
1467 if (gimple_vdef (stmt))
1468 release_ssa_name (gimple_vdef (stmt));
1469 }
1470 break;
1471
1472 CASE_FLT_FN (BUILT_IN_POWI):
1473 arg0 = gimple_call_arg (stmt, 0);
1474 arg1 = gimple_call_arg (stmt, 1);
1475 loc = gimple_location (stmt);
1476
1477 if (real_minus_onep (arg0))
1478 {
1479 tree t0, t1, cond, one, minus_one;
1480 gimple stmt;
1481
1482 t0 = TREE_TYPE (arg0);
1483 t1 = TREE_TYPE (arg1);
1484 one = build_real (t0, dconst1);
1485 minus_one = build_real (t0, dconstm1);
1486
1487 cond = make_temp_ssa_name (t1, NULL, "powi_cond");
1488 stmt = gimple_build_assign_with_ops (BIT_AND_EXPR, cond,
1489 arg1,
1490 build_int_cst (t1,
1491 1));
1492 gimple_set_location (stmt, loc);
1493 gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
1494
1495 result = make_temp_ssa_name (t0, NULL, "powi");
1496 stmt = gimple_build_assign_with_ops (COND_EXPR, result,
1497 cond,
1498 minus_one, one);
1499 gimple_set_location (stmt, loc);
1500 gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
1501 }
1502 else
1503 {
1504 if (!host_integerp (arg1, 0))
1505 break;
1506
1507 n = TREE_INT_CST_LOW (arg1);
1508 result = gimple_expand_builtin_powi (&gsi, loc, arg0, n);
1509 }
1510
1511 if (result)
1512 {
1513 tree lhs = gimple_get_lhs (stmt);
1514 gimple new_stmt = gimple_build_assign (lhs, result);
1515 gimple_set_location (new_stmt, loc);
1516 unlink_stmt_vdef (stmt);
1517 gsi_replace (&gsi, new_stmt, true);
1518 cleanup_eh = true;
1519 if (gimple_vdef (stmt))
1520 release_ssa_name (gimple_vdef (stmt));
1521 }
1522 break;
1523
1524 CASE_FLT_FN (BUILT_IN_CABS):
1525 arg0 = gimple_call_arg (stmt, 0);
1526 loc = gimple_location (stmt);
1527 result = gimple_expand_builtin_cabs (&gsi, loc, arg0);
1528
1529 if (result)
1530 {
1531 tree lhs = gimple_get_lhs (stmt);
1532 gimple new_stmt = gimple_build_assign (lhs, result);
1533 gimple_set_location (new_stmt, loc);
1534 unlink_stmt_vdef (stmt);
1535 gsi_replace (&gsi, new_stmt, true);
1536 cleanup_eh = true;
1537 if (gimple_vdef (stmt))
1538 release_ssa_name (gimple_vdef (stmt));
1539 }
1540 break;
1541
1542 default:;
1543 }
1544 }
1545 }
1546 if (cleanup_eh)
1547 cfg_changed |= gimple_purge_dead_eh_edges (bb);
1548 }
1549
1550 statistics_counter_event (cfun, "sincos statements inserted",
1551 sincos_stats.inserted);
1552
1553 free_dominance_info (CDI_DOMINATORS);
1554 return cfg_changed ? TODO_cleanup_cfg : 0;
1555 }
1556
1557 static bool
1558 gate_cse_sincos (void)
1559 {
1560 /* We no longer require either sincos or cexp, since powi expansion
1561 piggybacks on this pass. */
1562 return optimize;
1563 }
1564
1565 namespace {
1566
1567 const pass_data pass_data_cse_sincos =
1568 {
1569 GIMPLE_PASS, /* type */
1570 "sincos", /* name */
1571 OPTGROUP_NONE, /* optinfo_flags */
1572 true, /* has_gate */
1573 true, /* has_execute */
1574 TV_NONE, /* tv_id */
1575 PROP_ssa, /* properties_required */
1576 0, /* properties_provided */
1577 0, /* properties_destroyed */
1578 0, /* todo_flags_start */
1579 ( TODO_update_ssa | TODO_verify_ssa
1580 | TODO_verify_stmts ), /* todo_flags_finish */
1581 };
1582
1583 class pass_cse_sincos : public gimple_opt_pass
1584 {
1585 public:
1586 pass_cse_sincos (gcc::context *ctxt)
1587 : gimple_opt_pass (pass_data_cse_sincos, ctxt)
1588 {}
1589
1590 /* opt_pass methods: */
1591 bool gate () { return gate_cse_sincos (); }
1592 unsigned int execute () { return execute_cse_sincos (); }
1593
1594 }; // class pass_cse_sincos
1595
1596 } // anon namespace
1597
1598 gimple_opt_pass *
1599 make_pass_cse_sincos (gcc::context *ctxt)
1600 {
1601 return new pass_cse_sincos (ctxt);
1602 }
1603
1604 /* A symbolic number is used to detect byte permutation and selection
1605 patterns. Therefore the field N contains an artificial number
1606 consisting of byte size markers:
1607
1608 0 - byte has the value 0
1609 1..size - byte contains the content of the byte
1610 number indexed with that value minus one */
1611
1612 struct symbolic_number {
1613 unsigned HOST_WIDEST_INT n;
1614 int size;
1615 };
1616
1617 /* Perform a SHIFT or ROTATE operation by COUNT bits on symbolic
1618 number N. Return false if the requested operation is not permitted
1619 on a symbolic number. */
1620
1621 static inline bool
1622 do_shift_rotate (enum tree_code code,
1623 struct symbolic_number *n,
1624 int count)
1625 {
1626 if (count % 8 != 0)
1627 return false;
1628
1629 /* Zero out the extra bits of N in order to avoid them being shifted
1630 into the significant bits. */
1631 if (n->size < (int)sizeof (HOST_WIDEST_INT))
1632 n->n &= ((unsigned HOST_WIDEST_INT)1 << (n->size * BITS_PER_UNIT)) - 1;
1633
1634 switch (code)
1635 {
1636 case LSHIFT_EXPR:
1637 n->n <<= count;
1638 break;
1639 case RSHIFT_EXPR:
1640 n->n >>= count;
1641 break;
1642 case LROTATE_EXPR:
1643 n->n = (n->n << count) | (n->n >> ((n->size * BITS_PER_UNIT) - count));
1644 break;
1645 case RROTATE_EXPR:
1646 n->n = (n->n >> count) | (n->n << ((n->size * BITS_PER_UNIT) - count));
1647 break;
1648 default:
1649 return false;
1650 }
1651 /* Zero unused bits for size. */
1652 if (n->size < (int)sizeof (HOST_WIDEST_INT))
1653 n->n &= ((unsigned HOST_WIDEST_INT)1 << (n->size * BITS_PER_UNIT)) - 1;
1654 return true;
1655 }
1656
1657 /* Perform sanity checking for the symbolic number N and the gimple
1658 statement STMT. */
1659
1660 static inline bool
1661 verify_symbolic_number_p (struct symbolic_number *n, gimple stmt)
1662 {
1663 tree lhs_type;
1664
1665 lhs_type = gimple_expr_type (stmt);
1666
1667 if (TREE_CODE (lhs_type) != INTEGER_TYPE)
1668 return false;
1669
1670 if (TYPE_PRECISION (lhs_type) != n->size * BITS_PER_UNIT)
1671 return false;
1672
1673 return true;
1674 }
1675
1676 /* find_bswap_1 invokes itself recursively with N and tries to perform
1677 the operation given by the rhs of STMT on the result. If the
1678 operation could successfully be executed the function returns the
1679 tree expression of the source operand and NULL otherwise. */
1680
1681 static tree
1682 find_bswap_1 (gimple stmt, struct symbolic_number *n, int limit)
1683 {
1684 enum tree_code code;
1685 tree rhs1, rhs2 = NULL;
1686 gimple rhs1_stmt, rhs2_stmt;
1687 tree source_expr1;
1688 enum gimple_rhs_class rhs_class;
1689
1690 if (!limit || !is_gimple_assign (stmt))
1691 return NULL_TREE;
1692
1693 rhs1 = gimple_assign_rhs1 (stmt);
1694
1695 if (TREE_CODE (rhs1) != SSA_NAME)
1696 return NULL_TREE;
1697
1698 code = gimple_assign_rhs_code (stmt);
1699 rhs_class = gimple_assign_rhs_class (stmt);
1700 rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
1701
1702 if (rhs_class == GIMPLE_BINARY_RHS)
1703 rhs2 = gimple_assign_rhs2 (stmt);
1704
1705 /* Handle unary rhs and binary rhs with integer constants as second
1706 operand. */
1707
1708 if (rhs_class == GIMPLE_UNARY_RHS
1709 || (rhs_class == GIMPLE_BINARY_RHS
1710 && TREE_CODE (rhs2) == INTEGER_CST))
1711 {
1712 if (code != BIT_AND_EXPR
1713 && code != LSHIFT_EXPR
1714 && code != RSHIFT_EXPR
1715 && code != LROTATE_EXPR
1716 && code != RROTATE_EXPR
1717 && code != NOP_EXPR
1718 && code != CONVERT_EXPR)
1719 return NULL_TREE;
1720
1721 source_expr1 = find_bswap_1 (rhs1_stmt, n, limit - 1);
1722
1723 /* If find_bswap_1 returned NULL STMT is a leaf node and we have
1724 to initialize the symbolic number. */
1725 if (!source_expr1)
1726 {
1727 /* Set up the symbolic number N by setting each byte to a
1728 value between 1 and the byte size of rhs1. The highest
1729 order byte is set to n->size and the lowest order
1730 byte to 1. */
1731 n->size = TYPE_PRECISION (TREE_TYPE (rhs1));
1732 if (n->size % BITS_PER_UNIT != 0)
1733 return NULL_TREE;
1734 n->size /= BITS_PER_UNIT;
1735 n->n = (sizeof (HOST_WIDEST_INT) < 8 ? 0 :
1736 (unsigned HOST_WIDEST_INT)0x08070605 << 32 | 0x04030201);
1737
1738 if (n->size < (int)sizeof (HOST_WIDEST_INT))
1739 n->n &= ((unsigned HOST_WIDEST_INT)1 <<
1740 (n->size * BITS_PER_UNIT)) - 1;
1741
1742 source_expr1 = rhs1;
1743 }
1744
1745 switch (code)
1746 {
1747 case BIT_AND_EXPR:
1748 {
1749 int i;
1750 unsigned HOST_WIDEST_INT val = widest_int_cst_value (rhs2);
1751 unsigned HOST_WIDEST_INT tmp = val;
1752
1753 /* Only constants masking full bytes are allowed. */
1754 for (i = 0; i < n->size; i++, tmp >>= BITS_PER_UNIT)
1755 if ((tmp & 0xff) != 0 && (tmp & 0xff) != 0xff)
1756 return NULL_TREE;
1757
1758 n->n &= val;
1759 }
1760 break;
1761 case LSHIFT_EXPR:
1762 case RSHIFT_EXPR:
1763 case LROTATE_EXPR:
1764 case RROTATE_EXPR:
1765 if (!do_shift_rotate (code, n, (int)TREE_INT_CST_LOW (rhs2)))
1766 return NULL_TREE;
1767 break;
1768 CASE_CONVERT:
1769 {
1770 int type_size;
1771
1772 type_size = TYPE_PRECISION (gimple_expr_type (stmt));
1773 if (type_size % BITS_PER_UNIT != 0)
1774 return NULL_TREE;
1775
1776 if (type_size / BITS_PER_UNIT < (int)(sizeof (HOST_WIDEST_INT)))
1777 {
1778 /* If STMT casts to a smaller type mask out the bits not
1779 belonging to the target type. */
1780 n->n &= ((unsigned HOST_WIDEST_INT)1 << type_size) - 1;
1781 }
1782 n->size = type_size / BITS_PER_UNIT;
1783 }
1784 break;
1785 default:
1786 return NULL_TREE;
1787 };
1788 return verify_symbolic_number_p (n, stmt) ? source_expr1 : NULL;
1789 }
1790
1791 /* Handle binary rhs. */
1792
1793 if (rhs_class == GIMPLE_BINARY_RHS)
1794 {
1795 struct symbolic_number n1, n2;
1796 tree source_expr2;
1797
1798 if (code != BIT_IOR_EXPR)
1799 return NULL_TREE;
1800
1801 if (TREE_CODE (rhs2) != SSA_NAME)
1802 return NULL_TREE;
1803
1804 rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
1805
1806 switch (code)
1807 {
1808 case BIT_IOR_EXPR:
1809 source_expr1 = find_bswap_1 (rhs1_stmt, &n1, limit - 1);
1810
1811 if (!source_expr1)
1812 return NULL_TREE;
1813
1814 source_expr2 = find_bswap_1 (rhs2_stmt, &n2, limit - 1);
1815
1816 if (source_expr1 != source_expr2
1817 || n1.size != n2.size)
1818 return NULL_TREE;
1819
1820 n->size = n1.size;
1821 n->n = n1.n | n2.n;
1822
1823 if (!verify_symbolic_number_p (n, stmt))
1824 return NULL_TREE;
1825
1826 break;
1827 default:
1828 return NULL_TREE;
1829 }
1830 return source_expr1;
1831 }
1832 return NULL_TREE;
1833 }
1834
1835 /* Check if STMT completes a bswap implementation consisting of ORs,
1836 SHIFTs and ANDs. Return the source tree expression on which the
1837 byte swap is performed and NULL if no bswap was found. */
1838
1839 static tree
1840 find_bswap (gimple stmt)
1841 {
1842 /* The number which the find_bswap result should match in order to
1843 have a full byte swap. The number is shifted to the left according
1844 to the size of the symbolic number before using it. */
1845 unsigned HOST_WIDEST_INT cmp =
1846 sizeof (HOST_WIDEST_INT) < 8 ? 0 :
1847 (unsigned HOST_WIDEST_INT)0x01020304 << 32 | 0x05060708;
1848
1849 struct symbolic_number n;
1850 tree source_expr;
1851 int limit;
1852
1853 /* The last parameter determines the depth search limit. It usually
1854 correlates directly to the number of bytes to be touched. We
1855 increase that number by three here in order to also
1856 cover signed -> unsigned converions of the src operand as can be seen
1857 in libgcc, and for initial shift/and operation of the src operand. */
1858 limit = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (gimple_expr_type (stmt)));
1859 limit += 1 + (int) ceil_log2 ((unsigned HOST_WIDE_INT) limit);
1860 source_expr = find_bswap_1 (stmt, &n, limit);
1861
1862 if (!source_expr)
1863 return NULL_TREE;
1864
1865 /* Zero out the extra bits of N and CMP. */
1866 if (n.size < (int)sizeof (HOST_WIDEST_INT))
1867 {
1868 unsigned HOST_WIDEST_INT mask =
1869 ((unsigned HOST_WIDEST_INT)1 << (n.size * BITS_PER_UNIT)) - 1;
1870
1871 n.n &= mask;
1872 cmp >>= (sizeof (HOST_WIDEST_INT) - n.size) * BITS_PER_UNIT;
1873 }
1874
1875 /* A complete byte swap should make the symbolic number to start
1876 with the largest digit in the highest order byte. */
1877 if (cmp != n.n)
1878 return NULL_TREE;
1879
1880 return source_expr;
1881 }
1882
1883 /* Find manual byte swap implementations and turn them into a bswap
1884 builtin invokation. */
1885
1886 static unsigned int
1887 execute_optimize_bswap (void)
1888 {
1889 basic_block bb;
1890 bool bswap16_p, bswap32_p, bswap64_p;
1891 bool changed = false;
1892 tree bswap16_type = NULL_TREE, bswap32_type = NULL_TREE, bswap64_type = NULL_TREE;
1893
1894 if (BITS_PER_UNIT != 8)
1895 return 0;
1896
1897 if (sizeof (HOST_WIDEST_INT) < 8)
1898 return 0;
1899
1900 bswap16_p = (builtin_decl_explicit_p (BUILT_IN_BSWAP16)
1901 && optab_handler (bswap_optab, HImode) != CODE_FOR_nothing);
1902 bswap32_p = (builtin_decl_explicit_p (BUILT_IN_BSWAP32)
1903 && optab_handler (bswap_optab, SImode) != CODE_FOR_nothing);
1904 bswap64_p = (builtin_decl_explicit_p (BUILT_IN_BSWAP64)
1905 && (optab_handler (bswap_optab, DImode) != CODE_FOR_nothing
1906 || (bswap32_p && word_mode == SImode)));
1907
1908 if (!bswap16_p && !bswap32_p && !bswap64_p)
1909 return 0;
1910
1911 /* Determine the argument type of the builtins. The code later on
1912 assumes that the return and argument type are the same. */
1913 if (bswap16_p)
1914 {
1915 tree fndecl = builtin_decl_explicit (BUILT_IN_BSWAP16);
1916 bswap16_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
1917 }
1918
1919 if (bswap32_p)
1920 {
1921 tree fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32);
1922 bswap32_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
1923 }
1924
1925 if (bswap64_p)
1926 {
1927 tree fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64);
1928 bswap64_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
1929 }
1930
1931 memset (&bswap_stats, 0, sizeof (bswap_stats));
1932
1933 FOR_EACH_BB (bb)
1934 {
1935 gimple_stmt_iterator gsi;
1936
1937 /* We do a reverse scan for bswap patterns to make sure we get the
1938 widest match. As bswap pattern matching doesn't handle
1939 previously inserted smaller bswap replacements as sub-
1940 patterns, the wider variant wouldn't be detected. */
1941 for (gsi = gsi_last_bb (bb); !gsi_end_p (gsi); gsi_prev (&gsi))
1942 {
1943 gimple stmt = gsi_stmt (gsi);
1944 tree bswap_src, bswap_type;
1945 tree bswap_tmp;
1946 tree fndecl = NULL_TREE;
1947 int type_size;
1948 gimple call;
1949
1950 if (!is_gimple_assign (stmt)
1951 || gimple_assign_rhs_code (stmt) != BIT_IOR_EXPR)
1952 continue;
1953
1954 type_size = TYPE_PRECISION (gimple_expr_type (stmt));
1955
1956 switch (type_size)
1957 {
1958 case 16:
1959 if (bswap16_p)
1960 {
1961 fndecl = builtin_decl_explicit (BUILT_IN_BSWAP16);
1962 bswap_type = bswap16_type;
1963 }
1964 break;
1965 case 32:
1966 if (bswap32_p)
1967 {
1968 fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32);
1969 bswap_type = bswap32_type;
1970 }
1971 break;
1972 case 64:
1973 if (bswap64_p)
1974 {
1975 fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64);
1976 bswap_type = bswap64_type;
1977 }
1978 break;
1979 default:
1980 continue;
1981 }
1982
1983 if (!fndecl)
1984 continue;
1985
1986 bswap_src = find_bswap (stmt);
1987
1988 if (!bswap_src)
1989 continue;
1990
1991 changed = true;
1992 if (type_size == 16)
1993 bswap_stats.found_16bit++;
1994 else if (type_size == 32)
1995 bswap_stats.found_32bit++;
1996 else
1997 bswap_stats.found_64bit++;
1998
1999 bswap_tmp = bswap_src;
2000
2001 /* Convert the src expression if necessary. */
2002 if (!useless_type_conversion_p (TREE_TYPE (bswap_tmp), bswap_type))
2003 {
2004 gimple convert_stmt;
2005 bswap_tmp = make_temp_ssa_name (bswap_type, NULL, "bswapsrc");
2006 convert_stmt = gimple_build_assign_with_ops
2007 (NOP_EXPR, bswap_tmp, bswap_src, NULL);
2008 gsi_insert_before (&gsi, convert_stmt, GSI_SAME_STMT);
2009 }
2010
2011 call = gimple_build_call (fndecl, 1, bswap_tmp);
2012
2013 bswap_tmp = gimple_assign_lhs (stmt);
2014
2015 /* Convert the result if necessary. */
2016 if (!useless_type_conversion_p (TREE_TYPE (bswap_tmp), bswap_type))
2017 {
2018 gimple convert_stmt;
2019 bswap_tmp = make_temp_ssa_name (bswap_type, NULL, "bswapdst");
2020 convert_stmt = gimple_build_assign_with_ops
2021 (NOP_EXPR, gimple_assign_lhs (stmt), bswap_tmp, NULL);
2022 gsi_insert_after (&gsi, convert_stmt, GSI_SAME_STMT);
2023 }
2024
2025 gimple_call_set_lhs (call, bswap_tmp);
2026
2027 if (dump_file)
2028 {
2029 fprintf (dump_file, "%d bit bswap implementation found at: ",
2030 (int)type_size);
2031 print_gimple_stmt (dump_file, stmt, 0, 0);
2032 }
2033
2034 gsi_insert_after (&gsi, call, GSI_SAME_STMT);
2035 gsi_remove (&gsi, true);
2036 }
2037 }
2038
2039 statistics_counter_event (cfun, "16-bit bswap implementations found",
2040 bswap_stats.found_16bit);
2041 statistics_counter_event (cfun, "32-bit bswap implementations found",
2042 bswap_stats.found_32bit);
2043 statistics_counter_event (cfun, "64-bit bswap implementations found",
2044 bswap_stats.found_64bit);
2045
2046 return (changed ? TODO_update_ssa | TODO_verify_ssa
2047 | TODO_verify_stmts : 0);
2048 }
2049
2050 static bool
2051 gate_optimize_bswap (void)
2052 {
2053 return flag_expensive_optimizations && optimize;
2054 }
2055
2056 namespace {
2057
2058 const pass_data pass_data_optimize_bswap =
2059 {
2060 GIMPLE_PASS, /* type */
2061 "bswap", /* name */
2062 OPTGROUP_NONE, /* optinfo_flags */
2063 true, /* has_gate */
2064 true, /* has_execute */
2065 TV_NONE, /* tv_id */
2066 PROP_ssa, /* properties_required */
2067 0, /* properties_provided */
2068 0, /* properties_destroyed */
2069 0, /* todo_flags_start */
2070 0, /* todo_flags_finish */
2071 };
2072
2073 class pass_optimize_bswap : public gimple_opt_pass
2074 {
2075 public:
2076 pass_optimize_bswap (gcc::context *ctxt)
2077 : gimple_opt_pass (pass_data_optimize_bswap, ctxt)
2078 {}
2079
2080 /* opt_pass methods: */
2081 bool gate () { return gate_optimize_bswap (); }
2082 unsigned int execute () { return execute_optimize_bswap (); }
2083
2084 }; // class pass_optimize_bswap
2085
2086 } // anon namespace
2087
2088 gimple_opt_pass *
2089 make_pass_optimize_bswap (gcc::context *ctxt)
2090 {
2091 return new pass_optimize_bswap (ctxt);
2092 }
2093
2094 /* Return true if stmt is a type conversion operation that can be stripped
2095 when used in a widening multiply operation. */
2096 static bool
2097 widening_mult_conversion_strippable_p (tree result_type, gimple stmt)
2098 {
2099 enum tree_code rhs_code = gimple_assign_rhs_code (stmt);
2100
2101 if (TREE_CODE (result_type) == INTEGER_TYPE)
2102 {
2103 tree op_type;
2104 tree inner_op_type;
2105
2106 if (!CONVERT_EXPR_CODE_P (rhs_code))
2107 return false;
2108
2109 op_type = TREE_TYPE (gimple_assign_lhs (stmt));
2110
2111 /* If the type of OP has the same precision as the result, then
2112 we can strip this conversion. The multiply operation will be
2113 selected to create the correct extension as a by-product. */
2114 if (TYPE_PRECISION (result_type) == TYPE_PRECISION (op_type))
2115 return true;
2116
2117 /* We can also strip a conversion if it preserves the signed-ness of
2118 the operation and doesn't narrow the range. */
2119 inner_op_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
2120
2121 /* If the inner-most type is unsigned, then we can strip any
2122 intermediate widening operation. If it's signed, then the
2123 intermediate widening operation must also be signed. */
2124 if ((TYPE_UNSIGNED (inner_op_type)
2125 || TYPE_UNSIGNED (op_type) == TYPE_UNSIGNED (inner_op_type))
2126 && TYPE_PRECISION (op_type) > TYPE_PRECISION (inner_op_type))
2127 return true;
2128
2129 return false;
2130 }
2131
2132 return rhs_code == FIXED_CONVERT_EXPR;
2133 }
2134
2135 /* Return true if RHS is a suitable operand for a widening multiplication,
2136 assuming a target type of TYPE.
2137 There are two cases:
2138
2139 - RHS makes some value at least twice as wide. Store that value
2140 in *NEW_RHS_OUT if so, and store its type in *TYPE_OUT.
2141
2142 - RHS is an integer constant. Store that value in *NEW_RHS_OUT if so,
2143 but leave *TYPE_OUT untouched. */
2144
2145 static bool
2146 is_widening_mult_rhs_p (tree type, tree rhs, tree *type_out,
2147 tree *new_rhs_out)
2148 {
2149 gimple stmt;
2150 tree type1, rhs1;
2151
2152 if (TREE_CODE (rhs) == SSA_NAME)
2153 {
2154 stmt = SSA_NAME_DEF_STMT (rhs);
2155 if (is_gimple_assign (stmt))
2156 {
2157 if (! widening_mult_conversion_strippable_p (type, stmt))
2158 rhs1 = rhs;
2159 else
2160 {
2161 rhs1 = gimple_assign_rhs1 (stmt);
2162
2163 if (TREE_CODE (rhs1) == INTEGER_CST)
2164 {
2165 *new_rhs_out = rhs1;
2166 *type_out = NULL;
2167 return true;
2168 }
2169 }
2170 }
2171 else
2172 rhs1 = rhs;
2173
2174 type1 = TREE_TYPE (rhs1);
2175
2176 if (TREE_CODE (type1) != TREE_CODE (type)
2177 || TYPE_PRECISION (type1) * 2 > TYPE_PRECISION (type))
2178 return false;
2179
2180 *new_rhs_out = rhs1;
2181 *type_out = type1;
2182 return true;
2183 }
2184
2185 if (TREE_CODE (rhs) == INTEGER_CST)
2186 {
2187 *new_rhs_out = rhs;
2188 *type_out = NULL;
2189 return true;
2190 }
2191
2192 return false;
2193 }
2194
2195 /* Return true if STMT performs a widening multiplication, assuming the
2196 output type is TYPE. If so, store the unwidened types of the operands
2197 in *TYPE1_OUT and *TYPE2_OUT respectively. Also fill *RHS1_OUT and
2198 *RHS2_OUT such that converting those operands to types *TYPE1_OUT
2199 and *TYPE2_OUT would give the operands of the multiplication. */
2200
2201 static bool
2202 is_widening_mult_p (gimple stmt,
2203 tree *type1_out, tree *rhs1_out,
2204 tree *type2_out, tree *rhs2_out)
2205 {
2206 tree type = TREE_TYPE (gimple_assign_lhs (stmt));
2207
2208 if (TREE_CODE (type) != INTEGER_TYPE
2209 && TREE_CODE (type) != FIXED_POINT_TYPE)
2210 return false;
2211
2212 if (!is_widening_mult_rhs_p (type, gimple_assign_rhs1 (stmt), type1_out,
2213 rhs1_out))
2214 return false;
2215
2216 if (!is_widening_mult_rhs_p (type, gimple_assign_rhs2 (stmt), type2_out,
2217 rhs2_out))
2218 return false;
2219
2220 if (*type1_out == NULL)
2221 {
2222 if (*type2_out == NULL || !int_fits_type_p (*rhs1_out, *type2_out))
2223 return false;
2224 *type1_out = *type2_out;
2225 }
2226
2227 if (*type2_out == NULL)
2228 {
2229 if (!int_fits_type_p (*rhs2_out, *type1_out))
2230 return false;
2231 *type2_out = *type1_out;
2232 }
2233
2234 /* Ensure that the larger of the two operands comes first. */
2235 if (TYPE_PRECISION (*type1_out) < TYPE_PRECISION (*type2_out))
2236 {
2237 tree tmp;
2238 tmp = *type1_out;
2239 *type1_out = *type2_out;
2240 *type2_out = tmp;
2241 tmp = *rhs1_out;
2242 *rhs1_out = *rhs2_out;
2243 *rhs2_out = tmp;
2244 }
2245
2246 return true;
2247 }
2248
2249 /* Process a single gimple statement STMT, which has a MULT_EXPR as
2250 its rhs, and try to convert it into a WIDEN_MULT_EXPR. The return
2251 value is true iff we converted the statement. */
2252
2253 static bool
2254 convert_mult_to_widen (gimple stmt, gimple_stmt_iterator *gsi)
2255 {
2256 tree lhs, rhs1, rhs2, type, type1, type2;
2257 enum insn_code handler;
2258 enum machine_mode to_mode, from_mode, actual_mode;
2259 optab op;
2260 int actual_precision;
2261 location_t loc = gimple_location (stmt);
2262 bool from_unsigned1, from_unsigned2;
2263
2264 lhs = gimple_assign_lhs (stmt);
2265 type = TREE_TYPE (lhs);
2266 if (TREE_CODE (type) != INTEGER_TYPE)
2267 return false;
2268
2269 if (!is_widening_mult_p (stmt, &type1, &rhs1, &type2, &rhs2))
2270 return false;
2271
2272 to_mode = TYPE_MODE (type);
2273 from_mode = TYPE_MODE (type1);
2274 from_unsigned1 = TYPE_UNSIGNED (type1);
2275 from_unsigned2 = TYPE_UNSIGNED (type2);
2276
2277 if (from_unsigned1 && from_unsigned2)
2278 op = umul_widen_optab;
2279 else if (!from_unsigned1 && !from_unsigned2)
2280 op = smul_widen_optab;
2281 else
2282 op = usmul_widen_optab;
2283
2284 handler = find_widening_optab_handler_and_mode (op, to_mode, from_mode,
2285 0, &actual_mode);
2286
2287 if (handler == CODE_FOR_nothing)
2288 {
2289 if (op != smul_widen_optab)
2290 {
2291 /* We can use a signed multiply with unsigned types as long as
2292 there is a wider mode to use, or it is the smaller of the two
2293 types that is unsigned. Note that type1 >= type2, always. */
2294 if ((TYPE_UNSIGNED (type1)
2295 && TYPE_PRECISION (type1) == GET_MODE_PRECISION (from_mode))
2296 || (TYPE_UNSIGNED (type2)
2297 && TYPE_PRECISION (type2) == GET_MODE_PRECISION (from_mode)))
2298 {
2299 from_mode = GET_MODE_WIDER_MODE (from_mode);
2300 if (GET_MODE_SIZE (to_mode) <= GET_MODE_SIZE (from_mode))
2301 return false;
2302 }
2303
2304 op = smul_widen_optab;
2305 handler = find_widening_optab_handler_and_mode (op, to_mode,
2306 from_mode, 0,
2307 &actual_mode);
2308
2309 if (handler == CODE_FOR_nothing)
2310 return false;
2311
2312 from_unsigned1 = from_unsigned2 = false;
2313 }
2314 else
2315 return false;
2316 }
2317
2318 /* Ensure that the inputs to the handler are in the correct precison
2319 for the opcode. This will be the full mode size. */
2320 actual_precision = GET_MODE_PRECISION (actual_mode);
2321 if (2 * actual_precision > TYPE_PRECISION (type))
2322 return false;
2323 if (actual_precision != TYPE_PRECISION (type1)
2324 || from_unsigned1 != TYPE_UNSIGNED (type1))
2325 rhs1 = build_and_insert_cast (gsi, loc,
2326 build_nonstandard_integer_type
2327 (actual_precision, from_unsigned1), rhs1);
2328 if (actual_precision != TYPE_PRECISION (type2)
2329 || from_unsigned2 != TYPE_UNSIGNED (type2))
2330 rhs2 = build_and_insert_cast (gsi, loc,
2331 build_nonstandard_integer_type
2332 (actual_precision, from_unsigned2), rhs2);
2333
2334 /* Handle constants. */
2335 if (TREE_CODE (rhs1) == INTEGER_CST)
2336 rhs1 = fold_convert (type1, rhs1);
2337 if (TREE_CODE (rhs2) == INTEGER_CST)
2338 rhs2 = fold_convert (type2, rhs2);
2339
2340 gimple_assign_set_rhs1 (stmt, rhs1);
2341 gimple_assign_set_rhs2 (stmt, rhs2);
2342 gimple_assign_set_rhs_code (stmt, WIDEN_MULT_EXPR);
2343 update_stmt (stmt);
2344 widen_mul_stats.widen_mults_inserted++;
2345 return true;
2346 }
2347
2348 /* Process a single gimple statement STMT, which is found at the
2349 iterator GSI and has a either a PLUS_EXPR or a MINUS_EXPR as its
2350 rhs (given by CODE), and try to convert it into a
2351 WIDEN_MULT_PLUS_EXPR or a WIDEN_MULT_MINUS_EXPR. The return value
2352 is true iff we converted the statement. */
2353
2354 static bool
2355 convert_plusminus_to_widen (gimple_stmt_iterator *gsi, gimple stmt,
2356 enum tree_code code)
2357 {
2358 gimple rhs1_stmt = NULL, rhs2_stmt = NULL;
2359 gimple conv1_stmt = NULL, conv2_stmt = NULL, conv_stmt;
2360 tree type, type1, type2, optype;
2361 tree lhs, rhs1, rhs2, mult_rhs1, mult_rhs2, add_rhs;
2362 enum tree_code rhs1_code = ERROR_MARK, rhs2_code = ERROR_MARK;
2363 optab this_optab;
2364 enum tree_code wmult_code;
2365 enum insn_code handler;
2366 enum machine_mode to_mode, from_mode, actual_mode;
2367 location_t loc = gimple_location (stmt);
2368 int actual_precision;
2369 bool from_unsigned1, from_unsigned2;
2370
2371 lhs = gimple_assign_lhs (stmt);
2372 type = TREE_TYPE (lhs);
2373 if (TREE_CODE (type) != INTEGER_TYPE
2374 && TREE_CODE (type) != FIXED_POINT_TYPE)
2375 return false;
2376
2377 if (code == MINUS_EXPR)
2378 wmult_code = WIDEN_MULT_MINUS_EXPR;
2379 else
2380 wmult_code = WIDEN_MULT_PLUS_EXPR;
2381
2382 rhs1 = gimple_assign_rhs1 (stmt);
2383 rhs2 = gimple_assign_rhs2 (stmt);
2384
2385 if (TREE_CODE (rhs1) == SSA_NAME)
2386 {
2387 rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
2388 if (is_gimple_assign (rhs1_stmt))
2389 rhs1_code = gimple_assign_rhs_code (rhs1_stmt);
2390 }
2391
2392 if (TREE_CODE (rhs2) == SSA_NAME)
2393 {
2394 rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
2395 if (is_gimple_assign (rhs2_stmt))
2396 rhs2_code = gimple_assign_rhs_code (rhs2_stmt);
2397 }
2398
2399 /* Allow for one conversion statement between the multiply
2400 and addition/subtraction statement. If there are more than
2401 one conversions then we assume they would invalidate this
2402 transformation. If that's not the case then they should have
2403 been folded before now. */
2404 if (CONVERT_EXPR_CODE_P (rhs1_code))
2405 {
2406 conv1_stmt = rhs1_stmt;
2407 rhs1 = gimple_assign_rhs1 (rhs1_stmt);
2408 if (TREE_CODE (rhs1) == SSA_NAME)
2409 {
2410 rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
2411 if (is_gimple_assign (rhs1_stmt))
2412 rhs1_code = gimple_assign_rhs_code (rhs1_stmt);
2413 }
2414 else
2415 return false;
2416 }
2417 if (CONVERT_EXPR_CODE_P (rhs2_code))
2418 {
2419 conv2_stmt = rhs2_stmt;
2420 rhs2 = gimple_assign_rhs1 (rhs2_stmt);
2421 if (TREE_CODE (rhs2) == SSA_NAME)
2422 {
2423 rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
2424 if (is_gimple_assign (rhs2_stmt))
2425 rhs2_code = gimple_assign_rhs_code (rhs2_stmt);
2426 }
2427 else
2428 return false;
2429 }
2430
2431 /* If code is WIDEN_MULT_EXPR then it would seem unnecessary to call
2432 is_widening_mult_p, but we still need the rhs returns.
2433
2434 It might also appear that it would be sufficient to use the existing
2435 operands of the widening multiply, but that would limit the choice of
2436 multiply-and-accumulate instructions.
2437
2438 If the widened-multiplication result has more than one uses, it is
2439 probably wiser not to do the conversion. */
2440 if (code == PLUS_EXPR
2441 && (rhs1_code == MULT_EXPR || rhs1_code == WIDEN_MULT_EXPR))
2442 {
2443 if (!has_single_use (rhs1)
2444 || !is_widening_mult_p (rhs1_stmt, &type1, &mult_rhs1,
2445 &type2, &mult_rhs2))
2446 return false;
2447 add_rhs = rhs2;
2448 conv_stmt = conv1_stmt;
2449 }
2450 else if (rhs2_code == MULT_EXPR || rhs2_code == WIDEN_MULT_EXPR)
2451 {
2452 if (!has_single_use (rhs2)
2453 || !is_widening_mult_p (rhs2_stmt, &type1, &mult_rhs1,
2454 &type2, &mult_rhs2))
2455 return false;
2456 add_rhs = rhs1;
2457 conv_stmt = conv2_stmt;
2458 }
2459 else
2460 return false;
2461
2462 to_mode = TYPE_MODE (type);
2463 from_mode = TYPE_MODE (type1);
2464 from_unsigned1 = TYPE_UNSIGNED (type1);
2465 from_unsigned2 = TYPE_UNSIGNED (type2);
2466 optype = type1;
2467
2468 /* There's no such thing as a mixed sign madd yet, so use a wider mode. */
2469 if (from_unsigned1 != from_unsigned2)
2470 {
2471 if (!INTEGRAL_TYPE_P (type))
2472 return false;
2473 /* We can use a signed multiply with unsigned types as long as
2474 there is a wider mode to use, or it is the smaller of the two
2475 types that is unsigned. Note that type1 >= type2, always. */
2476 if ((from_unsigned1
2477 && TYPE_PRECISION (type1) == GET_MODE_PRECISION (from_mode))
2478 || (from_unsigned2
2479 && TYPE_PRECISION (type2) == GET_MODE_PRECISION (from_mode)))
2480 {
2481 from_mode = GET_MODE_WIDER_MODE (from_mode);
2482 if (GET_MODE_SIZE (from_mode) >= GET_MODE_SIZE (to_mode))
2483 return false;
2484 }
2485
2486 from_unsigned1 = from_unsigned2 = false;
2487 optype = build_nonstandard_integer_type (GET_MODE_PRECISION (from_mode),
2488 false);
2489 }
2490
2491 /* If there was a conversion between the multiply and addition
2492 then we need to make sure it fits a multiply-and-accumulate.
2493 The should be a single mode change which does not change the
2494 value. */
2495 if (conv_stmt)
2496 {
2497 /* We use the original, unmodified data types for this. */
2498 tree from_type = TREE_TYPE (gimple_assign_rhs1 (conv_stmt));
2499 tree to_type = TREE_TYPE (gimple_assign_lhs (conv_stmt));
2500 int data_size = TYPE_PRECISION (type1) + TYPE_PRECISION (type2);
2501 bool is_unsigned = TYPE_UNSIGNED (type1) && TYPE_UNSIGNED (type2);
2502
2503 if (TYPE_PRECISION (from_type) > TYPE_PRECISION (to_type))
2504 {
2505 /* Conversion is a truncate. */
2506 if (TYPE_PRECISION (to_type) < data_size)
2507 return false;
2508 }
2509 else if (TYPE_PRECISION (from_type) < TYPE_PRECISION (to_type))
2510 {
2511 /* Conversion is an extend. Check it's the right sort. */
2512 if (TYPE_UNSIGNED (from_type) != is_unsigned
2513 && !(is_unsigned && TYPE_PRECISION (from_type) > data_size))
2514 return false;
2515 }
2516 /* else convert is a no-op for our purposes. */
2517 }
2518
2519 /* Verify that the machine can perform a widening multiply
2520 accumulate in this mode/signedness combination, otherwise
2521 this transformation is likely to pessimize code. */
2522 this_optab = optab_for_tree_code (wmult_code, optype, optab_default);
2523 handler = find_widening_optab_handler_and_mode (this_optab, to_mode,
2524 from_mode, 0, &actual_mode);
2525
2526 if (handler == CODE_FOR_nothing)
2527 return false;
2528
2529 /* Ensure that the inputs to the handler are in the correct precison
2530 for the opcode. This will be the full mode size. */
2531 actual_precision = GET_MODE_PRECISION (actual_mode);
2532 if (actual_precision != TYPE_PRECISION (type1)
2533 || from_unsigned1 != TYPE_UNSIGNED (type1))
2534 mult_rhs1 = build_and_insert_cast (gsi, loc,
2535 build_nonstandard_integer_type
2536 (actual_precision, from_unsigned1),
2537 mult_rhs1);
2538 if (actual_precision != TYPE_PRECISION (type2)
2539 || from_unsigned2 != TYPE_UNSIGNED (type2))
2540 mult_rhs2 = build_and_insert_cast (gsi, loc,
2541 build_nonstandard_integer_type
2542 (actual_precision, from_unsigned2),
2543 mult_rhs2);
2544
2545 if (!useless_type_conversion_p (type, TREE_TYPE (add_rhs)))
2546 add_rhs = build_and_insert_cast (gsi, loc, type, add_rhs);
2547
2548 /* Handle constants. */
2549 if (TREE_CODE (mult_rhs1) == INTEGER_CST)
2550 mult_rhs1 = fold_convert (type1, mult_rhs1);
2551 if (TREE_CODE (mult_rhs2) == INTEGER_CST)
2552 mult_rhs2 = fold_convert (type2, mult_rhs2);
2553
2554 gimple_assign_set_rhs_with_ops_1 (gsi, wmult_code, mult_rhs1, mult_rhs2,
2555 add_rhs);
2556 update_stmt (gsi_stmt (*gsi));
2557 widen_mul_stats.maccs_inserted++;
2558 return true;
2559 }
2560
2561 /* Combine the multiplication at MUL_STMT with operands MULOP1 and MULOP2
2562 with uses in additions and subtractions to form fused multiply-add
2563 operations. Returns true if successful and MUL_STMT should be removed. */
2564
2565 static bool
2566 convert_mult_to_fma (gimple mul_stmt, tree op1, tree op2)
2567 {
2568 tree mul_result = gimple_get_lhs (mul_stmt);
2569 tree type = TREE_TYPE (mul_result);
2570 gimple use_stmt, neguse_stmt, fma_stmt;
2571 use_operand_p use_p;
2572 imm_use_iterator imm_iter;
2573
2574 if (FLOAT_TYPE_P (type)
2575 && flag_fp_contract_mode == FP_CONTRACT_OFF)
2576 return false;
2577
2578 /* We don't want to do bitfield reduction ops. */
2579 if (INTEGRAL_TYPE_P (type)
2580 && (TYPE_PRECISION (type)
2581 != GET_MODE_PRECISION (TYPE_MODE (type))))
2582 return false;
2583
2584 /* If the target doesn't support it, don't generate it. We assume that
2585 if fma isn't available then fms, fnma or fnms are not either. */
2586 if (optab_handler (fma_optab, TYPE_MODE (type)) == CODE_FOR_nothing)
2587 return false;
2588
2589 /* If the multiplication has zero uses, it is kept around probably because
2590 of -fnon-call-exceptions. Don't optimize it away in that case,
2591 it is DCE job. */
2592 if (has_zero_uses (mul_result))
2593 return false;
2594
2595 /* Make sure that the multiplication statement becomes dead after
2596 the transformation, thus that all uses are transformed to FMAs.
2597 This means we assume that an FMA operation has the same cost
2598 as an addition. */
2599 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, mul_result)
2600 {
2601 enum tree_code use_code;
2602 tree result = mul_result;
2603 bool negate_p = false;
2604
2605 use_stmt = USE_STMT (use_p);
2606
2607 if (is_gimple_debug (use_stmt))
2608 continue;
2609
2610 /* For now restrict this operations to single basic blocks. In theory
2611 we would want to support sinking the multiplication in
2612 m = a*b;
2613 if ()
2614 ma = m + c;
2615 else
2616 d = m;
2617 to form a fma in the then block and sink the multiplication to the
2618 else block. */
2619 if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
2620 return false;
2621
2622 if (!is_gimple_assign (use_stmt))
2623 return false;
2624
2625 use_code = gimple_assign_rhs_code (use_stmt);
2626
2627 /* A negate on the multiplication leads to FNMA. */
2628 if (use_code == NEGATE_EXPR)
2629 {
2630 ssa_op_iter iter;
2631 use_operand_p usep;
2632
2633 result = gimple_assign_lhs (use_stmt);
2634
2635 /* Make sure the negate statement becomes dead with this
2636 single transformation. */
2637 if (!single_imm_use (gimple_assign_lhs (use_stmt),
2638 &use_p, &neguse_stmt))
2639 return false;
2640
2641 /* Make sure the multiplication isn't also used on that stmt. */
2642 FOR_EACH_PHI_OR_STMT_USE (usep, neguse_stmt, iter, SSA_OP_USE)
2643 if (USE_FROM_PTR (usep) == mul_result)
2644 return false;
2645
2646 /* Re-validate. */
2647 use_stmt = neguse_stmt;
2648 if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
2649 return false;
2650 if (!is_gimple_assign (use_stmt))
2651 return false;
2652
2653 use_code = gimple_assign_rhs_code (use_stmt);
2654 negate_p = true;
2655 }
2656
2657 switch (use_code)
2658 {
2659 case MINUS_EXPR:
2660 if (gimple_assign_rhs2 (use_stmt) == result)
2661 negate_p = !negate_p;
2662 break;
2663 case PLUS_EXPR:
2664 break;
2665 default:
2666 /* FMA can only be formed from PLUS and MINUS. */
2667 return false;
2668 }
2669
2670 /* If the subtrahend (gimple_assign_rhs2 (use_stmt)) is computed
2671 by a MULT_EXPR that we'll visit later, we might be able to
2672 get a more profitable match with fnma.
2673 OTOH, if we don't, a negate / fma pair has likely lower latency
2674 that a mult / subtract pair. */
2675 if (use_code == MINUS_EXPR && !negate_p
2676 && gimple_assign_rhs1 (use_stmt) == result
2677 && optab_handler (fms_optab, TYPE_MODE (type)) == CODE_FOR_nothing
2678 && optab_handler (fnma_optab, TYPE_MODE (type)) != CODE_FOR_nothing)
2679 {
2680 tree rhs2 = gimple_assign_rhs2 (use_stmt);
2681
2682 if (TREE_CODE (rhs2) == SSA_NAME)
2683 {
2684 gimple stmt2 = SSA_NAME_DEF_STMT (rhs2);
2685 if (has_single_use (rhs2)
2686 && is_gimple_assign (stmt2)
2687 && gimple_assign_rhs_code (stmt2) == MULT_EXPR)
2688 return false;
2689 }
2690 }
2691
2692 /* We can't handle a * b + a * b. */
2693 if (gimple_assign_rhs1 (use_stmt) == gimple_assign_rhs2 (use_stmt))
2694 return false;
2695
2696 /* While it is possible to validate whether or not the exact form
2697 that we've recognized is available in the backend, the assumption
2698 is that the transformation is never a loss. For instance, suppose
2699 the target only has the plain FMA pattern available. Consider
2700 a*b-c -> fma(a,b,-c): we've exchanged MUL+SUB for FMA+NEG, which
2701 is still two operations. Consider -(a*b)-c -> fma(-a,b,-c): we
2702 still have 3 operations, but in the FMA form the two NEGs are
2703 independent and could be run in parallel. */
2704 }
2705
2706 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, mul_result)
2707 {
2708 gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
2709 enum tree_code use_code;
2710 tree addop, mulop1 = op1, result = mul_result;
2711 bool negate_p = false;
2712
2713 if (is_gimple_debug (use_stmt))
2714 continue;
2715
2716 use_code = gimple_assign_rhs_code (use_stmt);
2717 if (use_code == NEGATE_EXPR)
2718 {
2719 result = gimple_assign_lhs (use_stmt);
2720 single_imm_use (gimple_assign_lhs (use_stmt), &use_p, &neguse_stmt);
2721 gsi_remove (&gsi, true);
2722 release_defs (use_stmt);
2723
2724 use_stmt = neguse_stmt;
2725 gsi = gsi_for_stmt (use_stmt);
2726 use_code = gimple_assign_rhs_code (use_stmt);
2727 negate_p = true;
2728 }
2729
2730 if (gimple_assign_rhs1 (use_stmt) == result)
2731 {
2732 addop = gimple_assign_rhs2 (use_stmt);
2733 /* a * b - c -> a * b + (-c) */
2734 if (gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
2735 addop = force_gimple_operand_gsi (&gsi,
2736 build1 (NEGATE_EXPR,
2737 type, addop),
2738 true, NULL_TREE, true,
2739 GSI_SAME_STMT);
2740 }
2741 else
2742 {
2743 addop = gimple_assign_rhs1 (use_stmt);
2744 /* a - b * c -> (-b) * c + a */
2745 if (gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
2746 negate_p = !negate_p;
2747 }
2748
2749 if (negate_p)
2750 mulop1 = force_gimple_operand_gsi (&gsi,
2751 build1 (NEGATE_EXPR,
2752 type, mulop1),
2753 true, NULL_TREE, true,
2754 GSI_SAME_STMT);
2755
2756 fma_stmt = gimple_build_assign_with_ops (FMA_EXPR,
2757 gimple_assign_lhs (use_stmt),
2758 mulop1, op2,
2759 addop);
2760 gsi_replace (&gsi, fma_stmt, true);
2761 widen_mul_stats.fmas_inserted++;
2762 }
2763
2764 return true;
2765 }
2766
2767 /* Find integer multiplications where the operands are extended from
2768 smaller types, and replace the MULT_EXPR with a WIDEN_MULT_EXPR
2769 where appropriate. */
2770
2771 static unsigned int
2772 execute_optimize_widening_mul (void)
2773 {
2774 basic_block bb;
2775 bool cfg_changed = false;
2776
2777 memset (&widen_mul_stats, 0, sizeof (widen_mul_stats));
2778
2779 FOR_EACH_BB (bb)
2780 {
2781 gimple_stmt_iterator gsi;
2782
2783 for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi);)
2784 {
2785 gimple stmt = gsi_stmt (gsi);
2786 enum tree_code code;
2787
2788 if (is_gimple_assign (stmt))
2789 {
2790 code = gimple_assign_rhs_code (stmt);
2791 switch (code)
2792 {
2793 case MULT_EXPR:
2794 if (!convert_mult_to_widen (stmt, &gsi)
2795 && convert_mult_to_fma (stmt,
2796 gimple_assign_rhs1 (stmt),
2797 gimple_assign_rhs2 (stmt)))
2798 {
2799 gsi_remove (&gsi, true);
2800 release_defs (stmt);
2801 continue;
2802 }
2803 break;
2804
2805 case PLUS_EXPR:
2806 case MINUS_EXPR:
2807 convert_plusminus_to_widen (&gsi, stmt, code);
2808 break;
2809
2810 default:;
2811 }
2812 }
2813 else if (is_gimple_call (stmt)
2814 && gimple_call_lhs (stmt))
2815 {
2816 tree fndecl = gimple_call_fndecl (stmt);
2817 if (fndecl
2818 && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL)
2819 {
2820 switch (DECL_FUNCTION_CODE (fndecl))
2821 {
2822 case BUILT_IN_POWF:
2823 case BUILT_IN_POW:
2824 case BUILT_IN_POWL:
2825 if (TREE_CODE (gimple_call_arg (stmt, 1)) == REAL_CST
2826 && REAL_VALUES_EQUAL
2827 (TREE_REAL_CST (gimple_call_arg (stmt, 1)),
2828 dconst2)
2829 && convert_mult_to_fma (stmt,
2830 gimple_call_arg (stmt, 0),
2831 gimple_call_arg (stmt, 0)))
2832 {
2833 unlink_stmt_vdef (stmt);
2834 if (gsi_remove (&gsi, true)
2835 && gimple_purge_dead_eh_edges (bb))
2836 cfg_changed = true;
2837 release_defs (stmt);
2838 continue;
2839 }
2840 break;
2841
2842 default:;
2843 }
2844 }
2845 }
2846 gsi_next (&gsi);
2847 }
2848 }
2849
2850 statistics_counter_event (cfun, "widening multiplications inserted",
2851 widen_mul_stats.widen_mults_inserted);
2852 statistics_counter_event (cfun, "widening maccs inserted",
2853 widen_mul_stats.maccs_inserted);
2854 statistics_counter_event (cfun, "fused multiply-adds inserted",
2855 widen_mul_stats.fmas_inserted);
2856
2857 return cfg_changed ? TODO_cleanup_cfg : 0;
2858 }
2859
2860 static bool
2861 gate_optimize_widening_mul (void)
2862 {
2863 return flag_expensive_optimizations && optimize;
2864 }
2865
2866 namespace {
2867
2868 const pass_data pass_data_optimize_widening_mul =
2869 {
2870 GIMPLE_PASS, /* type */
2871 "widening_mul", /* name */
2872 OPTGROUP_NONE, /* optinfo_flags */
2873 true, /* has_gate */
2874 true, /* has_execute */
2875 TV_NONE, /* tv_id */
2876 PROP_ssa, /* properties_required */
2877 0, /* properties_provided */
2878 0, /* properties_destroyed */
2879 0, /* todo_flags_start */
2880 ( TODO_verify_ssa | TODO_verify_stmts
2881 | TODO_update_ssa ), /* todo_flags_finish */
2882 };
2883
2884 class pass_optimize_widening_mul : public gimple_opt_pass
2885 {
2886 public:
2887 pass_optimize_widening_mul (gcc::context *ctxt)
2888 : gimple_opt_pass (pass_data_optimize_widening_mul, ctxt)
2889 {}
2890
2891 /* opt_pass methods: */
2892 bool gate () { return gate_optimize_widening_mul (); }
2893 unsigned int execute () { return execute_optimize_widening_mul (); }
2894
2895 }; // class pass_optimize_widening_mul
2896
2897 } // anon namespace
2898
2899 gimple_opt_pass *
2900 make_pass_optimize_widening_mul (gcc::context *ctxt)
2901 {
2902 return new pass_optimize_widening_mul (ctxt);
2903 }