coretypes.h: Include machmode.h...
[gcc.git] / gcc / tree-ssa-math-opts.c
1 /* Global, SSA-based optimizations using mathematical identities.
2 Copyright (C) 2005-2015 Free Software Foundation, Inc.
3
4 This file is part of GCC.
5
6 GCC is free software; you can redistribute it and/or modify it
7 under the terms of the GNU General Public License as published by the
8 Free Software Foundation; either version 3, or (at your option) any
9 later version.
10
11 GCC is distributed in the hope that it will be useful, but WITHOUT
12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
19
20 /* Currently, the only mini-pass in this file tries to CSE reciprocal
21 operations. These are common in sequences such as this one:
22
23 modulus = sqrt(x*x + y*y + z*z);
24 x = x / modulus;
25 y = y / modulus;
26 z = z / modulus;
27
28 that can be optimized to
29
30 modulus = sqrt(x*x + y*y + z*z);
31 rmodulus = 1.0 / modulus;
32 x = x * rmodulus;
33 y = y * rmodulus;
34 z = z * rmodulus;
35
36 We do this for loop invariant divisors, and with this pass whenever
37 we notice that a division has the same divisor multiple times.
38
39 Of course, like in PRE, we don't insert a division if a dominator
40 already has one. However, this cannot be done as an extension of
41 PRE for several reasons.
42
43 First of all, with some experiments it was found out that the
44 transformation is not always useful if there are only two divisions
45 hy the same divisor. This is probably because modern processors
46 can pipeline the divisions; on older, in-order processors it should
47 still be effective to optimize two divisions by the same number.
48 We make this a param, and it shall be called N in the remainder of
49 this comment.
50
51 Second, if trapping math is active, we have less freedom on where
52 to insert divisions: we can only do so in basic blocks that already
53 contain one. (If divisions don't trap, instead, we can insert
54 divisions elsewhere, which will be in blocks that are common dominators
55 of those that have the division).
56
57 We really don't want to compute the reciprocal unless a division will
58 be found. To do this, we won't insert the division in a basic block
59 that has less than N divisions *post-dominating* it.
60
61 The algorithm constructs a subset of the dominator tree, holding the
62 blocks containing the divisions and the common dominators to them,
63 and walk it twice. The first walk is in post-order, and it annotates
64 each block with the number of divisions that post-dominate it: this
65 gives information on where divisions can be inserted profitably.
66 The second walk is in pre-order, and it inserts divisions as explained
67 above, and replaces divisions by multiplications.
68
69 In the best case, the cost of the pass is O(n_statements). In the
70 worst-case, the cost is due to creating the dominator tree subset,
71 with a cost of O(n_basic_blocks ^ 2); however this can only happen
72 for n_statements / n_basic_blocks statements. So, the amortized cost
73 of creating the dominator tree subset is O(n_basic_blocks) and the
74 worst-case cost of the pass is O(n_statements * n_basic_blocks).
75
76 More practically, the cost will be small because there are few
77 divisions, and they tend to be in the same basic block, so insert_bb
78 is called very few times.
79
80 If we did this using domwalk.c, an efficient implementation would have
81 to work on all the variables in a single pass, because we could not
82 work on just a subset of the dominator tree, as we do now, and the
83 cost would also be something like O(n_statements * n_basic_blocks).
84 The data structures would be more complex in order to work on all the
85 variables in a single pass. */
86
87 #include "config.h"
88 #include "system.h"
89 #include "coretypes.h"
90 #include "tm.h"
91 #include "flags.h"
92 #include "hash-set.h"
93 #include "vec.h"
94 #include "input.h"
95 #include "alias.h"
96 #include "symtab.h"
97 #include "inchash.h"
98 #include "tree.h"
99 #include "fold-const.h"
100 #include "predict.h"
101 #include "hard-reg-set.h"
102 #include "function.h"
103 #include "dominance.h"
104 #include "cfg.h"
105 #include "basic-block.h"
106 #include "tree-ssa-alias.h"
107 #include "internal-fn.h"
108 #include "gimple-fold.h"
109 #include "gimple-expr.h"
110 #include "is-a.h"
111 #include "gimple.h"
112 #include "gimple-iterator.h"
113 #include "gimplify.h"
114 #include "gimplify-me.h"
115 #include "stor-layout.h"
116 #include "gimple-ssa.h"
117 #include "tree-cfg.h"
118 #include "tree-phinodes.h"
119 #include "ssa-iterators.h"
120 #include "stringpool.h"
121 #include "tree-ssanames.h"
122 #include "hashtab.h"
123 #include "rtl.h"
124 #include "statistics.h"
125 #include "insn-config.h"
126 #include "expmed.h"
127 #include "dojump.h"
128 #include "explow.h"
129 #include "calls.h"
130 #include "emit-rtl.h"
131 #include "varasm.h"
132 #include "stmt.h"
133 #include "expr.h"
134 #include "tree-dfa.h"
135 #include "tree-ssa.h"
136 #include "tree-pass.h"
137 #include "alloc-pool.h"
138 #include "target.h"
139 #include "gimple-pretty-print.h"
140 #include "builtins.h"
141 #include "params.h"
142
143 /* FIXME: RTL headers have to be included here for optabs. */
144 #include "rtl.h" /* Because optabs.h wants enum rtx_code. */
145 #include "expr.h" /* Because optabs.h wants sepops. */
146 #include "insn-codes.h"
147 #include "optabs.h"
148
149 /* This structure represents one basic block that either computes a
150 division, or is a common dominator for basic block that compute a
151 division. */
152 struct occurrence {
153 /* The basic block represented by this structure. */
154 basic_block bb;
155
156 /* If non-NULL, the SSA_NAME holding the definition for a reciprocal
157 inserted in BB. */
158 tree recip_def;
159
160 /* If non-NULL, the GIMPLE_ASSIGN for a reciprocal computation that
161 was inserted in BB. */
162 gimple recip_def_stmt;
163
164 /* Pointer to a list of "struct occurrence"s for blocks dominated
165 by BB. */
166 struct occurrence *children;
167
168 /* Pointer to the next "struct occurrence"s in the list of blocks
169 sharing a common dominator. */
170 struct occurrence *next;
171
172 /* The number of divisions that are in BB before compute_merit. The
173 number of divisions that are in BB or post-dominate it after
174 compute_merit. */
175 int num_divisions;
176
177 /* True if the basic block has a division, false if it is a common
178 dominator for basic blocks that do. If it is false and trapping
179 math is active, BB is not a candidate for inserting a reciprocal. */
180 bool bb_has_division;
181 };
182
183 static struct
184 {
185 /* Number of 1.0/X ops inserted. */
186 int rdivs_inserted;
187
188 /* Number of 1.0/FUNC ops inserted. */
189 int rfuncs_inserted;
190 } reciprocal_stats;
191
192 static struct
193 {
194 /* Number of cexpi calls inserted. */
195 int inserted;
196 } sincos_stats;
197
198 static struct
199 {
200 /* Number of hand-written 16-bit nop / bswaps found. */
201 int found_16bit;
202
203 /* Number of hand-written 32-bit nop / bswaps found. */
204 int found_32bit;
205
206 /* Number of hand-written 64-bit nop / bswaps found. */
207 int found_64bit;
208 } nop_stats, bswap_stats;
209
210 static struct
211 {
212 /* Number of widening multiplication ops inserted. */
213 int widen_mults_inserted;
214
215 /* Number of integer multiply-and-accumulate ops inserted. */
216 int maccs_inserted;
217
218 /* Number of fp fused multiply-add ops inserted. */
219 int fmas_inserted;
220 } widen_mul_stats;
221
222 /* The instance of "struct occurrence" representing the highest
223 interesting block in the dominator tree. */
224 static struct occurrence *occ_head;
225
226 /* Allocation pool for getting instances of "struct occurrence". */
227 static pool_allocator<occurrence> *occ_pool;
228
229
230
231 /* Allocate and return a new struct occurrence for basic block BB, and
232 whose children list is headed by CHILDREN. */
233 static struct occurrence *
234 occ_new (basic_block bb, struct occurrence *children)
235 {
236 struct occurrence *occ;
237
238 bb->aux = occ = occ_pool->allocate ();
239 memset (occ, 0, sizeof (struct occurrence));
240
241 occ->bb = bb;
242 occ->children = children;
243 return occ;
244 }
245
246
247 /* Insert NEW_OCC into our subset of the dominator tree. P_HEAD points to a
248 list of "struct occurrence"s, one per basic block, having IDOM as
249 their common dominator.
250
251 We try to insert NEW_OCC as deep as possible in the tree, and we also
252 insert any other block that is a common dominator for BB and one
253 block already in the tree. */
254
255 static void
256 insert_bb (struct occurrence *new_occ, basic_block idom,
257 struct occurrence **p_head)
258 {
259 struct occurrence *occ, **p_occ;
260
261 for (p_occ = p_head; (occ = *p_occ) != NULL; )
262 {
263 basic_block bb = new_occ->bb, occ_bb = occ->bb;
264 basic_block dom = nearest_common_dominator (CDI_DOMINATORS, occ_bb, bb);
265 if (dom == bb)
266 {
267 /* BB dominates OCC_BB. OCC becomes NEW_OCC's child: remove OCC
268 from its list. */
269 *p_occ = occ->next;
270 occ->next = new_occ->children;
271 new_occ->children = occ;
272
273 /* Try the next block (it may as well be dominated by BB). */
274 }
275
276 else if (dom == occ_bb)
277 {
278 /* OCC_BB dominates BB. Tail recurse to look deeper. */
279 insert_bb (new_occ, dom, &occ->children);
280 return;
281 }
282
283 else if (dom != idom)
284 {
285 gcc_assert (!dom->aux);
286
287 /* There is a dominator between IDOM and BB, add it and make
288 two children out of NEW_OCC and OCC. First, remove OCC from
289 its list. */
290 *p_occ = occ->next;
291 new_occ->next = occ;
292 occ->next = NULL;
293
294 /* None of the previous blocks has DOM as a dominator: if we tail
295 recursed, we would reexamine them uselessly. Just switch BB with
296 DOM, and go on looking for blocks dominated by DOM. */
297 new_occ = occ_new (dom, new_occ);
298 }
299
300 else
301 {
302 /* Nothing special, go on with the next element. */
303 p_occ = &occ->next;
304 }
305 }
306
307 /* No place was found as a child of IDOM. Make BB a sibling of IDOM. */
308 new_occ->next = *p_head;
309 *p_head = new_occ;
310 }
311
312 /* Register that we found a division in BB. */
313
314 static inline void
315 register_division_in (basic_block bb)
316 {
317 struct occurrence *occ;
318
319 occ = (struct occurrence *) bb->aux;
320 if (!occ)
321 {
322 occ = occ_new (bb, NULL);
323 insert_bb (occ, ENTRY_BLOCK_PTR_FOR_FN (cfun), &occ_head);
324 }
325
326 occ->bb_has_division = true;
327 occ->num_divisions++;
328 }
329
330
331 /* Compute the number of divisions that postdominate each block in OCC and
332 its children. */
333
334 static void
335 compute_merit (struct occurrence *occ)
336 {
337 struct occurrence *occ_child;
338 basic_block dom = occ->bb;
339
340 for (occ_child = occ->children; occ_child; occ_child = occ_child->next)
341 {
342 basic_block bb;
343 if (occ_child->children)
344 compute_merit (occ_child);
345
346 if (flag_exceptions)
347 bb = single_noncomplex_succ (dom);
348 else
349 bb = dom;
350
351 if (dominated_by_p (CDI_POST_DOMINATORS, bb, occ_child->bb))
352 occ->num_divisions += occ_child->num_divisions;
353 }
354 }
355
356
357 /* Return whether USE_STMT is a floating-point division by DEF. */
358 static inline bool
359 is_division_by (gimple use_stmt, tree def)
360 {
361 return is_gimple_assign (use_stmt)
362 && gimple_assign_rhs_code (use_stmt) == RDIV_EXPR
363 && gimple_assign_rhs2 (use_stmt) == def
364 /* Do not recognize x / x as valid division, as we are getting
365 confused later by replacing all immediate uses x in such
366 a stmt. */
367 && gimple_assign_rhs1 (use_stmt) != def;
368 }
369
370 /* Walk the subset of the dominator tree rooted at OCC, setting the
371 RECIP_DEF field to a definition of 1.0 / DEF that can be used in
372 the given basic block. The field may be left NULL, of course,
373 if it is not possible or profitable to do the optimization.
374
375 DEF_BSI is an iterator pointing at the statement defining DEF.
376 If RECIP_DEF is set, a dominator already has a computation that can
377 be used. */
378
379 static void
380 insert_reciprocals (gimple_stmt_iterator *def_gsi, struct occurrence *occ,
381 tree def, tree recip_def, int threshold)
382 {
383 tree type;
384 gassign *new_stmt;
385 gimple_stmt_iterator gsi;
386 struct occurrence *occ_child;
387
388 if (!recip_def
389 && (occ->bb_has_division || !flag_trapping_math)
390 && occ->num_divisions >= threshold)
391 {
392 /* Make a variable with the replacement and substitute it. */
393 type = TREE_TYPE (def);
394 recip_def = create_tmp_reg (type, "reciptmp");
395 new_stmt = gimple_build_assign (recip_def, RDIV_EXPR,
396 build_one_cst (type), def);
397
398 if (occ->bb_has_division)
399 {
400 /* Case 1: insert before an existing division. */
401 gsi = gsi_after_labels (occ->bb);
402 while (!gsi_end_p (gsi) && !is_division_by (gsi_stmt (gsi), def))
403 gsi_next (&gsi);
404
405 gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
406 }
407 else if (def_gsi && occ->bb == def_gsi->bb)
408 {
409 /* Case 2: insert right after the definition. Note that this will
410 never happen if the definition statement can throw, because in
411 that case the sole successor of the statement's basic block will
412 dominate all the uses as well. */
413 gsi_insert_after (def_gsi, new_stmt, GSI_NEW_STMT);
414 }
415 else
416 {
417 /* Case 3: insert in a basic block not containing defs/uses. */
418 gsi = gsi_after_labels (occ->bb);
419 gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
420 }
421
422 reciprocal_stats.rdivs_inserted++;
423
424 occ->recip_def_stmt = new_stmt;
425 }
426
427 occ->recip_def = recip_def;
428 for (occ_child = occ->children; occ_child; occ_child = occ_child->next)
429 insert_reciprocals (def_gsi, occ_child, def, recip_def, threshold);
430 }
431
432
433 /* Replace the division at USE_P with a multiplication by the reciprocal, if
434 possible. */
435
436 static inline void
437 replace_reciprocal (use_operand_p use_p)
438 {
439 gimple use_stmt = USE_STMT (use_p);
440 basic_block bb = gimple_bb (use_stmt);
441 struct occurrence *occ = (struct occurrence *) bb->aux;
442
443 if (optimize_bb_for_speed_p (bb)
444 && occ->recip_def && use_stmt != occ->recip_def_stmt)
445 {
446 gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
447 gimple_assign_set_rhs_code (use_stmt, MULT_EXPR);
448 SET_USE (use_p, occ->recip_def);
449 fold_stmt_inplace (&gsi);
450 update_stmt (use_stmt);
451 }
452 }
453
454
455 /* Free OCC and return one more "struct occurrence" to be freed. */
456
457 static struct occurrence *
458 free_bb (struct occurrence *occ)
459 {
460 struct occurrence *child, *next;
461
462 /* First get the two pointers hanging off OCC. */
463 next = occ->next;
464 child = occ->children;
465 occ->bb->aux = NULL;
466 occ_pool->remove (occ);
467
468 /* Now ensure that we don't recurse unless it is necessary. */
469 if (!child)
470 return next;
471 else
472 {
473 while (next)
474 next = free_bb (next);
475
476 return child;
477 }
478 }
479
480
481 /* Look for floating-point divisions among DEF's uses, and try to
482 replace them by multiplications with the reciprocal. Add
483 as many statements computing the reciprocal as needed.
484
485 DEF must be a GIMPLE register of a floating-point type. */
486
487 static void
488 execute_cse_reciprocals_1 (gimple_stmt_iterator *def_gsi, tree def)
489 {
490 use_operand_p use_p;
491 imm_use_iterator use_iter;
492 struct occurrence *occ;
493 int count = 0, threshold;
494
495 gcc_assert (FLOAT_TYPE_P (TREE_TYPE (def)) && is_gimple_reg (def));
496
497 FOR_EACH_IMM_USE_FAST (use_p, use_iter, def)
498 {
499 gimple use_stmt = USE_STMT (use_p);
500 if (is_division_by (use_stmt, def))
501 {
502 register_division_in (gimple_bb (use_stmt));
503 count++;
504 }
505 }
506
507 /* Do the expensive part only if we can hope to optimize something. */
508 threshold = targetm.min_divisions_for_recip_mul (TYPE_MODE (TREE_TYPE (def)));
509 if (count >= threshold)
510 {
511 gimple use_stmt;
512 for (occ = occ_head; occ; occ = occ->next)
513 {
514 compute_merit (occ);
515 insert_reciprocals (def_gsi, occ, def, NULL, threshold);
516 }
517
518 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, def)
519 {
520 if (is_division_by (use_stmt, def))
521 {
522 FOR_EACH_IMM_USE_ON_STMT (use_p, use_iter)
523 replace_reciprocal (use_p);
524 }
525 }
526 }
527
528 for (occ = occ_head; occ; )
529 occ = free_bb (occ);
530
531 occ_head = NULL;
532 }
533
534 /* Go through all the floating-point SSA_NAMEs, and call
535 execute_cse_reciprocals_1 on each of them. */
536 namespace {
537
538 const pass_data pass_data_cse_reciprocals =
539 {
540 GIMPLE_PASS, /* type */
541 "recip", /* name */
542 OPTGROUP_NONE, /* optinfo_flags */
543 TV_NONE, /* tv_id */
544 PROP_ssa, /* properties_required */
545 0, /* properties_provided */
546 0, /* properties_destroyed */
547 0, /* todo_flags_start */
548 TODO_update_ssa, /* todo_flags_finish */
549 };
550
551 class pass_cse_reciprocals : public gimple_opt_pass
552 {
553 public:
554 pass_cse_reciprocals (gcc::context *ctxt)
555 : gimple_opt_pass (pass_data_cse_reciprocals, ctxt)
556 {}
557
558 /* opt_pass methods: */
559 virtual bool gate (function *) { return optimize && flag_reciprocal_math; }
560 virtual unsigned int execute (function *);
561
562 }; // class pass_cse_reciprocals
563
564 unsigned int
565 pass_cse_reciprocals::execute (function *fun)
566 {
567 basic_block bb;
568 tree arg;
569
570 occ_pool = new pool_allocator<occurrence>
571 ("dominators for recip", n_basic_blocks_for_fn (fun) / 3 + 1);
572
573 memset (&reciprocal_stats, 0, sizeof (reciprocal_stats));
574 calculate_dominance_info (CDI_DOMINATORS);
575 calculate_dominance_info (CDI_POST_DOMINATORS);
576
577 #ifdef ENABLE_CHECKING
578 FOR_EACH_BB_FN (bb, fun)
579 gcc_assert (!bb->aux);
580 #endif
581
582 for (arg = DECL_ARGUMENTS (fun->decl); arg; arg = DECL_CHAIN (arg))
583 if (FLOAT_TYPE_P (TREE_TYPE (arg))
584 && is_gimple_reg (arg))
585 {
586 tree name = ssa_default_def (fun, arg);
587 if (name)
588 execute_cse_reciprocals_1 (NULL, name);
589 }
590
591 FOR_EACH_BB_FN (bb, fun)
592 {
593 tree def;
594
595 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
596 gsi_next (&gsi))
597 {
598 gphi *phi = gsi.phi ();
599 def = PHI_RESULT (phi);
600 if (! virtual_operand_p (def)
601 && FLOAT_TYPE_P (TREE_TYPE (def)))
602 execute_cse_reciprocals_1 (NULL, def);
603 }
604
605 for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
606 gsi_next (&gsi))
607 {
608 gimple stmt = gsi_stmt (gsi);
609
610 if (gimple_has_lhs (stmt)
611 && (def = SINGLE_SSA_TREE_OPERAND (stmt, SSA_OP_DEF)) != NULL
612 && FLOAT_TYPE_P (TREE_TYPE (def))
613 && TREE_CODE (def) == SSA_NAME)
614 execute_cse_reciprocals_1 (&gsi, def);
615 }
616
617 if (optimize_bb_for_size_p (bb))
618 continue;
619
620 /* Scan for a/func(b) and convert it to reciprocal a*rfunc(b). */
621 for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
622 gsi_next (&gsi))
623 {
624 gimple stmt = gsi_stmt (gsi);
625 tree fndecl;
626
627 if (is_gimple_assign (stmt)
628 && gimple_assign_rhs_code (stmt) == RDIV_EXPR)
629 {
630 tree arg1 = gimple_assign_rhs2 (stmt);
631 gimple stmt1;
632
633 if (TREE_CODE (arg1) != SSA_NAME)
634 continue;
635
636 stmt1 = SSA_NAME_DEF_STMT (arg1);
637
638 if (is_gimple_call (stmt1)
639 && gimple_call_lhs (stmt1)
640 && (fndecl = gimple_call_fndecl (stmt1))
641 && (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL
642 || DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD))
643 {
644 enum built_in_function code;
645 bool md_code, fail;
646 imm_use_iterator ui;
647 use_operand_p use_p;
648
649 code = DECL_FUNCTION_CODE (fndecl);
650 md_code = DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD;
651
652 fndecl = targetm.builtin_reciprocal (code, md_code, false);
653 if (!fndecl)
654 continue;
655
656 /* Check that all uses of the SSA name are divisions,
657 otherwise replacing the defining statement will do
658 the wrong thing. */
659 fail = false;
660 FOR_EACH_IMM_USE_FAST (use_p, ui, arg1)
661 {
662 gimple stmt2 = USE_STMT (use_p);
663 if (is_gimple_debug (stmt2))
664 continue;
665 if (!is_gimple_assign (stmt2)
666 || gimple_assign_rhs_code (stmt2) != RDIV_EXPR
667 || gimple_assign_rhs1 (stmt2) == arg1
668 || gimple_assign_rhs2 (stmt2) != arg1)
669 {
670 fail = true;
671 break;
672 }
673 }
674 if (fail)
675 continue;
676
677 gimple_replace_ssa_lhs (stmt1, arg1);
678 gimple_call_set_fndecl (stmt1, fndecl);
679 update_stmt (stmt1);
680 reciprocal_stats.rfuncs_inserted++;
681
682 FOR_EACH_IMM_USE_STMT (stmt, ui, arg1)
683 {
684 gimple_stmt_iterator gsi = gsi_for_stmt (stmt);
685 gimple_assign_set_rhs_code (stmt, MULT_EXPR);
686 fold_stmt_inplace (&gsi);
687 update_stmt (stmt);
688 }
689 }
690 }
691 }
692 }
693
694 statistics_counter_event (fun, "reciprocal divs inserted",
695 reciprocal_stats.rdivs_inserted);
696 statistics_counter_event (fun, "reciprocal functions inserted",
697 reciprocal_stats.rfuncs_inserted);
698
699 free_dominance_info (CDI_DOMINATORS);
700 free_dominance_info (CDI_POST_DOMINATORS);
701 delete occ_pool;
702 return 0;
703 }
704
705 } // anon namespace
706
707 gimple_opt_pass *
708 make_pass_cse_reciprocals (gcc::context *ctxt)
709 {
710 return new pass_cse_reciprocals (ctxt);
711 }
712
713 /* Records an occurrence at statement USE_STMT in the vector of trees
714 STMTS if it is dominated by *TOP_BB or dominates it or this basic block
715 is not yet initialized. Returns true if the occurrence was pushed on
716 the vector. Adjusts *TOP_BB to be the basic block dominating all
717 statements in the vector. */
718
719 static bool
720 maybe_record_sincos (vec<gimple> *stmts,
721 basic_block *top_bb, gimple use_stmt)
722 {
723 basic_block use_bb = gimple_bb (use_stmt);
724 if (*top_bb
725 && (*top_bb == use_bb
726 || dominated_by_p (CDI_DOMINATORS, use_bb, *top_bb)))
727 stmts->safe_push (use_stmt);
728 else if (!*top_bb
729 || dominated_by_p (CDI_DOMINATORS, *top_bb, use_bb))
730 {
731 stmts->safe_push (use_stmt);
732 *top_bb = use_bb;
733 }
734 else
735 return false;
736
737 return true;
738 }
739
740 /* Look for sin, cos and cexpi calls with the same argument NAME and
741 create a single call to cexpi CSEing the result in this case.
742 We first walk over all immediate uses of the argument collecting
743 statements that we can CSE in a vector and in a second pass replace
744 the statement rhs with a REALPART or IMAGPART expression on the
745 result of the cexpi call we insert before the use statement that
746 dominates all other candidates. */
747
748 static bool
749 execute_cse_sincos_1 (tree name)
750 {
751 gimple_stmt_iterator gsi;
752 imm_use_iterator use_iter;
753 tree fndecl, res, type;
754 gimple def_stmt, use_stmt, stmt;
755 int seen_cos = 0, seen_sin = 0, seen_cexpi = 0;
756 auto_vec<gimple> stmts;
757 basic_block top_bb = NULL;
758 int i;
759 bool cfg_changed = false;
760
761 type = TREE_TYPE (name);
762 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, name)
763 {
764 if (gimple_code (use_stmt) != GIMPLE_CALL
765 || !gimple_call_lhs (use_stmt)
766 || !(fndecl = gimple_call_fndecl (use_stmt))
767 || DECL_BUILT_IN_CLASS (fndecl) != BUILT_IN_NORMAL)
768 continue;
769
770 switch (DECL_FUNCTION_CODE (fndecl))
771 {
772 CASE_FLT_FN (BUILT_IN_COS):
773 seen_cos |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
774 break;
775
776 CASE_FLT_FN (BUILT_IN_SIN):
777 seen_sin |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
778 break;
779
780 CASE_FLT_FN (BUILT_IN_CEXPI):
781 seen_cexpi |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
782 break;
783
784 default:;
785 }
786 }
787
788 if (seen_cos + seen_sin + seen_cexpi <= 1)
789 return false;
790
791 /* Simply insert cexpi at the beginning of top_bb but not earlier than
792 the name def statement. */
793 fndecl = mathfn_built_in (type, BUILT_IN_CEXPI);
794 if (!fndecl)
795 return false;
796 stmt = gimple_build_call (fndecl, 1, name);
797 res = make_temp_ssa_name (TREE_TYPE (TREE_TYPE (fndecl)), stmt, "sincostmp");
798 gimple_call_set_lhs (stmt, res);
799
800 def_stmt = SSA_NAME_DEF_STMT (name);
801 if (!SSA_NAME_IS_DEFAULT_DEF (name)
802 && gimple_code (def_stmt) != GIMPLE_PHI
803 && gimple_bb (def_stmt) == top_bb)
804 {
805 gsi = gsi_for_stmt (def_stmt);
806 gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
807 }
808 else
809 {
810 gsi = gsi_after_labels (top_bb);
811 gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
812 }
813 sincos_stats.inserted++;
814
815 /* And adjust the recorded old call sites. */
816 for (i = 0; stmts.iterate (i, &use_stmt); ++i)
817 {
818 tree rhs = NULL;
819 fndecl = gimple_call_fndecl (use_stmt);
820
821 switch (DECL_FUNCTION_CODE (fndecl))
822 {
823 CASE_FLT_FN (BUILT_IN_COS):
824 rhs = fold_build1 (REALPART_EXPR, type, res);
825 break;
826
827 CASE_FLT_FN (BUILT_IN_SIN):
828 rhs = fold_build1 (IMAGPART_EXPR, type, res);
829 break;
830
831 CASE_FLT_FN (BUILT_IN_CEXPI):
832 rhs = res;
833 break;
834
835 default:;
836 gcc_unreachable ();
837 }
838
839 /* Replace call with a copy. */
840 stmt = gimple_build_assign (gimple_call_lhs (use_stmt), rhs);
841
842 gsi = gsi_for_stmt (use_stmt);
843 gsi_replace (&gsi, stmt, true);
844 if (gimple_purge_dead_eh_edges (gimple_bb (stmt)))
845 cfg_changed = true;
846 }
847
848 return cfg_changed;
849 }
850
851 /* To evaluate powi(x,n), the floating point value x raised to the
852 constant integer exponent n, we use a hybrid algorithm that
853 combines the "window method" with look-up tables. For an
854 introduction to exponentiation algorithms and "addition chains",
855 see section 4.6.3, "Evaluation of Powers" of Donald E. Knuth,
856 "Seminumerical Algorithms", Vol. 2, "The Art of Computer Programming",
857 3rd Edition, 1998, and Daniel M. Gordon, "A Survey of Fast Exponentiation
858 Methods", Journal of Algorithms, Vol. 27, pp. 129-146, 1998. */
859
860 /* Provide a default value for POWI_MAX_MULTS, the maximum number of
861 multiplications to inline before calling the system library's pow
862 function. powi(x,n) requires at worst 2*bits(n)-2 multiplications,
863 so this default never requires calling pow, powf or powl. */
864
865 #ifndef POWI_MAX_MULTS
866 #define POWI_MAX_MULTS (2*HOST_BITS_PER_WIDE_INT-2)
867 #endif
868
869 /* The size of the "optimal power tree" lookup table. All
870 exponents less than this value are simply looked up in the
871 powi_table below. This threshold is also used to size the
872 cache of pseudo registers that hold intermediate results. */
873 #define POWI_TABLE_SIZE 256
874
875 /* The size, in bits of the window, used in the "window method"
876 exponentiation algorithm. This is equivalent to a radix of
877 (1<<POWI_WINDOW_SIZE) in the corresponding "m-ary method". */
878 #define POWI_WINDOW_SIZE 3
879
880 /* The following table is an efficient representation of an
881 "optimal power tree". For each value, i, the corresponding
882 value, j, in the table states than an optimal evaluation
883 sequence for calculating pow(x,i) can be found by evaluating
884 pow(x,j)*pow(x,i-j). An optimal power tree for the first
885 100 integers is given in Knuth's "Seminumerical algorithms". */
886
887 static const unsigned char powi_table[POWI_TABLE_SIZE] =
888 {
889 0, 1, 1, 2, 2, 3, 3, 4, /* 0 - 7 */
890 4, 6, 5, 6, 6, 10, 7, 9, /* 8 - 15 */
891 8, 16, 9, 16, 10, 12, 11, 13, /* 16 - 23 */
892 12, 17, 13, 18, 14, 24, 15, 26, /* 24 - 31 */
893 16, 17, 17, 19, 18, 33, 19, 26, /* 32 - 39 */
894 20, 25, 21, 40, 22, 27, 23, 44, /* 40 - 47 */
895 24, 32, 25, 34, 26, 29, 27, 44, /* 48 - 55 */
896 28, 31, 29, 34, 30, 60, 31, 36, /* 56 - 63 */
897 32, 64, 33, 34, 34, 46, 35, 37, /* 64 - 71 */
898 36, 65, 37, 50, 38, 48, 39, 69, /* 72 - 79 */
899 40, 49, 41, 43, 42, 51, 43, 58, /* 80 - 87 */
900 44, 64, 45, 47, 46, 59, 47, 76, /* 88 - 95 */
901 48, 65, 49, 66, 50, 67, 51, 66, /* 96 - 103 */
902 52, 70, 53, 74, 54, 104, 55, 74, /* 104 - 111 */
903 56, 64, 57, 69, 58, 78, 59, 68, /* 112 - 119 */
904 60, 61, 61, 80, 62, 75, 63, 68, /* 120 - 127 */
905 64, 65, 65, 128, 66, 129, 67, 90, /* 128 - 135 */
906 68, 73, 69, 131, 70, 94, 71, 88, /* 136 - 143 */
907 72, 128, 73, 98, 74, 132, 75, 121, /* 144 - 151 */
908 76, 102, 77, 124, 78, 132, 79, 106, /* 152 - 159 */
909 80, 97, 81, 160, 82, 99, 83, 134, /* 160 - 167 */
910 84, 86, 85, 95, 86, 160, 87, 100, /* 168 - 175 */
911 88, 113, 89, 98, 90, 107, 91, 122, /* 176 - 183 */
912 92, 111, 93, 102, 94, 126, 95, 150, /* 184 - 191 */
913 96, 128, 97, 130, 98, 133, 99, 195, /* 192 - 199 */
914 100, 128, 101, 123, 102, 164, 103, 138, /* 200 - 207 */
915 104, 145, 105, 146, 106, 109, 107, 149, /* 208 - 215 */
916 108, 200, 109, 146, 110, 170, 111, 157, /* 216 - 223 */
917 112, 128, 113, 130, 114, 182, 115, 132, /* 224 - 231 */
918 116, 200, 117, 132, 118, 158, 119, 206, /* 232 - 239 */
919 120, 240, 121, 162, 122, 147, 123, 152, /* 240 - 247 */
920 124, 166, 125, 214, 126, 138, 127, 153, /* 248 - 255 */
921 };
922
923
924 /* Return the number of multiplications required to calculate
925 powi(x,n) where n is less than POWI_TABLE_SIZE. This is a
926 subroutine of powi_cost. CACHE is an array indicating
927 which exponents have already been calculated. */
928
929 static int
930 powi_lookup_cost (unsigned HOST_WIDE_INT n, bool *cache)
931 {
932 /* If we've already calculated this exponent, then this evaluation
933 doesn't require any additional multiplications. */
934 if (cache[n])
935 return 0;
936
937 cache[n] = true;
938 return powi_lookup_cost (n - powi_table[n], cache)
939 + powi_lookup_cost (powi_table[n], cache) + 1;
940 }
941
942 /* Return the number of multiplications required to calculate
943 powi(x,n) for an arbitrary x, given the exponent N. This
944 function needs to be kept in sync with powi_as_mults below. */
945
946 static int
947 powi_cost (HOST_WIDE_INT n)
948 {
949 bool cache[POWI_TABLE_SIZE];
950 unsigned HOST_WIDE_INT digit;
951 unsigned HOST_WIDE_INT val;
952 int result;
953
954 if (n == 0)
955 return 0;
956
957 /* Ignore the reciprocal when calculating the cost. */
958 val = (n < 0) ? -n : n;
959
960 /* Initialize the exponent cache. */
961 memset (cache, 0, POWI_TABLE_SIZE * sizeof (bool));
962 cache[1] = true;
963
964 result = 0;
965
966 while (val >= POWI_TABLE_SIZE)
967 {
968 if (val & 1)
969 {
970 digit = val & ((1 << POWI_WINDOW_SIZE) - 1);
971 result += powi_lookup_cost (digit, cache)
972 + POWI_WINDOW_SIZE + 1;
973 val >>= POWI_WINDOW_SIZE;
974 }
975 else
976 {
977 val >>= 1;
978 result++;
979 }
980 }
981
982 return result + powi_lookup_cost (val, cache);
983 }
984
985 /* Recursive subroutine of powi_as_mults. This function takes the
986 array, CACHE, of already calculated exponents and an exponent N and
987 returns a tree that corresponds to CACHE[1]**N, with type TYPE. */
988
989 static tree
990 powi_as_mults_1 (gimple_stmt_iterator *gsi, location_t loc, tree type,
991 HOST_WIDE_INT n, tree *cache)
992 {
993 tree op0, op1, ssa_target;
994 unsigned HOST_WIDE_INT digit;
995 gassign *mult_stmt;
996
997 if (n < POWI_TABLE_SIZE && cache[n])
998 return cache[n];
999
1000 ssa_target = make_temp_ssa_name (type, NULL, "powmult");
1001
1002 if (n < POWI_TABLE_SIZE)
1003 {
1004 cache[n] = ssa_target;
1005 op0 = powi_as_mults_1 (gsi, loc, type, n - powi_table[n], cache);
1006 op1 = powi_as_mults_1 (gsi, loc, type, powi_table[n], cache);
1007 }
1008 else if (n & 1)
1009 {
1010 digit = n & ((1 << POWI_WINDOW_SIZE) - 1);
1011 op0 = powi_as_mults_1 (gsi, loc, type, n - digit, cache);
1012 op1 = powi_as_mults_1 (gsi, loc, type, digit, cache);
1013 }
1014 else
1015 {
1016 op0 = powi_as_mults_1 (gsi, loc, type, n >> 1, cache);
1017 op1 = op0;
1018 }
1019
1020 mult_stmt = gimple_build_assign (ssa_target, MULT_EXPR, op0, op1);
1021 gimple_set_location (mult_stmt, loc);
1022 gsi_insert_before (gsi, mult_stmt, GSI_SAME_STMT);
1023
1024 return ssa_target;
1025 }
1026
1027 /* Convert ARG0**N to a tree of multiplications of ARG0 with itself.
1028 This function needs to be kept in sync with powi_cost above. */
1029
1030 static tree
1031 powi_as_mults (gimple_stmt_iterator *gsi, location_t loc,
1032 tree arg0, HOST_WIDE_INT n)
1033 {
1034 tree cache[POWI_TABLE_SIZE], result, type = TREE_TYPE (arg0);
1035 gassign *div_stmt;
1036 tree target;
1037
1038 if (n == 0)
1039 return build_real (type, dconst1);
1040
1041 memset (cache, 0, sizeof (cache));
1042 cache[1] = arg0;
1043
1044 result = powi_as_mults_1 (gsi, loc, type, (n < 0) ? -n : n, cache);
1045 if (n >= 0)
1046 return result;
1047
1048 /* If the original exponent was negative, reciprocate the result. */
1049 target = make_temp_ssa_name (type, NULL, "powmult");
1050 div_stmt = gimple_build_assign (target, RDIV_EXPR,
1051 build_real (type, dconst1), result);
1052 gimple_set_location (div_stmt, loc);
1053 gsi_insert_before (gsi, div_stmt, GSI_SAME_STMT);
1054
1055 return target;
1056 }
1057
1058 /* ARG0 and N are the two arguments to a powi builtin in GSI with
1059 location info LOC. If the arguments are appropriate, create an
1060 equivalent sequence of statements prior to GSI using an optimal
1061 number of multiplications, and return an expession holding the
1062 result. */
1063
1064 static tree
1065 gimple_expand_builtin_powi (gimple_stmt_iterator *gsi, location_t loc,
1066 tree arg0, HOST_WIDE_INT n)
1067 {
1068 /* Avoid largest negative number. */
1069 if (n != -n
1070 && ((n >= -1 && n <= 2)
1071 || (optimize_function_for_speed_p (cfun)
1072 && powi_cost (n) <= POWI_MAX_MULTS)))
1073 return powi_as_mults (gsi, loc, arg0, n);
1074
1075 return NULL_TREE;
1076 }
1077
1078 /* Build a gimple call statement that calls FN with argument ARG.
1079 Set the lhs of the call statement to a fresh SSA name. Insert the
1080 statement prior to GSI's current position, and return the fresh
1081 SSA name. */
1082
1083 static tree
1084 build_and_insert_call (gimple_stmt_iterator *gsi, location_t loc,
1085 tree fn, tree arg)
1086 {
1087 gcall *call_stmt;
1088 tree ssa_target;
1089
1090 call_stmt = gimple_build_call (fn, 1, arg);
1091 ssa_target = make_temp_ssa_name (TREE_TYPE (arg), NULL, "powroot");
1092 gimple_set_lhs (call_stmt, ssa_target);
1093 gimple_set_location (call_stmt, loc);
1094 gsi_insert_before (gsi, call_stmt, GSI_SAME_STMT);
1095
1096 return ssa_target;
1097 }
1098
1099 /* Build a gimple binary operation with the given CODE and arguments
1100 ARG0, ARG1, assigning the result to a new SSA name for variable
1101 TARGET. Insert the statement prior to GSI's current position, and
1102 return the fresh SSA name.*/
1103
1104 static tree
1105 build_and_insert_binop (gimple_stmt_iterator *gsi, location_t loc,
1106 const char *name, enum tree_code code,
1107 tree arg0, tree arg1)
1108 {
1109 tree result = make_temp_ssa_name (TREE_TYPE (arg0), NULL, name);
1110 gassign *stmt = gimple_build_assign (result, code, arg0, arg1);
1111 gimple_set_location (stmt, loc);
1112 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1113 return result;
1114 }
1115
1116 /* Build a gimple reference operation with the given CODE and argument
1117 ARG, assigning the result to a new SSA name of TYPE with NAME.
1118 Insert the statement prior to GSI's current position, and return
1119 the fresh SSA name. */
1120
1121 static inline tree
1122 build_and_insert_ref (gimple_stmt_iterator *gsi, location_t loc, tree type,
1123 const char *name, enum tree_code code, tree arg0)
1124 {
1125 tree result = make_temp_ssa_name (type, NULL, name);
1126 gimple stmt = gimple_build_assign (result, build1 (code, type, arg0));
1127 gimple_set_location (stmt, loc);
1128 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1129 return result;
1130 }
1131
1132 /* Build a gimple assignment to cast VAL to TYPE. Insert the statement
1133 prior to GSI's current position, and return the fresh SSA name. */
1134
1135 static tree
1136 build_and_insert_cast (gimple_stmt_iterator *gsi, location_t loc,
1137 tree type, tree val)
1138 {
1139 tree result = make_ssa_name (type);
1140 gassign *stmt = gimple_build_assign (result, NOP_EXPR, val);
1141 gimple_set_location (stmt, loc);
1142 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1143 return result;
1144 }
1145
1146 struct pow_synth_sqrt_info
1147 {
1148 bool *factors;
1149 unsigned int deepest;
1150 unsigned int num_mults;
1151 };
1152
1153 /* Return true iff the real value C can be represented as a
1154 sum of powers of 0.5 up to N. That is:
1155 C == SUM<i from 1..N> (a[i]*(0.5**i)) where a[i] is either 0 or 1.
1156 Record in INFO the various parameters of the synthesis algorithm such
1157 as the factors a[i], the maximum 0.5 power and the number of
1158 multiplications that will be required. */
1159
1160 bool
1161 representable_as_half_series_p (REAL_VALUE_TYPE c, unsigned n,
1162 struct pow_synth_sqrt_info *info)
1163 {
1164 REAL_VALUE_TYPE factor = dconsthalf;
1165 REAL_VALUE_TYPE remainder = c;
1166
1167 info->deepest = 0;
1168 info->num_mults = 0;
1169 memset (info->factors, 0, n * sizeof (bool));
1170
1171 for (unsigned i = 0; i < n; i++)
1172 {
1173 REAL_VALUE_TYPE res;
1174
1175 /* If something inexact happened bail out now. */
1176 if (REAL_ARITHMETIC (res, MINUS_EXPR, remainder, factor))
1177 return false;
1178
1179 /* We have hit zero. The number is representable as a sum
1180 of powers of 0.5. */
1181 if (REAL_VALUES_EQUAL (res, dconst0))
1182 {
1183 info->factors[i] = true;
1184 info->deepest = i + 1;
1185 return true;
1186 }
1187 else if (!REAL_VALUE_NEGATIVE (res))
1188 {
1189 remainder = res;
1190 info->factors[i] = true;
1191 info->num_mults++;
1192 }
1193 else
1194 info->factors[i] = false;
1195
1196 REAL_ARITHMETIC (factor, MULT_EXPR, factor, dconsthalf);
1197 }
1198 return false;
1199 }
1200
1201 /* Return the tree corresponding to FN being applied
1202 to ARG N times at GSI and LOC.
1203 Look up previous results from CACHE if need be.
1204 cache[0] should contain just plain ARG i.e. FN applied to ARG 0 times. */
1205
1206 static tree
1207 get_fn_chain (tree arg, unsigned int n, gimple_stmt_iterator *gsi,
1208 tree fn, location_t loc, tree *cache)
1209 {
1210 tree res = cache[n];
1211 if (!res)
1212 {
1213 tree prev = get_fn_chain (arg, n - 1, gsi, fn, loc, cache);
1214 res = build_and_insert_call (gsi, loc, fn, prev);
1215 cache[n] = res;
1216 }
1217
1218 return res;
1219 }
1220
1221 /* Print to STREAM the repeated application of function FNAME to ARG
1222 N times. So, for FNAME = "foo", ARG = "x", N = 2 it would print:
1223 "foo (foo (x))". */
1224
1225 static void
1226 print_nested_fn (FILE* stream, const char *fname, const char* arg,
1227 unsigned int n)
1228 {
1229 if (n == 0)
1230 fprintf (stream, "%s", arg);
1231 else
1232 {
1233 fprintf (stream, "%s (", fname);
1234 print_nested_fn (stream, fname, arg, n - 1);
1235 fprintf (stream, ")");
1236 }
1237 }
1238
1239 /* Print to STREAM the fractional sequence of sqrt chains
1240 applied to ARG, described by INFO. Used for the dump file. */
1241
1242 static void
1243 dump_fractional_sqrt_sequence (FILE *stream, const char *arg,
1244 struct pow_synth_sqrt_info *info)
1245 {
1246 for (unsigned int i = 0; i < info->deepest; i++)
1247 {
1248 bool is_set = info->factors[i];
1249 if (is_set)
1250 {
1251 print_nested_fn (stream, "sqrt", arg, i + 1);
1252 if (i != info->deepest - 1)
1253 fprintf (stream, " * ");
1254 }
1255 }
1256 }
1257
1258 /* Print to STREAM a representation of raising ARG to an integer
1259 power N. Used for the dump file. */
1260
1261 static void
1262 dump_integer_part (FILE *stream, const char* arg, HOST_WIDE_INT n)
1263 {
1264 if (n > 1)
1265 fprintf (stream, "powi (%s, " HOST_WIDE_INT_PRINT_DEC ")", arg, n);
1266 else if (n == 1)
1267 fprintf (stream, "%s", arg);
1268 }
1269
1270 /* Attempt to synthesize a POW[F] (ARG0, ARG1) call using chains of
1271 square roots. Place at GSI and LOC. Limit the maximum depth
1272 of the sqrt chains to MAX_DEPTH. Return the tree holding the
1273 result of the expanded sequence or NULL_TREE if the expansion failed.
1274
1275 This routine assumes that ARG1 is a real number with a fractional part
1276 (the integer exponent case will have been handled earlier in
1277 gimple_expand_builtin_pow).
1278
1279 For ARG1 > 0.0:
1280 * For ARG1 composed of a whole part WHOLE_PART and a fractional part
1281 FRAC_PART i.e. WHOLE_PART == floor (ARG1) and
1282 FRAC_PART == ARG1 - WHOLE_PART:
1283 Produce POWI (ARG0, WHOLE_PART) * POW (ARG0, FRAC_PART) where
1284 POW (ARG0, FRAC_PART) is expanded as a product of square root chains
1285 if it can be expressed as such, that is if FRAC_PART satisfies:
1286 FRAC_PART == <SUM from i = 1 until MAX_DEPTH> (a[i] * (0.5**i))
1287 where integer a[i] is either 0 or 1.
1288
1289 Example:
1290 POW (x, 3.625) == POWI (x, 3) * POW (x, 0.625)
1291 --> POWI (x, 3) * SQRT (x) * SQRT (SQRT (SQRT (x)))
1292
1293 For ARG1 < 0.0 there are two approaches:
1294 * (A) Expand to 1.0 / POW (ARG0, -ARG1) where POW (ARG0, -ARG1)
1295 is calculated as above.
1296
1297 Example:
1298 POW (x, -5.625) == 1.0 / POW (x, 5.625)
1299 --> 1.0 / (POWI (x, 5) * SQRT (x) * SQRT (SQRT (SQRT (x))))
1300
1301 * (B) : WHOLE_PART := - ceil (abs (ARG1))
1302 FRAC_PART := ARG1 - WHOLE_PART
1303 and expand to POW (x, FRAC_PART) / POWI (x, WHOLE_PART).
1304 Example:
1305 POW (x, -5.875) == POW (x, 0.125) / POWI (X, 6)
1306 --> SQRT (SQRT (SQRT (x))) / (POWI (x, 6))
1307
1308 For ARG1 < 0.0 we choose between (A) and (B) depending on
1309 how many multiplications we'd have to do.
1310 So, for the example in (B): POW (x, -5.875), if we were to
1311 follow algorithm (A) we would produce:
1312 1.0 / POWI (X, 5) * SQRT (X) * SQRT (SQRT (X)) * SQRT (SQRT (SQRT (X)))
1313 which contains more multiplications than approach (B).
1314
1315 Hopefully, this approach will eliminate potentially expensive POW library
1316 calls when unsafe floating point math is enabled and allow the compiler to
1317 further optimise the multiplies, square roots and divides produced by this
1318 function. */
1319
1320 static tree
1321 expand_pow_as_sqrts (gimple_stmt_iterator *gsi, location_t loc,
1322 tree arg0, tree arg1, HOST_WIDE_INT max_depth)
1323 {
1324 tree type = TREE_TYPE (arg0);
1325 machine_mode mode = TYPE_MODE (type);
1326 tree sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1327 bool one_over = true;
1328
1329 if (!sqrtfn)
1330 return NULL_TREE;
1331
1332 if (TREE_CODE (arg1) != REAL_CST)
1333 return NULL_TREE;
1334
1335 REAL_VALUE_TYPE exp_init = TREE_REAL_CST (arg1);
1336
1337 gcc_assert (max_depth > 0);
1338 tree *cache = XALLOCAVEC (tree, max_depth + 1);
1339
1340 struct pow_synth_sqrt_info synth_info;
1341 synth_info.factors = XALLOCAVEC (bool, max_depth + 1);
1342 synth_info.deepest = 0;
1343 synth_info.num_mults = 0;
1344
1345 bool neg_exp = REAL_VALUE_NEGATIVE (exp_init);
1346 REAL_VALUE_TYPE exp = real_value_abs (&exp_init);
1347
1348 /* The whole and fractional parts of exp. */
1349 REAL_VALUE_TYPE whole_part;
1350 REAL_VALUE_TYPE frac_part;
1351
1352 real_floor (&whole_part, mode, &exp);
1353 REAL_ARITHMETIC (frac_part, MINUS_EXPR, exp, whole_part);
1354
1355
1356 REAL_VALUE_TYPE ceil_whole = dconst0;
1357 REAL_VALUE_TYPE ceil_fract = dconst0;
1358
1359 if (neg_exp)
1360 {
1361 real_ceil (&ceil_whole, mode, &exp);
1362 REAL_ARITHMETIC (ceil_fract, MINUS_EXPR, ceil_whole, exp);
1363 }
1364
1365 if (!representable_as_half_series_p (frac_part, max_depth, &synth_info))
1366 return NULL_TREE;
1367
1368 /* Check whether it's more profitable to not use 1.0 / ... */
1369 if (neg_exp)
1370 {
1371 struct pow_synth_sqrt_info alt_synth_info;
1372 alt_synth_info.factors = XALLOCAVEC (bool, max_depth + 1);
1373 alt_synth_info.deepest = 0;
1374 alt_synth_info.num_mults = 0;
1375
1376 if (representable_as_half_series_p (ceil_fract, max_depth,
1377 &alt_synth_info)
1378 && alt_synth_info.deepest <= synth_info.deepest
1379 && alt_synth_info.num_mults < synth_info.num_mults)
1380 {
1381 whole_part = ceil_whole;
1382 frac_part = ceil_fract;
1383 synth_info.deepest = alt_synth_info.deepest;
1384 synth_info.num_mults = alt_synth_info.num_mults;
1385 memcpy (synth_info.factors, alt_synth_info.factors,
1386 (max_depth + 1) * sizeof (bool));
1387 one_over = false;
1388 }
1389 }
1390
1391 HOST_WIDE_INT n = real_to_integer (&whole_part);
1392 REAL_VALUE_TYPE cint;
1393 real_from_integer (&cint, VOIDmode, n, SIGNED);
1394
1395 if (!real_identical (&whole_part, &cint))
1396 return NULL_TREE;
1397
1398 if (powi_cost (n) + synth_info.num_mults > POWI_MAX_MULTS)
1399 return NULL_TREE;
1400
1401 memset (cache, 0, (max_depth + 1) * sizeof (tree));
1402
1403 tree integer_res = n == 0 ? build_real (type, dconst1) : arg0;
1404
1405 /* Calculate the integer part of the exponent. */
1406 if (n > 1)
1407 {
1408 integer_res = gimple_expand_builtin_powi (gsi, loc, arg0, n);
1409 if (!integer_res)
1410 return NULL_TREE;
1411 }
1412
1413 if (dump_file)
1414 {
1415 char string[64];
1416
1417 real_to_decimal (string, &exp_init, sizeof (string), 0, 1);
1418 fprintf (dump_file, "synthesizing pow (x, %s) as:\n", string);
1419
1420 if (neg_exp)
1421 {
1422 if (one_over)
1423 {
1424 fprintf (dump_file, "1.0 / (");
1425 dump_integer_part (dump_file, "x", n);
1426 if (n > 0)
1427 fprintf (dump_file, " * ");
1428 dump_fractional_sqrt_sequence (dump_file, "x", &synth_info);
1429 fprintf (dump_file, ")");
1430 }
1431 else
1432 {
1433 dump_fractional_sqrt_sequence (dump_file, "x", &synth_info);
1434 fprintf (dump_file, " / (");
1435 dump_integer_part (dump_file, "x", n);
1436 fprintf (dump_file, ")");
1437 }
1438 }
1439 else
1440 {
1441 dump_fractional_sqrt_sequence (dump_file, "x", &synth_info);
1442 if (n > 0)
1443 fprintf (dump_file, " * ");
1444 dump_integer_part (dump_file, "x", n);
1445 }
1446
1447 fprintf (dump_file, "\ndeepest sqrt chain: %d\n", synth_info.deepest);
1448 }
1449
1450
1451 tree fract_res = NULL_TREE;
1452 cache[0] = arg0;
1453
1454 /* Calculate the fractional part of the exponent. */
1455 for (unsigned i = 0; i < synth_info.deepest; i++)
1456 {
1457 if (synth_info.factors[i])
1458 {
1459 tree sqrt_chain = get_fn_chain (arg0, i + 1, gsi, sqrtfn, loc, cache);
1460
1461 if (!fract_res)
1462 fract_res = sqrt_chain;
1463
1464 else
1465 fract_res = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1466 fract_res, sqrt_chain);
1467 }
1468 }
1469
1470 tree res = NULL_TREE;
1471
1472 if (neg_exp)
1473 {
1474 if (one_over)
1475 {
1476 if (n > 0)
1477 res = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1478 fract_res, integer_res);
1479 else
1480 res = fract_res;
1481
1482 res = build_and_insert_binop (gsi, loc, "powrootrecip", RDIV_EXPR,
1483 build_real (type, dconst1), res);
1484 }
1485 else
1486 {
1487 res = build_and_insert_binop (gsi, loc, "powroot", RDIV_EXPR,
1488 fract_res, integer_res);
1489 }
1490 }
1491 else
1492 res = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1493 fract_res, integer_res);
1494 return res;
1495 }
1496
1497 /* ARG0 and ARG1 are the two arguments to a pow builtin call in GSI
1498 with location info LOC. If possible, create an equivalent and
1499 less expensive sequence of statements prior to GSI, and return an
1500 expession holding the result. */
1501
1502 static tree
1503 gimple_expand_builtin_pow (gimple_stmt_iterator *gsi, location_t loc,
1504 tree arg0, tree arg1)
1505 {
1506 REAL_VALUE_TYPE c, cint, dconst1_3, dconst1_4, dconst1_6;
1507 REAL_VALUE_TYPE c2, dconst3;
1508 HOST_WIDE_INT n;
1509 tree type, sqrtfn, cbrtfn, sqrt_arg0, result, cbrt_x, powi_cbrt_x;
1510 machine_mode mode;
1511 bool speed_p = optimize_bb_for_speed_p (gsi_bb (*gsi));
1512 bool hw_sqrt_exists, c_is_int, c2_is_int;
1513
1514 dconst1_4 = dconst1;
1515 SET_REAL_EXP (&dconst1_4, REAL_EXP (&dconst1_4) - 2);
1516
1517 /* If the exponent isn't a constant, there's nothing of interest
1518 to be done. */
1519 if (TREE_CODE (arg1) != REAL_CST)
1520 return NULL_TREE;
1521
1522 /* If the exponent is equivalent to an integer, expand to an optimal
1523 multiplication sequence when profitable. */
1524 c = TREE_REAL_CST (arg1);
1525 n = real_to_integer (&c);
1526 real_from_integer (&cint, VOIDmode, n, SIGNED);
1527 c_is_int = real_identical (&c, &cint);
1528
1529 if (c_is_int
1530 && ((n >= -1 && n <= 2)
1531 || (flag_unsafe_math_optimizations
1532 && speed_p
1533 && powi_cost (n) <= POWI_MAX_MULTS)))
1534 return gimple_expand_builtin_powi (gsi, loc, arg0, n);
1535
1536 /* Attempt various optimizations using sqrt and cbrt. */
1537 type = TREE_TYPE (arg0);
1538 mode = TYPE_MODE (type);
1539 sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1540
1541 /* Optimize pow(x,0.5) = sqrt(x). This replacement is always safe
1542 unless signed zeros must be maintained. pow(-0,0.5) = +0, while
1543 sqrt(-0) = -0. */
1544 if (sqrtfn
1545 && REAL_VALUES_EQUAL (c, dconsthalf)
1546 && !HONOR_SIGNED_ZEROS (mode))
1547 return build_and_insert_call (gsi, loc, sqrtfn, arg0);
1548
1549 hw_sqrt_exists = optab_handler (sqrt_optab, mode) != CODE_FOR_nothing;
1550
1551 /* Optimize pow(x,1./3.) = cbrt(x). This requires unsafe math
1552 optimizations since 1./3. is not exactly representable. If x
1553 is negative and finite, the correct value of pow(x,1./3.) is
1554 a NaN with the "invalid" exception raised, because the value
1555 of 1./3. actually has an even denominator. The correct value
1556 of cbrt(x) is a negative real value. */
1557 cbrtfn = mathfn_built_in (type, BUILT_IN_CBRT);
1558 dconst1_3 = real_value_truncate (mode, dconst_third ());
1559
1560 if (flag_unsafe_math_optimizations
1561 && cbrtfn
1562 && (gimple_val_nonnegative_real_p (arg0) || !HONOR_NANS (mode))
1563 && REAL_VALUES_EQUAL (c, dconst1_3))
1564 return build_and_insert_call (gsi, loc, cbrtfn, arg0);
1565
1566 /* Optimize pow(x,1./6.) = cbrt(sqrt(x)). Don't do this optimization
1567 if we don't have a hardware sqrt insn. */
1568 dconst1_6 = dconst1_3;
1569 SET_REAL_EXP (&dconst1_6, REAL_EXP (&dconst1_6) - 1);
1570
1571 if (flag_unsafe_math_optimizations
1572 && sqrtfn
1573 && cbrtfn
1574 && (gimple_val_nonnegative_real_p (arg0) || !HONOR_NANS (mode))
1575 && speed_p
1576 && hw_sqrt_exists
1577 && REAL_VALUES_EQUAL (c, dconst1_6))
1578 {
1579 /* sqrt(x) */
1580 sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1581
1582 /* cbrt(sqrt(x)) */
1583 return build_and_insert_call (gsi, loc, cbrtfn, sqrt_arg0);
1584 }
1585
1586
1587 /* Attempt to expand the POW as a product of square root chains.
1588 Expand the 0.25 case even when otpimising for size. */
1589 if (flag_unsafe_math_optimizations
1590 && sqrtfn
1591 && hw_sqrt_exists
1592 && (speed_p || REAL_VALUES_EQUAL (c, dconst1_4))
1593 && !HONOR_SIGNED_ZEROS (mode))
1594 {
1595 unsigned int max_depth = speed_p
1596 ? PARAM_VALUE (PARAM_MAX_POW_SQRT_DEPTH)
1597 : 2;
1598
1599 tree expand_with_sqrts
1600 = expand_pow_as_sqrts (gsi, loc, arg0, arg1, max_depth);
1601
1602 if (expand_with_sqrts)
1603 return expand_with_sqrts;
1604 }
1605
1606 real_arithmetic (&c2, MULT_EXPR, &c, &dconst2);
1607 n = real_to_integer (&c2);
1608 real_from_integer (&cint, VOIDmode, n, SIGNED);
1609 c2_is_int = real_identical (&c2, &cint);
1610
1611 /* Optimize pow(x,c), where 3c = n for some nonzero integer n, into
1612
1613 powi(x, n/3) * powi(cbrt(x), n%3), n > 0;
1614 1.0 / (powi(x, abs(n)/3) * powi(cbrt(x), abs(n)%3)), n < 0.
1615
1616 Do not calculate the first factor when n/3 = 0. As cbrt(x) is
1617 different from pow(x, 1./3.) due to rounding and behavior with
1618 negative x, we need to constrain this transformation to unsafe
1619 math and positive x or finite math. */
1620 real_from_integer (&dconst3, VOIDmode, 3, SIGNED);
1621 real_arithmetic (&c2, MULT_EXPR, &c, &dconst3);
1622 real_round (&c2, mode, &c2);
1623 n = real_to_integer (&c2);
1624 real_from_integer (&cint, VOIDmode, n, SIGNED);
1625 real_arithmetic (&c2, RDIV_EXPR, &cint, &dconst3);
1626 real_convert (&c2, mode, &c2);
1627
1628 if (flag_unsafe_math_optimizations
1629 && cbrtfn
1630 && (gimple_val_nonnegative_real_p (arg0) || !HONOR_NANS (mode))
1631 && real_identical (&c2, &c)
1632 && !c2_is_int
1633 && optimize_function_for_speed_p (cfun)
1634 && powi_cost (n / 3) <= POWI_MAX_MULTS)
1635 {
1636 tree powi_x_ndiv3 = NULL_TREE;
1637
1638 /* Attempt to fold powi(arg0, abs(n/3)) into multiplies. If not
1639 possible or profitable, give up. Skip the degenerate case when
1640 abs(n) < 3, where the result is always 1. */
1641 if (absu_hwi (n) >= 3)
1642 {
1643 powi_x_ndiv3 = gimple_expand_builtin_powi (gsi, loc, arg0,
1644 abs_hwi (n / 3));
1645 if (!powi_x_ndiv3)
1646 return NULL_TREE;
1647 }
1648
1649 /* Calculate powi(cbrt(x), n%3). Don't use gimple_expand_builtin_powi
1650 as that creates an unnecessary variable. Instead, just produce
1651 either cbrt(x) or cbrt(x) * cbrt(x). */
1652 cbrt_x = build_and_insert_call (gsi, loc, cbrtfn, arg0);
1653
1654 if (absu_hwi (n) % 3 == 1)
1655 powi_cbrt_x = cbrt_x;
1656 else
1657 powi_cbrt_x = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1658 cbrt_x, cbrt_x);
1659
1660 /* Multiply the two subexpressions, unless powi(x,abs(n)/3) = 1. */
1661 if (absu_hwi (n) < 3)
1662 result = powi_cbrt_x;
1663 else
1664 result = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1665 powi_x_ndiv3, powi_cbrt_x);
1666
1667 /* If n is negative, reciprocate the result. */
1668 if (n < 0)
1669 result = build_and_insert_binop (gsi, loc, "powroot", RDIV_EXPR,
1670 build_real (type, dconst1), result);
1671
1672 return result;
1673 }
1674
1675 /* No optimizations succeeded. */
1676 return NULL_TREE;
1677 }
1678
1679 /* ARG is the argument to a cabs builtin call in GSI with location info
1680 LOC. Create a sequence of statements prior to GSI that calculates
1681 sqrt(R*R + I*I), where R and I are the real and imaginary components
1682 of ARG, respectively. Return an expression holding the result. */
1683
1684 static tree
1685 gimple_expand_builtin_cabs (gimple_stmt_iterator *gsi, location_t loc, tree arg)
1686 {
1687 tree real_part, imag_part, addend1, addend2, sum, result;
1688 tree type = TREE_TYPE (TREE_TYPE (arg));
1689 tree sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1690 machine_mode mode = TYPE_MODE (type);
1691
1692 if (!flag_unsafe_math_optimizations
1693 || !optimize_bb_for_speed_p (gimple_bb (gsi_stmt (*gsi)))
1694 || !sqrtfn
1695 || optab_handler (sqrt_optab, mode) == CODE_FOR_nothing)
1696 return NULL_TREE;
1697
1698 real_part = build_and_insert_ref (gsi, loc, type, "cabs",
1699 REALPART_EXPR, arg);
1700 addend1 = build_and_insert_binop (gsi, loc, "cabs", MULT_EXPR,
1701 real_part, real_part);
1702 imag_part = build_and_insert_ref (gsi, loc, type, "cabs",
1703 IMAGPART_EXPR, arg);
1704 addend2 = build_and_insert_binop (gsi, loc, "cabs", MULT_EXPR,
1705 imag_part, imag_part);
1706 sum = build_and_insert_binop (gsi, loc, "cabs", PLUS_EXPR, addend1, addend2);
1707 result = build_and_insert_call (gsi, loc, sqrtfn, sum);
1708
1709 return result;
1710 }
1711
1712 /* Go through all calls to sin, cos and cexpi and call execute_cse_sincos_1
1713 on the SSA_NAME argument of each of them. Also expand powi(x,n) into
1714 an optimal number of multiplies, when n is a constant. */
1715
1716 namespace {
1717
1718 const pass_data pass_data_cse_sincos =
1719 {
1720 GIMPLE_PASS, /* type */
1721 "sincos", /* name */
1722 OPTGROUP_NONE, /* optinfo_flags */
1723 TV_NONE, /* tv_id */
1724 PROP_ssa, /* properties_required */
1725 0, /* properties_provided */
1726 0, /* properties_destroyed */
1727 0, /* todo_flags_start */
1728 TODO_update_ssa, /* todo_flags_finish */
1729 };
1730
1731 class pass_cse_sincos : public gimple_opt_pass
1732 {
1733 public:
1734 pass_cse_sincos (gcc::context *ctxt)
1735 : gimple_opt_pass (pass_data_cse_sincos, ctxt)
1736 {}
1737
1738 /* opt_pass methods: */
1739 virtual bool gate (function *)
1740 {
1741 /* We no longer require either sincos or cexp, since powi expansion
1742 piggybacks on this pass. */
1743 return optimize;
1744 }
1745
1746 virtual unsigned int execute (function *);
1747
1748 }; // class pass_cse_sincos
1749
1750 unsigned int
1751 pass_cse_sincos::execute (function *fun)
1752 {
1753 basic_block bb;
1754 bool cfg_changed = false;
1755
1756 calculate_dominance_info (CDI_DOMINATORS);
1757 memset (&sincos_stats, 0, sizeof (sincos_stats));
1758
1759 FOR_EACH_BB_FN (bb, fun)
1760 {
1761 gimple_stmt_iterator gsi;
1762 bool cleanup_eh = false;
1763
1764 for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1765 {
1766 gimple stmt = gsi_stmt (gsi);
1767 tree fndecl;
1768
1769 /* Only the last stmt in a bb could throw, no need to call
1770 gimple_purge_dead_eh_edges if we change something in the middle
1771 of a basic block. */
1772 cleanup_eh = false;
1773
1774 if (is_gimple_call (stmt)
1775 && gimple_call_lhs (stmt)
1776 && (fndecl = gimple_call_fndecl (stmt))
1777 && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL)
1778 {
1779 tree arg, arg0, arg1, result;
1780 HOST_WIDE_INT n;
1781 location_t loc;
1782
1783 switch (DECL_FUNCTION_CODE (fndecl))
1784 {
1785 CASE_FLT_FN (BUILT_IN_COS):
1786 CASE_FLT_FN (BUILT_IN_SIN):
1787 CASE_FLT_FN (BUILT_IN_CEXPI):
1788 /* Make sure we have either sincos or cexp. */
1789 if (!targetm.libc_has_function (function_c99_math_complex)
1790 && !targetm.libc_has_function (function_sincos))
1791 break;
1792
1793 arg = gimple_call_arg (stmt, 0);
1794 if (TREE_CODE (arg) == SSA_NAME)
1795 cfg_changed |= execute_cse_sincos_1 (arg);
1796 break;
1797
1798 CASE_FLT_FN (BUILT_IN_POW):
1799 arg0 = gimple_call_arg (stmt, 0);
1800 arg1 = gimple_call_arg (stmt, 1);
1801
1802 loc = gimple_location (stmt);
1803 result = gimple_expand_builtin_pow (&gsi, loc, arg0, arg1);
1804
1805 if (result)
1806 {
1807 tree lhs = gimple_get_lhs (stmt);
1808 gassign *new_stmt = gimple_build_assign (lhs, result);
1809 gimple_set_location (new_stmt, loc);
1810 unlink_stmt_vdef (stmt);
1811 gsi_replace (&gsi, new_stmt, true);
1812 cleanup_eh = true;
1813 if (gimple_vdef (stmt))
1814 release_ssa_name (gimple_vdef (stmt));
1815 }
1816 break;
1817
1818 CASE_FLT_FN (BUILT_IN_POWI):
1819 arg0 = gimple_call_arg (stmt, 0);
1820 arg1 = gimple_call_arg (stmt, 1);
1821 loc = gimple_location (stmt);
1822
1823 if (real_minus_onep (arg0))
1824 {
1825 tree t0, t1, cond, one, minus_one;
1826 gassign *stmt;
1827
1828 t0 = TREE_TYPE (arg0);
1829 t1 = TREE_TYPE (arg1);
1830 one = build_real (t0, dconst1);
1831 minus_one = build_real (t0, dconstm1);
1832
1833 cond = make_temp_ssa_name (t1, NULL, "powi_cond");
1834 stmt = gimple_build_assign (cond, BIT_AND_EXPR,
1835 arg1, build_int_cst (t1, 1));
1836 gimple_set_location (stmt, loc);
1837 gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
1838
1839 result = make_temp_ssa_name (t0, NULL, "powi");
1840 stmt = gimple_build_assign (result, COND_EXPR, cond,
1841 minus_one, one);
1842 gimple_set_location (stmt, loc);
1843 gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
1844 }
1845 else
1846 {
1847 if (!tree_fits_shwi_p (arg1))
1848 break;
1849
1850 n = tree_to_shwi (arg1);
1851 result = gimple_expand_builtin_powi (&gsi, loc, arg0, n);
1852 }
1853
1854 if (result)
1855 {
1856 tree lhs = gimple_get_lhs (stmt);
1857 gassign *new_stmt = gimple_build_assign (lhs, result);
1858 gimple_set_location (new_stmt, loc);
1859 unlink_stmt_vdef (stmt);
1860 gsi_replace (&gsi, new_stmt, true);
1861 cleanup_eh = true;
1862 if (gimple_vdef (stmt))
1863 release_ssa_name (gimple_vdef (stmt));
1864 }
1865 break;
1866
1867 CASE_FLT_FN (BUILT_IN_CABS):
1868 arg0 = gimple_call_arg (stmt, 0);
1869 loc = gimple_location (stmt);
1870 result = gimple_expand_builtin_cabs (&gsi, loc, arg0);
1871
1872 if (result)
1873 {
1874 tree lhs = gimple_get_lhs (stmt);
1875 gassign *new_stmt = gimple_build_assign (lhs, result);
1876 gimple_set_location (new_stmt, loc);
1877 unlink_stmt_vdef (stmt);
1878 gsi_replace (&gsi, new_stmt, true);
1879 cleanup_eh = true;
1880 if (gimple_vdef (stmt))
1881 release_ssa_name (gimple_vdef (stmt));
1882 }
1883 break;
1884
1885 default:;
1886 }
1887 }
1888 }
1889 if (cleanup_eh)
1890 cfg_changed |= gimple_purge_dead_eh_edges (bb);
1891 }
1892
1893 statistics_counter_event (fun, "sincos statements inserted",
1894 sincos_stats.inserted);
1895
1896 free_dominance_info (CDI_DOMINATORS);
1897 return cfg_changed ? TODO_cleanup_cfg : 0;
1898 }
1899
1900 } // anon namespace
1901
1902 gimple_opt_pass *
1903 make_pass_cse_sincos (gcc::context *ctxt)
1904 {
1905 return new pass_cse_sincos (ctxt);
1906 }
1907
1908 /* A symbolic number is used to detect byte permutation and selection
1909 patterns. Therefore the field N contains an artificial number
1910 consisting of octet sized markers:
1911
1912 0 - target byte has the value 0
1913 FF - target byte has an unknown value (eg. due to sign extension)
1914 1..size - marker value is the target byte index minus one.
1915
1916 To detect permutations on memory sources (arrays and structures), a symbolic
1917 number is also associated a base address (the array or structure the load is
1918 made from), an offset from the base address and a range which gives the
1919 difference between the highest and lowest accessed memory location to make
1920 such a symbolic number. The range is thus different from size which reflects
1921 the size of the type of current expression. Note that for non memory source,
1922 range holds the same value as size.
1923
1924 For instance, for an array char a[], (short) a[0] | (short) a[3] would have
1925 a size of 2 but a range of 4 while (short) a[0] | ((short) a[0] << 1) would
1926 still have a size of 2 but this time a range of 1. */
1927
1928 struct symbolic_number {
1929 uint64_t n;
1930 tree type;
1931 tree base_addr;
1932 tree offset;
1933 HOST_WIDE_INT bytepos;
1934 tree alias_set;
1935 tree vuse;
1936 unsigned HOST_WIDE_INT range;
1937 };
1938
1939 #define BITS_PER_MARKER 8
1940 #define MARKER_MASK ((1 << BITS_PER_MARKER) - 1)
1941 #define MARKER_BYTE_UNKNOWN MARKER_MASK
1942 #define HEAD_MARKER(n, size) \
1943 ((n) & ((uint64_t) MARKER_MASK << (((size) - 1) * BITS_PER_MARKER)))
1944
1945 /* The number which the find_bswap_or_nop_1 result should match in
1946 order to have a nop. The number is masked according to the size of
1947 the symbolic number before using it. */
1948 #define CMPNOP (sizeof (int64_t) < 8 ? 0 : \
1949 (uint64_t)0x08070605 << 32 | 0x04030201)
1950
1951 /* The number which the find_bswap_or_nop_1 result should match in
1952 order to have a byte swap. The number is masked according to the
1953 size of the symbolic number before using it. */
1954 #define CMPXCHG (sizeof (int64_t) < 8 ? 0 : \
1955 (uint64_t)0x01020304 << 32 | 0x05060708)
1956
1957 /* Perform a SHIFT or ROTATE operation by COUNT bits on symbolic
1958 number N. Return false if the requested operation is not permitted
1959 on a symbolic number. */
1960
1961 static inline bool
1962 do_shift_rotate (enum tree_code code,
1963 struct symbolic_number *n,
1964 int count)
1965 {
1966 int i, size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
1967 unsigned head_marker;
1968
1969 if (count % BITS_PER_UNIT != 0)
1970 return false;
1971 count = (count / BITS_PER_UNIT) * BITS_PER_MARKER;
1972
1973 /* Zero out the extra bits of N in order to avoid them being shifted
1974 into the significant bits. */
1975 if (size < 64 / BITS_PER_MARKER)
1976 n->n &= ((uint64_t) 1 << (size * BITS_PER_MARKER)) - 1;
1977
1978 switch (code)
1979 {
1980 case LSHIFT_EXPR:
1981 n->n <<= count;
1982 break;
1983 case RSHIFT_EXPR:
1984 head_marker = HEAD_MARKER (n->n, size);
1985 n->n >>= count;
1986 /* Arithmetic shift of signed type: result is dependent on the value. */
1987 if (!TYPE_UNSIGNED (n->type) && head_marker)
1988 for (i = 0; i < count / BITS_PER_MARKER; i++)
1989 n->n |= (uint64_t) MARKER_BYTE_UNKNOWN
1990 << ((size - 1 - i) * BITS_PER_MARKER);
1991 break;
1992 case LROTATE_EXPR:
1993 n->n = (n->n << count) | (n->n >> ((size * BITS_PER_MARKER) - count));
1994 break;
1995 case RROTATE_EXPR:
1996 n->n = (n->n >> count) | (n->n << ((size * BITS_PER_MARKER) - count));
1997 break;
1998 default:
1999 return false;
2000 }
2001 /* Zero unused bits for size. */
2002 if (size < 64 / BITS_PER_MARKER)
2003 n->n &= ((uint64_t) 1 << (size * BITS_PER_MARKER)) - 1;
2004 return true;
2005 }
2006
2007 /* Perform sanity checking for the symbolic number N and the gimple
2008 statement STMT. */
2009
2010 static inline bool
2011 verify_symbolic_number_p (struct symbolic_number *n, gimple stmt)
2012 {
2013 tree lhs_type;
2014
2015 lhs_type = gimple_expr_type (stmt);
2016
2017 if (TREE_CODE (lhs_type) != INTEGER_TYPE)
2018 return false;
2019
2020 if (TYPE_PRECISION (lhs_type) != TYPE_PRECISION (n->type))
2021 return false;
2022
2023 return true;
2024 }
2025
2026 /* Initialize the symbolic number N for the bswap pass from the base element
2027 SRC manipulated by the bitwise OR expression. */
2028
2029 static bool
2030 init_symbolic_number (struct symbolic_number *n, tree src)
2031 {
2032 int size;
2033
2034 n->base_addr = n->offset = n->alias_set = n->vuse = NULL_TREE;
2035
2036 /* Set up the symbolic number N by setting each byte to a value between 1 and
2037 the byte size of rhs1. The highest order byte is set to n->size and the
2038 lowest order byte to 1. */
2039 n->type = TREE_TYPE (src);
2040 size = TYPE_PRECISION (n->type);
2041 if (size % BITS_PER_UNIT != 0)
2042 return false;
2043 size /= BITS_PER_UNIT;
2044 if (size > 64 / BITS_PER_MARKER)
2045 return false;
2046 n->range = size;
2047 n->n = CMPNOP;
2048
2049 if (size < 64 / BITS_PER_MARKER)
2050 n->n &= ((uint64_t) 1 << (size * BITS_PER_MARKER)) - 1;
2051
2052 return true;
2053 }
2054
2055 /* Check if STMT might be a byte swap or a nop from a memory source and returns
2056 the answer. If so, REF is that memory source and the base of the memory area
2057 accessed and the offset of the access from that base are recorded in N. */
2058
2059 bool
2060 find_bswap_or_nop_load (gimple stmt, tree ref, struct symbolic_number *n)
2061 {
2062 /* Leaf node is an array or component ref. Memorize its base and
2063 offset from base to compare to other such leaf node. */
2064 HOST_WIDE_INT bitsize, bitpos;
2065 machine_mode mode;
2066 int unsignedp, volatilep;
2067 tree offset, base_addr;
2068
2069 /* Not prepared to handle PDP endian. */
2070 if (BYTES_BIG_ENDIAN != WORDS_BIG_ENDIAN)
2071 return false;
2072
2073 if (!gimple_assign_load_p (stmt) || gimple_has_volatile_ops (stmt))
2074 return false;
2075
2076 base_addr = get_inner_reference (ref, &bitsize, &bitpos, &offset, &mode,
2077 &unsignedp, &volatilep, false);
2078
2079 if (TREE_CODE (base_addr) == MEM_REF)
2080 {
2081 offset_int bit_offset = 0;
2082 tree off = TREE_OPERAND (base_addr, 1);
2083
2084 if (!integer_zerop (off))
2085 {
2086 offset_int boff, coff = mem_ref_offset (base_addr);
2087 boff = wi::lshift (coff, LOG2_BITS_PER_UNIT);
2088 bit_offset += boff;
2089 }
2090
2091 base_addr = TREE_OPERAND (base_addr, 0);
2092
2093 /* Avoid returning a negative bitpos as this may wreak havoc later. */
2094 if (wi::neg_p (bit_offset))
2095 {
2096 offset_int mask = wi::mask <offset_int> (LOG2_BITS_PER_UNIT, false);
2097 offset_int tem = bit_offset.and_not (mask);
2098 /* TEM is the bitpos rounded to BITS_PER_UNIT towards -Inf.
2099 Subtract it to BIT_OFFSET and add it (scaled) to OFFSET. */
2100 bit_offset -= tem;
2101 tem = wi::arshift (tem, LOG2_BITS_PER_UNIT);
2102 if (offset)
2103 offset = size_binop (PLUS_EXPR, offset,
2104 wide_int_to_tree (sizetype, tem));
2105 else
2106 offset = wide_int_to_tree (sizetype, tem);
2107 }
2108
2109 bitpos += bit_offset.to_shwi ();
2110 }
2111
2112 if (bitpos % BITS_PER_UNIT)
2113 return false;
2114 if (bitsize % BITS_PER_UNIT)
2115 return false;
2116
2117 if (!init_symbolic_number (n, ref))
2118 return false;
2119 n->base_addr = base_addr;
2120 n->offset = offset;
2121 n->bytepos = bitpos / BITS_PER_UNIT;
2122 n->alias_set = reference_alias_ptr_type (ref);
2123 n->vuse = gimple_vuse (stmt);
2124 return true;
2125 }
2126
2127 /* Compute the symbolic number N representing the result of a bitwise OR on 2
2128 symbolic number N1 and N2 whose source statements are respectively
2129 SOURCE_STMT1 and SOURCE_STMT2. */
2130
2131 static gimple
2132 perform_symbolic_merge (gimple source_stmt1, struct symbolic_number *n1,
2133 gimple source_stmt2, struct symbolic_number *n2,
2134 struct symbolic_number *n)
2135 {
2136 int i, size;
2137 uint64_t mask;
2138 gimple source_stmt;
2139 struct symbolic_number *n_start;
2140
2141 /* Sources are different, cancel bswap if they are not memory location with
2142 the same base (array, structure, ...). */
2143 if (gimple_assign_rhs1 (source_stmt1) != gimple_assign_rhs1 (source_stmt2))
2144 {
2145 int64_t inc;
2146 HOST_WIDE_INT start_sub, end_sub, end1, end2, end;
2147 struct symbolic_number *toinc_n_ptr, *n_end;
2148
2149 if (!n1->base_addr || !n2->base_addr
2150 || !operand_equal_p (n1->base_addr, n2->base_addr, 0))
2151 return NULL;
2152
2153 if (!n1->offset != !n2->offset
2154 || (n1->offset && !operand_equal_p (n1->offset, n2->offset, 0)))
2155 return NULL;
2156
2157 if (n1->bytepos < n2->bytepos)
2158 {
2159 n_start = n1;
2160 start_sub = n2->bytepos - n1->bytepos;
2161 source_stmt = source_stmt1;
2162 }
2163 else
2164 {
2165 n_start = n2;
2166 start_sub = n1->bytepos - n2->bytepos;
2167 source_stmt = source_stmt2;
2168 }
2169
2170 /* Find the highest address at which a load is performed and
2171 compute related info. */
2172 end1 = n1->bytepos + (n1->range - 1);
2173 end2 = n2->bytepos + (n2->range - 1);
2174 if (end1 < end2)
2175 {
2176 end = end2;
2177 end_sub = end2 - end1;
2178 }
2179 else
2180 {
2181 end = end1;
2182 end_sub = end1 - end2;
2183 }
2184 n_end = (end2 > end1) ? n2 : n1;
2185
2186 /* Find symbolic number whose lsb is the most significant. */
2187 if (BYTES_BIG_ENDIAN)
2188 toinc_n_ptr = (n_end == n1) ? n2 : n1;
2189 else
2190 toinc_n_ptr = (n_start == n1) ? n2 : n1;
2191
2192 n->range = end - n_start->bytepos + 1;
2193
2194 /* Check that the range of memory covered can be represented by
2195 a symbolic number. */
2196 if (n->range > 64 / BITS_PER_MARKER)
2197 return NULL;
2198
2199 /* Reinterpret byte marks in symbolic number holding the value of
2200 bigger weight according to target endianness. */
2201 inc = BYTES_BIG_ENDIAN ? end_sub : start_sub;
2202 size = TYPE_PRECISION (n1->type) / BITS_PER_UNIT;
2203 for (i = 0; i < size; i++, inc <<= BITS_PER_MARKER)
2204 {
2205 unsigned marker
2206 = (toinc_n_ptr->n >> (i * BITS_PER_MARKER)) & MARKER_MASK;
2207 if (marker && marker != MARKER_BYTE_UNKNOWN)
2208 toinc_n_ptr->n += inc;
2209 }
2210 }
2211 else
2212 {
2213 n->range = n1->range;
2214 n_start = n1;
2215 source_stmt = source_stmt1;
2216 }
2217
2218 if (!n1->alias_set
2219 || alias_ptr_types_compatible_p (n1->alias_set, n2->alias_set))
2220 n->alias_set = n1->alias_set;
2221 else
2222 n->alias_set = ptr_type_node;
2223 n->vuse = n_start->vuse;
2224 n->base_addr = n_start->base_addr;
2225 n->offset = n_start->offset;
2226 n->bytepos = n_start->bytepos;
2227 n->type = n_start->type;
2228 size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
2229
2230 for (i = 0, mask = MARKER_MASK; i < size; i++, mask <<= BITS_PER_MARKER)
2231 {
2232 uint64_t masked1, masked2;
2233
2234 masked1 = n1->n & mask;
2235 masked2 = n2->n & mask;
2236 if (masked1 && masked2 && masked1 != masked2)
2237 return NULL;
2238 }
2239 n->n = n1->n | n2->n;
2240
2241 return source_stmt;
2242 }
2243
2244 /* find_bswap_or_nop_1 invokes itself recursively with N and tries to perform
2245 the operation given by the rhs of STMT on the result. If the operation
2246 could successfully be executed the function returns a gimple stmt whose
2247 rhs's first tree is the expression of the source operand and NULL
2248 otherwise. */
2249
2250 static gimple
2251 find_bswap_or_nop_1 (gimple stmt, struct symbolic_number *n, int limit)
2252 {
2253 enum tree_code code;
2254 tree rhs1, rhs2 = NULL;
2255 gimple rhs1_stmt, rhs2_stmt, source_stmt1;
2256 enum gimple_rhs_class rhs_class;
2257
2258 if (!limit || !is_gimple_assign (stmt))
2259 return NULL;
2260
2261 rhs1 = gimple_assign_rhs1 (stmt);
2262
2263 if (find_bswap_or_nop_load (stmt, rhs1, n))
2264 return stmt;
2265
2266 if (TREE_CODE (rhs1) != SSA_NAME)
2267 return NULL;
2268
2269 code = gimple_assign_rhs_code (stmt);
2270 rhs_class = gimple_assign_rhs_class (stmt);
2271 rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
2272
2273 if (rhs_class == GIMPLE_BINARY_RHS)
2274 rhs2 = gimple_assign_rhs2 (stmt);
2275
2276 /* Handle unary rhs and binary rhs with integer constants as second
2277 operand. */
2278
2279 if (rhs_class == GIMPLE_UNARY_RHS
2280 || (rhs_class == GIMPLE_BINARY_RHS
2281 && TREE_CODE (rhs2) == INTEGER_CST))
2282 {
2283 if (code != BIT_AND_EXPR
2284 && code != LSHIFT_EXPR
2285 && code != RSHIFT_EXPR
2286 && code != LROTATE_EXPR
2287 && code != RROTATE_EXPR
2288 && !CONVERT_EXPR_CODE_P (code))
2289 return NULL;
2290
2291 source_stmt1 = find_bswap_or_nop_1 (rhs1_stmt, n, limit - 1);
2292
2293 /* If find_bswap_or_nop_1 returned NULL, STMT is a leaf node and
2294 we have to initialize the symbolic number. */
2295 if (!source_stmt1)
2296 {
2297 if (gimple_assign_load_p (stmt)
2298 || !init_symbolic_number (n, rhs1))
2299 return NULL;
2300 source_stmt1 = stmt;
2301 }
2302
2303 switch (code)
2304 {
2305 case BIT_AND_EXPR:
2306 {
2307 int i, size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
2308 uint64_t val = int_cst_value (rhs2), mask = 0;
2309 uint64_t tmp = (1 << BITS_PER_UNIT) - 1;
2310
2311 /* Only constants masking full bytes are allowed. */
2312 for (i = 0; i < size; i++, tmp <<= BITS_PER_UNIT)
2313 if ((val & tmp) != 0 && (val & tmp) != tmp)
2314 return NULL;
2315 else if (val & tmp)
2316 mask |= (uint64_t) MARKER_MASK << (i * BITS_PER_MARKER);
2317
2318 n->n &= mask;
2319 }
2320 break;
2321 case LSHIFT_EXPR:
2322 case RSHIFT_EXPR:
2323 case LROTATE_EXPR:
2324 case RROTATE_EXPR:
2325 if (!do_shift_rotate (code, n, (int) TREE_INT_CST_LOW (rhs2)))
2326 return NULL;
2327 break;
2328 CASE_CONVERT:
2329 {
2330 int i, type_size, old_type_size;
2331 tree type;
2332
2333 type = gimple_expr_type (stmt);
2334 type_size = TYPE_PRECISION (type);
2335 if (type_size % BITS_PER_UNIT != 0)
2336 return NULL;
2337 type_size /= BITS_PER_UNIT;
2338 if (type_size > 64 / BITS_PER_MARKER)
2339 return NULL;
2340
2341 /* Sign extension: result is dependent on the value. */
2342 old_type_size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
2343 if (!TYPE_UNSIGNED (n->type) && type_size > old_type_size
2344 && HEAD_MARKER (n->n, old_type_size))
2345 for (i = 0; i < type_size - old_type_size; i++)
2346 n->n |= (uint64_t) MARKER_BYTE_UNKNOWN
2347 << ((type_size - 1 - i) * BITS_PER_MARKER);
2348
2349 if (type_size < 64 / BITS_PER_MARKER)
2350 {
2351 /* If STMT casts to a smaller type mask out the bits not
2352 belonging to the target type. */
2353 n->n &= ((uint64_t) 1 << (type_size * BITS_PER_MARKER)) - 1;
2354 }
2355 n->type = type;
2356 if (!n->base_addr)
2357 n->range = type_size;
2358 }
2359 break;
2360 default:
2361 return NULL;
2362 };
2363 return verify_symbolic_number_p (n, stmt) ? source_stmt1 : NULL;
2364 }
2365
2366 /* Handle binary rhs. */
2367
2368 if (rhs_class == GIMPLE_BINARY_RHS)
2369 {
2370 struct symbolic_number n1, n2;
2371 gimple source_stmt, source_stmt2;
2372
2373 if (code != BIT_IOR_EXPR)
2374 return NULL;
2375
2376 if (TREE_CODE (rhs2) != SSA_NAME)
2377 return NULL;
2378
2379 rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
2380
2381 switch (code)
2382 {
2383 case BIT_IOR_EXPR:
2384 source_stmt1 = find_bswap_or_nop_1 (rhs1_stmt, &n1, limit - 1);
2385
2386 if (!source_stmt1)
2387 return NULL;
2388
2389 source_stmt2 = find_bswap_or_nop_1 (rhs2_stmt, &n2, limit - 1);
2390
2391 if (!source_stmt2)
2392 return NULL;
2393
2394 if (TYPE_PRECISION (n1.type) != TYPE_PRECISION (n2.type))
2395 return NULL;
2396
2397 if (!n1.vuse != !n2.vuse
2398 || (n1.vuse && !operand_equal_p (n1.vuse, n2.vuse, 0)))
2399 return NULL;
2400
2401 source_stmt
2402 = perform_symbolic_merge (source_stmt1, &n1, source_stmt2, &n2, n);
2403
2404 if (!source_stmt)
2405 return NULL;
2406
2407 if (!verify_symbolic_number_p (n, stmt))
2408 return NULL;
2409
2410 break;
2411 default:
2412 return NULL;
2413 }
2414 return source_stmt;
2415 }
2416 return NULL;
2417 }
2418
2419 /* Check if STMT completes a bswap implementation or a read in a given
2420 endianness consisting of ORs, SHIFTs and ANDs and sets *BSWAP
2421 accordingly. It also sets N to represent the kind of operations
2422 performed: size of the resulting expression and whether it works on
2423 a memory source, and if so alias-set and vuse. At last, the
2424 function returns a stmt whose rhs's first tree is the source
2425 expression. */
2426
2427 static gimple
2428 find_bswap_or_nop (gimple stmt, struct symbolic_number *n, bool *bswap)
2429 {
2430 /* The number which the find_bswap_or_nop_1 result should match in order
2431 to have a full byte swap. The number is shifted to the right
2432 according to the size of the symbolic number before using it. */
2433 uint64_t cmpxchg = CMPXCHG;
2434 uint64_t cmpnop = CMPNOP;
2435
2436 gimple source_stmt;
2437 int limit;
2438
2439 /* The last parameter determines the depth search limit. It usually
2440 correlates directly to the number n of bytes to be touched. We
2441 increase that number by log2(n) + 1 here in order to also
2442 cover signed -> unsigned conversions of the src operand as can be seen
2443 in libgcc, and for initial shift/and operation of the src operand. */
2444 limit = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (gimple_expr_type (stmt)));
2445 limit += 1 + (int) ceil_log2 ((unsigned HOST_WIDE_INT) limit);
2446 source_stmt = find_bswap_or_nop_1 (stmt, n, limit);
2447
2448 if (!source_stmt)
2449 return NULL;
2450
2451 /* Find real size of result (highest non-zero byte). */
2452 if (n->base_addr)
2453 {
2454 int rsize;
2455 uint64_t tmpn;
2456
2457 for (tmpn = n->n, rsize = 0; tmpn; tmpn >>= BITS_PER_MARKER, rsize++);
2458 n->range = rsize;
2459 }
2460
2461 /* Zero out the extra bits of N and CMP*. */
2462 if (n->range < (int) sizeof (int64_t))
2463 {
2464 uint64_t mask;
2465
2466 mask = ((uint64_t) 1 << (n->range * BITS_PER_MARKER)) - 1;
2467 cmpxchg >>= (64 / BITS_PER_MARKER - n->range) * BITS_PER_MARKER;
2468 cmpnop &= mask;
2469 }
2470
2471 /* A complete byte swap should make the symbolic number to start with
2472 the largest digit in the highest order byte. Unchanged symbolic
2473 number indicates a read with same endianness as target architecture. */
2474 if (n->n == cmpnop)
2475 *bswap = false;
2476 else if (n->n == cmpxchg)
2477 *bswap = true;
2478 else
2479 return NULL;
2480
2481 /* Useless bit manipulation performed by code. */
2482 if (!n->base_addr && n->n == cmpnop)
2483 return NULL;
2484
2485 n->range *= BITS_PER_UNIT;
2486 return source_stmt;
2487 }
2488
2489 namespace {
2490
2491 const pass_data pass_data_optimize_bswap =
2492 {
2493 GIMPLE_PASS, /* type */
2494 "bswap", /* name */
2495 OPTGROUP_NONE, /* optinfo_flags */
2496 TV_NONE, /* tv_id */
2497 PROP_ssa, /* properties_required */
2498 0, /* properties_provided */
2499 0, /* properties_destroyed */
2500 0, /* todo_flags_start */
2501 0, /* todo_flags_finish */
2502 };
2503
2504 class pass_optimize_bswap : public gimple_opt_pass
2505 {
2506 public:
2507 pass_optimize_bswap (gcc::context *ctxt)
2508 : gimple_opt_pass (pass_data_optimize_bswap, ctxt)
2509 {}
2510
2511 /* opt_pass methods: */
2512 virtual bool gate (function *)
2513 {
2514 return flag_expensive_optimizations && optimize;
2515 }
2516
2517 virtual unsigned int execute (function *);
2518
2519 }; // class pass_optimize_bswap
2520
2521 /* Perform the bswap optimization: replace the expression computed in the rhs
2522 of CUR_STMT by an equivalent bswap, load or load + bswap expression.
2523 Which of these alternatives replace the rhs is given by N->base_addr (non
2524 null if a load is needed) and BSWAP. The type, VUSE and set-alias of the
2525 load to perform are also given in N while the builtin bswap invoke is given
2526 in FNDEL. Finally, if a load is involved, SRC_STMT refers to one of the
2527 load statements involved to construct the rhs in CUR_STMT and N->range gives
2528 the size of the rhs expression for maintaining some statistics.
2529
2530 Note that if the replacement involve a load, CUR_STMT is moved just after
2531 SRC_STMT to do the load with the same VUSE which can lead to CUR_STMT
2532 changing of basic block. */
2533
2534 static bool
2535 bswap_replace (gimple cur_stmt, gimple src_stmt, tree fndecl, tree bswap_type,
2536 tree load_type, struct symbolic_number *n, bool bswap)
2537 {
2538 gimple_stmt_iterator gsi;
2539 tree src, tmp, tgt;
2540 gimple bswap_stmt;
2541
2542 gsi = gsi_for_stmt (cur_stmt);
2543 src = gimple_assign_rhs1 (src_stmt);
2544 tgt = gimple_assign_lhs (cur_stmt);
2545
2546 /* Need to load the value from memory first. */
2547 if (n->base_addr)
2548 {
2549 gimple_stmt_iterator gsi_ins = gsi_for_stmt (src_stmt);
2550 tree addr_expr, addr_tmp, val_expr, val_tmp;
2551 tree load_offset_ptr, aligned_load_type;
2552 gimple addr_stmt, load_stmt;
2553 unsigned align;
2554 HOST_WIDE_INT load_offset = 0;
2555
2556 align = get_object_alignment (src);
2557 /* If the new access is smaller than the original one, we need
2558 to perform big endian adjustment. */
2559 if (BYTES_BIG_ENDIAN)
2560 {
2561 HOST_WIDE_INT bitsize, bitpos;
2562 machine_mode mode;
2563 int unsignedp, volatilep;
2564 tree offset;
2565
2566 get_inner_reference (src, &bitsize, &bitpos, &offset, &mode,
2567 &unsignedp, &volatilep, false);
2568 if (n->range < (unsigned HOST_WIDE_INT) bitsize)
2569 {
2570 load_offset = (bitsize - n->range) / BITS_PER_UNIT;
2571 unsigned HOST_WIDE_INT l
2572 = (load_offset * BITS_PER_UNIT) & (align - 1);
2573 if (l)
2574 align = l & -l;
2575 }
2576 }
2577
2578 if (bswap
2579 && align < GET_MODE_ALIGNMENT (TYPE_MODE (load_type))
2580 && SLOW_UNALIGNED_ACCESS (TYPE_MODE (load_type), align))
2581 return false;
2582
2583 /* Move cur_stmt just before one of the load of the original
2584 to ensure it has the same VUSE. See PR61517 for what could
2585 go wrong. */
2586 gsi_move_before (&gsi, &gsi_ins);
2587 gsi = gsi_for_stmt (cur_stmt);
2588
2589 /* Compute address to load from and cast according to the size
2590 of the load. */
2591 addr_expr = build_fold_addr_expr (unshare_expr (src));
2592 if (is_gimple_mem_ref_addr (addr_expr))
2593 addr_tmp = addr_expr;
2594 else
2595 {
2596 addr_tmp = make_temp_ssa_name (TREE_TYPE (addr_expr), NULL,
2597 "load_src");
2598 addr_stmt = gimple_build_assign (addr_tmp, addr_expr);
2599 gsi_insert_before (&gsi, addr_stmt, GSI_SAME_STMT);
2600 }
2601
2602 /* Perform the load. */
2603 aligned_load_type = load_type;
2604 if (align < TYPE_ALIGN (load_type))
2605 aligned_load_type = build_aligned_type (load_type, align);
2606 load_offset_ptr = build_int_cst (n->alias_set, load_offset);
2607 val_expr = fold_build2 (MEM_REF, aligned_load_type, addr_tmp,
2608 load_offset_ptr);
2609
2610 if (!bswap)
2611 {
2612 if (n->range == 16)
2613 nop_stats.found_16bit++;
2614 else if (n->range == 32)
2615 nop_stats.found_32bit++;
2616 else
2617 {
2618 gcc_assert (n->range == 64);
2619 nop_stats.found_64bit++;
2620 }
2621
2622 /* Convert the result of load if necessary. */
2623 if (!useless_type_conversion_p (TREE_TYPE (tgt), load_type))
2624 {
2625 val_tmp = make_temp_ssa_name (aligned_load_type, NULL,
2626 "load_dst");
2627 load_stmt = gimple_build_assign (val_tmp, val_expr);
2628 gimple_set_vuse (load_stmt, n->vuse);
2629 gsi_insert_before (&gsi, load_stmt, GSI_SAME_STMT);
2630 gimple_assign_set_rhs_with_ops (&gsi, NOP_EXPR, val_tmp);
2631 }
2632 else
2633 {
2634 gimple_assign_set_rhs_with_ops (&gsi, MEM_REF, val_expr);
2635 gimple_set_vuse (cur_stmt, n->vuse);
2636 }
2637 update_stmt (cur_stmt);
2638
2639 if (dump_file)
2640 {
2641 fprintf (dump_file,
2642 "%d bit load in target endianness found at: ",
2643 (int) n->range);
2644 print_gimple_stmt (dump_file, cur_stmt, 0, 0);
2645 }
2646 return true;
2647 }
2648 else
2649 {
2650 val_tmp = make_temp_ssa_name (aligned_load_type, NULL, "load_dst");
2651 load_stmt = gimple_build_assign (val_tmp, val_expr);
2652 gimple_set_vuse (load_stmt, n->vuse);
2653 gsi_insert_before (&gsi, load_stmt, GSI_SAME_STMT);
2654 }
2655 src = val_tmp;
2656 }
2657
2658 if (n->range == 16)
2659 bswap_stats.found_16bit++;
2660 else if (n->range == 32)
2661 bswap_stats.found_32bit++;
2662 else
2663 {
2664 gcc_assert (n->range == 64);
2665 bswap_stats.found_64bit++;
2666 }
2667
2668 tmp = src;
2669
2670 /* Convert the src expression if necessary. */
2671 if (!useless_type_conversion_p (TREE_TYPE (tmp), bswap_type))
2672 {
2673 gimple convert_stmt;
2674
2675 tmp = make_temp_ssa_name (bswap_type, NULL, "bswapsrc");
2676 convert_stmt = gimple_build_assign (tmp, NOP_EXPR, src);
2677 gsi_insert_before (&gsi, convert_stmt, GSI_SAME_STMT);
2678 }
2679
2680 /* Canonical form for 16 bit bswap is a rotate expression. Only 16bit values
2681 are considered as rotation of 2N bit values by N bits is generally not
2682 equivalent to a bswap. Consider for instance 0x01020304 r>> 16 which
2683 gives 0x03040102 while a bswap for that value is 0x04030201. */
2684 if (bswap && n->range == 16)
2685 {
2686 tree count = build_int_cst (NULL, BITS_PER_UNIT);
2687 src = fold_build2 (LROTATE_EXPR, bswap_type, tmp, count);
2688 bswap_stmt = gimple_build_assign (NULL, src);
2689 }
2690 else
2691 bswap_stmt = gimple_build_call (fndecl, 1, tmp);
2692
2693 tmp = tgt;
2694
2695 /* Convert the result if necessary. */
2696 if (!useless_type_conversion_p (TREE_TYPE (tgt), bswap_type))
2697 {
2698 gimple convert_stmt;
2699
2700 tmp = make_temp_ssa_name (bswap_type, NULL, "bswapdst");
2701 convert_stmt = gimple_build_assign (tgt, NOP_EXPR, tmp);
2702 gsi_insert_after (&gsi, convert_stmt, GSI_SAME_STMT);
2703 }
2704
2705 gimple_set_lhs (bswap_stmt, tmp);
2706
2707 if (dump_file)
2708 {
2709 fprintf (dump_file, "%d bit bswap implementation found at: ",
2710 (int) n->range);
2711 print_gimple_stmt (dump_file, cur_stmt, 0, 0);
2712 }
2713
2714 gsi_insert_after (&gsi, bswap_stmt, GSI_SAME_STMT);
2715 gsi_remove (&gsi, true);
2716 return true;
2717 }
2718
2719 /* Find manual byte swap implementations as well as load in a given
2720 endianness. Byte swaps are turned into a bswap builtin invokation
2721 while endian loads are converted to bswap builtin invokation or
2722 simple load according to the target endianness. */
2723
2724 unsigned int
2725 pass_optimize_bswap::execute (function *fun)
2726 {
2727 basic_block bb;
2728 bool bswap32_p, bswap64_p;
2729 bool changed = false;
2730 tree bswap32_type = NULL_TREE, bswap64_type = NULL_TREE;
2731
2732 if (BITS_PER_UNIT != 8)
2733 return 0;
2734
2735 bswap32_p = (builtin_decl_explicit_p (BUILT_IN_BSWAP32)
2736 && optab_handler (bswap_optab, SImode) != CODE_FOR_nothing);
2737 bswap64_p = (builtin_decl_explicit_p (BUILT_IN_BSWAP64)
2738 && (optab_handler (bswap_optab, DImode) != CODE_FOR_nothing
2739 || (bswap32_p && word_mode == SImode)));
2740
2741 /* Determine the argument type of the builtins. The code later on
2742 assumes that the return and argument type are the same. */
2743 if (bswap32_p)
2744 {
2745 tree fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32);
2746 bswap32_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
2747 }
2748
2749 if (bswap64_p)
2750 {
2751 tree fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64);
2752 bswap64_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
2753 }
2754
2755 memset (&nop_stats, 0, sizeof (nop_stats));
2756 memset (&bswap_stats, 0, sizeof (bswap_stats));
2757
2758 FOR_EACH_BB_FN (bb, fun)
2759 {
2760 gimple_stmt_iterator gsi;
2761
2762 /* We do a reverse scan for bswap patterns to make sure we get the
2763 widest match. As bswap pattern matching doesn't handle previously
2764 inserted smaller bswap replacements as sub-patterns, the wider
2765 variant wouldn't be detected. */
2766 for (gsi = gsi_last_bb (bb); !gsi_end_p (gsi);)
2767 {
2768 gimple src_stmt, cur_stmt = gsi_stmt (gsi);
2769 tree fndecl = NULL_TREE, bswap_type = NULL_TREE, load_type;
2770 enum tree_code code;
2771 struct symbolic_number n;
2772 bool bswap;
2773
2774 /* This gsi_prev (&gsi) is not part of the for loop because cur_stmt
2775 might be moved to a different basic block by bswap_replace and gsi
2776 must not points to it if that's the case. Moving the gsi_prev
2777 there make sure that gsi points to the statement previous to
2778 cur_stmt while still making sure that all statements are
2779 considered in this basic block. */
2780 gsi_prev (&gsi);
2781
2782 if (!is_gimple_assign (cur_stmt))
2783 continue;
2784
2785 code = gimple_assign_rhs_code (cur_stmt);
2786 switch (code)
2787 {
2788 case LROTATE_EXPR:
2789 case RROTATE_EXPR:
2790 if (!tree_fits_uhwi_p (gimple_assign_rhs2 (cur_stmt))
2791 || tree_to_uhwi (gimple_assign_rhs2 (cur_stmt))
2792 % BITS_PER_UNIT)
2793 continue;
2794 /* Fall through. */
2795 case BIT_IOR_EXPR:
2796 break;
2797 default:
2798 continue;
2799 }
2800
2801 src_stmt = find_bswap_or_nop (cur_stmt, &n, &bswap);
2802
2803 if (!src_stmt)
2804 continue;
2805
2806 switch (n.range)
2807 {
2808 case 16:
2809 /* Already in canonical form, nothing to do. */
2810 if (code == LROTATE_EXPR || code == RROTATE_EXPR)
2811 continue;
2812 load_type = bswap_type = uint16_type_node;
2813 break;
2814 case 32:
2815 load_type = uint32_type_node;
2816 if (bswap32_p)
2817 {
2818 fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32);
2819 bswap_type = bswap32_type;
2820 }
2821 break;
2822 case 64:
2823 load_type = uint64_type_node;
2824 if (bswap64_p)
2825 {
2826 fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64);
2827 bswap_type = bswap64_type;
2828 }
2829 break;
2830 default:
2831 continue;
2832 }
2833
2834 if (bswap && !fndecl && n.range != 16)
2835 continue;
2836
2837 if (bswap_replace (cur_stmt, src_stmt, fndecl, bswap_type, load_type,
2838 &n, bswap))
2839 changed = true;
2840 }
2841 }
2842
2843 statistics_counter_event (fun, "16-bit nop implementations found",
2844 nop_stats.found_16bit);
2845 statistics_counter_event (fun, "32-bit nop implementations found",
2846 nop_stats.found_32bit);
2847 statistics_counter_event (fun, "64-bit nop implementations found",
2848 nop_stats.found_64bit);
2849 statistics_counter_event (fun, "16-bit bswap implementations found",
2850 bswap_stats.found_16bit);
2851 statistics_counter_event (fun, "32-bit bswap implementations found",
2852 bswap_stats.found_32bit);
2853 statistics_counter_event (fun, "64-bit bswap implementations found",
2854 bswap_stats.found_64bit);
2855
2856 return (changed ? TODO_update_ssa : 0);
2857 }
2858
2859 } // anon namespace
2860
2861 gimple_opt_pass *
2862 make_pass_optimize_bswap (gcc::context *ctxt)
2863 {
2864 return new pass_optimize_bswap (ctxt);
2865 }
2866
2867 /* Return true if stmt is a type conversion operation that can be stripped
2868 when used in a widening multiply operation. */
2869 static bool
2870 widening_mult_conversion_strippable_p (tree result_type, gimple stmt)
2871 {
2872 enum tree_code rhs_code = gimple_assign_rhs_code (stmt);
2873
2874 if (TREE_CODE (result_type) == INTEGER_TYPE)
2875 {
2876 tree op_type;
2877 tree inner_op_type;
2878
2879 if (!CONVERT_EXPR_CODE_P (rhs_code))
2880 return false;
2881
2882 op_type = TREE_TYPE (gimple_assign_lhs (stmt));
2883
2884 /* If the type of OP has the same precision as the result, then
2885 we can strip this conversion. The multiply operation will be
2886 selected to create the correct extension as a by-product. */
2887 if (TYPE_PRECISION (result_type) == TYPE_PRECISION (op_type))
2888 return true;
2889
2890 /* We can also strip a conversion if it preserves the signed-ness of
2891 the operation and doesn't narrow the range. */
2892 inner_op_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
2893
2894 /* If the inner-most type is unsigned, then we can strip any
2895 intermediate widening operation. If it's signed, then the
2896 intermediate widening operation must also be signed. */
2897 if ((TYPE_UNSIGNED (inner_op_type)
2898 || TYPE_UNSIGNED (op_type) == TYPE_UNSIGNED (inner_op_type))
2899 && TYPE_PRECISION (op_type) > TYPE_PRECISION (inner_op_type))
2900 return true;
2901
2902 return false;
2903 }
2904
2905 return rhs_code == FIXED_CONVERT_EXPR;
2906 }
2907
2908 /* Return true if RHS is a suitable operand for a widening multiplication,
2909 assuming a target type of TYPE.
2910 There are two cases:
2911
2912 - RHS makes some value at least twice as wide. Store that value
2913 in *NEW_RHS_OUT if so, and store its type in *TYPE_OUT.
2914
2915 - RHS is an integer constant. Store that value in *NEW_RHS_OUT if so,
2916 but leave *TYPE_OUT untouched. */
2917
2918 static bool
2919 is_widening_mult_rhs_p (tree type, tree rhs, tree *type_out,
2920 tree *new_rhs_out)
2921 {
2922 gimple stmt;
2923 tree type1, rhs1;
2924
2925 if (TREE_CODE (rhs) == SSA_NAME)
2926 {
2927 stmt = SSA_NAME_DEF_STMT (rhs);
2928 if (is_gimple_assign (stmt))
2929 {
2930 if (! widening_mult_conversion_strippable_p (type, stmt))
2931 rhs1 = rhs;
2932 else
2933 {
2934 rhs1 = gimple_assign_rhs1 (stmt);
2935
2936 if (TREE_CODE (rhs1) == INTEGER_CST)
2937 {
2938 *new_rhs_out = rhs1;
2939 *type_out = NULL;
2940 return true;
2941 }
2942 }
2943 }
2944 else
2945 rhs1 = rhs;
2946
2947 type1 = TREE_TYPE (rhs1);
2948
2949 if (TREE_CODE (type1) != TREE_CODE (type)
2950 || TYPE_PRECISION (type1) * 2 > TYPE_PRECISION (type))
2951 return false;
2952
2953 *new_rhs_out = rhs1;
2954 *type_out = type1;
2955 return true;
2956 }
2957
2958 if (TREE_CODE (rhs) == INTEGER_CST)
2959 {
2960 *new_rhs_out = rhs;
2961 *type_out = NULL;
2962 return true;
2963 }
2964
2965 return false;
2966 }
2967
2968 /* Return true if STMT performs a widening multiplication, assuming the
2969 output type is TYPE. If so, store the unwidened types of the operands
2970 in *TYPE1_OUT and *TYPE2_OUT respectively. Also fill *RHS1_OUT and
2971 *RHS2_OUT such that converting those operands to types *TYPE1_OUT
2972 and *TYPE2_OUT would give the operands of the multiplication. */
2973
2974 static bool
2975 is_widening_mult_p (gimple stmt,
2976 tree *type1_out, tree *rhs1_out,
2977 tree *type2_out, tree *rhs2_out)
2978 {
2979 tree type = TREE_TYPE (gimple_assign_lhs (stmt));
2980
2981 if (TREE_CODE (type) != INTEGER_TYPE
2982 && TREE_CODE (type) != FIXED_POINT_TYPE)
2983 return false;
2984
2985 if (!is_widening_mult_rhs_p (type, gimple_assign_rhs1 (stmt), type1_out,
2986 rhs1_out))
2987 return false;
2988
2989 if (!is_widening_mult_rhs_p (type, gimple_assign_rhs2 (stmt), type2_out,
2990 rhs2_out))
2991 return false;
2992
2993 if (*type1_out == NULL)
2994 {
2995 if (*type2_out == NULL || !int_fits_type_p (*rhs1_out, *type2_out))
2996 return false;
2997 *type1_out = *type2_out;
2998 }
2999
3000 if (*type2_out == NULL)
3001 {
3002 if (!int_fits_type_p (*rhs2_out, *type1_out))
3003 return false;
3004 *type2_out = *type1_out;
3005 }
3006
3007 /* Ensure that the larger of the two operands comes first. */
3008 if (TYPE_PRECISION (*type1_out) < TYPE_PRECISION (*type2_out))
3009 {
3010 std::swap (*type1_out, *type2_out);
3011 std::swap (*rhs1_out, *rhs2_out);
3012 }
3013
3014 return true;
3015 }
3016
3017 /* Process a single gimple statement STMT, which has a MULT_EXPR as
3018 its rhs, and try to convert it into a WIDEN_MULT_EXPR. The return
3019 value is true iff we converted the statement. */
3020
3021 static bool
3022 convert_mult_to_widen (gimple stmt, gimple_stmt_iterator *gsi)
3023 {
3024 tree lhs, rhs1, rhs2, type, type1, type2;
3025 enum insn_code handler;
3026 machine_mode to_mode, from_mode, actual_mode;
3027 optab op;
3028 int actual_precision;
3029 location_t loc = gimple_location (stmt);
3030 bool from_unsigned1, from_unsigned2;
3031
3032 lhs = gimple_assign_lhs (stmt);
3033 type = TREE_TYPE (lhs);
3034 if (TREE_CODE (type) != INTEGER_TYPE)
3035 return false;
3036
3037 if (!is_widening_mult_p (stmt, &type1, &rhs1, &type2, &rhs2))
3038 return false;
3039
3040 to_mode = TYPE_MODE (type);
3041 from_mode = TYPE_MODE (type1);
3042 from_unsigned1 = TYPE_UNSIGNED (type1);
3043 from_unsigned2 = TYPE_UNSIGNED (type2);
3044
3045 if (from_unsigned1 && from_unsigned2)
3046 op = umul_widen_optab;
3047 else if (!from_unsigned1 && !from_unsigned2)
3048 op = smul_widen_optab;
3049 else
3050 op = usmul_widen_optab;
3051
3052 handler = find_widening_optab_handler_and_mode (op, to_mode, from_mode,
3053 0, &actual_mode);
3054
3055 if (handler == CODE_FOR_nothing)
3056 {
3057 if (op != smul_widen_optab)
3058 {
3059 /* We can use a signed multiply with unsigned types as long as
3060 there is a wider mode to use, or it is the smaller of the two
3061 types that is unsigned. Note that type1 >= type2, always. */
3062 if ((TYPE_UNSIGNED (type1)
3063 && TYPE_PRECISION (type1) == GET_MODE_PRECISION (from_mode))
3064 || (TYPE_UNSIGNED (type2)
3065 && TYPE_PRECISION (type2) == GET_MODE_PRECISION (from_mode)))
3066 {
3067 from_mode = GET_MODE_WIDER_MODE (from_mode);
3068 if (GET_MODE_SIZE (to_mode) <= GET_MODE_SIZE (from_mode))
3069 return false;
3070 }
3071
3072 op = smul_widen_optab;
3073 handler = find_widening_optab_handler_and_mode (op, to_mode,
3074 from_mode, 0,
3075 &actual_mode);
3076
3077 if (handler == CODE_FOR_nothing)
3078 return false;
3079
3080 from_unsigned1 = from_unsigned2 = false;
3081 }
3082 else
3083 return false;
3084 }
3085
3086 /* Ensure that the inputs to the handler are in the correct precison
3087 for the opcode. This will be the full mode size. */
3088 actual_precision = GET_MODE_PRECISION (actual_mode);
3089 if (2 * actual_precision > TYPE_PRECISION (type))
3090 return false;
3091 if (actual_precision != TYPE_PRECISION (type1)
3092 || from_unsigned1 != TYPE_UNSIGNED (type1))
3093 rhs1 = build_and_insert_cast (gsi, loc,
3094 build_nonstandard_integer_type
3095 (actual_precision, from_unsigned1), rhs1);
3096 if (actual_precision != TYPE_PRECISION (type2)
3097 || from_unsigned2 != TYPE_UNSIGNED (type2))
3098 rhs2 = build_and_insert_cast (gsi, loc,
3099 build_nonstandard_integer_type
3100 (actual_precision, from_unsigned2), rhs2);
3101
3102 /* Handle constants. */
3103 if (TREE_CODE (rhs1) == INTEGER_CST)
3104 rhs1 = fold_convert (type1, rhs1);
3105 if (TREE_CODE (rhs2) == INTEGER_CST)
3106 rhs2 = fold_convert (type2, rhs2);
3107
3108 gimple_assign_set_rhs1 (stmt, rhs1);
3109 gimple_assign_set_rhs2 (stmt, rhs2);
3110 gimple_assign_set_rhs_code (stmt, WIDEN_MULT_EXPR);
3111 update_stmt (stmt);
3112 widen_mul_stats.widen_mults_inserted++;
3113 return true;
3114 }
3115
3116 /* Process a single gimple statement STMT, which is found at the
3117 iterator GSI and has a either a PLUS_EXPR or a MINUS_EXPR as its
3118 rhs (given by CODE), and try to convert it into a
3119 WIDEN_MULT_PLUS_EXPR or a WIDEN_MULT_MINUS_EXPR. The return value
3120 is true iff we converted the statement. */
3121
3122 static bool
3123 convert_plusminus_to_widen (gimple_stmt_iterator *gsi, gimple stmt,
3124 enum tree_code code)
3125 {
3126 gimple rhs1_stmt = NULL, rhs2_stmt = NULL;
3127 gimple conv1_stmt = NULL, conv2_stmt = NULL, conv_stmt;
3128 tree type, type1, type2, optype;
3129 tree lhs, rhs1, rhs2, mult_rhs1, mult_rhs2, add_rhs;
3130 enum tree_code rhs1_code = ERROR_MARK, rhs2_code = ERROR_MARK;
3131 optab this_optab;
3132 enum tree_code wmult_code;
3133 enum insn_code handler;
3134 machine_mode to_mode, from_mode, actual_mode;
3135 location_t loc = gimple_location (stmt);
3136 int actual_precision;
3137 bool from_unsigned1, from_unsigned2;
3138
3139 lhs = gimple_assign_lhs (stmt);
3140 type = TREE_TYPE (lhs);
3141 if (TREE_CODE (type) != INTEGER_TYPE
3142 && TREE_CODE (type) != FIXED_POINT_TYPE)
3143 return false;
3144
3145 if (code == MINUS_EXPR)
3146 wmult_code = WIDEN_MULT_MINUS_EXPR;
3147 else
3148 wmult_code = WIDEN_MULT_PLUS_EXPR;
3149
3150 rhs1 = gimple_assign_rhs1 (stmt);
3151 rhs2 = gimple_assign_rhs2 (stmt);
3152
3153 if (TREE_CODE (rhs1) == SSA_NAME)
3154 {
3155 rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
3156 if (is_gimple_assign (rhs1_stmt))
3157 rhs1_code = gimple_assign_rhs_code (rhs1_stmt);
3158 }
3159
3160 if (TREE_CODE (rhs2) == SSA_NAME)
3161 {
3162 rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
3163 if (is_gimple_assign (rhs2_stmt))
3164 rhs2_code = gimple_assign_rhs_code (rhs2_stmt);
3165 }
3166
3167 /* Allow for one conversion statement between the multiply
3168 and addition/subtraction statement. If there are more than
3169 one conversions then we assume they would invalidate this
3170 transformation. If that's not the case then they should have
3171 been folded before now. */
3172 if (CONVERT_EXPR_CODE_P (rhs1_code))
3173 {
3174 conv1_stmt = rhs1_stmt;
3175 rhs1 = gimple_assign_rhs1 (rhs1_stmt);
3176 if (TREE_CODE (rhs1) == SSA_NAME)
3177 {
3178 rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
3179 if (is_gimple_assign (rhs1_stmt))
3180 rhs1_code = gimple_assign_rhs_code (rhs1_stmt);
3181 }
3182 else
3183 return false;
3184 }
3185 if (CONVERT_EXPR_CODE_P (rhs2_code))
3186 {
3187 conv2_stmt = rhs2_stmt;
3188 rhs2 = gimple_assign_rhs1 (rhs2_stmt);
3189 if (TREE_CODE (rhs2) == SSA_NAME)
3190 {
3191 rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
3192 if (is_gimple_assign (rhs2_stmt))
3193 rhs2_code = gimple_assign_rhs_code (rhs2_stmt);
3194 }
3195 else
3196 return false;
3197 }
3198
3199 /* If code is WIDEN_MULT_EXPR then it would seem unnecessary to call
3200 is_widening_mult_p, but we still need the rhs returns.
3201
3202 It might also appear that it would be sufficient to use the existing
3203 operands of the widening multiply, but that would limit the choice of
3204 multiply-and-accumulate instructions.
3205
3206 If the widened-multiplication result has more than one uses, it is
3207 probably wiser not to do the conversion. */
3208 if (code == PLUS_EXPR
3209 && (rhs1_code == MULT_EXPR || rhs1_code == WIDEN_MULT_EXPR))
3210 {
3211 if (!has_single_use (rhs1)
3212 || !is_widening_mult_p (rhs1_stmt, &type1, &mult_rhs1,
3213 &type2, &mult_rhs2))
3214 return false;
3215 add_rhs = rhs2;
3216 conv_stmt = conv1_stmt;
3217 }
3218 else if (rhs2_code == MULT_EXPR || rhs2_code == WIDEN_MULT_EXPR)
3219 {
3220 if (!has_single_use (rhs2)
3221 || !is_widening_mult_p (rhs2_stmt, &type1, &mult_rhs1,
3222 &type2, &mult_rhs2))
3223 return false;
3224 add_rhs = rhs1;
3225 conv_stmt = conv2_stmt;
3226 }
3227 else
3228 return false;
3229
3230 to_mode = TYPE_MODE (type);
3231 from_mode = TYPE_MODE (type1);
3232 from_unsigned1 = TYPE_UNSIGNED (type1);
3233 from_unsigned2 = TYPE_UNSIGNED (type2);
3234 optype = type1;
3235
3236 /* There's no such thing as a mixed sign madd yet, so use a wider mode. */
3237 if (from_unsigned1 != from_unsigned2)
3238 {
3239 if (!INTEGRAL_TYPE_P (type))
3240 return false;
3241 /* We can use a signed multiply with unsigned types as long as
3242 there is a wider mode to use, or it is the smaller of the two
3243 types that is unsigned. Note that type1 >= type2, always. */
3244 if ((from_unsigned1
3245 && TYPE_PRECISION (type1) == GET_MODE_PRECISION (from_mode))
3246 || (from_unsigned2
3247 && TYPE_PRECISION (type2) == GET_MODE_PRECISION (from_mode)))
3248 {
3249 from_mode = GET_MODE_WIDER_MODE (from_mode);
3250 if (GET_MODE_SIZE (from_mode) >= GET_MODE_SIZE (to_mode))
3251 return false;
3252 }
3253
3254 from_unsigned1 = from_unsigned2 = false;
3255 optype = build_nonstandard_integer_type (GET_MODE_PRECISION (from_mode),
3256 false);
3257 }
3258
3259 /* If there was a conversion between the multiply and addition
3260 then we need to make sure it fits a multiply-and-accumulate.
3261 The should be a single mode change which does not change the
3262 value. */
3263 if (conv_stmt)
3264 {
3265 /* We use the original, unmodified data types for this. */
3266 tree from_type = TREE_TYPE (gimple_assign_rhs1 (conv_stmt));
3267 tree to_type = TREE_TYPE (gimple_assign_lhs (conv_stmt));
3268 int data_size = TYPE_PRECISION (type1) + TYPE_PRECISION (type2);
3269 bool is_unsigned = TYPE_UNSIGNED (type1) && TYPE_UNSIGNED (type2);
3270
3271 if (TYPE_PRECISION (from_type) > TYPE_PRECISION (to_type))
3272 {
3273 /* Conversion is a truncate. */
3274 if (TYPE_PRECISION (to_type) < data_size)
3275 return false;
3276 }
3277 else if (TYPE_PRECISION (from_type) < TYPE_PRECISION (to_type))
3278 {
3279 /* Conversion is an extend. Check it's the right sort. */
3280 if (TYPE_UNSIGNED (from_type) != is_unsigned
3281 && !(is_unsigned && TYPE_PRECISION (from_type) > data_size))
3282 return false;
3283 }
3284 /* else convert is a no-op for our purposes. */
3285 }
3286
3287 /* Verify that the machine can perform a widening multiply
3288 accumulate in this mode/signedness combination, otherwise
3289 this transformation is likely to pessimize code. */
3290 this_optab = optab_for_tree_code (wmult_code, optype, optab_default);
3291 handler = find_widening_optab_handler_and_mode (this_optab, to_mode,
3292 from_mode, 0, &actual_mode);
3293
3294 if (handler == CODE_FOR_nothing)
3295 return false;
3296
3297 /* Ensure that the inputs to the handler are in the correct precison
3298 for the opcode. This will be the full mode size. */
3299 actual_precision = GET_MODE_PRECISION (actual_mode);
3300 if (actual_precision != TYPE_PRECISION (type1)
3301 || from_unsigned1 != TYPE_UNSIGNED (type1))
3302 mult_rhs1 = build_and_insert_cast (gsi, loc,
3303 build_nonstandard_integer_type
3304 (actual_precision, from_unsigned1),
3305 mult_rhs1);
3306 if (actual_precision != TYPE_PRECISION (type2)
3307 || from_unsigned2 != TYPE_UNSIGNED (type2))
3308 mult_rhs2 = build_and_insert_cast (gsi, loc,
3309 build_nonstandard_integer_type
3310 (actual_precision, from_unsigned2),
3311 mult_rhs2);
3312
3313 if (!useless_type_conversion_p (type, TREE_TYPE (add_rhs)))
3314 add_rhs = build_and_insert_cast (gsi, loc, type, add_rhs);
3315
3316 /* Handle constants. */
3317 if (TREE_CODE (mult_rhs1) == INTEGER_CST)
3318 mult_rhs1 = fold_convert (type1, mult_rhs1);
3319 if (TREE_CODE (mult_rhs2) == INTEGER_CST)
3320 mult_rhs2 = fold_convert (type2, mult_rhs2);
3321
3322 gimple_assign_set_rhs_with_ops (gsi, wmult_code, mult_rhs1, mult_rhs2,
3323 add_rhs);
3324 update_stmt (gsi_stmt (*gsi));
3325 widen_mul_stats.maccs_inserted++;
3326 return true;
3327 }
3328
3329 /* Combine the multiplication at MUL_STMT with operands MULOP1 and MULOP2
3330 with uses in additions and subtractions to form fused multiply-add
3331 operations. Returns true if successful and MUL_STMT should be removed. */
3332
3333 static bool
3334 convert_mult_to_fma (gimple mul_stmt, tree op1, tree op2)
3335 {
3336 tree mul_result = gimple_get_lhs (mul_stmt);
3337 tree type = TREE_TYPE (mul_result);
3338 gimple use_stmt, neguse_stmt;
3339 gassign *fma_stmt;
3340 use_operand_p use_p;
3341 imm_use_iterator imm_iter;
3342
3343 if (FLOAT_TYPE_P (type)
3344 && flag_fp_contract_mode == FP_CONTRACT_OFF)
3345 return false;
3346
3347 /* We don't want to do bitfield reduction ops. */
3348 if (INTEGRAL_TYPE_P (type)
3349 && (TYPE_PRECISION (type)
3350 != GET_MODE_PRECISION (TYPE_MODE (type))))
3351 return false;
3352
3353 /* If the target doesn't support it, don't generate it. We assume that
3354 if fma isn't available then fms, fnma or fnms are not either. */
3355 if (optab_handler (fma_optab, TYPE_MODE (type)) == CODE_FOR_nothing)
3356 return false;
3357
3358 /* If the multiplication has zero uses, it is kept around probably because
3359 of -fnon-call-exceptions. Don't optimize it away in that case,
3360 it is DCE job. */
3361 if (has_zero_uses (mul_result))
3362 return false;
3363
3364 /* Make sure that the multiplication statement becomes dead after
3365 the transformation, thus that all uses are transformed to FMAs.
3366 This means we assume that an FMA operation has the same cost
3367 as an addition. */
3368 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, mul_result)
3369 {
3370 enum tree_code use_code;
3371 tree result = mul_result;
3372 bool negate_p = false;
3373
3374 use_stmt = USE_STMT (use_p);
3375
3376 if (is_gimple_debug (use_stmt))
3377 continue;
3378
3379 /* For now restrict this operations to single basic blocks. In theory
3380 we would want to support sinking the multiplication in
3381 m = a*b;
3382 if ()
3383 ma = m + c;
3384 else
3385 d = m;
3386 to form a fma in the then block and sink the multiplication to the
3387 else block. */
3388 if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
3389 return false;
3390
3391 if (!is_gimple_assign (use_stmt))
3392 return false;
3393
3394 use_code = gimple_assign_rhs_code (use_stmt);
3395
3396 /* A negate on the multiplication leads to FNMA. */
3397 if (use_code == NEGATE_EXPR)
3398 {
3399 ssa_op_iter iter;
3400 use_operand_p usep;
3401
3402 result = gimple_assign_lhs (use_stmt);
3403
3404 /* Make sure the negate statement becomes dead with this
3405 single transformation. */
3406 if (!single_imm_use (gimple_assign_lhs (use_stmt),
3407 &use_p, &neguse_stmt))
3408 return false;
3409
3410 /* Make sure the multiplication isn't also used on that stmt. */
3411 FOR_EACH_PHI_OR_STMT_USE (usep, neguse_stmt, iter, SSA_OP_USE)
3412 if (USE_FROM_PTR (usep) == mul_result)
3413 return false;
3414
3415 /* Re-validate. */
3416 use_stmt = neguse_stmt;
3417 if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
3418 return false;
3419 if (!is_gimple_assign (use_stmt))
3420 return false;
3421
3422 use_code = gimple_assign_rhs_code (use_stmt);
3423 negate_p = true;
3424 }
3425
3426 switch (use_code)
3427 {
3428 case MINUS_EXPR:
3429 if (gimple_assign_rhs2 (use_stmt) == result)
3430 negate_p = !negate_p;
3431 break;
3432 case PLUS_EXPR:
3433 break;
3434 default:
3435 /* FMA can only be formed from PLUS and MINUS. */
3436 return false;
3437 }
3438
3439 /* If the subtrahend (gimple_assign_rhs2 (use_stmt)) is computed
3440 by a MULT_EXPR that we'll visit later, we might be able to
3441 get a more profitable match with fnma.
3442 OTOH, if we don't, a negate / fma pair has likely lower latency
3443 that a mult / subtract pair. */
3444 if (use_code == MINUS_EXPR && !negate_p
3445 && gimple_assign_rhs1 (use_stmt) == result
3446 && optab_handler (fms_optab, TYPE_MODE (type)) == CODE_FOR_nothing
3447 && optab_handler (fnma_optab, TYPE_MODE (type)) != CODE_FOR_nothing)
3448 {
3449 tree rhs2 = gimple_assign_rhs2 (use_stmt);
3450
3451 if (TREE_CODE (rhs2) == SSA_NAME)
3452 {
3453 gimple stmt2 = SSA_NAME_DEF_STMT (rhs2);
3454 if (has_single_use (rhs2)
3455 && is_gimple_assign (stmt2)
3456 && gimple_assign_rhs_code (stmt2) == MULT_EXPR)
3457 return false;
3458 }
3459 }
3460
3461 /* We can't handle a * b + a * b. */
3462 if (gimple_assign_rhs1 (use_stmt) == gimple_assign_rhs2 (use_stmt))
3463 return false;
3464
3465 /* While it is possible to validate whether or not the exact form
3466 that we've recognized is available in the backend, the assumption
3467 is that the transformation is never a loss. For instance, suppose
3468 the target only has the plain FMA pattern available. Consider
3469 a*b-c -> fma(a,b,-c): we've exchanged MUL+SUB for FMA+NEG, which
3470 is still two operations. Consider -(a*b)-c -> fma(-a,b,-c): we
3471 still have 3 operations, but in the FMA form the two NEGs are
3472 independent and could be run in parallel. */
3473 }
3474
3475 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, mul_result)
3476 {
3477 gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
3478 enum tree_code use_code;
3479 tree addop, mulop1 = op1, result = mul_result;
3480 bool negate_p = false;
3481
3482 if (is_gimple_debug (use_stmt))
3483 continue;
3484
3485 use_code = gimple_assign_rhs_code (use_stmt);
3486 if (use_code == NEGATE_EXPR)
3487 {
3488 result = gimple_assign_lhs (use_stmt);
3489 single_imm_use (gimple_assign_lhs (use_stmt), &use_p, &neguse_stmt);
3490 gsi_remove (&gsi, true);
3491 release_defs (use_stmt);
3492
3493 use_stmt = neguse_stmt;
3494 gsi = gsi_for_stmt (use_stmt);
3495 use_code = gimple_assign_rhs_code (use_stmt);
3496 negate_p = true;
3497 }
3498
3499 if (gimple_assign_rhs1 (use_stmt) == result)
3500 {
3501 addop = gimple_assign_rhs2 (use_stmt);
3502 /* a * b - c -> a * b + (-c) */
3503 if (gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
3504 addop = force_gimple_operand_gsi (&gsi,
3505 build1 (NEGATE_EXPR,
3506 type, addop),
3507 true, NULL_TREE, true,
3508 GSI_SAME_STMT);
3509 }
3510 else
3511 {
3512 addop = gimple_assign_rhs1 (use_stmt);
3513 /* a - b * c -> (-b) * c + a */
3514 if (gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
3515 negate_p = !negate_p;
3516 }
3517
3518 if (negate_p)
3519 mulop1 = force_gimple_operand_gsi (&gsi,
3520 build1 (NEGATE_EXPR,
3521 type, mulop1),
3522 true, NULL_TREE, true,
3523 GSI_SAME_STMT);
3524
3525 fma_stmt = gimple_build_assign (gimple_assign_lhs (use_stmt),
3526 FMA_EXPR, mulop1, op2, addop);
3527 gsi_replace (&gsi, fma_stmt, true);
3528 widen_mul_stats.fmas_inserted++;
3529 }
3530
3531 return true;
3532 }
3533
3534 /* Find integer multiplications where the operands are extended from
3535 smaller types, and replace the MULT_EXPR with a WIDEN_MULT_EXPR
3536 where appropriate. */
3537
3538 namespace {
3539
3540 const pass_data pass_data_optimize_widening_mul =
3541 {
3542 GIMPLE_PASS, /* type */
3543 "widening_mul", /* name */
3544 OPTGROUP_NONE, /* optinfo_flags */
3545 TV_NONE, /* tv_id */
3546 PROP_ssa, /* properties_required */
3547 0, /* properties_provided */
3548 0, /* properties_destroyed */
3549 0, /* todo_flags_start */
3550 TODO_update_ssa, /* todo_flags_finish */
3551 };
3552
3553 class pass_optimize_widening_mul : public gimple_opt_pass
3554 {
3555 public:
3556 pass_optimize_widening_mul (gcc::context *ctxt)
3557 : gimple_opt_pass (pass_data_optimize_widening_mul, ctxt)
3558 {}
3559
3560 /* opt_pass methods: */
3561 virtual bool gate (function *)
3562 {
3563 return flag_expensive_optimizations && optimize;
3564 }
3565
3566 virtual unsigned int execute (function *);
3567
3568 }; // class pass_optimize_widening_mul
3569
3570 unsigned int
3571 pass_optimize_widening_mul::execute (function *fun)
3572 {
3573 basic_block bb;
3574 bool cfg_changed = false;
3575
3576 memset (&widen_mul_stats, 0, sizeof (widen_mul_stats));
3577
3578 FOR_EACH_BB_FN (bb, fun)
3579 {
3580 gimple_stmt_iterator gsi;
3581
3582 for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi);)
3583 {
3584 gimple stmt = gsi_stmt (gsi);
3585 enum tree_code code;
3586
3587 if (is_gimple_assign (stmt))
3588 {
3589 code = gimple_assign_rhs_code (stmt);
3590 switch (code)
3591 {
3592 case MULT_EXPR:
3593 if (!convert_mult_to_widen (stmt, &gsi)
3594 && convert_mult_to_fma (stmt,
3595 gimple_assign_rhs1 (stmt),
3596 gimple_assign_rhs2 (stmt)))
3597 {
3598 gsi_remove (&gsi, true);
3599 release_defs (stmt);
3600 continue;
3601 }
3602 break;
3603
3604 case PLUS_EXPR:
3605 case MINUS_EXPR:
3606 convert_plusminus_to_widen (&gsi, stmt, code);
3607 break;
3608
3609 default:;
3610 }
3611 }
3612 else if (is_gimple_call (stmt)
3613 && gimple_call_lhs (stmt))
3614 {
3615 tree fndecl = gimple_call_fndecl (stmt);
3616 if (fndecl
3617 && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL)
3618 {
3619 switch (DECL_FUNCTION_CODE (fndecl))
3620 {
3621 case BUILT_IN_POWF:
3622 case BUILT_IN_POW:
3623 case BUILT_IN_POWL:
3624 if (TREE_CODE (gimple_call_arg (stmt, 1)) == REAL_CST
3625 && REAL_VALUES_EQUAL
3626 (TREE_REAL_CST (gimple_call_arg (stmt, 1)),
3627 dconst2)
3628 && convert_mult_to_fma (stmt,
3629 gimple_call_arg (stmt, 0),
3630 gimple_call_arg (stmt, 0)))
3631 {
3632 unlink_stmt_vdef (stmt);
3633 if (gsi_remove (&gsi, true)
3634 && gimple_purge_dead_eh_edges (bb))
3635 cfg_changed = true;
3636 release_defs (stmt);
3637 continue;
3638 }
3639 break;
3640
3641 default:;
3642 }
3643 }
3644 }
3645 gsi_next (&gsi);
3646 }
3647 }
3648
3649 statistics_counter_event (fun, "widening multiplications inserted",
3650 widen_mul_stats.widen_mults_inserted);
3651 statistics_counter_event (fun, "widening maccs inserted",
3652 widen_mul_stats.maccs_inserted);
3653 statistics_counter_event (fun, "fused multiply-adds inserted",
3654 widen_mul_stats.fmas_inserted);
3655
3656 return cfg_changed ? TODO_cleanup_cfg : 0;
3657 }
3658
3659 } // anon namespace
3660
3661 gimple_opt_pass *
3662 make_pass_optimize_widening_mul (gcc::context *ctxt)
3663 {
3664 return new pass_optimize_widening_mul (ctxt);
3665 }