Daily bump.
[gcc.git] / gcc / tree-ssa-math-opts.c
1 /* Global, SSA-based optimizations using mathematical identities.
2 Copyright (C) 2005-2020 Free Software Foundation, Inc.
3
4 This file is part of GCC.
5
6 GCC is free software; you can redistribute it and/or modify it
7 under the terms of the GNU General Public License as published by the
8 Free Software Foundation; either version 3, or (at your option) any
9 later version.
10
11 GCC is distributed in the hope that it will be useful, but WITHOUT
12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
19
20 /* Currently, the only mini-pass in this file tries to CSE reciprocal
21 operations. These are common in sequences such as this one:
22
23 modulus = sqrt(x*x + y*y + z*z);
24 x = x / modulus;
25 y = y / modulus;
26 z = z / modulus;
27
28 that can be optimized to
29
30 modulus = sqrt(x*x + y*y + z*z);
31 rmodulus = 1.0 / modulus;
32 x = x * rmodulus;
33 y = y * rmodulus;
34 z = z * rmodulus;
35
36 We do this for loop invariant divisors, and with this pass whenever
37 we notice that a division has the same divisor multiple times.
38
39 Of course, like in PRE, we don't insert a division if a dominator
40 already has one. However, this cannot be done as an extension of
41 PRE for several reasons.
42
43 First of all, with some experiments it was found out that the
44 transformation is not always useful if there are only two divisions
45 by the same divisor. This is probably because modern processors
46 can pipeline the divisions; on older, in-order processors it should
47 still be effective to optimize two divisions by the same number.
48 We make this a param, and it shall be called N in the remainder of
49 this comment.
50
51 Second, if trapping math is active, we have less freedom on where
52 to insert divisions: we can only do so in basic blocks that already
53 contain one. (If divisions don't trap, instead, we can insert
54 divisions elsewhere, which will be in blocks that are common dominators
55 of those that have the division).
56
57 We really don't want to compute the reciprocal unless a division will
58 be found. To do this, we won't insert the division in a basic block
59 that has less than N divisions *post-dominating* it.
60
61 The algorithm constructs a subset of the dominator tree, holding the
62 blocks containing the divisions and the common dominators to them,
63 and walk it twice. The first walk is in post-order, and it annotates
64 each block with the number of divisions that post-dominate it: this
65 gives information on where divisions can be inserted profitably.
66 The second walk is in pre-order, and it inserts divisions as explained
67 above, and replaces divisions by multiplications.
68
69 In the best case, the cost of the pass is O(n_statements). In the
70 worst-case, the cost is due to creating the dominator tree subset,
71 with a cost of O(n_basic_blocks ^ 2); however this can only happen
72 for n_statements / n_basic_blocks statements. So, the amortized cost
73 of creating the dominator tree subset is O(n_basic_blocks) and the
74 worst-case cost of the pass is O(n_statements * n_basic_blocks).
75
76 More practically, the cost will be small because there are few
77 divisions, and they tend to be in the same basic block, so insert_bb
78 is called very few times.
79
80 If we did this using domwalk.c, an efficient implementation would have
81 to work on all the variables in a single pass, because we could not
82 work on just a subset of the dominator tree, as we do now, and the
83 cost would also be something like O(n_statements * n_basic_blocks).
84 The data structures would be more complex in order to work on all the
85 variables in a single pass. */
86
87 #include "config.h"
88 #include "system.h"
89 #include "coretypes.h"
90 #include "backend.h"
91 #include "target.h"
92 #include "rtl.h"
93 #include "tree.h"
94 #include "gimple.h"
95 #include "predict.h"
96 #include "alloc-pool.h"
97 #include "tree-pass.h"
98 #include "ssa.h"
99 #include "optabs-tree.h"
100 #include "gimple-pretty-print.h"
101 #include "alias.h"
102 #include "fold-const.h"
103 #include "gimple-fold.h"
104 #include "gimple-iterator.h"
105 #include "gimplify.h"
106 #include "gimplify-me.h"
107 #include "stor-layout.h"
108 #include "tree-cfg.h"
109 #include "tree-dfa.h"
110 #include "tree-ssa.h"
111 #include "builtins.h"
112 #include "internal-fn.h"
113 #include "case-cfn-macros.h"
114 #include "optabs-libfuncs.h"
115 #include "tree-eh.h"
116 #include "targhooks.h"
117 #include "domwalk.h"
118
119 /* This structure represents one basic block that either computes a
120 division, or is a common dominator for basic block that compute a
121 division. */
122 struct occurrence {
123 /* The basic block represented by this structure. */
124 basic_block bb = basic_block();
125
126 /* If non-NULL, the SSA_NAME holding the definition for a reciprocal
127 inserted in BB. */
128 tree recip_def = tree();
129
130 /* If non-NULL, the SSA_NAME holding the definition for a squared
131 reciprocal inserted in BB. */
132 tree square_recip_def = tree();
133
134 /* If non-NULL, the GIMPLE_ASSIGN for a reciprocal computation that
135 was inserted in BB. */
136 gimple *recip_def_stmt = nullptr;
137
138 /* Pointer to a list of "struct occurrence"s for blocks dominated
139 by BB. */
140 struct occurrence *children = nullptr;
141
142 /* Pointer to the next "struct occurrence"s in the list of blocks
143 sharing a common dominator. */
144 struct occurrence *next = nullptr;
145
146 /* The number of divisions that are in BB before compute_merit. The
147 number of divisions that are in BB or post-dominate it after
148 compute_merit. */
149 int num_divisions = 0;
150
151 /* True if the basic block has a division, false if it is a common
152 dominator for basic blocks that do. If it is false and trapping
153 math is active, BB is not a candidate for inserting a reciprocal. */
154 bool bb_has_division = false;
155
156 /* Construct a struct occurrence for basic block BB, and whose
157 children list is headed by CHILDREN. */
158 occurrence (basic_block bb, struct occurrence *children)
159 : bb (bb), children (children)
160 {
161 bb->aux = this;
162 }
163
164 /* Destroy a struct occurrence and remove it from its basic block. */
165 ~occurrence ()
166 {
167 bb->aux = nullptr;
168 }
169
170 /* Allocate memory for a struct occurrence from OCC_POOL. */
171 static void* operator new (size_t);
172
173 /* Return memory for a struct occurrence to OCC_POOL. */
174 static void operator delete (void*, size_t);
175 };
176
177 static struct
178 {
179 /* Number of 1.0/X ops inserted. */
180 int rdivs_inserted;
181
182 /* Number of 1.0/FUNC ops inserted. */
183 int rfuncs_inserted;
184 } reciprocal_stats;
185
186 static struct
187 {
188 /* Number of cexpi calls inserted. */
189 int inserted;
190 } sincos_stats;
191
192 static struct
193 {
194 /* Number of widening multiplication ops inserted. */
195 int widen_mults_inserted;
196
197 /* Number of integer multiply-and-accumulate ops inserted. */
198 int maccs_inserted;
199
200 /* Number of fp fused multiply-add ops inserted. */
201 int fmas_inserted;
202
203 /* Number of divmod calls inserted. */
204 int divmod_calls_inserted;
205 } widen_mul_stats;
206
207 /* The instance of "struct occurrence" representing the highest
208 interesting block in the dominator tree. */
209 static struct occurrence *occ_head;
210
211 /* Allocation pool for getting instances of "struct occurrence". */
212 static object_allocator<occurrence> *occ_pool;
213
214 void* occurrence::operator new (size_t n)
215 {
216 gcc_assert (n == sizeof(occurrence));
217 return occ_pool->allocate_raw ();
218 }
219
220 void occurrence::operator delete (void *occ, size_t n)
221 {
222 gcc_assert (n == sizeof(occurrence));
223 occ_pool->remove_raw (occ);
224 }
225
226 /* Insert NEW_OCC into our subset of the dominator tree. P_HEAD points to a
227 list of "struct occurrence"s, one per basic block, having IDOM as
228 their common dominator.
229
230 We try to insert NEW_OCC as deep as possible in the tree, and we also
231 insert any other block that is a common dominator for BB and one
232 block already in the tree. */
233
234 static void
235 insert_bb (struct occurrence *new_occ, basic_block idom,
236 struct occurrence **p_head)
237 {
238 struct occurrence *occ, **p_occ;
239
240 for (p_occ = p_head; (occ = *p_occ) != NULL; )
241 {
242 basic_block bb = new_occ->bb, occ_bb = occ->bb;
243 basic_block dom = nearest_common_dominator (CDI_DOMINATORS, occ_bb, bb);
244 if (dom == bb)
245 {
246 /* BB dominates OCC_BB. OCC becomes NEW_OCC's child: remove OCC
247 from its list. */
248 *p_occ = occ->next;
249 occ->next = new_occ->children;
250 new_occ->children = occ;
251
252 /* Try the next block (it may as well be dominated by BB). */
253 }
254
255 else if (dom == occ_bb)
256 {
257 /* OCC_BB dominates BB. Tail recurse to look deeper. */
258 insert_bb (new_occ, dom, &occ->children);
259 return;
260 }
261
262 else if (dom != idom)
263 {
264 gcc_assert (!dom->aux);
265
266 /* There is a dominator between IDOM and BB, add it and make
267 two children out of NEW_OCC and OCC. First, remove OCC from
268 its list. */
269 *p_occ = occ->next;
270 new_occ->next = occ;
271 occ->next = NULL;
272
273 /* None of the previous blocks has DOM as a dominator: if we tail
274 recursed, we would reexamine them uselessly. Just switch BB with
275 DOM, and go on looking for blocks dominated by DOM. */
276 new_occ = new occurrence (dom, new_occ);
277 }
278
279 else
280 {
281 /* Nothing special, go on with the next element. */
282 p_occ = &occ->next;
283 }
284 }
285
286 /* No place was found as a child of IDOM. Make BB a sibling of IDOM. */
287 new_occ->next = *p_head;
288 *p_head = new_occ;
289 }
290
291 /* Register that we found a division in BB.
292 IMPORTANCE is a measure of how much weighting to give
293 that division. Use IMPORTANCE = 2 to register a single
294 division. If the division is going to be found multiple
295 times use 1 (as it is with squares). */
296
297 static inline void
298 register_division_in (basic_block bb, int importance)
299 {
300 struct occurrence *occ;
301
302 occ = (struct occurrence *) bb->aux;
303 if (!occ)
304 {
305 occ = new occurrence (bb, NULL);
306 insert_bb (occ, ENTRY_BLOCK_PTR_FOR_FN (cfun), &occ_head);
307 }
308
309 occ->bb_has_division = true;
310 occ->num_divisions += importance;
311 }
312
313
314 /* Compute the number of divisions that postdominate each block in OCC and
315 its children. */
316
317 static void
318 compute_merit (struct occurrence *occ)
319 {
320 struct occurrence *occ_child;
321 basic_block dom = occ->bb;
322
323 for (occ_child = occ->children; occ_child; occ_child = occ_child->next)
324 {
325 basic_block bb;
326 if (occ_child->children)
327 compute_merit (occ_child);
328
329 if (flag_exceptions)
330 bb = single_noncomplex_succ (dom);
331 else
332 bb = dom;
333
334 if (dominated_by_p (CDI_POST_DOMINATORS, bb, occ_child->bb))
335 occ->num_divisions += occ_child->num_divisions;
336 }
337 }
338
339
340 /* Return whether USE_STMT is a floating-point division by DEF. */
341 static inline bool
342 is_division_by (gimple *use_stmt, tree def)
343 {
344 return is_gimple_assign (use_stmt)
345 && gimple_assign_rhs_code (use_stmt) == RDIV_EXPR
346 && gimple_assign_rhs2 (use_stmt) == def
347 /* Do not recognize x / x as valid division, as we are getting
348 confused later by replacing all immediate uses x in such
349 a stmt. */
350 && gimple_assign_rhs1 (use_stmt) != def
351 && !stmt_can_throw_internal (cfun, use_stmt);
352 }
353
354 /* Return TRUE if USE_STMT is a multiplication of DEF by A. */
355 static inline bool
356 is_mult_by (gimple *use_stmt, tree def, tree a)
357 {
358 if (gimple_code (use_stmt) == GIMPLE_ASSIGN
359 && gimple_assign_rhs_code (use_stmt) == MULT_EXPR)
360 {
361 tree op0 = gimple_assign_rhs1 (use_stmt);
362 tree op1 = gimple_assign_rhs2 (use_stmt);
363
364 return (op0 == def && op1 == a)
365 || (op0 == a && op1 == def);
366 }
367 return 0;
368 }
369
370 /* Return whether USE_STMT is DEF * DEF. */
371 static inline bool
372 is_square_of (gimple *use_stmt, tree def)
373 {
374 return is_mult_by (use_stmt, def, def);
375 }
376
377 /* Return whether USE_STMT is a floating-point division by
378 DEF * DEF. */
379 static inline bool
380 is_division_by_square (gimple *use_stmt, tree def)
381 {
382 if (gimple_code (use_stmt) == GIMPLE_ASSIGN
383 && gimple_assign_rhs_code (use_stmt) == RDIV_EXPR
384 && gimple_assign_rhs1 (use_stmt) != gimple_assign_rhs2 (use_stmt)
385 && !stmt_can_throw_internal (cfun, use_stmt))
386 {
387 tree denominator = gimple_assign_rhs2 (use_stmt);
388 if (TREE_CODE (denominator) == SSA_NAME)
389 return is_square_of (SSA_NAME_DEF_STMT (denominator), def);
390 }
391 return 0;
392 }
393
394 /* Walk the subset of the dominator tree rooted at OCC, setting the
395 RECIP_DEF field to a definition of 1.0 / DEF that can be used in
396 the given basic block. The field may be left NULL, of course,
397 if it is not possible or profitable to do the optimization.
398
399 DEF_BSI is an iterator pointing at the statement defining DEF.
400 If RECIP_DEF is set, a dominator already has a computation that can
401 be used.
402
403 If should_insert_square_recip is set, then this also inserts
404 the square of the reciprocal immediately after the definition
405 of the reciprocal. */
406
407 static void
408 insert_reciprocals (gimple_stmt_iterator *def_gsi, struct occurrence *occ,
409 tree def, tree recip_def, tree square_recip_def,
410 int should_insert_square_recip, int threshold)
411 {
412 tree type;
413 gassign *new_stmt, *new_square_stmt;
414 gimple_stmt_iterator gsi;
415 struct occurrence *occ_child;
416
417 if (!recip_def
418 && (occ->bb_has_division || !flag_trapping_math)
419 /* Divide by two as all divisions are counted twice in
420 the costing loop. */
421 && occ->num_divisions / 2 >= threshold)
422 {
423 /* Make a variable with the replacement and substitute it. */
424 type = TREE_TYPE (def);
425 recip_def = create_tmp_reg (type, "reciptmp");
426 new_stmt = gimple_build_assign (recip_def, RDIV_EXPR,
427 build_one_cst (type), def);
428
429 if (should_insert_square_recip)
430 {
431 square_recip_def = create_tmp_reg (type, "powmult_reciptmp");
432 new_square_stmt = gimple_build_assign (square_recip_def, MULT_EXPR,
433 recip_def, recip_def);
434 }
435
436 if (occ->bb_has_division)
437 {
438 /* Case 1: insert before an existing division. */
439 gsi = gsi_after_labels (occ->bb);
440 while (!gsi_end_p (gsi)
441 && (!is_division_by (gsi_stmt (gsi), def))
442 && (!is_division_by_square (gsi_stmt (gsi), def)))
443 gsi_next (&gsi);
444
445 gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
446 if (should_insert_square_recip)
447 gsi_insert_before (&gsi, new_square_stmt, GSI_SAME_STMT);
448 }
449 else if (def_gsi && occ->bb == def_gsi->bb)
450 {
451 /* Case 2: insert right after the definition. Note that this will
452 never happen if the definition statement can throw, because in
453 that case the sole successor of the statement's basic block will
454 dominate all the uses as well. */
455 gsi_insert_after (def_gsi, new_stmt, GSI_NEW_STMT);
456 if (should_insert_square_recip)
457 gsi_insert_after (def_gsi, new_square_stmt, GSI_NEW_STMT);
458 }
459 else
460 {
461 /* Case 3: insert in a basic block not containing defs/uses. */
462 gsi = gsi_after_labels (occ->bb);
463 gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
464 if (should_insert_square_recip)
465 gsi_insert_before (&gsi, new_square_stmt, GSI_SAME_STMT);
466 }
467
468 reciprocal_stats.rdivs_inserted++;
469
470 occ->recip_def_stmt = new_stmt;
471 }
472
473 occ->recip_def = recip_def;
474 occ->square_recip_def = square_recip_def;
475 for (occ_child = occ->children; occ_child; occ_child = occ_child->next)
476 insert_reciprocals (def_gsi, occ_child, def, recip_def,
477 square_recip_def, should_insert_square_recip,
478 threshold);
479 }
480
481 /* Replace occurrences of expr / (x * x) with expr * ((1 / x) * (1 / x)).
482 Take as argument the use for (x * x). */
483 static inline void
484 replace_reciprocal_squares (use_operand_p use_p)
485 {
486 gimple *use_stmt = USE_STMT (use_p);
487 basic_block bb = gimple_bb (use_stmt);
488 struct occurrence *occ = (struct occurrence *) bb->aux;
489
490 if (optimize_bb_for_speed_p (bb) && occ->square_recip_def
491 && occ->recip_def)
492 {
493 gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
494 gimple_assign_set_rhs_code (use_stmt, MULT_EXPR);
495 gimple_assign_set_rhs2 (use_stmt, occ->square_recip_def);
496 SET_USE (use_p, occ->square_recip_def);
497 fold_stmt_inplace (&gsi);
498 update_stmt (use_stmt);
499 }
500 }
501
502
503 /* Replace the division at USE_P with a multiplication by the reciprocal, if
504 possible. */
505
506 static inline void
507 replace_reciprocal (use_operand_p use_p)
508 {
509 gimple *use_stmt = USE_STMT (use_p);
510 basic_block bb = gimple_bb (use_stmt);
511 struct occurrence *occ = (struct occurrence *) bb->aux;
512
513 if (optimize_bb_for_speed_p (bb)
514 && occ->recip_def && use_stmt != occ->recip_def_stmt)
515 {
516 gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
517 gimple_assign_set_rhs_code (use_stmt, MULT_EXPR);
518 SET_USE (use_p, occ->recip_def);
519 fold_stmt_inplace (&gsi);
520 update_stmt (use_stmt);
521 }
522 }
523
524
525 /* Free OCC and return one more "struct occurrence" to be freed. */
526
527 static struct occurrence *
528 free_bb (struct occurrence *occ)
529 {
530 struct occurrence *child, *next;
531
532 /* First get the two pointers hanging off OCC. */
533 next = occ->next;
534 child = occ->children;
535 delete occ;
536
537 /* Now ensure that we don't recurse unless it is necessary. */
538 if (!child)
539 return next;
540 else
541 {
542 while (next)
543 next = free_bb (next);
544
545 return child;
546 }
547 }
548
549 /* Transform sequences like
550 t = sqrt (a)
551 x = 1.0 / t;
552 r1 = x * x;
553 r2 = a * x;
554 into:
555 t = sqrt (a)
556 r1 = 1.0 / a;
557 r2 = t;
558 x = r1 * r2;
559 depending on the uses of x, r1, r2. This removes one multiplication and
560 allows the sqrt and division operations to execute in parallel.
561 DEF_GSI is the gsi of the initial division by sqrt that defines
562 DEF (x in the example above). */
563
564 static void
565 optimize_recip_sqrt (gimple_stmt_iterator *def_gsi, tree def)
566 {
567 gimple *use_stmt;
568 imm_use_iterator use_iter;
569 gimple *stmt = gsi_stmt (*def_gsi);
570 tree x = def;
571 tree orig_sqrt_ssa_name = gimple_assign_rhs2 (stmt);
572 tree div_rhs1 = gimple_assign_rhs1 (stmt);
573
574 if (TREE_CODE (orig_sqrt_ssa_name) != SSA_NAME
575 || TREE_CODE (div_rhs1) != REAL_CST
576 || !real_equal (&TREE_REAL_CST (div_rhs1), &dconst1))
577 return;
578
579 gcall *sqrt_stmt
580 = dyn_cast <gcall *> (SSA_NAME_DEF_STMT (orig_sqrt_ssa_name));
581
582 if (!sqrt_stmt || !gimple_call_lhs (sqrt_stmt))
583 return;
584
585 switch (gimple_call_combined_fn (sqrt_stmt))
586 {
587 CASE_CFN_SQRT:
588 CASE_CFN_SQRT_FN:
589 break;
590
591 default:
592 return;
593 }
594 tree a = gimple_call_arg (sqrt_stmt, 0);
595
596 /* We have 'a' and 'x'. Now analyze the uses of 'x'. */
597
598 /* Statements that use x in x * x. */
599 auto_vec<gimple *> sqr_stmts;
600 /* Statements that use x in a * x. */
601 auto_vec<gimple *> mult_stmts;
602 bool has_other_use = false;
603 bool mult_on_main_path = false;
604
605 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, x)
606 {
607 if (is_gimple_debug (use_stmt))
608 continue;
609 if (is_square_of (use_stmt, x))
610 {
611 sqr_stmts.safe_push (use_stmt);
612 if (gimple_bb (use_stmt) == gimple_bb (stmt))
613 mult_on_main_path = true;
614 }
615 else if (is_mult_by (use_stmt, x, a))
616 {
617 mult_stmts.safe_push (use_stmt);
618 if (gimple_bb (use_stmt) == gimple_bb (stmt))
619 mult_on_main_path = true;
620 }
621 else
622 has_other_use = true;
623 }
624
625 /* In the x * x and a * x cases we just rewire stmt operands or
626 remove multiplications. In the has_other_use case we introduce
627 a multiplication so make sure we don't introduce a multiplication
628 on a path where there was none. */
629 if (has_other_use && !mult_on_main_path)
630 return;
631
632 if (sqr_stmts.is_empty () && mult_stmts.is_empty ())
633 return;
634
635 /* If x = 1.0 / sqrt (a) has uses other than those optimized here we want
636 to be able to compose it from the sqr and mult cases. */
637 if (has_other_use && (sqr_stmts.is_empty () || mult_stmts.is_empty ()))
638 return;
639
640 if (dump_file)
641 {
642 fprintf (dump_file, "Optimizing reciprocal sqrt multiplications of\n");
643 print_gimple_stmt (dump_file, sqrt_stmt, 0, TDF_NONE);
644 print_gimple_stmt (dump_file, stmt, 0, TDF_NONE);
645 fprintf (dump_file, "\n");
646 }
647
648 bool delete_div = !has_other_use;
649 tree sqr_ssa_name = NULL_TREE;
650 if (!sqr_stmts.is_empty ())
651 {
652 /* r1 = x * x. Transform the original
653 x = 1.0 / t
654 into
655 tmp1 = 1.0 / a
656 r1 = tmp1. */
657
658 sqr_ssa_name
659 = make_temp_ssa_name (TREE_TYPE (a), NULL, "recip_sqrt_sqr");
660
661 if (dump_file)
662 {
663 fprintf (dump_file, "Replacing original division\n");
664 print_gimple_stmt (dump_file, stmt, 0, TDF_NONE);
665 fprintf (dump_file, "with new division\n");
666 }
667 stmt
668 = gimple_build_assign (sqr_ssa_name, gimple_assign_rhs_code (stmt),
669 gimple_assign_rhs1 (stmt), a);
670 gsi_insert_before (def_gsi, stmt, GSI_SAME_STMT);
671 gsi_remove (def_gsi, true);
672 *def_gsi = gsi_for_stmt (stmt);
673 fold_stmt_inplace (def_gsi);
674 update_stmt (stmt);
675
676 if (dump_file)
677 print_gimple_stmt (dump_file, stmt, 0, TDF_NONE);
678
679 delete_div = false;
680 gimple *sqr_stmt;
681 unsigned int i;
682 FOR_EACH_VEC_ELT (sqr_stmts, i, sqr_stmt)
683 {
684 gimple_stmt_iterator gsi2 = gsi_for_stmt (sqr_stmt);
685 gimple_assign_set_rhs_from_tree (&gsi2, sqr_ssa_name);
686 update_stmt (sqr_stmt);
687 }
688 }
689 if (!mult_stmts.is_empty ())
690 {
691 /* r2 = a * x. Transform this into:
692 r2 = t (The original sqrt (a)). */
693 unsigned int i;
694 gimple *mult_stmt = NULL;
695 FOR_EACH_VEC_ELT (mult_stmts, i, mult_stmt)
696 {
697 gimple_stmt_iterator gsi2 = gsi_for_stmt (mult_stmt);
698
699 if (dump_file)
700 {
701 fprintf (dump_file, "Replacing squaring multiplication\n");
702 print_gimple_stmt (dump_file, mult_stmt, 0, TDF_NONE);
703 fprintf (dump_file, "with assignment\n");
704 }
705 gimple_assign_set_rhs_from_tree (&gsi2, orig_sqrt_ssa_name);
706 fold_stmt_inplace (&gsi2);
707 update_stmt (mult_stmt);
708 if (dump_file)
709 print_gimple_stmt (dump_file, mult_stmt, 0, TDF_NONE);
710 }
711 }
712
713 if (has_other_use)
714 {
715 /* Using the two temporaries tmp1, tmp2 from above
716 the original x is now:
717 x = tmp1 * tmp2. */
718 gcc_assert (orig_sqrt_ssa_name);
719 gcc_assert (sqr_ssa_name);
720
721 gimple *new_stmt
722 = gimple_build_assign (x, MULT_EXPR,
723 orig_sqrt_ssa_name, sqr_ssa_name);
724 gsi_insert_after (def_gsi, new_stmt, GSI_NEW_STMT);
725 update_stmt (stmt);
726 }
727 else if (delete_div)
728 {
729 /* Remove the original division. */
730 gimple_stmt_iterator gsi2 = gsi_for_stmt (stmt);
731 gsi_remove (&gsi2, true);
732 release_defs (stmt);
733 }
734 else
735 release_ssa_name (x);
736 }
737
738 /* Look for floating-point divisions among DEF's uses, and try to
739 replace them by multiplications with the reciprocal. Add
740 as many statements computing the reciprocal as needed.
741
742 DEF must be a GIMPLE register of a floating-point type. */
743
744 static void
745 execute_cse_reciprocals_1 (gimple_stmt_iterator *def_gsi, tree def)
746 {
747 use_operand_p use_p, square_use_p;
748 imm_use_iterator use_iter, square_use_iter;
749 tree square_def;
750 struct occurrence *occ;
751 int count = 0;
752 int threshold;
753 int square_recip_count = 0;
754 int sqrt_recip_count = 0;
755
756 gcc_assert (FLOAT_TYPE_P (TREE_TYPE (def)) && TREE_CODE (def) == SSA_NAME);
757 threshold = targetm.min_divisions_for_recip_mul (TYPE_MODE (TREE_TYPE (def)));
758
759 /* If DEF is a square (x * x), count the number of divisions by x.
760 If there are more divisions by x than by (DEF * DEF), prefer to optimize
761 the reciprocal of x instead of DEF. This improves cases like:
762 def = x * x
763 t0 = a / def
764 t1 = b / def
765 t2 = c / x
766 Reciprocal optimization of x results in 1 division rather than 2 or 3. */
767 gimple *def_stmt = SSA_NAME_DEF_STMT (def);
768
769 if (is_gimple_assign (def_stmt)
770 && gimple_assign_rhs_code (def_stmt) == MULT_EXPR
771 && TREE_CODE (gimple_assign_rhs1 (def_stmt)) == SSA_NAME
772 && gimple_assign_rhs1 (def_stmt) == gimple_assign_rhs2 (def_stmt))
773 {
774 tree op0 = gimple_assign_rhs1 (def_stmt);
775
776 FOR_EACH_IMM_USE_FAST (use_p, use_iter, op0)
777 {
778 gimple *use_stmt = USE_STMT (use_p);
779 if (is_division_by (use_stmt, op0))
780 sqrt_recip_count++;
781 }
782 }
783
784 FOR_EACH_IMM_USE_FAST (use_p, use_iter, def)
785 {
786 gimple *use_stmt = USE_STMT (use_p);
787 if (is_division_by (use_stmt, def))
788 {
789 register_division_in (gimple_bb (use_stmt), 2);
790 count++;
791 }
792
793 if (is_square_of (use_stmt, def))
794 {
795 square_def = gimple_assign_lhs (use_stmt);
796 FOR_EACH_IMM_USE_FAST (square_use_p, square_use_iter, square_def)
797 {
798 gimple *square_use_stmt = USE_STMT (square_use_p);
799 if (is_division_by (square_use_stmt, square_def))
800 {
801 /* This is executed twice for each division by a square. */
802 register_division_in (gimple_bb (square_use_stmt), 1);
803 square_recip_count++;
804 }
805 }
806 }
807 }
808
809 /* Square reciprocals were counted twice above. */
810 square_recip_count /= 2;
811
812 /* If it is more profitable to optimize 1 / x, don't optimize 1 / (x * x). */
813 if (sqrt_recip_count > square_recip_count)
814 goto out;
815
816 /* Do the expensive part only if we can hope to optimize something. */
817 if (count + square_recip_count >= threshold && count >= 1)
818 {
819 gimple *use_stmt;
820 for (occ = occ_head; occ; occ = occ->next)
821 {
822 compute_merit (occ);
823 insert_reciprocals (def_gsi, occ, def, NULL, NULL,
824 square_recip_count, threshold);
825 }
826
827 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, def)
828 {
829 if (is_division_by (use_stmt, def))
830 {
831 FOR_EACH_IMM_USE_ON_STMT (use_p, use_iter)
832 replace_reciprocal (use_p);
833 }
834 else if (square_recip_count > 0 && is_square_of (use_stmt, def))
835 {
836 FOR_EACH_IMM_USE_ON_STMT (use_p, use_iter)
837 {
838 /* Find all uses of the square that are divisions and
839 * replace them by multiplications with the inverse. */
840 imm_use_iterator square_iterator;
841 gimple *powmult_use_stmt = USE_STMT (use_p);
842 tree powmult_def_name = gimple_assign_lhs (powmult_use_stmt);
843
844 FOR_EACH_IMM_USE_STMT (powmult_use_stmt,
845 square_iterator, powmult_def_name)
846 FOR_EACH_IMM_USE_ON_STMT (square_use_p, square_iterator)
847 {
848 gimple *powmult_use_stmt = USE_STMT (square_use_p);
849 if (is_division_by (powmult_use_stmt, powmult_def_name))
850 replace_reciprocal_squares (square_use_p);
851 }
852 }
853 }
854 }
855 }
856
857 out:
858 for (occ = occ_head; occ; )
859 occ = free_bb (occ);
860
861 occ_head = NULL;
862 }
863
864 /* Return an internal function that implements the reciprocal of CALL,
865 or IFN_LAST if there is no such function that the target supports. */
866
867 internal_fn
868 internal_fn_reciprocal (gcall *call)
869 {
870 internal_fn ifn;
871
872 switch (gimple_call_combined_fn (call))
873 {
874 CASE_CFN_SQRT:
875 CASE_CFN_SQRT_FN:
876 ifn = IFN_RSQRT;
877 break;
878
879 default:
880 return IFN_LAST;
881 }
882
883 tree_pair types = direct_internal_fn_types (ifn, call);
884 if (!direct_internal_fn_supported_p (ifn, types, OPTIMIZE_FOR_SPEED))
885 return IFN_LAST;
886
887 return ifn;
888 }
889
890 /* Go through all the floating-point SSA_NAMEs, and call
891 execute_cse_reciprocals_1 on each of them. */
892 namespace {
893
894 const pass_data pass_data_cse_reciprocals =
895 {
896 GIMPLE_PASS, /* type */
897 "recip", /* name */
898 OPTGROUP_NONE, /* optinfo_flags */
899 TV_TREE_RECIP, /* tv_id */
900 PROP_ssa, /* properties_required */
901 0, /* properties_provided */
902 0, /* properties_destroyed */
903 0, /* todo_flags_start */
904 TODO_update_ssa, /* todo_flags_finish */
905 };
906
907 class pass_cse_reciprocals : public gimple_opt_pass
908 {
909 public:
910 pass_cse_reciprocals (gcc::context *ctxt)
911 : gimple_opt_pass (pass_data_cse_reciprocals, ctxt)
912 {}
913
914 /* opt_pass methods: */
915 virtual bool gate (function *) { return optimize && flag_reciprocal_math; }
916 virtual unsigned int execute (function *);
917
918 }; // class pass_cse_reciprocals
919
920 unsigned int
921 pass_cse_reciprocals::execute (function *fun)
922 {
923 basic_block bb;
924 tree arg;
925
926 occ_pool = new object_allocator<occurrence> ("dominators for recip");
927
928 memset (&reciprocal_stats, 0, sizeof (reciprocal_stats));
929 calculate_dominance_info (CDI_DOMINATORS);
930 calculate_dominance_info (CDI_POST_DOMINATORS);
931
932 if (flag_checking)
933 FOR_EACH_BB_FN (bb, fun)
934 gcc_assert (!bb->aux);
935
936 for (arg = DECL_ARGUMENTS (fun->decl); arg; arg = DECL_CHAIN (arg))
937 if (FLOAT_TYPE_P (TREE_TYPE (arg))
938 && is_gimple_reg (arg))
939 {
940 tree name = ssa_default_def (fun, arg);
941 if (name)
942 execute_cse_reciprocals_1 (NULL, name);
943 }
944
945 FOR_EACH_BB_FN (bb, fun)
946 {
947 tree def;
948
949 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
950 gsi_next (&gsi))
951 {
952 gphi *phi = gsi.phi ();
953 def = PHI_RESULT (phi);
954 if (! virtual_operand_p (def)
955 && FLOAT_TYPE_P (TREE_TYPE (def)))
956 execute_cse_reciprocals_1 (NULL, def);
957 }
958
959 for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
960 gsi_next (&gsi))
961 {
962 gimple *stmt = gsi_stmt (gsi);
963
964 if (gimple_has_lhs (stmt)
965 && (def = SINGLE_SSA_TREE_OPERAND (stmt, SSA_OP_DEF)) != NULL
966 && FLOAT_TYPE_P (TREE_TYPE (def))
967 && TREE_CODE (def) == SSA_NAME)
968 {
969 execute_cse_reciprocals_1 (&gsi, def);
970 stmt = gsi_stmt (gsi);
971 if (flag_unsafe_math_optimizations
972 && is_gimple_assign (stmt)
973 && gimple_assign_lhs (stmt) == def
974 && !stmt_can_throw_internal (cfun, stmt)
975 && gimple_assign_rhs_code (stmt) == RDIV_EXPR)
976 optimize_recip_sqrt (&gsi, def);
977 }
978 }
979
980 if (optimize_bb_for_size_p (bb))
981 continue;
982
983 /* Scan for a/func(b) and convert it to reciprocal a*rfunc(b). */
984 for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
985 gsi_next (&gsi))
986 {
987 gimple *stmt = gsi_stmt (gsi);
988
989 if (is_gimple_assign (stmt)
990 && gimple_assign_rhs_code (stmt) == RDIV_EXPR)
991 {
992 tree arg1 = gimple_assign_rhs2 (stmt);
993 gimple *stmt1;
994
995 if (TREE_CODE (arg1) != SSA_NAME)
996 continue;
997
998 stmt1 = SSA_NAME_DEF_STMT (arg1);
999
1000 if (is_gimple_call (stmt1)
1001 && gimple_call_lhs (stmt1))
1002 {
1003 bool fail;
1004 imm_use_iterator ui;
1005 use_operand_p use_p;
1006 tree fndecl = NULL_TREE;
1007
1008 gcall *call = as_a <gcall *> (stmt1);
1009 internal_fn ifn = internal_fn_reciprocal (call);
1010 if (ifn == IFN_LAST)
1011 {
1012 fndecl = gimple_call_fndecl (call);
1013 if (!fndecl
1014 || !fndecl_built_in_p (fndecl, BUILT_IN_MD))
1015 continue;
1016 fndecl = targetm.builtin_reciprocal (fndecl);
1017 if (!fndecl)
1018 continue;
1019 }
1020
1021 /* Check that all uses of the SSA name are divisions,
1022 otherwise replacing the defining statement will do
1023 the wrong thing. */
1024 fail = false;
1025 FOR_EACH_IMM_USE_FAST (use_p, ui, arg1)
1026 {
1027 gimple *stmt2 = USE_STMT (use_p);
1028 if (is_gimple_debug (stmt2))
1029 continue;
1030 if (!is_gimple_assign (stmt2)
1031 || gimple_assign_rhs_code (stmt2) != RDIV_EXPR
1032 || gimple_assign_rhs1 (stmt2) == arg1
1033 || gimple_assign_rhs2 (stmt2) != arg1)
1034 {
1035 fail = true;
1036 break;
1037 }
1038 }
1039 if (fail)
1040 continue;
1041
1042 gimple_replace_ssa_lhs (call, arg1);
1043 if (gimple_call_internal_p (call) != (ifn != IFN_LAST))
1044 {
1045 auto_vec<tree, 4> args;
1046 for (unsigned int i = 0;
1047 i < gimple_call_num_args (call); i++)
1048 args.safe_push (gimple_call_arg (call, i));
1049 gcall *stmt2;
1050 if (ifn == IFN_LAST)
1051 stmt2 = gimple_build_call_vec (fndecl, args);
1052 else
1053 stmt2 = gimple_build_call_internal_vec (ifn, args);
1054 gimple_call_set_lhs (stmt2, arg1);
1055 gimple_move_vops (stmt2, call);
1056 gimple_call_set_nothrow (stmt2,
1057 gimple_call_nothrow_p (call));
1058 gimple_stmt_iterator gsi2 = gsi_for_stmt (call);
1059 gsi_replace (&gsi2, stmt2, true);
1060 }
1061 else
1062 {
1063 if (ifn == IFN_LAST)
1064 gimple_call_set_fndecl (call, fndecl);
1065 else
1066 gimple_call_set_internal_fn (call, ifn);
1067 update_stmt (call);
1068 }
1069 reciprocal_stats.rfuncs_inserted++;
1070
1071 FOR_EACH_IMM_USE_STMT (stmt, ui, arg1)
1072 {
1073 gimple_stmt_iterator gsi = gsi_for_stmt (stmt);
1074 gimple_assign_set_rhs_code (stmt, MULT_EXPR);
1075 fold_stmt_inplace (&gsi);
1076 update_stmt (stmt);
1077 }
1078 }
1079 }
1080 }
1081 }
1082
1083 statistics_counter_event (fun, "reciprocal divs inserted",
1084 reciprocal_stats.rdivs_inserted);
1085 statistics_counter_event (fun, "reciprocal functions inserted",
1086 reciprocal_stats.rfuncs_inserted);
1087
1088 free_dominance_info (CDI_DOMINATORS);
1089 free_dominance_info (CDI_POST_DOMINATORS);
1090 delete occ_pool;
1091 return 0;
1092 }
1093
1094 } // anon namespace
1095
1096 gimple_opt_pass *
1097 make_pass_cse_reciprocals (gcc::context *ctxt)
1098 {
1099 return new pass_cse_reciprocals (ctxt);
1100 }
1101
1102 /* Records an occurrence at statement USE_STMT in the vector of trees
1103 STMTS if it is dominated by *TOP_BB or dominates it or this basic block
1104 is not yet initialized. Returns true if the occurrence was pushed on
1105 the vector. Adjusts *TOP_BB to be the basic block dominating all
1106 statements in the vector. */
1107
1108 static bool
1109 maybe_record_sincos (vec<gimple *> *stmts,
1110 basic_block *top_bb, gimple *use_stmt)
1111 {
1112 basic_block use_bb = gimple_bb (use_stmt);
1113 if (*top_bb
1114 && (*top_bb == use_bb
1115 || dominated_by_p (CDI_DOMINATORS, use_bb, *top_bb)))
1116 stmts->safe_push (use_stmt);
1117 else if (!*top_bb
1118 || dominated_by_p (CDI_DOMINATORS, *top_bb, use_bb))
1119 {
1120 stmts->safe_push (use_stmt);
1121 *top_bb = use_bb;
1122 }
1123 else
1124 return false;
1125
1126 return true;
1127 }
1128
1129 /* Look for sin, cos and cexpi calls with the same argument NAME and
1130 create a single call to cexpi CSEing the result in this case.
1131 We first walk over all immediate uses of the argument collecting
1132 statements that we can CSE in a vector and in a second pass replace
1133 the statement rhs with a REALPART or IMAGPART expression on the
1134 result of the cexpi call we insert before the use statement that
1135 dominates all other candidates. */
1136
1137 static bool
1138 execute_cse_sincos_1 (tree name)
1139 {
1140 gimple_stmt_iterator gsi;
1141 imm_use_iterator use_iter;
1142 tree fndecl, res, type;
1143 gimple *def_stmt, *use_stmt, *stmt;
1144 int seen_cos = 0, seen_sin = 0, seen_cexpi = 0;
1145 auto_vec<gimple *> stmts;
1146 basic_block top_bb = NULL;
1147 int i;
1148 bool cfg_changed = false;
1149
1150 type = TREE_TYPE (name);
1151 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, name)
1152 {
1153 if (gimple_code (use_stmt) != GIMPLE_CALL
1154 || !gimple_call_lhs (use_stmt))
1155 continue;
1156
1157 switch (gimple_call_combined_fn (use_stmt))
1158 {
1159 CASE_CFN_COS:
1160 seen_cos |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
1161 break;
1162
1163 CASE_CFN_SIN:
1164 seen_sin |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
1165 break;
1166
1167 CASE_CFN_CEXPI:
1168 seen_cexpi |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
1169 break;
1170
1171 default:;
1172 }
1173 }
1174
1175 if (seen_cos + seen_sin + seen_cexpi <= 1)
1176 return false;
1177
1178 /* Simply insert cexpi at the beginning of top_bb but not earlier than
1179 the name def statement. */
1180 fndecl = mathfn_built_in (type, BUILT_IN_CEXPI);
1181 if (!fndecl)
1182 return false;
1183 stmt = gimple_build_call (fndecl, 1, name);
1184 res = make_temp_ssa_name (TREE_TYPE (TREE_TYPE (fndecl)), stmt, "sincostmp");
1185 gimple_call_set_lhs (stmt, res);
1186
1187 def_stmt = SSA_NAME_DEF_STMT (name);
1188 if (!SSA_NAME_IS_DEFAULT_DEF (name)
1189 && gimple_code (def_stmt) != GIMPLE_PHI
1190 && gimple_bb (def_stmt) == top_bb)
1191 {
1192 gsi = gsi_for_stmt (def_stmt);
1193 gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
1194 }
1195 else
1196 {
1197 gsi = gsi_after_labels (top_bb);
1198 gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
1199 }
1200 sincos_stats.inserted++;
1201
1202 /* And adjust the recorded old call sites. */
1203 for (i = 0; stmts.iterate (i, &use_stmt); ++i)
1204 {
1205 tree rhs = NULL;
1206
1207 switch (gimple_call_combined_fn (use_stmt))
1208 {
1209 CASE_CFN_COS:
1210 rhs = fold_build1 (REALPART_EXPR, type, res);
1211 break;
1212
1213 CASE_CFN_SIN:
1214 rhs = fold_build1 (IMAGPART_EXPR, type, res);
1215 break;
1216
1217 CASE_CFN_CEXPI:
1218 rhs = res;
1219 break;
1220
1221 default:;
1222 gcc_unreachable ();
1223 }
1224
1225 /* Replace call with a copy. */
1226 stmt = gimple_build_assign (gimple_call_lhs (use_stmt), rhs);
1227
1228 gsi = gsi_for_stmt (use_stmt);
1229 gsi_replace (&gsi, stmt, true);
1230 if (gimple_purge_dead_eh_edges (gimple_bb (stmt)))
1231 cfg_changed = true;
1232 }
1233
1234 return cfg_changed;
1235 }
1236
1237 /* To evaluate powi(x,n), the floating point value x raised to the
1238 constant integer exponent n, we use a hybrid algorithm that
1239 combines the "window method" with look-up tables. For an
1240 introduction to exponentiation algorithms and "addition chains",
1241 see section 4.6.3, "Evaluation of Powers" of Donald E. Knuth,
1242 "Seminumerical Algorithms", Vol. 2, "The Art of Computer Programming",
1243 3rd Edition, 1998, and Daniel M. Gordon, "A Survey of Fast Exponentiation
1244 Methods", Journal of Algorithms, Vol. 27, pp. 129-146, 1998. */
1245
1246 /* Provide a default value for POWI_MAX_MULTS, the maximum number of
1247 multiplications to inline before calling the system library's pow
1248 function. powi(x,n) requires at worst 2*bits(n)-2 multiplications,
1249 so this default never requires calling pow, powf or powl. */
1250
1251 #ifndef POWI_MAX_MULTS
1252 #define POWI_MAX_MULTS (2*HOST_BITS_PER_WIDE_INT-2)
1253 #endif
1254
1255 /* The size of the "optimal power tree" lookup table. All
1256 exponents less than this value are simply looked up in the
1257 powi_table below. This threshold is also used to size the
1258 cache of pseudo registers that hold intermediate results. */
1259 #define POWI_TABLE_SIZE 256
1260
1261 /* The size, in bits of the window, used in the "window method"
1262 exponentiation algorithm. This is equivalent to a radix of
1263 (1<<POWI_WINDOW_SIZE) in the corresponding "m-ary method". */
1264 #define POWI_WINDOW_SIZE 3
1265
1266 /* The following table is an efficient representation of an
1267 "optimal power tree". For each value, i, the corresponding
1268 value, j, in the table states than an optimal evaluation
1269 sequence for calculating pow(x,i) can be found by evaluating
1270 pow(x,j)*pow(x,i-j). An optimal power tree for the first
1271 100 integers is given in Knuth's "Seminumerical algorithms". */
1272
1273 static const unsigned char powi_table[POWI_TABLE_SIZE] =
1274 {
1275 0, 1, 1, 2, 2, 3, 3, 4, /* 0 - 7 */
1276 4, 6, 5, 6, 6, 10, 7, 9, /* 8 - 15 */
1277 8, 16, 9, 16, 10, 12, 11, 13, /* 16 - 23 */
1278 12, 17, 13, 18, 14, 24, 15, 26, /* 24 - 31 */
1279 16, 17, 17, 19, 18, 33, 19, 26, /* 32 - 39 */
1280 20, 25, 21, 40, 22, 27, 23, 44, /* 40 - 47 */
1281 24, 32, 25, 34, 26, 29, 27, 44, /* 48 - 55 */
1282 28, 31, 29, 34, 30, 60, 31, 36, /* 56 - 63 */
1283 32, 64, 33, 34, 34, 46, 35, 37, /* 64 - 71 */
1284 36, 65, 37, 50, 38, 48, 39, 69, /* 72 - 79 */
1285 40, 49, 41, 43, 42, 51, 43, 58, /* 80 - 87 */
1286 44, 64, 45, 47, 46, 59, 47, 76, /* 88 - 95 */
1287 48, 65, 49, 66, 50, 67, 51, 66, /* 96 - 103 */
1288 52, 70, 53, 74, 54, 104, 55, 74, /* 104 - 111 */
1289 56, 64, 57, 69, 58, 78, 59, 68, /* 112 - 119 */
1290 60, 61, 61, 80, 62, 75, 63, 68, /* 120 - 127 */
1291 64, 65, 65, 128, 66, 129, 67, 90, /* 128 - 135 */
1292 68, 73, 69, 131, 70, 94, 71, 88, /* 136 - 143 */
1293 72, 128, 73, 98, 74, 132, 75, 121, /* 144 - 151 */
1294 76, 102, 77, 124, 78, 132, 79, 106, /* 152 - 159 */
1295 80, 97, 81, 160, 82, 99, 83, 134, /* 160 - 167 */
1296 84, 86, 85, 95, 86, 160, 87, 100, /* 168 - 175 */
1297 88, 113, 89, 98, 90, 107, 91, 122, /* 176 - 183 */
1298 92, 111, 93, 102, 94, 126, 95, 150, /* 184 - 191 */
1299 96, 128, 97, 130, 98, 133, 99, 195, /* 192 - 199 */
1300 100, 128, 101, 123, 102, 164, 103, 138, /* 200 - 207 */
1301 104, 145, 105, 146, 106, 109, 107, 149, /* 208 - 215 */
1302 108, 200, 109, 146, 110, 170, 111, 157, /* 216 - 223 */
1303 112, 128, 113, 130, 114, 182, 115, 132, /* 224 - 231 */
1304 116, 200, 117, 132, 118, 158, 119, 206, /* 232 - 239 */
1305 120, 240, 121, 162, 122, 147, 123, 152, /* 240 - 247 */
1306 124, 166, 125, 214, 126, 138, 127, 153, /* 248 - 255 */
1307 };
1308
1309
1310 /* Return the number of multiplications required to calculate
1311 powi(x,n) where n is less than POWI_TABLE_SIZE. This is a
1312 subroutine of powi_cost. CACHE is an array indicating
1313 which exponents have already been calculated. */
1314
1315 static int
1316 powi_lookup_cost (unsigned HOST_WIDE_INT n, bool *cache)
1317 {
1318 /* If we've already calculated this exponent, then this evaluation
1319 doesn't require any additional multiplications. */
1320 if (cache[n])
1321 return 0;
1322
1323 cache[n] = true;
1324 return powi_lookup_cost (n - powi_table[n], cache)
1325 + powi_lookup_cost (powi_table[n], cache) + 1;
1326 }
1327
1328 /* Return the number of multiplications required to calculate
1329 powi(x,n) for an arbitrary x, given the exponent N. This
1330 function needs to be kept in sync with powi_as_mults below. */
1331
1332 static int
1333 powi_cost (HOST_WIDE_INT n)
1334 {
1335 bool cache[POWI_TABLE_SIZE];
1336 unsigned HOST_WIDE_INT digit;
1337 unsigned HOST_WIDE_INT val;
1338 int result;
1339
1340 if (n == 0)
1341 return 0;
1342
1343 /* Ignore the reciprocal when calculating the cost. */
1344 val = (n < 0) ? -n : n;
1345
1346 /* Initialize the exponent cache. */
1347 memset (cache, 0, POWI_TABLE_SIZE * sizeof (bool));
1348 cache[1] = true;
1349
1350 result = 0;
1351
1352 while (val >= POWI_TABLE_SIZE)
1353 {
1354 if (val & 1)
1355 {
1356 digit = val & ((1 << POWI_WINDOW_SIZE) - 1);
1357 result += powi_lookup_cost (digit, cache)
1358 + POWI_WINDOW_SIZE + 1;
1359 val >>= POWI_WINDOW_SIZE;
1360 }
1361 else
1362 {
1363 val >>= 1;
1364 result++;
1365 }
1366 }
1367
1368 return result + powi_lookup_cost (val, cache);
1369 }
1370
1371 /* Recursive subroutine of powi_as_mults. This function takes the
1372 array, CACHE, of already calculated exponents and an exponent N and
1373 returns a tree that corresponds to CACHE[1]**N, with type TYPE. */
1374
1375 static tree
1376 powi_as_mults_1 (gimple_stmt_iterator *gsi, location_t loc, tree type,
1377 HOST_WIDE_INT n, tree *cache)
1378 {
1379 tree op0, op1, ssa_target;
1380 unsigned HOST_WIDE_INT digit;
1381 gassign *mult_stmt;
1382
1383 if (n < POWI_TABLE_SIZE && cache[n])
1384 return cache[n];
1385
1386 ssa_target = make_temp_ssa_name (type, NULL, "powmult");
1387
1388 if (n < POWI_TABLE_SIZE)
1389 {
1390 cache[n] = ssa_target;
1391 op0 = powi_as_mults_1 (gsi, loc, type, n - powi_table[n], cache);
1392 op1 = powi_as_mults_1 (gsi, loc, type, powi_table[n], cache);
1393 }
1394 else if (n & 1)
1395 {
1396 digit = n & ((1 << POWI_WINDOW_SIZE) - 1);
1397 op0 = powi_as_mults_1 (gsi, loc, type, n - digit, cache);
1398 op1 = powi_as_mults_1 (gsi, loc, type, digit, cache);
1399 }
1400 else
1401 {
1402 op0 = powi_as_mults_1 (gsi, loc, type, n >> 1, cache);
1403 op1 = op0;
1404 }
1405
1406 mult_stmt = gimple_build_assign (ssa_target, MULT_EXPR, op0, op1);
1407 gimple_set_location (mult_stmt, loc);
1408 gsi_insert_before (gsi, mult_stmt, GSI_SAME_STMT);
1409
1410 return ssa_target;
1411 }
1412
1413 /* Convert ARG0**N to a tree of multiplications of ARG0 with itself.
1414 This function needs to be kept in sync with powi_cost above. */
1415
1416 static tree
1417 powi_as_mults (gimple_stmt_iterator *gsi, location_t loc,
1418 tree arg0, HOST_WIDE_INT n)
1419 {
1420 tree cache[POWI_TABLE_SIZE], result, type = TREE_TYPE (arg0);
1421 gassign *div_stmt;
1422 tree target;
1423
1424 if (n == 0)
1425 return build_real (type, dconst1);
1426
1427 memset (cache, 0, sizeof (cache));
1428 cache[1] = arg0;
1429
1430 result = powi_as_mults_1 (gsi, loc, type, (n < 0) ? -n : n, cache);
1431 if (n >= 0)
1432 return result;
1433
1434 /* If the original exponent was negative, reciprocate the result. */
1435 target = make_temp_ssa_name (type, NULL, "powmult");
1436 div_stmt = gimple_build_assign (target, RDIV_EXPR,
1437 build_real (type, dconst1), result);
1438 gimple_set_location (div_stmt, loc);
1439 gsi_insert_before (gsi, div_stmt, GSI_SAME_STMT);
1440
1441 return target;
1442 }
1443
1444 /* ARG0 and N are the two arguments to a powi builtin in GSI with
1445 location info LOC. If the arguments are appropriate, create an
1446 equivalent sequence of statements prior to GSI using an optimal
1447 number of multiplications, and return an expession holding the
1448 result. */
1449
1450 static tree
1451 gimple_expand_builtin_powi (gimple_stmt_iterator *gsi, location_t loc,
1452 tree arg0, HOST_WIDE_INT n)
1453 {
1454 /* Avoid largest negative number. */
1455 if (n != -n
1456 && ((n >= -1 && n <= 2)
1457 || (optimize_function_for_speed_p (cfun)
1458 && powi_cost (n) <= POWI_MAX_MULTS)))
1459 return powi_as_mults (gsi, loc, arg0, n);
1460
1461 return NULL_TREE;
1462 }
1463
1464 /* Build a gimple call statement that calls FN with argument ARG.
1465 Set the lhs of the call statement to a fresh SSA name. Insert the
1466 statement prior to GSI's current position, and return the fresh
1467 SSA name. */
1468
1469 static tree
1470 build_and_insert_call (gimple_stmt_iterator *gsi, location_t loc,
1471 tree fn, tree arg)
1472 {
1473 gcall *call_stmt;
1474 tree ssa_target;
1475
1476 call_stmt = gimple_build_call (fn, 1, arg);
1477 ssa_target = make_temp_ssa_name (TREE_TYPE (arg), NULL, "powroot");
1478 gimple_set_lhs (call_stmt, ssa_target);
1479 gimple_set_location (call_stmt, loc);
1480 gsi_insert_before (gsi, call_stmt, GSI_SAME_STMT);
1481
1482 return ssa_target;
1483 }
1484
1485 /* Build a gimple binary operation with the given CODE and arguments
1486 ARG0, ARG1, assigning the result to a new SSA name for variable
1487 TARGET. Insert the statement prior to GSI's current position, and
1488 return the fresh SSA name.*/
1489
1490 static tree
1491 build_and_insert_binop (gimple_stmt_iterator *gsi, location_t loc,
1492 const char *name, enum tree_code code,
1493 tree arg0, tree arg1)
1494 {
1495 tree result = make_temp_ssa_name (TREE_TYPE (arg0), NULL, name);
1496 gassign *stmt = gimple_build_assign (result, code, arg0, arg1);
1497 gimple_set_location (stmt, loc);
1498 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1499 return result;
1500 }
1501
1502 /* Build a gimple reference operation with the given CODE and argument
1503 ARG, assigning the result to a new SSA name of TYPE with NAME.
1504 Insert the statement prior to GSI's current position, and return
1505 the fresh SSA name. */
1506
1507 static inline tree
1508 build_and_insert_ref (gimple_stmt_iterator *gsi, location_t loc, tree type,
1509 const char *name, enum tree_code code, tree arg0)
1510 {
1511 tree result = make_temp_ssa_name (type, NULL, name);
1512 gimple *stmt = gimple_build_assign (result, build1 (code, type, arg0));
1513 gimple_set_location (stmt, loc);
1514 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1515 return result;
1516 }
1517
1518 /* Build a gimple assignment to cast VAL to TYPE. Insert the statement
1519 prior to GSI's current position, and return the fresh SSA name. */
1520
1521 static tree
1522 build_and_insert_cast (gimple_stmt_iterator *gsi, location_t loc,
1523 tree type, tree val)
1524 {
1525 tree result = make_ssa_name (type);
1526 gassign *stmt = gimple_build_assign (result, NOP_EXPR, val);
1527 gimple_set_location (stmt, loc);
1528 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1529 return result;
1530 }
1531
1532 struct pow_synth_sqrt_info
1533 {
1534 bool *factors;
1535 unsigned int deepest;
1536 unsigned int num_mults;
1537 };
1538
1539 /* Return true iff the real value C can be represented as a
1540 sum of powers of 0.5 up to N. That is:
1541 C == SUM<i from 1..N> (a[i]*(0.5**i)) where a[i] is either 0 or 1.
1542 Record in INFO the various parameters of the synthesis algorithm such
1543 as the factors a[i], the maximum 0.5 power and the number of
1544 multiplications that will be required. */
1545
1546 bool
1547 representable_as_half_series_p (REAL_VALUE_TYPE c, unsigned n,
1548 struct pow_synth_sqrt_info *info)
1549 {
1550 REAL_VALUE_TYPE factor = dconsthalf;
1551 REAL_VALUE_TYPE remainder = c;
1552
1553 info->deepest = 0;
1554 info->num_mults = 0;
1555 memset (info->factors, 0, n * sizeof (bool));
1556
1557 for (unsigned i = 0; i < n; i++)
1558 {
1559 REAL_VALUE_TYPE res;
1560
1561 /* If something inexact happened bail out now. */
1562 if (real_arithmetic (&res, MINUS_EXPR, &remainder, &factor))
1563 return false;
1564
1565 /* We have hit zero. The number is representable as a sum
1566 of powers of 0.5. */
1567 if (real_equal (&res, &dconst0))
1568 {
1569 info->factors[i] = true;
1570 info->deepest = i + 1;
1571 return true;
1572 }
1573 else if (!REAL_VALUE_NEGATIVE (res))
1574 {
1575 remainder = res;
1576 info->factors[i] = true;
1577 info->num_mults++;
1578 }
1579 else
1580 info->factors[i] = false;
1581
1582 real_arithmetic (&factor, MULT_EXPR, &factor, &dconsthalf);
1583 }
1584 return false;
1585 }
1586
1587 /* Return the tree corresponding to FN being applied
1588 to ARG N times at GSI and LOC.
1589 Look up previous results from CACHE if need be.
1590 cache[0] should contain just plain ARG i.e. FN applied to ARG 0 times. */
1591
1592 static tree
1593 get_fn_chain (tree arg, unsigned int n, gimple_stmt_iterator *gsi,
1594 tree fn, location_t loc, tree *cache)
1595 {
1596 tree res = cache[n];
1597 if (!res)
1598 {
1599 tree prev = get_fn_chain (arg, n - 1, gsi, fn, loc, cache);
1600 res = build_and_insert_call (gsi, loc, fn, prev);
1601 cache[n] = res;
1602 }
1603
1604 return res;
1605 }
1606
1607 /* Print to STREAM the repeated application of function FNAME to ARG
1608 N times. So, for FNAME = "foo", ARG = "x", N = 2 it would print:
1609 "foo (foo (x))". */
1610
1611 static void
1612 print_nested_fn (FILE* stream, const char *fname, const char* arg,
1613 unsigned int n)
1614 {
1615 if (n == 0)
1616 fprintf (stream, "%s", arg);
1617 else
1618 {
1619 fprintf (stream, "%s (", fname);
1620 print_nested_fn (stream, fname, arg, n - 1);
1621 fprintf (stream, ")");
1622 }
1623 }
1624
1625 /* Print to STREAM the fractional sequence of sqrt chains
1626 applied to ARG, described by INFO. Used for the dump file. */
1627
1628 static void
1629 dump_fractional_sqrt_sequence (FILE *stream, const char *arg,
1630 struct pow_synth_sqrt_info *info)
1631 {
1632 for (unsigned int i = 0; i < info->deepest; i++)
1633 {
1634 bool is_set = info->factors[i];
1635 if (is_set)
1636 {
1637 print_nested_fn (stream, "sqrt", arg, i + 1);
1638 if (i != info->deepest - 1)
1639 fprintf (stream, " * ");
1640 }
1641 }
1642 }
1643
1644 /* Print to STREAM a representation of raising ARG to an integer
1645 power N. Used for the dump file. */
1646
1647 static void
1648 dump_integer_part (FILE *stream, const char* arg, HOST_WIDE_INT n)
1649 {
1650 if (n > 1)
1651 fprintf (stream, "powi (%s, " HOST_WIDE_INT_PRINT_DEC ")", arg, n);
1652 else if (n == 1)
1653 fprintf (stream, "%s", arg);
1654 }
1655
1656 /* Attempt to synthesize a POW[F] (ARG0, ARG1) call using chains of
1657 square roots. Place at GSI and LOC. Limit the maximum depth
1658 of the sqrt chains to MAX_DEPTH. Return the tree holding the
1659 result of the expanded sequence or NULL_TREE if the expansion failed.
1660
1661 This routine assumes that ARG1 is a real number with a fractional part
1662 (the integer exponent case will have been handled earlier in
1663 gimple_expand_builtin_pow).
1664
1665 For ARG1 > 0.0:
1666 * For ARG1 composed of a whole part WHOLE_PART and a fractional part
1667 FRAC_PART i.e. WHOLE_PART == floor (ARG1) and
1668 FRAC_PART == ARG1 - WHOLE_PART:
1669 Produce POWI (ARG0, WHOLE_PART) * POW (ARG0, FRAC_PART) where
1670 POW (ARG0, FRAC_PART) is expanded as a product of square root chains
1671 if it can be expressed as such, that is if FRAC_PART satisfies:
1672 FRAC_PART == <SUM from i = 1 until MAX_DEPTH> (a[i] * (0.5**i))
1673 where integer a[i] is either 0 or 1.
1674
1675 Example:
1676 POW (x, 3.625) == POWI (x, 3) * POW (x, 0.625)
1677 --> POWI (x, 3) * SQRT (x) * SQRT (SQRT (SQRT (x)))
1678
1679 For ARG1 < 0.0 there are two approaches:
1680 * (A) Expand to 1.0 / POW (ARG0, -ARG1) where POW (ARG0, -ARG1)
1681 is calculated as above.
1682
1683 Example:
1684 POW (x, -5.625) == 1.0 / POW (x, 5.625)
1685 --> 1.0 / (POWI (x, 5) * SQRT (x) * SQRT (SQRT (SQRT (x))))
1686
1687 * (B) : WHOLE_PART := - ceil (abs (ARG1))
1688 FRAC_PART := ARG1 - WHOLE_PART
1689 and expand to POW (x, FRAC_PART) / POWI (x, WHOLE_PART).
1690 Example:
1691 POW (x, -5.875) == POW (x, 0.125) / POWI (X, 6)
1692 --> SQRT (SQRT (SQRT (x))) / (POWI (x, 6))
1693
1694 For ARG1 < 0.0 we choose between (A) and (B) depending on
1695 how many multiplications we'd have to do.
1696 So, for the example in (B): POW (x, -5.875), if we were to
1697 follow algorithm (A) we would produce:
1698 1.0 / POWI (X, 5) * SQRT (X) * SQRT (SQRT (X)) * SQRT (SQRT (SQRT (X)))
1699 which contains more multiplications than approach (B).
1700
1701 Hopefully, this approach will eliminate potentially expensive POW library
1702 calls when unsafe floating point math is enabled and allow the compiler to
1703 further optimise the multiplies, square roots and divides produced by this
1704 function. */
1705
1706 static tree
1707 expand_pow_as_sqrts (gimple_stmt_iterator *gsi, location_t loc,
1708 tree arg0, tree arg1, HOST_WIDE_INT max_depth)
1709 {
1710 tree type = TREE_TYPE (arg0);
1711 machine_mode mode = TYPE_MODE (type);
1712 tree sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1713 bool one_over = true;
1714
1715 if (!sqrtfn)
1716 return NULL_TREE;
1717
1718 if (TREE_CODE (arg1) != REAL_CST)
1719 return NULL_TREE;
1720
1721 REAL_VALUE_TYPE exp_init = TREE_REAL_CST (arg1);
1722
1723 gcc_assert (max_depth > 0);
1724 tree *cache = XALLOCAVEC (tree, max_depth + 1);
1725
1726 struct pow_synth_sqrt_info synth_info;
1727 synth_info.factors = XALLOCAVEC (bool, max_depth + 1);
1728 synth_info.deepest = 0;
1729 synth_info.num_mults = 0;
1730
1731 bool neg_exp = REAL_VALUE_NEGATIVE (exp_init);
1732 REAL_VALUE_TYPE exp = real_value_abs (&exp_init);
1733
1734 /* The whole and fractional parts of exp. */
1735 REAL_VALUE_TYPE whole_part;
1736 REAL_VALUE_TYPE frac_part;
1737
1738 real_floor (&whole_part, mode, &exp);
1739 real_arithmetic (&frac_part, MINUS_EXPR, &exp, &whole_part);
1740
1741
1742 REAL_VALUE_TYPE ceil_whole = dconst0;
1743 REAL_VALUE_TYPE ceil_fract = dconst0;
1744
1745 if (neg_exp)
1746 {
1747 real_ceil (&ceil_whole, mode, &exp);
1748 real_arithmetic (&ceil_fract, MINUS_EXPR, &ceil_whole, &exp);
1749 }
1750
1751 if (!representable_as_half_series_p (frac_part, max_depth, &synth_info))
1752 return NULL_TREE;
1753
1754 /* Check whether it's more profitable to not use 1.0 / ... */
1755 if (neg_exp)
1756 {
1757 struct pow_synth_sqrt_info alt_synth_info;
1758 alt_synth_info.factors = XALLOCAVEC (bool, max_depth + 1);
1759 alt_synth_info.deepest = 0;
1760 alt_synth_info.num_mults = 0;
1761
1762 if (representable_as_half_series_p (ceil_fract, max_depth,
1763 &alt_synth_info)
1764 && alt_synth_info.deepest <= synth_info.deepest
1765 && alt_synth_info.num_mults < synth_info.num_mults)
1766 {
1767 whole_part = ceil_whole;
1768 frac_part = ceil_fract;
1769 synth_info.deepest = alt_synth_info.deepest;
1770 synth_info.num_mults = alt_synth_info.num_mults;
1771 memcpy (synth_info.factors, alt_synth_info.factors,
1772 (max_depth + 1) * sizeof (bool));
1773 one_over = false;
1774 }
1775 }
1776
1777 HOST_WIDE_INT n = real_to_integer (&whole_part);
1778 REAL_VALUE_TYPE cint;
1779 real_from_integer (&cint, VOIDmode, n, SIGNED);
1780
1781 if (!real_identical (&whole_part, &cint))
1782 return NULL_TREE;
1783
1784 if (powi_cost (n) + synth_info.num_mults > POWI_MAX_MULTS)
1785 return NULL_TREE;
1786
1787 memset (cache, 0, (max_depth + 1) * sizeof (tree));
1788
1789 tree integer_res = n == 0 ? build_real (type, dconst1) : arg0;
1790
1791 /* Calculate the integer part of the exponent. */
1792 if (n > 1)
1793 {
1794 integer_res = gimple_expand_builtin_powi (gsi, loc, arg0, n);
1795 if (!integer_res)
1796 return NULL_TREE;
1797 }
1798
1799 if (dump_file)
1800 {
1801 char string[64];
1802
1803 real_to_decimal (string, &exp_init, sizeof (string), 0, 1);
1804 fprintf (dump_file, "synthesizing pow (x, %s) as:\n", string);
1805
1806 if (neg_exp)
1807 {
1808 if (one_over)
1809 {
1810 fprintf (dump_file, "1.0 / (");
1811 dump_integer_part (dump_file, "x", n);
1812 if (n > 0)
1813 fprintf (dump_file, " * ");
1814 dump_fractional_sqrt_sequence (dump_file, "x", &synth_info);
1815 fprintf (dump_file, ")");
1816 }
1817 else
1818 {
1819 dump_fractional_sqrt_sequence (dump_file, "x", &synth_info);
1820 fprintf (dump_file, " / (");
1821 dump_integer_part (dump_file, "x", n);
1822 fprintf (dump_file, ")");
1823 }
1824 }
1825 else
1826 {
1827 dump_fractional_sqrt_sequence (dump_file, "x", &synth_info);
1828 if (n > 0)
1829 fprintf (dump_file, " * ");
1830 dump_integer_part (dump_file, "x", n);
1831 }
1832
1833 fprintf (dump_file, "\ndeepest sqrt chain: %d\n", synth_info.deepest);
1834 }
1835
1836
1837 tree fract_res = NULL_TREE;
1838 cache[0] = arg0;
1839
1840 /* Calculate the fractional part of the exponent. */
1841 for (unsigned i = 0; i < synth_info.deepest; i++)
1842 {
1843 if (synth_info.factors[i])
1844 {
1845 tree sqrt_chain = get_fn_chain (arg0, i + 1, gsi, sqrtfn, loc, cache);
1846
1847 if (!fract_res)
1848 fract_res = sqrt_chain;
1849
1850 else
1851 fract_res = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1852 fract_res, sqrt_chain);
1853 }
1854 }
1855
1856 tree res = NULL_TREE;
1857
1858 if (neg_exp)
1859 {
1860 if (one_over)
1861 {
1862 if (n > 0)
1863 res = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1864 fract_res, integer_res);
1865 else
1866 res = fract_res;
1867
1868 res = build_and_insert_binop (gsi, loc, "powrootrecip", RDIV_EXPR,
1869 build_real (type, dconst1), res);
1870 }
1871 else
1872 {
1873 res = build_and_insert_binop (gsi, loc, "powroot", RDIV_EXPR,
1874 fract_res, integer_res);
1875 }
1876 }
1877 else
1878 res = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1879 fract_res, integer_res);
1880 return res;
1881 }
1882
1883 /* ARG0 and ARG1 are the two arguments to a pow builtin call in GSI
1884 with location info LOC. If possible, create an equivalent and
1885 less expensive sequence of statements prior to GSI, and return an
1886 expession holding the result. */
1887
1888 static tree
1889 gimple_expand_builtin_pow (gimple_stmt_iterator *gsi, location_t loc,
1890 tree arg0, tree arg1)
1891 {
1892 REAL_VALUE_TYPE c, cint, dconst1_3, dconst1_4, dconst1_6;
1893 REAL_VALUE_TYPE c2, dconst3;
1894 HOST_WIDE_INT n;
1895 tree type, sqrtfn, cbrtfn, sqrt_arg0, result, cbrt_x, powi_cbrt_x;
1896 machine_mode mode;
1897 bool speed_p = optimize_bb_for_speed_p (gsi_bb (*gsi));
1898 bool hw_sqrt_exists, c_is_int, c2_is_int;
1899
1900 dconst1_4 = dconst1;
1901 SET_REAL_EXP (&dconst1_4, REAL_EXP (&dconst1_4) - 2);
1902
1903 /* If the exponent isn't a constant, there's nothing of interest
1904 to be done. */
1905 if (TREE_CODE (arg1) != REAL_CST)
1906 return NULL_TREE;
1907
1908 /* Don't perform the operation if flag_signaling_nans is on
1909 and the operand is a signaling NaN. */
1910 if (HONOR_SNANS (TYPE_MODE (TREE_TYPE (arg1)))
1911 && ((TREE_CODE (arg0) == REAL_CST
1912 && REAL_VALUE_ISSIGNALING_NAN (TREE_REAL_CST (arg0)))
1913 || REAL_VALUE_ISSIGNALING_NAN (TREE_REAL_CST (arg1))))
1914 return NULL_TREE;
1915
1916 /* If the exponent is equivalent to an integer, expand to an optimal
1917 multiplication sequence when profitable. */
1918 c = TREE_REAL_CST (arg1);
1919 n = real_to_integer (&c);
1920 real_from_integer (&cint, VOIDmode, n, SIGNED);
1921 c_is_int = real_identical (&c, &cint);
1922
1923 if (c_is_int
1924 && ((n >= -1 && n <= 2)
1925 || (flag_unsafe_math_optimizations
1926 && speed_p
1927 && powi_cost (n) <= POWI_MAX_MULTS)))
1928 return gimple_expand_builtin_powi (gsi, loc, arg0, n);
1929
1930 /* Attempt various optimizations using sqrt and cbrt. */
1931 type = TREE_TYPE (arg0);
1932 mode = TYPE_MODE (type);
1933 sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1934
1935 /* Optimize pow(x,0.5) = sqrt(x). This replacement is always safe
1936 unless signed zeros must be maintained. pow(-0,0.5) = +0, while
1937 sqrt(-0) = -0. */
1938 if (sqrtfn
1939 && real_equal (&c, &dconsthalf)
1940 && !HONOR_SIGNED_ZEROS (mode))
1941 return build_and_insert_call (gsi, loc, sqrtfn, arg0);
1942
1943 hw_sqrt_exists = optab_handler (sqrt_optab, mode) != CODE_FOR_nothing;
1944
1945 /* Optimize pow(x,1./3.) = cbrt(x). This requires unsafe math
1946 optimizations since 1./3. is not exactly representable. If x
1947 is negative and finite, the correct value of pow(x,1./3.) is
1948 a NaN with the "invalid" exception raised, because the value
1949 of 1./3. actually has an even denominator. The correct value
1950 of cbrt(x) is a negative real value. */
1951 cbrtfn = mathfn_built_in (type, BUILT_IN_CBRT);
1952 dconst1_3 = real_value_truncate (mode, dconst_third ());
1953
1954 if (flag_unsafe_math_optimizations
1955 && cbrtfn
1956 && (!HONOR_NANS (mode) || tree_expr_nonnegative_p (arg0))
1957 && real_equal (&c, &dconst1_3))
1958 return build_and_insert_call (gsi, loc, cbrtfn, arg0);
1959
1960 /* Optimize pow(x,1./6.) = cbrt(sqrt(x)). Don't do this optimization
1961 if we don't have a hardware sqrt insn. */
1962 dconst1_6 = dconst1_3;
1963 SET_REAL_EXP (&dconst1_6, REAL_EXP (&dconst1_6) - 1);
1964
1965 if (flag_unsafe_math_optimizations
1966 && sqrtfn
1967 && cbrtfn
1968 && (!HONOR_NANS (mode) || tree_expr_nonnegative_p (arg0))
1969 && speed_p
1970 && hw_sqrt_exists
1971 && real_equal (&c, &dconst1_6))
1972 {
1973 /* sqrt(x) */
1974 sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1975
1976 /* cbrt(sqrt(x)) */
1977 return build_and_insert_call (gsi, loc, cbrtfn, sqrt_arg0);
1978 }
1979
1980
1981 /* Attempt to expand the POW as a product of square root chains.
1982 Expand the 0.25 case even when otpimising for size. */
1983 if (flag_unsafe_math_optimizations
1984 && sqrtfn
1985 && hw_sqrt_exists
1986 && (speed_p || real_equal (&c, &dconst1_4))
1987 && !HONOR_SIGNED_ZEROS (mode))
1988 {
1989 unsigned int max_depth = speed_p
1990 ? param_max_pow_sqrt_depth
1991 : 2;
1992
1993 tree expand_with_sqrts
1994 = expand_pow_as_sqrts (gsi, loc, arg0, arg1, max_depth);
1995
1996 if (expand_with_sqrts)
1997 return expand_with_sqrts;
1998 }
1999
2000 real_arithmetic (&c2, MULT_EXPR, &c, &dconst2);
2001 n = real_to_integer (&c2);
2002 real_from_integer (&cint, VOIDmode, n, SIGNED);
2003 c2_is_int = real_identical (&c2, &cint);
2004
2005 /* Optimize pow(x,c), where 3c = n for some nonzero integer n, into
2006
2007 powi(x, n/3) * powi(cbrt(x), n%3), n > 0;
2008 1.0 / (powi(x, abs(n)/3) * powi(cbrt(x), abs(n)%3)), n < 0.
2009
2010 Do not calculate the first factor when n/3 = 0. As cbrt(x) is
2011 different from pow(x, 1./3.) due to rounding and behavior with
2012 negative x, we need to constrain this transformation to unsafe
2013 math and positive x or finite math. */
2014 real_from_integer (&dconst3, VOIDmode, 3, SIGNED);
2015 real_arithmetic (&c2, MULT_EXPR, &c, &dconst3);
2016 real_round (&c2, mode, &c2);
2017 n = real_to_integer (&c2);
2018 real_from_integer (&cint, VOIDmode, n, SIGNED);
2019 real_arithmetic (&c2, RDIV_EXPR, &cint, &dconst3);
2020 real_convert (&c2, mode, &c2);
2021
2022 if (flag_unsafe_math_optimizations
2023 && cbrtfn
2024 && (!HONOR_NANS (mode) || tree_expr_nonnegative_p (arg0))
2025 && real_identical (&c2, &c)
2026 && !c2_is_int
2027 && optimize_function_for_speed_p (cfun)
2028 && powi_cost (n / 3) <= POWI_MAX_MULTS)
2029 {
2030 tree powi_x_ndiv3 = NULL_TREE;
2031
2032 /* Attempt to fold powi(arg0, abs(n/3)) into multiplies. If not
2033 possible or profitable, give up. Skip the degenerate case when
2034 abs(n) < 3, where the result is always 1. */
2035 if (absu_hwi (n) >= 3)
2036 {
2037 powi_x_ndiv3 = gimple_expand_builtin_powi (gsi, loc, arg0,
2038 abs_hwi (n / 3));
2039 if (!powi_x_ndiv3)
2040 return NULL_TREE;
2041 }
2042
2043 /* Calculate powi(cbrt(x), n%3). Don't use gimple_expand_builtin_powi
2044 as that creates an unnecessary variable. Instead, just produce
2045 either cbrt(x) or cbrt(x) * cbrt(x). */
2046 cbrt_x = build_and_insert_call (gsi, loc, cbrtfn, arg0);
2047
2048 if (absu_hwi (n) % 3 == 1)
2049 powi_cbrt_x = cbrt_x;
2050 else
2051 powi_cbrt_x = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
2052 cbrt_x, cbrt_x);
2053
2054 /* Multiply the two subexpressions, unless powi(x,abs(n)/3) = 1. */
2055 if (absu_hwi (n) < 3)
2056 result = powi_cbrt_x;
2057 else
2058 result = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
2059 powi_x_ndiv3, powi_cbrt_x);
2060
2061 /* If n is negative, reciprocate the result. */
2062 if (n < 0)
2063 result = build_and_insert_binop (gsi, loc, "powroot", RDIV_EXPR,
2064 build_real (type, dconst1), result);
2065
2066 return result;
2067 }
2068
2069 /* No optimizations succeeded. */
2070 return NULL_TREE;
2071 }
2072
2073 /* ARG is the argument to a cabs builtin call in GSI with location info
2074 LOC. Create a sequence of statements prior to GSI that calculates
2075 sqrt(R*R + I*I), where R and I are the real and imaginary components
2076 of ARG, respectively. Return an expression holding the result. */
2077
2078 static tree
2079 gimple_expand_builtin_cabs (gimple_stmt_iterator *gsi, location_t loc, tree arg)
2080 {
2081 tree real_part, imag_part, addend1, addend2, sum, result;
2082 tree type = TREE_TYPE (TREE_TYPE (arg));
2083 tree sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
2084 machine_mode mode = TYPE_MODE (type);
2085
2086 if (!flag_unsafe_math_optimizations
2087 || !optimize_bb_for_speed_p (gimple_bb (gsi_stmt (*gsi)))
2088 || !sqrtfn
2089 || optab_handler (sqrt_optab, mode) == CODE_FOR_nothing)
2090 return NULL_TREE;
2091
2092 real_part = build_and_insert_ref (gsi, loc, type, "cabs",
2093 REALPART_EXPR, arg);
2094 addend1 = build_and_insert_binop (gsi, loc, "cabs", MULT_EXPR,
2095 real_part, real_part);
2096 imag_part = build_and_insert_ref (gsi, loc, type, "cabs",
2097 IMAGPART_EXPR, arg);
2098 addend2 = build_and_insert_binop (gsi, loc, "cabs", MULT_EXPR,
2099 imag_part, imag_part);
2100 sum = build_and_insert_binop (gsi, loc, "cabs", PLUS_EXPR, addend1, addend2);
2101 result = build_and_insert_call (gsi, loc, sqrtfn, sum);
2102
2103 return result;
2104 }
2105
2106 /* Go through all calls to sin, cos and cexpi and call execute_cse_sincos_1
2107 on the SSA_NAME argument of each of them. Also expand powi(x,n) into
2108 an optimal number of multiplies, when n is a constant. */
2109
2110 namespace {
2111
2112 const pass_data pass_data_cse_sincos =
2113 {
2114 GIMPLE_PASS, /* type */
2115 "sincos", /* name */
2116 OPTGROUP_NONE, /* optinfo_flags */
2117 TV_TREE_SINCOS, /* tv_id */
2118 PROP_ssa, /* properties_required */
2119 PROP_gimple_opt_math, /* properties_provided */
2120 0, /* properties_destroyed */
2121 0, /* todo_flags_start */
2122 TODO_update_ssa, /* todo_flags_finish */
2123 };
2124
2125 class pass_cse_sincos : public gimple_opt_pass
2126 {
2127 public:
2128 pass_cse_sincos (gcc::context *ctxt)
2129 : gimple_opt_pass (pass_data_cse_sincos, ctxt)
2130 {}
2131
2132 /* opt_pass methods: */
2133 virtual bool gate (function *)
2134 {
2135 /* We no longer require either sincos or cexp, since powi expansion
2136 piggybacks on this pass. */
2137 return optimize;
2138 }
2139
2140 virtual unsigned int execute (function *);
2141
2142 }; // class pass_cse_sincos
2143
2144 unsigned int
2145 pass_cse_sincos::execute (function *fun)
2146 {
2147 basic_block bb;
2148 bool cfg_changed = false;
2149
2150 calculate_dominance_info (CDI_DOMINATORS);
2151 memset (&sincos_stats, 0, sizeof (sincos_stats));
2152
2153 FOR_EACH_BB_FN (bb, fun)
2154 {
2155 gimple_stmt_iterator gsi;
2156 bool cleanup_eh = false;
2157
2158 for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2159 {
2160 gimple *stmt = gsi_stmt (gsi);
2161
2162 /* Only the last stmt in a bb could throw, no need to call
2163 gimple_purge_dead_eh_edges if we change something in the middle
2164 of a basic block. */
2165 cleanup_eh = false;
2166
2167 if (is_gimple_call (stmt)
2168 && gimple_call_lhs (stmt))
2169 {
2170 tree arg, arg0, arg1, result;
2171 HOST_WIDE_INT n;
2172 location_t loc;
2173
2174 switch (gimple_call_combined_fn (stmt))
2175 {
2176 CASE_CFN_COS:
2177 CASE_CFN_SIN:
2178 CASE_CFN_CEXPI:
2179 /* Make sure we have either sincos or cexp. */
2180 if (!targetm.libc_has_function (function_c99_math_complex)
2181 && !targetm.libc_has_function (function_sincos))
2182 break;
2183
2184 arg = gimple_call_arg (stmt, 0);
2185 if (TREE_CODE (arg) == SSA_NAME)
2186 cfg_changed |= execute_cse_sincos_1 (arg);
2187 break;
2188
2189 CASE_CFN_POW:
2190 arg0 = gimple_call_arg (stmt, 0);
2191 arg1 = gimple_call_arg (stmt, 1);
2192
2193 loc = gimple_location (stmt);
2194 result = gimple_expand_builtin_pow (&gsi, loc, arg0, arg1);
2195
2196 if (result)
2197 {
2198 tree lhs = gimple_get_lhs (stmt);
2199 gassign *new_stmt = gimple_build_assign (lhs, result);
2200 gimple_set_location (new_stmt, loc);
2201 unlink_stmt_vdef (stmt);
2202 gsi_replace (&gsi, new_stmt, true);
2203 cleanup_eh = true;
2204 if (gimple_vdef (stmt))
2205 release_ssa_name (gimple_vdef (stmt));
2206 }
2207 break;
2208
2209 CASE_CFN_POWI:
2210 arg0 = gimple_call_arg (stmt, 0);
2211 arg1 = gimple_call_arg (stmt, 1);
2212 loc = gimple_location (stmt);
2213
2214 if (real_minus_onep (arg0))
2215 {
2216 tree t0, t1, cond, one, minus_one;
2217 gassign *stmt;
2218
2219 t0 = TREE_TYPE (arg0);
2220 t1 = TREE_TYPE (arg1);
2221 one = build_real (t0, dconst1);
2222 minus_one = build_real (t0, dconstm1);
2223
2224 cond = make_temp_ssa_name (t1, NULL, "powi_cond");
2225 stmt = gimple_build_assign (cond, BIT_AND_EXPR,
2226 arg1, build_int_cst (t1, 1));
2227 gimple_set_location (stmt, loc);
2228 gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
2229
2230 result = make_temp_ssa_name (t0, NULL, "powi");
2231 stmt = gimple_build_assign (result, COND_EXPR, cond,
2232 minus_one, one);
2233 gimple_set_location (stmt, loc);
2234 gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
2235 }
2236 else
2237 {
2238 if (!tree_fits_shwi_p (arg1))
2239 break;
2240
2241 n = tree_to_shwi (arg1);
2242 result = gimple_expand_builtin_powi (&gsi, loc, arg0, n);
2243 }
2244
2245 if (result)
2246 {
2247 tree lhs = gimple_get_lhs (stmt);
2248 gassign *new_stmt = gimple_build_assign (lhs, result);
2249 gimple_set_location (new_stmt, loc);
2250 unlink_stmt_vdef (stmt);
2251 gsi_replace (&gsi, new_stmt, true);
2252 cleanup_eh = true;
2253 if (gimple_vdef (stmt))
2254 release_ssa_name (gimple_vdef (stmt));
2255 }
2256 break;
2257
2258 CASE_CFN_CABS:
2259 arg0 = gimple_call_arg (stmt, 0);
2260 loc = gimple_location (stmt);
2261 result = gimple_expand_builtin_cabs (&gsi, loc, arg0);
2262
2263 if (result)
2264 {
2265 tree lhs = gimple_get_lhs (stmt);
2266 gassign *new_stmt = gimple_build_assign (lhs, result);
2267 gimple_set_location (new_stmt, loc);
2268 unlink_stmt_vdef (stmt);
2269 gsi_replace (&gsi, new_stmt, true);
2270 cleanup_eh = true;
2271 if (gimple_vdef (stmt))
2272 release_ssa_name (gimple_vdef (stmt));
2273 }
2274 break;
2275
2276 default:;
2277 }
2278 }
2279 }
2280 if (cleanup_eh)
2281 cfg_changed |= gimple_purge_dead_eh_edges (bb);
2282 }
2283
2284 statistics_counter_event (fun, "sincos statements inserted",
2285 sincos_stats.inserted);
2286
2287 return cfg_changed ? TODO_cleanup_cfg : 0;
2288 }
2289
2290 } // anon namespace
2291
2292 gimple_opt_pass *
2293 make_pass_cse_sincos (gcc::context *ctxt)
2294 {
2295 return new pass_cse_sincos (ctxt);
2296 }
2297
2298 /* Return true if stmt is a type conversion operation that can be stripped
2299 when used in a widening multiply operation. */
2300 static bool
2301 widening_mult_conversion_strippable_p (tree result_type, gimple *stmt)
2302 {
2303 enum tree_code rhs_code = gimple_assign_rhs_code (stmt);
2304
2305 if (TREE_CODE (result_type) == INTEGER_TYPE)
2306 {
2307 tree op_type;
2308 tree inner_op_type;
2309
2310 if (!CONVERT_EXPR_CODE_P (rhs_code))
2311 return false;
2312
2313 op_type = TREE_TYPE (gimple_assign_lhs (stmt));
2314
2315 /* If the type of OP has the same precision as the result, then
2316 we can strip this conversion. The multiply operation will be
2317 selected to create the correct extension as a by-product. */
2318 if (TYPE_PRECISION (result_type) == TYPE_PRECISION (op_type))
2319 return true;
2320
2321 /* We can also strip a conversion if it preserves the signed-ness of
2322 the operation and doesn't narrow the range. */
2323 inner_op_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
2324
2325 /* If the inner-most type is unsigned, then we can strip any
2326 intermediate widening operation. If it's signed, then the
2327 intermediate widening operation must also be signed. */
2328 if ((TYPE_UNSIGNED (inner_op_type)
2329 || TYPE_UNSIGNED (op_type) == TYPE_UNSIGNED (inner_op_type))
2330 && TYPE_PRECISION (op_type) > TYPE_PRECISION (inner_op_type))
2331 return true;
2332
2333 return false;
2334 }
2335
2336 return rhs_code == FIXED_CONVERT_EXPR;
2337 }
2338
2339 /* Return true if RHS is a suitable operand for a widening multiplication,
2340 assuming a target type of TYPE.
2341 There are two cases:
2342
2343 - RHS makes some value at least twice as wide. Store that value
2344 in *NEW_RHS_OUT if so, and store its type in *TYPE_OUT.
2345
2346 - RHS is an integer constant. Store that value in *NEW_RHS_OUT if so,
2347 but leave *TYPE_OUT untouched. */
2348
2349 static bool
2350 is_widening_mult_rhs_p (tree type, tree rhs, tree *type_out,
2351 tree *new_rhs_out)
2352 {
2353 gimple *stmt;
2354 tree type1, rhs1;
2355
2356 if (TREE_CODE (rhs) == SSA_NAME)
2357 {
2358 stmt = SSA_NAME_DEF_STMT (rhs);
2359 if (is_gimple_assign (stmt))
2360 {
2361 if (! widening_mult_conversion_strippable_p (type, stmt))
2362 rhs1 = rhs;
2363 else
2364 {
2365 rhs1 = gimple_assign_rhs1 (stmt);
2366
2367 if (TREE_CODE (rhs1) == INTEGER_CST)
2368 {
2369 *new_rhs_out = rhs1;
2370 *type_out = NULL;
2371 return true;
2372 }
2373 }
2374 }
2375 else
2376 rhs1 = rhs;
2377
2378 type1 = TREE_TYPE (rhs1);
2379
2380 if (TREE_CODE (type1) != TREE_CODE (type)
2381 || TYPE_PRECISION (type1) * 2 > TYPE_PRECISION (type))
2382 return false;
2383
2384 *new_rhs_out = rhs1;
2385 *type_out = type1;
2386 return true;
2387 }
2388
2389 if (TREE_CODE (rhs) == INTEGER_CST)
2390 {
2391 *new_rhs_out = rhs;
2392 *type_out = NULL;
2393 return true;
2394 }
2395
2396 return false;
2397 }
2398
2399 /* Return true if STMT performs a widening multiplication, assuming the
2400 output type is TYPE. If so, store the unwidened types of the operands
2401 in *TYPE1_OUT and *TYPE2_OUT respectively. Also fill *RHS1_OUT and
2402 *RHS2_OUT such that converting those operands to types *TYPE1_OUT
2403 and *TYPE2_OUT would give the operands of the multiplication. */
2404
2405 static bool
2406 is_widening_mult_p (gimple *stmt,
2407 tree *type1_out, tree *rhs1_out,
2408 tree *type2_out, tree *rhs2_out)
2409 {
2410 tree type = TREE_TYPE (gimple_assign_lhs (stmt));
2411
2412 if (TREE_CODE (type) == INTEGER_TYPE)
2413 {
2414 if (TYPE_OVERFLOW_TRAPS (type))
2415 return false;
2416 }
2417 else if (TREE_CODE (type) != FIXED_POINT_TYPE)
2418 return false;
2419
2420 if (!is_widening_mult_rhs_p (type, gimple_assign_rhs1 (stmt), type1_out,
2421 rhs1_out))
2422 return false;
2423
2424 if (!is_widening_mult_rhs_p (type, gimple_assign_rhs2 (stmt), type2_out,
2425 rhs2_out))
2426 return false;
2427
2428 if (*type1_out == NULL)
2429 {
2430 if (*type2_out == NULL || !int_fits_type_p (*rhs1_out, *type2_out))
2431 return false;
2432 *type1_out = *type2_out;
2433 }
2434
2435 if (*type2_out == NULL)
2436 {
2437 if (!int_fits_type_p (*rhs2_out, *type1_out))
2438 return false;
2439 *type2_out = *type1_out;
2440 }
2441
2442 /* Ensure that the larger of the two operands comes first. */
2443 if (TYPE_PRECISION (*type1_out) < TYPE_PRECISION (*type2_out))
2444 {
2445 std::swap (*type1_out, *type2_out);
2446 std::swap (*rhs1_out, *rhs2_out);
2447 }
2448
2449 return true;
2450 }
2451
2452 /* Check to see if the CALL statement is an invocation of copysign
2453 with 1. being the first argument. */
2454 static bool
2455 is_copysign_call_with_1 (gimple *call)
2456 {
2457 gcall *c = dyn_cast <gcall *> (call);
2458 if (! c)
2459 return false;
2460
2461 enum combined_fn code = gimple_call_combined_fn (c);
2462
2463 if (code == CFN_LAST)
2464 return false;
2465
2466 if (builtin_fn_p (code))
2467 {
2468 switch (as_builtin_fn (code))
2469 {
2470 CASE_FLT_FN (BUILT_IN_COPYSIGN):
2471 CASE_FLT_FN_FLOATN_NX (BUILT_IN_COPYSIGN):
2472 return real_onep (gimple_call_arg (c, 0));
2473 default:
2474 return false;
2475 }
2476 }
2477
2478 if (internal_fn_p (code))
2479 {
2480 switch (as_internal_fn (code))
2481 {
2482 case IFN_COPYSIGN:
2483 return real_onep (gimple_call_arg (c, 0));
2484 default:
2485 return false;
2486 }
2487 }
2488
2489 return false;
2490 }
2491
2492 /* Try to expand the pattern x * copysign (1, y) into xorsign (x, y).
2493 This only happens when the xorsign optab is defined, if the
2494 pattern is not a xorsign pattern or if expansion fails FALSE is
2495 returned, otherwise TRUE is returned. */
2496 static bool
2497 convert_expand_mult_copysign (gimple *stmt, gimple_stmt_iterator *gsi)
2498 {
2499 tree treeop0, treeop1, lhs, type;
2500 location_t loc = gimple_location (stmt);
2501 lhs = gimple_assign_lhs (stmt);
2502 treeop0 = gimple_assign_rhs1 (stmt);
2503 treeop1 = gimple_assign_rhs2 (stmt);
2504 type = TREE_TYPE (lhs);
2505 machine_mode mode = TYPE_MODE (type);
2506
2507 if (HONOR_SNANS (type))
2508 return false;
2509
2510 if (TREE_CODE (treeop0) == SSA_NAME && TREE_CODE (treeop1) == SSA_NAME)
2511 {
2512 gimple *call0 = SSA_NAME_DEF_STMT (treeop0);
2513 if (!has_single_use (treeop0) || !is_copysign_call_with_1 (call0))
2514 {
2515 call0 = SSA_NAME_DEF_STMT (treeop1);
2516 if (!has_single_use (treeop1) || !is_copysign_call_with_1 (call0))
2517 return false;
2518
2519 treeop1 = treeop0;
2520 }
2521 if (optab_handler (xorsign_optab, mode) == CODE_FOR_nothing)
2522 return false;
2523
2524 gcall *c = as_a<gcall*> (call0);
2525 treeop0 = gimple_call_arg (c, 1);
2526
2527 gcall *call_stmt
2528 = gimple_build_call_internal (IFN_XORSIGN, 2, treeop1, treeop0);
2529 gimple_set_lhs (call_stmt, lhs);
2530 gimple_set_location (call_stmt, loc);
2531 gsi_replace (gsi, call_stmt, true);
2532 return true;
2533 }
2534
2535 return false;
2536 }
2537
2538 /* Process a single gimple statement STMT, which has a MULT_EXPR as
2539 its rhs, and try to convert it into a WIDEN_MULT_EXPR. The return
2540 value is true iff we converted the statement. */
2541
2542 static bool
2543 convert_mult_to_widen (gimple *stmt, gimple_stmt_iterator *gsi)
2544 {
2545 tree lhs, rhs1, rhs2, type, type1, type2;
2546 enum insn_code handler;
2547 scalar_int_mode to_mode, from_mode, actual_mode;
2548 optab op;
2549 int actual_precision;
2550 location_t loc = gimple_location (stmt);
2551 bool from_unsigned1, from_unsigned2;
2552
2553 lhs = gimple_assign_lhs (stmt);
2554 type = TREE_TYPE (lhs);
2555 if (TREE_CODE (type) != INTEGER_TYPE)
2556 return false;
2557
2558 if (!is_widening_mult_p (stmt, &type1, &rhs1, &type2, &rhs2))
2559 return false;
2560
2561 to_mode = SCALAR_INT_TYPE_MODE (type);
2562 from_mode = SCALAR_INT_TYPE_MODE (type1);
2563 if (to_mode == from_mode)
2564 return false;
2565
2566 from_unsigned1 = TYPE_UNSIGNED (type1);
2567 from_unsigned2 = TYPE_UNSIGNED (type2);
2568
2569 if (from_unsigned1 && from_unsigned2)
2570 op = umul_widen_optab;
2571 else if (!from_unsigned1 && !from_unsigned2)
2572 op = smul_widen_optab;
2573 else
2574 op = usmul_widen_optab;
2575
2576 handler = find_widening_optab_handler_and_mode (op, to_mode, from_mode,
2577 &actual_mode);
2578
2579 if (handler == CODE_FOR_nothing)
2580 {
2581 if (op != smul_widen_optab)
2582 {
2583 /* We can use a signed multiply with unsigned types as long as
2584 there is a wider mode to use, or it is the smaller of the two
2585 types that is unsigned. Note that type1 >= type2, always. */
2586 if ((TYPE_UNSIGNED (type1)
2587 && TYPE_PRECISION (type1) == GET_MODE_PRECISION (from_mode))
2588 || (TYPE_UNSIGNED (type2)
2589 && TYPE_PRECISION (type2) == GET_MODE_PRECISION (from_mode)))
2590 {
2591 if (!GET_MODE_WIDER_MODE (from_mode).exists (&from_mode)
2592 || GET_MODE_SIZE (to_mode) <= GET_MODE_SIZE (from_mode))
2593 return false;
2594 }
2595
2596 op = smul_widen_optab;
2597 handler = find_widening_optab_handler_and_mode (op, to_mode,
2598 from_mode,
2599 &actual_mode);
2600
2601 if (handler == CODE_FOR_nothing)
2602 return false;
2603
2604 from_unsigned1 = from_unsigned2 = false;
2605 }
2606 else
2607 return false;
2608 }
2609
2610 /* Ensure that the inputs to the handler are in the correct precison
2611 for the opcode. This will be the full mode size. */
2612 actual_precision = GET_MODE_PRECISION (actual_mode);
2613 if (2 * actual_precision > TYPE_PRECISION (type))
2614 return false;
2615 if (actual_precision != TYPE_PRECISION (type1)
2616 || from_unsigned1 != TYPE_UNSIGNED (type1))
2617 rhs1 = build_and_insert_cast (gsi, loc,
2618 build_nonstandard_integer_type
2619 (actual_precision, from_unsigned1), rhs1);
2620 if (actual_precision != TYPE_PRECISION (type2)
2621 || from_unsigned2 != TYPE_UNSIGNED (type2))
2622 rhs2 = build_and_insert_cast (gsi, loc,
2623 build_nonstandard_integer_type
2624 (actual_precision, from_unsigned2), rhs2);
2625
2626 /* Handle constants. */
2627 if (TREE_CODE (rhs1) == INTEGER_CST)
2628 rhs1 = fold_convert (type1, rhs1);
2629 if (TREE_CODE (rhs2) == INTEGER_CST)
2630 rhs2 = fold_convert (type2, rhs2);
2631
2632 gimple_assign_set_rhs1 (stmt, rhs1);
2633 gimple_assign_set_rhs2 (stmt, rhs2);
2634 gimple_assign_set_rhs_code (stmt, WIDEN_MULT_EXPR);
2635 update_stmt (stmt);
2636 widen_mul_stats.widen_mults_inserted++;
2637 return true;
2638 }
2639
2640 /* Process a single gimple statement STMT, which is found at the
2641 iterator GSI and has a either a PLUS_EXPR or a MINUS_EXPR as its
2642 rhs (given by CODE), and try to convert it into a
2643 WIDEN_MULT_PLUS_EXPR or a WIDEN_MULT_MINUS_EXPR. The return value
2644 is true iff we converted the statement. */
2645
2646 static bool
2647 convert_plusminus_to_widen (gimple_stmt_iterator *gsi, gimple *stmt,
2648 enum tree_code code)
2649 {
2650 gimple *rhs1_stmt = NULL, *rhs2_stmt = NULL;
2651 gimple *conv1_stmt = NULL, *conv2_stmt = NULL, *conv_stmt;
2652 tree type, type1, type2, optype;
2653 tree lhs, rhs1, rhs2, mult_rhs1, mult_rhs2, add_rhs;
2654 enum tree_code rhs1_code = ERROR_MARK, rhs2_code = ERROR_MARK;
2655 optab this_optab;
2656 enum tree_code wmult_code;
2657 enum insn_code handler;
2658 scalar_mode to_mode, from_mode, actual_mode;
2659 location_t loc = gimple_location (stmt);
2660 int actual_precision;
2661 bool from_unsigned1, from_unsigned2;
2662
2663 lhs = gimple_assign_lhs (stmt);
2664 type = TREE_TYPE (lhs);
2665 if (TREE_CODE (type) != INTEGER_TYPE
2666 && TREE_CODE (type) != FIXED_POINT_TYPE)
2667 return false;
2668
2669 if (code == MINUS_EXPR)
2670 wmult_code = WIDEN_MULT_MINUS_EXPR;
2671 else
2672 wmult_code = WIDEN_MULT_PLUS_EXPR;
2673
2674 rhs1 = gimple_assign_rhs1 (stmt);
2675 rhs2 = gimple_assign_rhs2 (stmt);
2676
2677 if (TREE_CODE (rhs1) == SSA_NAME)
2678 {
2679 rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
2680 if (is_gimple_assign (rhs1_stmt))
2681 rhs1_code = gimple_assign_rhs_code (rhs1_stmt);
2682 }
2683
2684 if (TREE_CODE (rhs2) == SSA_NAME)
2685 {
2686 rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
2687 if (is_gimple_assign (rhs2_stmt))
2688 rhs2_code = gimple_assign_rhs_code (rhs2_stmt);
2689 }
2690
2691 /* Allow for one conversion statement between the multiply
2692 and addition/subtraction statement. If there are more than
2693 one conversions then we assume they would invalidate this
2694 transformation. If that's not the case then they should have
2695 been folded before now. */
2696 if (CONVERT_EXPR_CODE_P (rhs1_code))
2697 {
2698 conv1_stmt = rhs1_stmt;
2699 rhs1 = gimple_assign_rhs1 (rhs1_stmt);
2700 if (TREE_CODE (rhs1) == SSA_NAME)
2701 {
2702 rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
2703 if (is_gimple_assign (rhs1_stmt))
2704 rhs1_code = gimple_assign_rhs_code (rhs1_stmt);
2705 }
2706 else
2707 return false;
2708 }
2709 if (CONVERT_EXPR_CODE_P (rhs2_code))
2710 {
2711 conv2_stmt = rhs2_stmt;
2712 rhs2 = gimple_assign_rhs1 (rhs2_stmt);
2713 if (TREE_CODE (rhs2) == SSA_NAME)
2714 {
2715 rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
2716 if (is_gimple_assign (rhs2_stmt))
2717 rhs2_code = gimple_assign_rhs_code (rhs2_stmt);
2718 }
2719 else
2720 return false;
2721 }
2722
2723 /* If code is WIDEN_MULT_EXPR then it would seem unnecessary to call
2724 is_widening_mult_p, but we still need the rhs returns.
2725
2726 It might also appear that it would be sufficient to use the existing
2727 operands of the widening multiply, but that would limit the choice of
2728 multiply-and-accumulate instructions.
2729
2730 If the widened-multiplication result has more than one uses, it is
2731 probably wiser not to do the conversion. Also restrict this operation
2732 to single basic block to avoid moving the multiply to a different block
2733 with a higher execution frequency. */
2734 if (code == PLUS_EXPR
2735 && (rhs1_code == MULT_EXPR || rhs1_code == WIDEN_MULT_EXPR))
2736 {
2737 if (!has_single_use (rhs1)
2738 || gimple_bb (rhs1_stmt) != gimple_bb (stmt)
2739 || !is_widening_mult_p (rhs1_stmt, &type1, &mult_rhs1,
2740 &type2, &mult_rhs2))
2741 return false;
2742 add_rhs = rhs2;
2743 conv_stmt = conv1_stmt;
2744 }
2745 else if (rhs2_code == MULT_EXPR || rhs2_code == WIDEN_MULT_EXPR)
2746 {
2747 if (!has_single_use (rhs2)
2748 || gimple_bb (rhs2_stmt) != gimple_bb (stmt)
2749 || !is_widening_mult_p (rhs2_stmt, &type1, &mult_rhs1,
2750 &type2, &mult_rhs2))
2751 return false;
2752 add_rhs = rhs1;
2753 conv_stmt = conv2_stmt;
2754 }
2755 else
2756 return false;
2757
2758 to_mode = SCALAR_TYPE_MODE (type);
2759 from_mode = SCALAR_TYPE_MODE (type1);
2760 if (to_mode == from_mode)
2761 return false;
2762
2763 from_unsigned1 = TYPE_UNSIGNED (type1);
2764 from_unsigned2 = TYPE_UNSIGNED (type2);
2765 optype = type1;
2766
2767 /* There's no such thing as a mixed sign madd yet, so use a wider mode. */
2768 if (from_unsigned1 != from_unsigned2)
2769 {
2770 if (!INTEGRAL_TYPE_P (type))
2771 return false;
2772 /* We can use a signed multiply with unsigned types as long as
2773 there is a wider mode to use, or it is the smaller of the two
2774 types that is unsigned. Note that type1 >= type2, always. */
2775 if ((from_unsigned1
2776 && TYPE_PRECISION (type1) == GET_MODE_PRECISION (from_mode))
2777 || (from_unsigned2
2778 && TYPE_PRECISION (type2) == GET_MODE_PRECISION (from_mode)))
2779 {
2780 if (!GET_MODE_WIDER_MODE (from_mode).exists (&from_mode)
2781 || GET_MODE_SIZE (from_mode) >= GET_MODE_SIZE (to_mode))
2782 return false;
2783 }
2784
2785 from_unsigned1 = from_unsigned2 = false;
2786 optype = build_nonstandard_integer_type (GET_MODE_PRECISION (from_mode),
2787 false);
2788 }
2789
2790 /* If there was a conversion between the multiply and addition
2791 then we need to make sure it fits a multiply-and-accumulate.
2792 The should be a single mode change which does not change the
2793 value. */
2794 if (conv_stmt)
2795 {
2796 /* We use the original, unmodified data types for this. */
2797 tree from_type = TREE_TYPE (gimple_assign_rhs1 (conv_stmt));
2798 tree to_type = TREE_TYPE (gimple_assign_lhs (conv_stmt));
2799 int data_size = TYPE_PRECISION (type1) + TYPE_PRECISION (type2);
2800 bool is_unsigned = TYPE_UNSIGNED (type1) && TYPE_UNSIGNED (type2);
2801
2802 if (TYPE_PRECISION (from_type) > TYPE_PRECISION (to_type))
2803 {
2804 /* Conversion is a truncate. */
2805 if (TYPE_PRECISION (to_type) < data_size)
2806 return false;
2807 }
2808 else if (TYPE_PRECISION (from_type) < TYPE_PRECISION (to_type))
2809 {
2810 /* Conversion is an extend. Check it's the right sort. */
2811 if (TYPE_UNSIGNED (from_type) != is_unsigned
2812 && !(is_unsigned && TYPE_PRECISION (from_type) > data_size))
2813 return false;
2814 }
2815 /* else convert is a no-op for our purposes. */
2816 }
2817
2818 /* Verify that the machine can perform a widening multiply
2819 accumulate in this mode/signedness combination, otherwise
2820 this transformation is likely to pessimize code. */
2821 this_optab = optab_for_tree_code (wmult_code, optype, optab_default);
2822 handler = find_widening_optab_handler_and_mode (this_optab, to_mode,
2823 from_mode, &actual_mode);
2824
2825 if (handler == CODE_FOR_nothing)
2826 return false;
2827
2828 /* Ensure that the inputs to the handler are in the correct precison
2829 for the opcode. This will be the full mode size. */
2830 actual_precision = GET_MODE_PRECISION (actual_mode);
2831 if (actual_precision != TYPE_PRECISION (type1)
2832 || from_unsigned1 != TYPE_UNSIGNED (type1))
2833 mult_rhs1 = build_and_insert_cast (gsi, loc,
2834 build_nonstandard_integer_type
2835 (actual_precision, from_unsigned1),
2836 mult_rhs1);
2837 if (actual_precision != TYPE_PRECISION (type2)
2838 || from_unsigned2 != TYPE_UNSIGNED (type2))
2839 mult_rhs2 = build_and_insert_cast (gsi, loc,
2840 build_nonstandard_integer_type
2841 (actual_precision, from_unsigned2),
2842 mult_rhs2);
2843
2844 if (!useless_type_conversion_p (type, TREE_TYPE (add_rhs)))
2845 add_rhs = build_and_insert_cast (gsi, loc, type, add_rhs);
2846
2847 /* Handle constants. */
2848 if (TREE_CODE (mult_rhs1) == INTEGER_CST)
2849 mult_rhs1 = fold_convert (type1, mult_rhs1);
2850 if (TREE_CODE (mult_rhs2) == INTEGER_CST)
2851 mult_rhs2 = fold_convert (type2, mult_rhs2);
2852
2853 gimple_assign_set_rhs_with_ops (gsi, wmult_code, mult_rhs1, mult_rhs2,
2854 add_rhs);
2855 update_stmt (gsi_stmt (*gsi));
2856 widen_mul_stats.maccs_inserted++;
2857 return true;
2858 }
2859
2860 /* Given a result MUL_RESULT which is a result of a multiplication of OP1 and
2861 OP2 and which we know is used in statements that can be, together with the
2862 multiplication, converted to FMAs, perform the transformation. */
2863
2864 static void
2865 convert_mult_to_fma_1 (tree mul_result, tree op1, tree op2)
2866 {
2867 tree type = TREE_TYPE (mul_result);
2868 gimple *use_stmt;
2869 imm_use_iterator imm_iter;
2870 gcall *fma_stmt;
2871
2872 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, mul_result)
2873 {
2874 gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
2875 tree addop, mulop1 = op1, result = mul_result;
2876 bool negate_p = false;
2877 gimple_seq seq = NULL;
2878
2879 if (is_gimple_debug (use_stmt))
2880 continue;
2881
2882 if (is_gimple_assign (use_stmt)
2883 && gimple_assign_rhs_code (use_stmt) == NEGATE_EXPR)
2884 {
2885 result = gimple_assign_lhs (use_stmt);
2886 use_operand_p use_p;
2887 gimple *neguse_stmt;
2888 single_imm_use (gimple_assign_lhs (use_stmt), &use_p, &neguse_stmt);
2889 gsi_remove (&gsi, true);
2890 release_defs (use_stmt);
2891
2892 use_stmt = neguse_stmt;
2893 gsi = gsi_for_stmt (use_stmt);
2894 negate_p = true;
2895 }
2896
2897 tree cond, else_value, ops[3];
2898 tree_code code;
2899 if (!can_interpret_as_conditional_op_p (use_stmt, &cond, &code,
2900 ops, &else_value))
2901 gcc_unreachable ();
2902 addop = ops[0] == result ? ops[1] : ops[0];
2903
2904 if (code == MINUS_EXPR)
2905 {
2906 if (ops[0] == result)
2907 /* a * b - c -> a * b + (-c) */
2908 addop = gimple_build (&seq, NEGATE_EXPR, type, addop);
2909 else
2910 /* a - b * c -> (-b) * c + a */
2911 negate_p = !negate_p;
2912 }
2913
2914 if (negate_p)
2915 mulop1 = gimple_build (&seq, NEGATE_EXPR, type, mulop1);
2916
2917 if (seq)
2918 gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
2919
2920 if (cond)
2921 fma_stmt = gimple_build_call_internal (IFN_COND_FMA, 5, cond, mulop1,
2922 op2, addop, else_value);
2923 else
2924 fma_stmt = gimple_build_call_internal (IFN_FMA, 3, mulop1, op2, addop);
2925 gimple_set_lhs (fma_stmt, gimple_get_lhs (use_stmt));
2926 gimple_call_set_nothrow (fma_stmt, !stmt_can_throw_internal (cfun,
2927 use_stmt));
2928 gsi_replace (&gsi, fma_stmt, true);
2929 /* Follow all SSA edges so that we generate FMS, FNMA and FNMS
2930 regardless of where the negation occurs. */
2931 gimple *orig_stmt = gsi_stmt (gsi);
2932 if (fold_stmt (&gsi, follow_all_ssa_edges))
2933 {
2934 if (maybe_clean_or_replace_eh_stmt (orig_stmt, gsi_stmt (gsi)))
2935 gcc_unreachable ();
2936 update_stmt (gsi_stmt (gsi));
2937 }
2938
2939 if (dump_file && (dump_flags & TDF_DETAILS))
2940 {
2941 fprintf (dump_file, "Generated FMA ");
2942 print_gimple_stmt (dump_file, gsi_stmt (gsi), 0, TDF_NONE);
2943 fprintf (dump_file, "\n");
2944 }
2945
2946 /* If the FMA result is negated in a single use, fold the negation
2947 too. */
2948 orig_stmt = gsi_stmt (gsi);
2949 use_operand_p use_p;
2950 gimple *neg_stmt;
2951 if (is_gimple_call (orig_stmt)
2952 && gimple_call_internal_p (orig_stmt)
2953 && gimple_call_lhs (orig_stmt)
2954 && TREE_CODE (gimple_call_lhs (orig_stmt)) == SSA_NAME
2955 && single_imm_use (gimple_call_lhs (orig_stmt), &use_p, &neg_stmt)
2956 && is_gimple_assign (neg_stmt)
2957 && gimple_assign_rhs_code (neg_stmt) == NEGATE_EXPR
2958 && !stmt_could_throw_p (cfun, neg_stmt))
2959 {
2960 gsi = gsi_for_stmt (neg_stmt);
2961 if (fold_stmt (&gsi, follow_all_ssa_edges))
2962 {
2963 if (maybe_clean_or_replace_eh_stmt (neg_stmt, gsi_stmt (gsi)))
2964 gcc_unreachable ();
2965 update_stmt (gsi_stmt (gsi));
2966 if (dump_file && (dump_flags & TDF_DETAILS))
2967 {
2968 fprintf (dump_file, "Folded FMA negation ");
2969 print_gimple_stmt (dump_file, gsi_stmt (gsi), 0, TDF_NONE);
2970 fprintf (dump_file, "\n");
2971 }
2972 }
2973 }
2974
2975 widen_mul_stats.fmas_inserted++;
2976 }
2977 }
2978
2979 /* Data necessary to perform the actual transformation from a multiplication
2980 and an addition to an FMA after decision is taken it should be done and to
2981 then delete the multiplication statement from the function IL. */
2982
2983 struct fma_transformation_info
2984 {
2985 gimple *mul_stmt;
2986 tree mul_result;
2987 tree op1;
2988 tree op2;
2989 };
2990
2991 /* Structure containing the current state of FMA deferring, i.e. whether we are
2992 deferring, whether to continue deferring, and all data necessary to come
2993 back and perform all deferred transformations. */
2994
2995 class fma_deferring_state
2996 {
2997 public:
2998 /* Class constructor. Pass true as PERFORM_DEFERRING in order to actually
2999 do any deferring. */
3000
3001 fma_deferring_state (bool perform_deferring)
3002 : m_candidates (), m_mul_result_set (), m_initial_phi (NULL),
3003 m_last_result (NULL_TREE), m_deferring_p (perform_deferring) {}
3004
3005 /* List of FMA candidates for which we the transformation has been determined
3006 possible but we at this point in BB analysis we do not consider them
3007 beneficial. */
3008 auto_vec<fma_transformation_info, 8> m_candidates;
3009
3010 /* Set of results of multiplication that are part of an already deferred FMA
3011 candidates. */
3012 hash_set<tree> m_mul_result_set;
3013
3014 /* The PHI that supposedly feeds back result of a FMA to another over loop
3015 boundary. */
3016 gphi *m_initial_phi;
3017
3018 /* Result of the last produced FMA candidate or NULL if there has not been
3019 one. */
3020 tree m_last_result;
3021
3022 /* If true, deferring might still be profitable. If false, transform all
3023 candidates and no longer defer. */
3024 bool m_deferring_p;
3025 };
3026
3027 /* Transform all deferred FMA candidates and mark STATE as no longer
3028 deferring. */
3029
3030 static void
3031 cancel_fma_deferring (fma_deferring_state *state)
3032 {
3033 if (!state->m_deferring_p)
3034 return;
3035
3036 for (unsigned i = 0; i < state->m_candidates.length (); i++)
3037 {
3038 if (dump_file && (dump_flags & TDF_DETAILS))
3039 fprintf (dump_file, "Generating deferred FMA\n");
3040
3041 const fma_transformation_info &fti = state->m_candidates[i];
3042 convert_mult_to_fma_1 (fti.mul_result, fti.op1, fti.op2);
3043
3044 gimple_stmt_iterator gsi = gsi_for_stmt (fti.mul_stmt);
3045 gsi_remove (&gsi, true);
3046 release_defs (fti.mul_stmt);
3047 }
3048 state->m_deferring_p = false;
3049 }
3050
3051 /* If OP is an SSA name defined by a PHI node, return the PHI statement.
3052 Otherwise return NULL. */
3053
3054 static gphi *
3055 result_of_phi (tree op)
3056 {
3057 if (TREE_CODE (op) != SSA_NAME)
3058 return NULL;
3059
3060 return dyn_cast <gphi *> (SSA_NAME_DEF_STMT (op));
3061 }
3062
3063 /* After processing statements of a BB and recording STATE, return true if the
3064 initial phi is fed by the last FMA candidate result ore one such result from
3065 previously processed BBs marked in LAST_RESULT_SET. */
3066
3067 static bool
3068 last_fma_candidate_feeds_initial_phi (fma_deferring_state *state,
3069 hash_set<tree> *last_result_set)
3070 {
3071 ssa_op_iter iter;
3072 use_operand_p use;
3073 FOR_EACH_PHI_ARG (use, state->m_initial_phi, iter, SSA_OP_USE)
3074 {
3075 tree t = USE_FROM_PTR (use);
3076 if (t == state->m_last_result
3077 || last_result_set->contains (t))
3078 return true;
3079 }
3080
3081 return false;
3082 }
3083
3084 /* Combine the multiplication at MUL_STMT with operands MULOP1 and MULOP2
3085 with uses in additions and subtractions to form fused multiply-add
3086 operations. Returns true if successful and MUL_STMT should be removed.
3087 If MUL_COND is nonnull, the multiplication in MUL_STMT is conditional
3088 on MUL_COND, otherwise it is unconditional.
3089
3090 If STATE indicates that we are deferring FMA transformation, that means
3091 that we do not produce FMAs for basic blocks which look like:
3092
3093 <bb 6>
3094 # accumulator_111 = PHI <0.0(5), accumulator_66(6)>
3095 _65 = _14 * _16;
3096 accumulator_66 = _65 + accumulator_111;
3097
3098 or its unrolled version, i.e. with several FMA candidates that feed result
3099 of one into the addend of another. Instead, we add them to a list in STATE
3100 and if we later discover an FMA candidate that is not part of such a chain,
3101 we go back and perform all deferred past candidates. */
3102
3103 static bool
3104 convert_mult_to_fma (gimple *mul_stmt, tree op1, tree op2,
3105 fma_deferring_state *state, tree mul_cond = NULL_TREE)
3106 {
3107 tree mul_result = gimple_get_lhs (mul_stmt);
3108 tree type = TREE_TYPE (mul_result);
3109 gimple *use_stmt, *neguse_stmt;
3110 use_operand_p use_p;
3111 imm_use_iterator imm_iter;
3112
3113 if (FLOAT_TYPE_P (type)
3114 && flag_fp_contract_mode == FP_CONTRACT_OFF)
3115 return false;
3116
3117 /* We don't want to do bitfield reduction ops. */
3118 if (INTEGRAL_TYPE_P (type)
3119 && (!type_has_mode_precision_p (type) || TYPE_OVERFLOW_TRAPS (type)))
3120 return false;
3121
3122 /* If the target doesn't support it, don't generate it. We assume that
3123 if fma isn't available then fms, fnma or fnms are not either. */
3124 optimization_type opt_type = bb_optimization_type (gimple_bb (mul_stmt));
3125 if (!direct_internal_fn_supported_p (IFN_FMA, type, opt_type))
3126 return false;
3127
3128 /* If the multiplication has zero uses, it is kept around probably because
3129 of -fnon-call-exceptions. Don't optimize it away in that case,
3130 it is DCE job. */
3131 if (has_zero_uses (mul_result))
3132 return false;
3133
3134 bool check_defer
3135 = (state->m_deferring_p
3136 && (tree_to_shwi (TYPE_SIZE (type))
3137 <= param_avoid_fma_max_bits));
3138 bool defer = check_defer;
3139 bool seen_negate_p = false;
3140 /* Make sure that the multiplication statement becomes dead after
3141 the transformation, thus that all uses are transformed to FMAs.
3142 This means we assume that an FMA operation has the same cost
3143 as an addition. */
3144 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, mul_result)
3145 {
3146 tree result = mul_result;
3147 bool negate_p = false;
3148
3149 use_stmt = USE_STMT (use_p);
3150
3151 if (is_gimple_debug (use_stmt))
3152 continue;
3153
3154 /* For now restrict this operations to single basic blocks. In theory
3155 we would want to support sinking the multiplication in
3156 m = a*b;
3157 if ()
3158 ma = m + c;
3159 else
3160 d = m;
3161 to form a fma in the then block and sink the multiplication to the
3162 else block. */
3163 if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
3164 return false;
3165
3166 /* A negate on the multiplication leads to FNMA. */
3167 if (is_gimple_assign (use_stmt)
3168 && gimple_assign_rhs_code (use_stmt) == NEGATE_EXPR)
3169 {
3170 ssa_op_iter iter;
3171 use_operand_p usep;
3172
3173 /* If (due to earlier missed optimizations) we have two
3174 negates of the same value, treat them as equivalent
3175 to a single negate with multiple uses. */
3176 if (seen_negate_p)
3177 return false;
3178
3179 result = gimple_assign_lhs (use_stmt);
3180
3181 /* Make sure the negate statement becomes dead with this
3182 single transformation. */
3183 if (!single_imm_use (gimple_assign_lhs (use_stmt),
3184 &use_p, &neguse_stmt))
3185 return false;
3186
3187 /* Make sure the multiplication isn't also used on that stmt. */
3188 FOR_EACH_PHI_OR_STMT_USE (usep, neguse_stmt, iter, SSA_OP_USE)
3189 if (USE_FROM_PTR (usep) == mul_result)
3190 return false;
3191
3192 /* Re-validate. */
3193 use_stmt = neguse_stmt;
3194 if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
3195 return false;
3196
3197 negate_p = seen_negate_p = true;
3198 }
3199
3200 tree cond, else_value, ops[3];
3201 tree_code code;
3202 if (!can_interpret_as_conditional_op_p (use_stmt, &cond, &code, ops,
3203 &else_value))
3204 return false;
3205
3206 switch (code)
3207 {
3208 case MINUS_EXPR:
3209 if (ops[1] == result)
3210 negate_p = !negate_p;
3211 break;
3212 case PLUS_EXPR:
3213 break;
3214 default:
3215 /* FMA can only be formed from PLUS and MINUS. */
3216 return false;
3217 }
3218
3219 if (mul_cond && cond != mul_cond)
3220 return false;
3221
3222 if (cond)
3223 {
3224 if (cond == result || else_value == result)
3225 return false;
3226 if (!direct_internal_fn_supported_p (IFN_COND_FMA, type, opt_type))
3227 return false;
3228 }
3229
3230 /* If the subtrahend (OPS[1]) is computed by a MULT_EXPR that
3231 we'll visit later, we might be able to get a more profitable
3232 match with fnma.
3233 OTOH, if we don't, a negate / fma pair has likely lower latency
3234 that a mult / subtract pair. */
3235 if (code == MINUS_EXPR
3236 && !negate_p
3237 && ops[0] == result
3238 && !direct_internal_fn_supported_p (IFN_FMS, type, opt_type)
3239 && direct_internal_fn_supported_p (IFN_FNMA, type, opt_type)
3240 && TREE_CODE (ops[1]) == SSA_NAME
3241 && has_single_use (ops[1]))
3242 {
3243 gimple *stmt2 = SSA_NAME_DEF_STMT (ops[1]);
3244 if (is_gimple_assign (stmt2)
3245 && gimple_assign_rhs_code (stmt2) == MULT_EXPR)
3246 return false;
3247 }
3248
3249 /* We can't handle a * b + a * b. */
3250 if (ops[0] == ops[1])
3251 return false;
3252 /* If deferring, make sure we are not looking at an instruction that
3253 wouldn't have existed if we were not. */
3254 if (state->m_deferring_p
3255 && (state->m_mul_result_set.contains (ops[0])
3256 || state->m_mul_result_set.contains (ops[1])))
3257 return false;
3258
3259 if (check_defer)
3260 {
3261 tree use_lhs = gimple_get_lhs (use_stmt);
3262 if (state->m_last_result)
3263 {
3264 if (ops[1] == state->m_last_result
3265 || ops[0] == state->m_last_result)
3266 defer = true;
3267 else
3268 defer = false;
3269 }
3270 else
3271 {
3272 gcc_checking_assert (!state->m_initial_phi);
3273 gphi *phi;
3274 if (ops[0] == result)
3275 phi = result_of_phi (ops[1]);
3276 else
3277 {
3278 gcc_assert (ops[1] == result);
3279 phi = result_of_phi (ops[0]);
3280 }
3281
3282 if (phi)
3283 {
3284 state->m_initial_phi = phi;
3285 defer = true;
3286 }
3287 else
3288 defer = false;
3289 }
3290
3291 state->m_last_result = use_lhs;
3292 check_defer = false;
3293 }
3294 else
3295 defer = false;
3296
3297 /* While it is possible to validate whether or not the exact form that
3298 we've recognized is available in the backend, the assumption is that
3299 if the deferring logic above did not trigger, the transformation is
3300 never a loss. For instance, suppose the target only has the plain FMA
3301 pattern available. Consider a*b-c -> fma(a,b,-c): we've exchanged
3302 MUL+SUB for FMA+NEG, which is still two operations. Consider
3303 -(a*b)-c -> fma(-a,b,-c): we still have 3 operations, but in the FMA
3304 form the two NEGs are independent and could be run in parallel. */
3305 }
3306
3307 if (defer)
3308 {
3309 fma_transformation_info fti;
3310 fti.mul_stmt = mul_stmt;
3311 fti.mul_result = mul_result;
3312 fti.op1 = op1;
3313 fti.op2 = op2;
3314 state->m_candidates.safe_push (fti);
3315 state->m_mul_result_set.add (mul_result);
3316
3317 if (dump_file && (dump_flags & TDF_DETAILS))
3318 {
3319 fprintf (dump_file, "Deferred generating FMA for multiplication ");
3320 print_gimple_stmt (dump_file, mul_stmt, 0, TDF_NONE);
3321 fprintf (dump_file, "\n");
3322 }
3323
3324 return false;
3325 }
3326 else
3327 {
3328 if (state->m_deferring_p)
3329 cancel_fma_deferring (state);
3330 convert_mult_to_fma_1 (mul_result, op1, op2);
3331 return true;
3332 }
3333 }
3334
3335
3336 /* Helper function of match_uaddsub_overflow. Return 1
3337 if USE_STMT is unsigned overflow check ovf != 0 for
3338 STMT, -1 if USE_STMT is unsigned overflow check ovf == 0
3339 and 0 otherwise. */
3340
3341 static int
3342 uaddsub_overflow_check_p (gimple *stmt, gimple *use_stmt)
3343 {
3344 enum tree_code ccode = ERROR_MARK;
3345 tree crhs1 = NULL_TREE, crhs2 = NULL_TREE;
3346 if (gimple_code (use_stmt) == GIMPLE_COND)
3347 {
3348 ccode = gimple_cond_code (use_stmt);
3349 crhs1 = gimple_cond_lhs (use_stmt);
3350 crhs2 = gimple_cond_rhs (use_stmt);
3351 }
3352 else if (is_gimple_assign (use_stmt))
3353 {
3354 if (gimple_assign_rhs_class (use_stmt) == GIMPLE_BINARY_RHS)
3355 {
3356 ccode = gimple_assign_rhs_code (use_stmt);
3357 crhs1 = gimple_assign_rhs1 (use_stmt);
3358 crhs2 = gimple_assign_rhs2 (use_stmt);
3359 }
3360 else if (gimple_assign_rhs_code (use_stmt) == COND_EXPR)
3361 {
3362 tree cond = gimple_assign_rhs1 (use_stmt);
3363 if (COMPARISON_CLASS_P (cond))
3364 {
3365 ccode = TREE_CODE (cond);
3366 crhs1 = TREE_OPERAND (cond, 0);
3367 crhs2 = TREE_OPERAND (cond, 1);
3368 }
3369 else
3370 return 0;
3371 }
3372 else
3373 return 0;
3374 }
3375 else
3376 return 0;
3377
3378 if (TREE_CODE_CLASS (ccode) != tcc_comparison)
3379 return 0;
3380
3381 enum tree_code code = gimple_assign_rhs_code (stmt);
3382 tree lhs = gimple_assign_lhs (stmt);
3383 tree rhs1 = gimple_assign_rhs1 (stmt);
3384 tree rhs2 = gimple_assign_rhs2 (stmt);
3385
3386 switch (ccode)
3387 {
3388 case GT_EXPR:
3389 case LE_EXPR:
3390 /* r = a - b; r > a or r <= a
3391 r = a + b; a > r or a <= r or b > r or b <= r. */
3392 if ((code == MINUS_EXPR && crhs1 == lhs && crhs2 == rhs1)
3393 || (code == PLUS_EXPR && (crhs1 == rhs1 || crhs1 == rhs2)
3394 && crhs2 == lhs))
3395 return ccode == GT_EXPR ? 1 : -1;
3396 break;
3397 case LT_EXPR:
3398 case GE_EXPR:
3399 /* r = a - b; a < r or a >= r
3400 r = a + b; r < a or r >= a or r < b or r >= b. */
3401 if ((code == MINUS_EXPR && crhs1 == rhs1 && crhs2 == lhs)
3402 || (code == PLUS_EXPR && crhs1 == lhs
3403 && (crhs2 == rhs1 || crhs2 == rhs2)))
3404 return ccode == LT_EXPR ? 1 : -1;
3405 break;
3406 default:
3407 break;
3408 }
3409 return 0;
3410 }
3411
3412 /* Recognize for unsigned x
3413 x = y - z;
3414 if (x > y)
3415 where there are other uses of x and replace it with
3416 _7 = SUB_OVERFLOW (y, z);
3417 x = REALPART_EXPR <_7>;
3418 _8 = IMAGPART_EXPR <_7>;
3419 if (_8)
3420 and similarly for addition. */
3421
3422 static bool
3423 match_uaddsub_overflow (gimple_stmt_iterator *gsi, gimple *stmt,
3424 enum tree_code code)
3425 {
3426 tree lhs = gimple_assign_lhs (stmt);
3427 tree type = TREE_TYPE (lhs);
3428 use_operand_p use_p;
3429 imm_use_iterator iter;
3430 bool use_seen = false;
3431 bool ovf_use_seen = false;
3432 gimple *use_stmt;
3433
3434 gcc_checking_assert (code == PLUS_EXPR || code == MINUS_EXPR);
3435 if (!INTEGRAL_TYPE_P (type)
3436 || !TYPE_UNSIGNED (type)
3437 || has_zero_uses (lhs)
3438 || has_single_use (lhs)
3439 || optab_handler (code == PLUS_EXPR ? uaddv4_optab : usubv4_optab,
3440 TYPE_MODE (type)) == CODE_FOR_nothing)
3441 return false;
3442
3443 FOR_EACH_IMM_USE_FAST (use_p, iter, lhs)
3444 {
3445 use_stmt = USE_STMT (use_p);
3446 if (is_gimple_debug (use_stmt))
3447 continue;
3448
3449 if (uaddsub_overflow_check_p (stmt, use_stmt))
3450 ovf_use_seen = true;
3451 else
3452 use_seen = true;
3453 if (ovf_use_seen && use_seen)
3454 break;
3455 }
3456
3457 if (!ovf_use_seen || !use_seen)
3458 return false;
3459
3460 tree ctype = build_complex_type (type);
3461 tree rhs1 = gimple_assign_rhs1 (stmt);
3462 tree rhs2 = gimple_assign_rhs2 (stmt);
3463 gcall *g = gimple_build_call_internal (code == PLUS_EXPR
3464 ? IFN_ADD_OVERFLOW : IFN_SUB_OVERFLOW,
3465 2, rhs1, rhs2);
3466 tree ctmp = make_ssa_name (ctype);
3467 gimple_call_set_lhs (g, ctmp);
3468 gsi_insert_before (gsi, g, GSI_SAME_STMT);
3469 gassign *g2 = gimple_build_assign (lhs, REALPART_EXPR,
3470 build1 (REALPART_EXPR, type, ctmp));
3471 gsi_replace (gsi, g2, true);
3472 tree ovf = make_ssa_name (type);
3473 g2 = gimple_build_assign (ovf, IMAGPART_EXPR,
3474 build1 (IMAGPART_EXPR, type, ctmp));
3475 gsi_insert_after (gsi, g2, GSI_NEW_STMT);
3476
3477 FOR_EACH_IMM_USE_STMT (use_stmt, iter, lhs)
3478 {
3479 if (is_gimple_debug (use_stmt))
3480 continue;
3481
3482 int ovf_use = uaddsub_overflow_check_p (stmt, use_stmt);
3483 if (ovf_use == 0)
3484 continue;
3485 if (gimple_code (use_stmt) == GIMPLE_COND)
3486 {
3487 gcond *cond_stmt = as_a <gcond *> (use_stmt);
3488 gimple_cond_set_lhs (cond_stmt, ovf);
3489 gimple_cond_set_rhs (cond_stmt, build_int_cst (type, 0));
3490 gimple_cond_set_code (cond_stmt, ovf_use == 1 ? NE_EXPR : EQ_EXPR);
3491 }
3492 else
3493 {
3494 gcc_checking_assert (is_gimple_assign (use_stmt));
3495 if (gimple_assign_rhs_class (use_stmt) == GIMPLE_BINARY_RHS)
3496 {
3497 gimple_assign_set_rhs1 (use_stmt, ovf);
3498 gimple_assign_set_rhs2 (use_stmt, build_int_cst (type, 0));
3499 gimple_assign_set_rhs_code (use_stmt,
3500 ovf_use == 1 ? NE_EXPR : EQ_EXPR);
3501 }
3502 else
3503 {
3504 gcc_checking_assert (gimple_assign_rhs_code (use_stmt)
3505 == COND_EXPR);
3506 tree cond = build2 (ovf_use == 1 ? NE_EXPR : EQ_EXPR,
3507 boolean_type_node, ovf,
3508 build_int_cst (type, 0));
3509 gimple_assign_set_rhs1 (use_stmt, cond);
3510 }
3511 }
3512 update_stmt (use_stmt);
3513 }
3514 return true;
3515 }
3516
3517 /* Return true if target has support for divmod. */
3518
3519 static bool
3520 target_supports_divmod_p (optab divmod_optab, optab div_optab, machine_mode mode)
3521 {
3522 /* If target supports hardware divmod insn, use it for divmod. */
3523 if (optab_handler (divmod_optab, mode) != CODE_FOR_nothing)
3524 return true;
3525
3526 /* Check if libfunc for divmod is available. */
3527 rtx libfunc = optab_libfunc (divmod_optab, mode);
3528 if (libfunc != NULL_RTX)
3529 {
3530 /* If optab_handler exists for div_optab, perhaps in a wider mode,
3531 we don't want to use the libfunc even if it exists for given mode. */
3532 machine_mode div_mode;
3533 FOR_EACH_MODE_FROM (div_mode, mode)
3534 if (optab_handler (div_optab, div_mode) != CODE_FOR_nothing)
3535 return false;
3536
3537 return targetm.expand_divmod_libfunc != NULL;
3538 }
3539
3540 return false;
3541 }
3542
3543 /* Check if stmt is candidate for divmod transform. */
3544
3545 static bool
3546 divmod_candidate_p (gassign *stmt)
3547 {
3548 tree type = TREE_TYPE (gimple_assign_lhs (stmt));
3549 machine_mode mode = TYPE_MODE (type);
3550 optab divmod_optab, div_optab;
3551
3552 if (TYPE_UNSIGNED (type))
3553 {
3554 divmod_optab = udivmod_optab;
3555 div_optab = udiv_optab;
3556 }
3557 else
3558 {
3559 divmod_optab = sdivmod_optab;
3560 div_optab = sdiv_optab;
3561 }
3562
3563 tree op1 = gimple_assign_rhs1 (stmt);
3564 tree op2 = gimple_assign_rhs2 (stmt);
3565
3566 /* Disable the transform if either is a constant, since division-by-constant
3567 may have specialized expansion. */
3568 if (CONSTANT_CLASS_P (op1) || CONSTANT_CLASS_P (op2))
3569 return false;
3570
3571 /* Exclude the case where TYPE_OVERFLOW_TRAPS (type) as that should
3572 expand using the [su]divv optabs. */
3573 if (TYPE_OVERFLOW_TRAPS (type))
3574 return false;
3575
3576 if (!target_supports_divmod_p (divmod_optab, div_optab, mode))
3577 return false;
3578
3579 return true;
3580 }
3581
3582 /* This function looks for:
3583 t1 = a TRUNC_DIV_EXPR b;
3584 t2 = a TRUNC_MOD_EXPR b;
3585 and transforms it to the following sequence:
3586 complex_tmp = DIVMOD (a, b);
3587 t1 = REALPART_EXPR(a);
3588 t2 = IMAGPART_EXPR(b);
3589 For conditions enabling the transform see divmod_candidate_p().
3590
3591 The pass has three parts:
3592 1) Find top_stmt which is trunc_div or trunc_mod stmt and dominates all
3593 other trunc_div_expr and trunc_mod_expr stmts.
3594 2) Add top_stmt and all trunc_div and trunc_mod stmts dominated by top_stmt
3595 to stmts vector.
3596 3) Insert DIVMOD call just before top_stmt and update entries in
3597 stmts vector to use return value of DIMOVD (REALEXPR_PART for div,
3598 IMAGPART_EXPR for mod). */
3599
3600 static bool
3601 convert_to_divmod (gassign *stmt)
3602 {
3603 if (stmt_can_throw_internal (cfun, stmt)
3604 || !divmod_candidate_p (stmt))
3605 return false;
3606
3607 tree op1 = gimple_assign_rhs1 (stmt);
3608 tree op2 = gimple_assign_rhs2 (stmt);
3609
3610 imm_use_iterator use_iter;
3611 gimple *use_stmt;
3612 auto_vec<gimple *> stmts;
3613
3614 gimple *top_stmt = stmt;
3615 basic_block top_bb = gimple_bb (stmt);
3616
3617 /* Part 1: Try to set top_stmt to "topmost" stmt that dominates
3618 at-least stmt and possibly other trunc_div/trunc_mod stmts
3619 having same operands as stmt. */
3620
3621 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, op1)
3622 {
3623 if (is_gimple_assign (use_stmt)
3624 && (gimple_assign_rhs_code (use_stmt) == TRUNC_DIV_EXPR
3625 || gimple_assign_rhs_code (use_stmt) == TRUNC_MOD_EXPR)
3626 && operand_equal_p (op1, gimple_assign_rhs1 (use_stmt), 0)
3627 && operand_equal_p (op2, gimple_assign_rhs2 (use_stmt), 0))
3628 {
3629 if (stmt_can_throw_internal (cfun, use_stmt))
3630 continue;
3631
3632 basic_block bb = gimple_bb (use_stmt);
3633
3634 if (bb == top_bb)
3635 {
3636 if (gimple_uid (use_stmt) < gimple_uid (top_stmt))
3637 top_stmt = use_stmt;
3638 }
3639 else if (dominated_by_p (CDI_DOMINATORS, top_bb, bb))
3640 {
3641 top_bb = bb;
3642 top_stmt = use_stmt;
3643 }
3644 }
3645 }
3646
3647 tree top_op1 = gimple_assign_rhs1 (top_stmt);
3648 tree top_op2 = gimple_assign_rhs2 (top_stmt);
3649
3650 stmts.safe_push (top_stmt);
3651 bool div_seen = (gimple_assign_rhs_code (top_stmt) == TRUNC_DIV_EXPR);
3652
3653 /* Part 2: Add all trunc_div/trunc_mod statements domianted by top_bb
3654 to stmts vector. The 2nd loop will always add stmt to stmts vector, since
3655 gimple_bb (top_stmt) dominates gimple_bb (stmt), so the
3656 2nd loop ends up adding at-least single trunc_mod_expr stmt. */
3657
3658 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, top_op1)
3659 {
3660 if (is_gimple_assign (use_stmt)
3661 && (gimple_assign_rhs_code (use_stmt) == TRUNC_DIV_EXPR
3662 || gimple_assign_rhs_code (use_stmt) == TRUNC_MOD_EXPR)
3663 && operand_equal_p (top_op1, gimple_assign_rhs1 (use_stmt), 0)
3664 && operand_equal_p (top_op2, gimple_assign_rhs2 (use_stmt), 0))
3665 {
3666 if (use_stmt == top_stmt
3667 || stmt_can_throw_internal (cfun, use_stmt)
3668 || !dominated_by_p (CDI_DOMINATORS, gimple_bb (use_stmt), top_bb))
3669 continue;
3670
3671 stmts.safe_push (use_stmt);
3672 if (gimple_assign_rhs_code (use_stmt) == TRUNC_DIV_EXPR)
3673 div_seen = true;
3674 }
3675 }
3676
3677 if (!div_seen)
3678 return false;
3679
3680 /* Part 3: Create libcall to internal fn DIVMOD:
3681 divmod_tmp = DIVMOD (op1, op2). */
3682
3683 gcall *call_stmt = gimple_build_call_internal (IFN_DIVMOD, 2, op1, op2);
3684 tree res = make_temp_ssa_name (build_complex_type (TREE_TYPE (op1)),
3685 call_stmt, "divmod_tmp");
3686 gimple_call_set_lhs (call_stmt, res);
3687 /* We rejected throwing statements above. */
3688 gimple_call_set_nothrow (call_stmt, true);
3689
3690 /* Insert the call before top_stmt. */
3691 gimple_stmt_iterator top_stmt_gsi = gsi_for_stmt (top_stmt);
3692 gsi_insert_before (&top_stmt_gsi, call_stmt, GSI_SAME_STMT);
3693
3694 widen_mul_stats.divmod_calls_inserted++;
3695
3696 /* Update all statements in stmts vector:
3697 lhs = op1 TRUNC_DIV_EXPR op2 -> lhs = REALPART_EXPR<divmod_tmp>
3698 lhs = op1 TRUNC_MOD_EXPR op2 -> lhs = IMAGPART_EXPR<divmod_tmp>. */
3699
3700 for (unsigned i = 0; stmts.iterate (i, &use_stmt); ++i)
3701 {
3702 tree new_rhs;
3703
3704 switch (gimple_assign_rhs_code (use_stmt))
3705 {
3706 case TRUNC_DIV_EXPR:
3707 new_rhs = fold_build1 (REALPART_EXPR, TREE_TYPE (op1), res);
3708 break;
3709
3710 case TRUNC_MOD_EXPR:
3711 new_rhs = fold_build1 (IMAGPART_EXPR, TREE_TYPE (op1), res);
3712 break;
3713
3714 default:
3715 gcc_unreachable ();
3716 }
3717
3718 gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
3719 gimple_assign_set_rhs_from_tree (&gsi, new_rhs);
3720 update_stmt (use_stmt);
3721 }
3722
3723 return true;
3724 }
3725
3726 /* Find integer multiplications where the operands are extended from
3727 smaller types, and replace the MULT_EXPR with a WIDEN_MULT_EXPR
3728 where appropriate. */
3729
3730 namespace {
3731
3732 const pass_data pass_data_optimize_widening_mul =
3733 {
3734 GIMPLE_PASS, /* type */
3735 "widening_mul", /* name */
3736 OPTGROUP_NONE, /* optinfo_flags */
3737 TV_TREE_WIDEN_MUL, /* tv_id */
3738 PROP_ssa, /* properties_required */
3739 0, /* properties_provided */
3740 0, /* properties_destroyed */
3741 0, /* todo_flags_start */
3742 TODO_update_ssa, /* todo_flags_finish */
3743 };
3744
3745 class pass_optimize_widening_mul : public gimple_opt_pass
3746 {
3747 public:
3748 pass_optimize_widening_mul (gcc::context *ctxt)
3749 : gimple_opt_pass (pass_data_optimize_widening_mul, ctxt)
3750 {}
3751
3752 /* opt_pass methods: */
3753 virtual bool gate (function *)
3754 {
3755 return flag_expensive_optimizations && optimize;
3756 }
3757
3758 virtual unsigned int execute (function *);
3759
3760 }; // class pass_optimize_widening_mul
3761
3762 /* Walker class to perform the transformation in reverse dominance order. */
3763
3764 class math_opts_dom_walker : public dom_walker
3765 {
3766 public:
3767 /* Constructor, CFG_CHANGED is a pointer to a boolean flag that will be set
3768 if walking modidifes the CFG. */
3769
3770 math_opts_dom_walker (bool *cfg_changed_p)
3771 : dom_walker (CDI_DOMINATORS), m_last_result_set (),
3772 m_cfg_changed_p (cfg_changed_p) {}
3773
3774 /* The actual actions performed in the walk. */
3775
3776 virtual void after_dom_children (basic_block);
3777
3778 /* Set of results of chains of multiply and add statement combinations that
3779 were not transformed into FMAs because of active deferring. */
3780 hash_set<tree> m_last_result_set;
3781
3782 /* Pointer to a flag of the user that needs to be set if CFG has been
3783 modified. */
3784 bool *m_cfg_changed_p;
3785 };
3786
3787 void
3788 math_opts_dom_walker::after_dom_children (basic_block bb)
3789 {
3790 gimple_stmt_iterator gsi;
3791
3792 fma_deferring_state fma_state (param_avoid_fma_max_bits > 0);
3793
3794 for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi);)
3795 {
3796 gimple *stmt = gsi_stmt (gsi);
3797 enum tree_code code;
3798
3799 if (is_gimple_assign (stmt))
3800 {
3801 code = gimple_assign_rhs_code (stmt);
3802 switch (code)
3803 {
3804 case MULT_EXPR:
3805 if (!convert_mult_to_widen (stmt, &gsi)
3806 && !convert_expand_mult_copysign (stmt, &gsi)
3807 && convert_mult_to_fma (stmt,
3808 gimple_assign_rhs1 (stmt),
3809 gimple_assign_rhs2 (stmt),
3810 &fma_state))
3811 {
3812 gsi_remove (&gsi, true);
3813 release_defs (stmt);
3814 continue;
3815 }
3816 break;
3817
3818 case PLUS_EXPR:
3819 case MINUS_EXPR:
3820 if (!convert_plusminus_to_widen (&gsi, stmt, code))
3821 match_uaddsub_overflow (&gsi, stmt, code);
3822 break;
3823
3824 case TRUNC_MOD_EXPR:
3825 convert_to_divmod (as_a<gassign *> (stmt));
3826 break;
3827
3828 default:;
3829 }
3830 }
3831 else if (is_gimple_call (stmt))
3832 {
3833 switch (gimple_call_combined_fn (stmt))
3834 {
3835 CASE_CFN_POW:
3836 if (gimple_call_lhs (stmt)
3837 && TREE_CODE (gimple_call_arg (stmt, 1)) == REAL_CST
3838 && real_equal (&TREE_REAL_CST (gimple_call_arg (stmt, 1)),
3839 &dconst2)
3840 && convert_mult_to_fma (stmt,
3841 gimple_call_arg (stmt, 0),
3842 gimple_call_arg (stmt, 0),
3843 &fma_state))
3844 {
3845 unlink_stmt_vdef (stmt);
3846 if (gsi_remove (&gsi, true)
3847 && gimple_purge_dead_eh_edges (bb))
3848 *m_cfg_changed_p = true;
3849 release_defs (stmt);
3850 continue;
3851 }
3852 break;
3853
3854 case CFN_COND_MUL:
3855 if (convert_mult_to_fma (stmt,
3856 gimple_call_arg (stmt, 1),
3857 gimple_call_arg (stmt, 2),
3858 &fma_state,
3859 gimple_call_arg (stmt, 0)))
3860
3861 {
3862 gsi_remove (&gsi, true);
3863 release_defs (stmt);
3864 continue;
3865 }
3866 break;
3867
3868 case CFN_LAST:
3869 cancel_fma_deferring (&fma_state);
3870 break;
3871
3872 default:
3873 break;
3874 }
3875 }
3876 gsi_next (&gsi);
3877 }
3878 if (fma_state.m_deferring_p
3879 && fma_state.m_initial_phi)
3880 {
3881 gcc_checking_assert (fma_state.m_last_result);
3882 if (!last_fma_candidate_feeds_initial_phi (&fma_state,
3883 &m_last_result_set))
3884 cancel_fma_deferring (&fma_state);
3885 else
3886 m_last_result_set.add (fma_state.m_last_result);
3887 }
3888 }
3889
3890
3891 unsigned int
3892 pass_optimize_widening_mul::execute (function *fun)
3893 {
3894 bool cfg_changed = false;
3895
3896 memset (&widen_mul_stats, 0, sizeof (widen_mul_stats));
3897 calculate_dominance_info (CDI_DOMINATORS);
3898 renumber_gimple_stmt_uids (cfun);
3899
3900 math_opts_dom_walker (&cfg_changed).walk (ENTRY_BLOCK_PTR_FOR_FN (cfun));
3901
3902 statistics_counter_event (fun, "widening multiplications inserted",
3903 widen_mul_stats.widen_mults_inserted);
3904 statistics_counter_event (fun, "widening maccs inserted",
3905 widen_mul_stats.maccs_inserted);
3906 statistics_counter_event (fun, "fused multiply-adds inserted",
3907 widen_mul_stats.fmas_inserted);
3908 statistics_counter_event (fun, "divmod calls inserted",
3909 widen_mul_stats.divmod_calls_inserted);
3910
3911 return cfg_changed ? TODO_cleanup_cfg : 0;
3912 }
3913
3914 } // anon namespace
3915
3916 gimple_opt_pass *
3917 make_pass_optimize_widening_mul (gcc::context *ctxt)
3918 {
3919 return new pass_optimize_widening_mul (ctxt);
3920 }