gcc/tree-ssa-math-opts.c

   1 /* Global, SSA-based optimizations using mathematical identities.
   2    Copyright (C) 2005-2020 Free Software Foundation, Inc.
   3
   4 This file is part of GCC.
   5
   6 GCC is free software; you can redistribute it and/or modify it
   7 under the terms of the GNU General Public License as published by the
   8 Free Software Foundation; either version 3, or (at your option) any
   9 later version.
  10
  11 GCC is distributed in the hope that it will be useful, but WITHOUT
  12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14 for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with GCC; see the file COPYING3.  If not see
  18 <http://www.gnu.org/licenses/>.  */
  19
  20 /* Currently, the only mini-pass in this file tries to CSE reciprocal
  21    operations.  These are common in sequences such as this one:
  22
  23         modulus = sqrt(x*x + y*y + z*z);
  24         x = x / modulus;
  25         y = y / modulus;
  26         z = z / modulus;
  27
  28    that can be optimized to
  29
  30         modulus = sqrt(x*x + y*y + z*z);
  31         rmodulus = 1.0 / modulus;
  32         x = x * rmodulus;
  33         y = y * rmodulus;
  34         z = z * rmodulus;
  35
  36    We do this for loop invariant divisors, and with this pass whenever
  37    we notice that a division has the same divisor multiple times.
  38
  39    Of course, like in PRE, we don't insert a division if a dominator
  40    already has one.  However, this cannot be done as an extension of
  41    PRE for several reasons.
  42
  43    First of all, with some experiments it was found out that the
  44    transformation is not always useful if there are only two divisions
  45    by the same divisor.  This is probably because modern processors
  46    can pipeline the divisions; on older, in-order processors it should
  47    still be effective to optimize two divisions by the same number.
  48    We make this a param, and it shall be called N in the remainder of
  49    this comment.
  50
  51    Second, if trapping math is active, we have less freedom on where
  52    to insert divisions: we can only do so in basic blocks that already
  53    contain one.  (If divisions don't trap, instead, we can insert
  54    divisions elsewhere, which will be in blocks that are common dominators
  55    of those that have the division).
  56
  57    We really don't want to compute the reciprocal unless a division will
  58    be found.  To do this, we won't insert the division in a basic block
  59    that has less than N divisions *post-dominating* it.
  60
  61    The algorithm constructs a subset of the dominator tree, holding the
  62    blocks containing the divisions and the common dominators to them,
  63    and walk it twice.  The first walk is in post-order, and it annotates
  64    each block with the number of divisions that post-dominate it: this
  65    gives information on where divisions can be inserted profitably.
  66    The second walk is in pre-order, and it inserts divisions as explained
  67    above, and replaces divisions by multiplications.
  68
  69    In the best case, the cost of the pass is O(n_statements).  In the
  70    worst-case, the cost is due to creating the dominator tree subset,
  71    with a cost of O(n_basic_blocks ^ 2); however this can only happen
  72    for n_statements / n_basic_blocks statements.  So, the amortized cost
  73    of creating the dominator tree subset is O(n_basic_blocks) and the
  74    worst-case cost of the pass is O(n_statements * n_basic_blocks).
  75
  76    More practically, the cost will be small because there are few
  77    divisions, and they tend to be in the same basic block, so insert_bb
  78    is called very few times.
  79
  80    If we did this using domwalk.c, an efficient implementation would have
  81    to work on all the variables in a single pass, because we could not
  82    work on just a subset of the dominator tree, as we do now, and the
  83    cost would also be something like O(n_statements * n_basic_blocks).
  84    The data structures would be more complex in order to work on all the
  85    variables in a single pass.  */
  86
  87 #include "config.h"
  88 #include "system.h"
  89 #include "coretypes.h"
  90 #include "backend.h"
  91 #include "target.h"
  92 #include "rtl.h"
  93 #include "tree.h"
  94 #include "gimple.h"
  95 #include "predict.h"
  96 #include "alloc-pool.h"
  97 #include "tree-pass.h"
  98 #include "ssa.h"
  99 #include "optabs-tree.h"
 100 #include "gimple-pretty-print.h"
 101 #include "alias.h"
 102 #include "fold-const.h"
 103 #include "gimple-fold.h"
 104 #include "gimple-iterator.h"
 105 #include "gimplify.h"
 106 #include "gimplify-me.h"
 107 #include "stor-layout.h"
 108 #include "tree-cfg.h"
 109 #include "tree-dfa.h"
 110 #include "tree-ssa.h"
 111 #include "builtins.h"
 112 #include "internal-fn.h"
 113 #include "case-cfn-macros.h"
 114 #include "optabs-libfuncs.h"
 115 #include "tree-eh.h"
 116 #include "targhooks.h"
 117 #include "domwalk.h"
 118
 119 /* This structure represents one basic block that either computes a
 120    division, or is a common dominator for basic block that compute a
 121    division.  */
 122 struct occurrence {
 123   /* The basic block represented by this structure.  */
 124   basic_block bb = basic_block();
 125
 126   /* If non-NULL, the SSA_NAME holding the definition for a reciprocal
 127      inserted in BB.  */
 128   tree recip_def = tree();
 129
 130   /* If non-NULL, the SSA_NAME holding the definition for a squared
 131      reciprocal inserted in BB.  */
 132   tree square_recip_def = tree();
 133
 134   /* If non-NULL, the GIMPLE_ASSIGN for a reciprocal computation that
 135      was inserted in BB.  */
 136   gimple *recip_def_stmt = nullptr;
 137
 138   /* Pointer to a list of "struct occurrence"s for blocks dominated
 139      by BB.  */
 140   struct occurrence *children = nullptr;
 141
 142   /* Pointer to the next "struct occurrence"s in the list of blocks
 143      sharing a common dominator.  */
 144   struct occurrence *next = nullptr;
 145
 146   /* The number of divisions that are in BB before compute_merit.  The
 147      number of divisions that are in BB or post-dominate it after
 148      compute_merit.  */
 149   int num_divisions = 0;
 150
 151   /* True if the basic block has a division, false if it is a common
 152      dominator for basic blocks that do.  If it is false and trapping
 153      math is active, BB is not a candidate for inserting a reciprocal.  */
 154   bool bb_has_division = false;
 155
 156   /* Construct a struct occurrence for basic block BB, and whose
 157      children list is headed by CHILDREN.  */
 158   occurrence (basic_block bb, struct occurrence *children)
 159   : bb (bb), children (children)
 160   {
 161     bb->aux = this;
 162   }
 163
 164   /* Destroy a struct occurrence and remove it from its basic block.  */
 165   ~occurrence ()
 166   {
 167     bb->aux = nullptr;
 168   }
 169
 170   /* Allocate memory for a struct occurrence from OCC_POOL.  */
 171   static void* operator new (size_t);
 172
 173   /* Return memory for a struct occurrence to OCC_POOL.  */
 174   static void operator delete (void*, size_t);
 175 };
 176
 177 static struct
 178 {
 179   /* Number of 1.0/X ops inserted.  */
 180   int rdivs_inserted;
 181
 182   /* Number of 1.0/FUNC ops inserted.  */
 183   int rfuncs_inserted;
 184 } reciprocal_stats;
 185
 186 static struct
 187 {
 188   /* Number of cexpi calls inserted.  */
 189   int inserted;
 190 } sincos_stats;
 191
 192 static struct
 193 {
 194   /* Number of widening multiplication ops inserted.  */
 195   int widen_mults_inserted;
 196
 197   /* Number of integer multiply-and-accumulate ops inserted.  */
 198   int maccs_inserted;
 199
 200   /* Number of fp fused multiply-add ops inserted.  */
 201   int fmas_inserted;
 202
 203   /* Number of divmod calls inserted.  */
 204   int divmod_calls_inserted;
 205 } widen_mul_stats;
 206
 207 /* The instance of "struct occurrence" representing the highest
 208    interesting block in the dominator tree.  */
 209 static struct occurrence *occ_head;
 210
 211 /* Allocation pool for getting instances of "struct occurrence".  */
 212 static object_allocator<occurrence> *occ_pool;
 213
 214 void* occurrence::operator new (size_t n)
 215 {
 216   gcc_assert (n == sizeof(occurrence));
 217   return occ_pool->allocate_raw ();
 218 }
 219
 220 void occurrence::operator delete (void *occ, size_t n)
 221 {
 222   gcc_assert (n == sizeof(occurrence));
 223   occ_pool->remove_raw (occ);
 224 }
 225
 226 /* Insert NEW_OCC into our subset of the dominator tree.  P_HEAD points to a
 227    list of "struct occurrence"s, one per basic block, having IDOM as
 228    their common dominator.
 229
 230    We try to insert NEW_OCC as deep as possible in the tree, and we also
 231    insert any other block that is a common dominator for BB and one
 232    block already in the tree.  */
 233
 234 static void
 235 insert_bb (struct occurrence *new_occ, basic_block idom,
 236            struct occurrence **p_head)
 237 {
 238   struct occurrence *occ, **p_occ;
 239
 240   for (p_occ = p_head; (occ = *p_occ) != NULL; )
 241     {
 242       basic_block bb = new_occ->bb, occ_bb = occ->bb;
 243       basic_block dom = nearest_common_dominator (CDI_DOMINATORS, occ_bb, bb);
 244       if (dom == bb)
 245         {
 246           /* BB dominates OCC_BB.  OCC becomes NEW_OCC's child: remove OCC
 247              from its list.  */
 248           *p_occ = occ->next;
 249           occ->next = new_occ->children;
 250           new_occ->children = occ;
 251
 252           /* Try the next block (it may as well be dominated by BB).  */
 253         }
 254
 255       else if (dom == occ_bb)
 256         {
 257           /* OCC_BB dominates BB.  Tail recurse to look deeper.  */
 258           insert_bb (new_occ, dom, &occ->children);
 259           return;
 260         }
 261
 262       else if (dom != idom)
 263         {
 264           gcc_assert (!dom->aux);
 265
 266           /* There is a dominator between IDOM and BB, add it and make
 267              two children out of NEW_OCC and OCC.  First, remove OCC from
 268              its list.  */
 269           *p_occ = occ->next;
 270           new_occ->next = occ;
 271           occ->next = NULL;
 272
 273           /* None of the previous blocks has DOM as a dominator: if we tail
 274              recursed, we would reexamine them uselessly. Just switch BB with
 275              DOM, and go on looking for blocks dominated by DOM.  */
 276           new_occ = new occurrence (dom, new_occ);
 277         }
 278
 279       else
 280         {
 281           /* Nothing special, go on with the next element.  */
 282           p_occ = &occ->next;
 283         }
 284     }
 285
 286   /* No place was found as a child of IDOM.  Make BB a sibling of IDOM.  */
 287   new_occ->next = *p_head;
 288   *p_head = new_occ;
 289 }
 290
 291 /* Register that we found a division in BB.
 292    IMPORTANCE is a measure of how much weighting to give
 293    that division.  Use IMPORTANCE = 2 to register a single
 294    division.  If the division is going to be found multiple
 295    times use 1 (as it is with squares).  */
 296
 297 static inline void
 298 register_division_in (basic_block bb, int importance)
 299 {
 300   struct occurrence *occ;
 301
 302   occ = (struct occurrence *) bb->aux;
 303   if (!occ)
 304     {
 305       occ = new occurrence (bb, NULL);
 306       insert_bb (occ, ENTRY_BLOCK_PTR_FOR_FN (cfun), &occ_head);
 307     }
 308
 309   occ->bb_has_division = true;
 310   occ->num_divisions += importance;
 311 }
 312
 313
 314 /* Compute the number of divisions that postdominate each block in OCC and
 315    its children.  */
 316
 317 static void
 318 compute_merit (struct occurrence *occ)
 319 {
 320   struct occurrence *occ_child;
 321   basic_block dom = occ->bb;
 322
 323   for (occ_child = occ->children; occ_child; occ_child = occ_child->next)
 324     {
 325       basic_block bb;
 326       if (occ_child->children)
 327         compute_merit (occ_child);
 328
 329       if (flag_exceptions)
 330         bb = single_noncomplex_succ (dom);
 331       else
 332         bb = dom;
 333
 334       if (dominated_by_p (CDI_POST_DOMINATORS, bb, occ_child->bb))
 335         occ->num_divisions += occ_child->num_divisions;
 336     }
 337 }
 338
 339
 340 /* Return whether USE_STMT is a floating-point division by DEF.  */
 341 static inline bool
 342 is_division_by (gimple *use_stmt, tree def)
 343 {
 344   return is_gimple_assign (use_stmt)
 345          && gimple_assign_rhs_code (use_stmt) == RDIV_EXPR
 346          && gimple_assign_rhs2 (use_stmt) == def
 347          /* Do not recognize x / x as valid division, as we are getting
 348             confused later by replacing all immediate uses x in such
 349             a stmt.  */
 350          && gimple_assign_rhs1 (use_stmt) != def
 351          && !stmt_can_throw_internal (cfun, use_stmt);
 352 }
 353
 354 /* Return TRUE if USE_STMT is a multiplication of DEF by A.  */
 355 static inline bool
 356 is_mult_by (gimple *use_stmt, tree def, tree a)
 357 {
 358   if (gimple_code (use_stmt) == GIMPLE_ASSIGN
 359       && gimple_assign_rhs_code (use_stmt) == MULT_EXPR)
 360     {
 361       tree op0 = gimple_assign_rhs1 (use_stmt);
 362       tree op1 = gimple_assign_rhs2 (use_stmt);
 363
 364       return (op0 == def && op1 == a)
 365               || (op0 == a && op1 == def);
 366     }
 367   return 0;
 368 }
 369
 370 /* Return whether USE_STMT is DEF * DEF.  */
 371 static inline bool
 372 is_square_of (gimple *use_stmt, tree def)
 373 {
 374   return is_mult_by (use_stmt, def, def);
 375 }
 376
 377 /* Return whether USE_STMT is a floating-point division by
 378    DEF * DEF.  */
 379 static inline bool
 380 is_division_by_square (gimple *use_stmt, tree def)
 381 {
 382   if (gimple_code (use_stmt) == GIMPLE_ASSIGN
 383       && gimple_assign_rhs_code (use_stmt) == RDIV_EXPR
 384       && gimple_assign_rhs1 (use_stmt) != gimple_assign_rhs2 (use_stmt)
 385       && !stmt_can_throw_internal (cfun, use_stmt))
 386     {
 387       tree denominator = gimple_assign_rhs2 (use_stmt);
 388       if (TREE_CODE (denominator) == SSA_NAME)
 389         return is_square_of (SSA_NAME_DEF_STMT (denominator), def);
 390     }
 391   return 0;
 392 }
 393
 394 /* Walk the subset of the dominator tree rooted at OCC, setting the
 395    RECIP_DEF field to a definition of 1.0 / DEF that can be used in
 396    the given basic block.  The field may be left NULL, of course,
 397    if it is not possible or profitable to do the optimization.
 398
 399    DEF_BSI is an iterator pointing at the statement defining DEF.
 400    If RECIP_DEF is set, a dominator already has a computation that can
 401    be used.
 402
 403    If should_insert_square_recip is set, then this also inserts
 404    the square of the reciprocal immediately after the definition
 405    of the reciprocal.  */
 406
 407 static void
 408 insert_reciprocals (gimple_stmt_iterator *def_gsi, struct occurrence *occ,
 409                     tree def, tree recip_def, tree square_recip_def,
 410                     int should_insert_square_recip, int threshold)
 411 {
 412   tree type;
 413   gassign *new_stmt, *new_square_stmt;
 414   gimple_stmt_iterator gsi;
 415   struct occurrence *occ_child;
 416
 417   if (!recip_def
 418       && (occ->bb_has_division || !flag_trapping_math)
 419       /* Divide by two as all divisions are counted twice in
 420          the costing loop.  */
 421       && occ->num_divisions / 2 >= threshold)
 422     {
 423       /* Make a variable with the replacement and substitute it.  */
 424       type = TREE_TYPE (def);
 425       recip_def = create_tmp_reg (type, "reciptmp");
 426       new_stmt = gimple_build_assign (recip_def, RDIV_EXPR,
 427                                       build_one_cst (type), def);
 428
 429       if (should_insert_square_recip)
 430         {
 431           square_recip_def = create_tmp_reg (type, "powmult_reciptmp");
 432           new_square_stmt = gimple_build_assign (square_recip_def, MULT_EXPR,
 433                                                  recip_def, recip_def);
 434         }
 435
 436       if (occ->bb_has_division)
 437         {
 438           /* Case 1: insert before an existing division.  */
 439           gsi = gsi_after_labels (occ->bb);
 440           while (!gsi_end_p (gsi)
 441                  && (!is_division_by (gsi_stmt (gsi), def))
 442                  && (!is_division_by_square (gsi_stmt (gsi), def)))
 443             gsi_next (&gsi);
 444
 445           gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
 446           if (should_insert_square_recip)
 447             gsi_insert_before (&gsi, new_square_stmt, GSI_SAME_STMT);
 448         }
 449       else if (def_gsi && occ->bb == def_gsi->bb)
 450         {
 451           /* Case 2: insert right after the definition.  Note that this will
 452              never happen if the definition statement can throw, because in
 453              that case the sole successor of the statement's basic block will
 454              dominate all the uses as well.  */
 455           gsi_insert_after (def_gsi, new_stmt, GSI_NEW_STMT);
 456           if (should_insert_square_recip)
 457             gsi_insert_after (def_gsi, new_square_stmt, GSI_NEW_STMT);
 458         }
 459       else
 460         {
 461           /* Case 3: insert in a basic block not containing defs/uses.  */
 462           gsi = gsi_after_labels (occ->bb);
 463           gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
 464           if (should_insert_square_recip)
 465             gsi_insert_before (&gsi, new_square_stmt, GSI_SAME_STMT);
 466         }
 467
 468       reciprocal_stats.rdivs_inserted++;
 469
 470       occ->recip_def_stmt = new_stmt;
 471     }
 472
 473   occ->recip_def = recip_def;
 474   occ->square_recip_def = square_recip_def;
 475   for (occ_child = occ->children; occ_child; occ_child = occ_child->next)
 476     insert_reciprocals (def_gsi, occ_child, def, recip_def,
 477                         square_recip_def, should_insert_square_recip,
 478                         threshold);
 479 }
 480
 481 /* Replace occurrences of expr / (x * x) with expr * ((1 / x) * (1 / x)).
 482    Take as argument the use for (x * x).  */
 483 static inline void
 484 replace_reciprocal_squares (use_operand_p use_p)
 485 {
 486   gimple *use_stmt = USE_STMT (use_p);
 487   basic_block bb = gimple_bb (use_stmt);
 488   struct occurrence *occ = (struct occurrence *) bb->aux;
 489
 490   if (optimize_bb_for_speed_p (bb) && occ->square_recip_def
 491       && occ->recip_def)
 492     {
 493       gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
 494       gimple_assign_set_rhs_code (use_stmt, MULT_EXPR);
 495       gimple_assign_set_rhs2 (use_stmt, occ->square_recip_def);
 496       SET_USE (use_p, occ->square_recip_def);
 497       fold_stmt_inplace (&gsi);
 498       update_stmt (use_stmt);
 499     }
 500 }
 501
 502
 503 /* Replace the division at USE_P with a multiplication by the reciprocal, if
 504    possible.  */
 505
 506 static inline void
 507 replace_reciprocal (use_operand_p use_p)
 508 {
 509   gimple *use_stmt = USE_STMT (use_p);
 510   basic_block bb = gimple_bb (use_stmt);
 511   struct occurrence *occ = (struct occurrence *) bb->aux;
 512
 513   if (optimize_bb_for_speed_p (bb)
 514       && occ->recip_def && use_stmt != occ->recip_def_stmt)
 515     {
 516       gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
 517       gimple_assign_set_rhs_code (use_stmt, MULT_EXPR);
 518       SET_USE (use_p, occ->recip_def);
 519       fold_stmt_inplace (&gsi);
 520       update_stmt (use_stmt);
 521     }
 522 }
 523
 524
 525 /* Free OCC and return one more "struct occurrence" to be freed.  */
 526
 527 static struct occurrence *
 528 free_bb (struct occurrence *occ)
 529 {
 530   struct occurrence *child, *next;
 531
 532   /* First get the two pointers hanging off OCC.  */
 533   next = occ->next;
 534   child = occ->children;
 535   delete occ;
 536
 537   /* Now ensure that we don't recurse unless it is necessary.  */
 538   if (!child)
 539     return next;
 540   else
 541     {
 542       while (next)
 543         next = free_bb (next);
 544
 545       return child;
 546     }
 547 }
 548
 549 /* Transform sequences like
 550    t = sqrt (a)
 551    x = 1.0 / t;
 552    r1 = x * x;
 553    r2 = a * x;
 554    into:
 555    t = sqrt (a)
 556    r1 = 1.0 / a;
 557    r2 = t;
 558    x = r1 * r2;
 559    depending on the uses of x, r1, r2.  This removes one multiplication and
 560    allows the sqrt and division operations to execute in parallel.
 561    DEF_GSI is the gsi of the initial division by sqrt that defines
 562    DEF (x in the example above).  */
 563
 564 static void
 565 optimize_recip_sqrt (gimple_stmt_iterator *def_gsi, tree def)
 566 {
 567   gimple *use_stmt;
 568   imm_use_iterator use_iter;
 569   gimple *stmt = gsi_stmt (*def_gsi);
 570   tree x = def;
 571   tree orig_sqrt_ssa_name = gimple_assign_rhs2 (stmt);
 572   tree div_rhs1 = gimple_assign_rhs1 (stmt);
 573
 574   if (TREE_CODE (orig_sqrt_ssa_name) != SSA_NAME
 575       || TREE_CODE (div_rhs1) != REAL_CST
 576       || !real_equal (&TREE_REAL_CST (div_rhs1), &dconst1))
 577     return;
 578
 579   gcall *sqrt_stmt
 580     = dyn_cast <gcall *> (SSA_NAME_DEF_STMT (orig_sqrt_ssa_name));
 581
 582   if (!sqrt_stmt || !gimple_call_lhs (sqrt_stmt))
 583     return;
 584
 585   switch (gimple_call_combined_fn (sqrt_stmt))
 586     {
 587     CASE_CFN_SQRT:
 588     CASE_CFN_SQRT_FN:
 589       break;
 590
 591     default:
 592       return;
 593     }
 594   tree a = gimple_call_arg (sqrt_stmt, 0);
 595
 596   /* We have 'a' and 'x'.  Now analyze the uses of 'x'.  */
 597
 598   /* Statements that use x in x * x.  */
 599   auto_vec<gimple *> sqr_stmts;
 600   /* Statements that use x in a * x.  */
 601   auto_vec<gimple *> mult_stmts;
 602   bool has_other_use = false;
 603   bool mult_on_main_path = false;
 604
 605   FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, x)
 606     {
 607       if (is_gimple_debug (use_stmt))
 608         continue;
 609       if (is_square_of (use_stmt, x))
 610         {
 611           sqr_stmts.safe_push (use_stmt);
 612           if (gimple_bb (use_stmt) == gimple_bb (stmt))
 613             mult_on_main_path = true;
 614         }
 615       else if (is_mult_by (use_stmt, x, a))
 616         {
 617           mult_stmts.safe_push (use_stmt);
 618           if (gimple_bb (use_stmt) == gimple_bb (stmt))
 619             mult_on_main_path = true;
 620         }
 621       else
 622         has_other_use = true;
 623     }
 624
 625   /* In the x * x and a * x cases we just rewire stmt operands or
 626      remove multiplications.  In the has_other_use case we introduce
 627      a multiplication so make sure we don't introduce a multiplication
 628      on a path where there was none.  */
 629   if (has_other_use && !mult_on_main_path)
 630     return;
 631
 632   if (sqr_stmts.is_empty () && mult_stmts.is_empty ())
 633     return;
 634
 635   /* If x = 1.0 / sqrt (a) has uses other than those optimized here we want
 636      to be able to compose it from the sqr and mult cases.  */
 637   if (has_other_use && (sqr_stmts.is_empty () || mult_stmts.is_empty ()))
 638     return;
 639
 640   if (dump_file)
 641     {
 642       fprintf (dump_file, "Optimizing reciprocal sqrt multiplications of\n");
 643       print_gimple_stmt (dump_file, sqrt_stmt, 0, TDF_NONE);
 644       print_gimple_stmt (dump_file, stmt, 0, TDF_NONE);
 645       fprintf (dump_file, "\n");
 646     }
 647
 648   bool delete_div = !has_other_use;
 649   tree sqr_ssa_name = NULL_TREE;
 650   if (!sqr_stmts.is_empty ())
 651     {
 652       /* r1 = x * x.  Transform the original
 653          x = 1.0 / t
 654          into
 655          tmp1 = 1.0 / a
 656          r1 = tmp1.  */
 657
 658       sqr_ssa_name
 659         = make_temp_ssa_name (TREE_TYPE (a), NULL, "recip_sqrt_sqr");
 660
 661       if (dump_file)
 662         {
 663           fprintf (dump_file, "Replacing original division\n");
 664           print_gimple_stmt (dump_file, stmt, 0, TDF_NONE);
 665           fprintf (dump_file, "with new division\n");
 666         }
 667       stmt
 668         = gimple_build_assign (sqr_ssa_name, gimple_assign_rhs_code (stmt),
 669                                gimple_assign_rhs1 (stmt), a);
 670       gsi_insert_before (def_gsi, stmt, GSI_SAME_STMT);
 671       gsi_remove (def_gsi, true);
 672       *def_gsi = gsi_for_stmt (stmt);
 673       fold_stmt_inplace (def_gsi);
 674       update_stmt (stmt);
 675
 676       if (dump_file)
 677         print_gimple_stmt (dump_file, stmt, 0, TDF_NONE);
 678
 679       delete_div = false;
 680       gimple *sqr_stmt;
 681       unsigned int i;
 682       FOR_EACH_VEC_ELT (sqr_stmts, i, sqr_stmt)
 683         {
 684           gimple_stmt_iterator gsi2 = gsi_for_stmt (sqr_stmt);
 685           gimple_assign_set_rhs_from_tree (&gsi2, sqr_ssa_name);
 686           update_stmt (sqr_stmt);
 687         }
 688     }
 689   if (!mult_stmts.is_empty ())
 690     {
 691       /* r2 = a * x.  Transform this into:
 692          r2 = t (The original sqrt (a)).  */
 693       unsigned int i;
 694       gimple *mult_stmt = NULL;
 695       FOR_EACH_VEC_ELT (mult_stmts, i, mult_stmt)
 696         {
 697           gimple_stmt_iterator gsi2 = gsi_for_stmt (mult_stmt);
 698
 699           if (dump_file)
 700             {
 701               fprintf (dump_file, "Replacing squaring multiplication\n");
 702               print_gimple_stmt (dump_file, mult_stmt, 0, TDF_NONE);
 703               fprintf (dump_file, "with assignment\n");
 704             }
 705           gimple_assign_set_rhs_from_tree (&gsi2, orig_sqrt_ssa_name);
 706           fold_stmt_inplace (&gsi2);
 707           update_stmt (mult_stmt);
 708           if (dump_file)
 709             print_gimple_stmt (dump_file, mult_stmt, 0, TDF_NONE);
 710       }
 711     }
 712
 713   if (has_other_use)
 714     {
 715       /* Using the two temporaries tmp1, tmp2 from above
 716          the original x is now:
 717          x = tmp1 * tmp2.  */
 718       gcc_assert (orig_sqrt_ssa_name);
 719       gcc_assert (sqr_ssa_name);
 720
 721       gimple *new_stmt
 722         = gimple_build_assign (x, MULT_EXPR,
 723                                orig_sqrt_ssa_name, sqr_ssa_name);
 724       gsi_insert_after (def_gsi, new_stmt, GSI_NEW_STMT);
 725       update_stmt (stmt);
 726     }
 727   else if (delete_div)
 728     {
 729       /* Remove the original division.  */
 730       gimple_stmt_iterator gsi2 = gsi_for_stmt (stmt);
 731       gsi_remove (&gsi2, true);
 732       release_defs (stmt);
 733     }
 734   else
 735     release_ssa_name (x);
 736 }
 737
 738 /* Look for floating-point divisions among DEF's uses, and try to
 739    replace them by multiplications with the reciprocal.  Add
 740    as many statements computing the reciprocal as needed.
 741
 742    DEF must be a GIMPLE register of a floating-point type.  */
 743
 744 static void
 745 execute_cse_reciprocals_1 (gimple_stmt_iterator *def_gsi, tree def)
 746 {
 747   use_operand_p use_p, square_use_p;
 748   imm_use_iterator use_iter, square_use_iter;
 749   tree square_def;
 750   struct occurrence *occ;
 751   int count = 0;
 752   int threshold;
 753   int square_recip_count = 0;
 754   int sqrt_recip_count = 0;
 755
 756   gcc_assert (FLOAT_TYPE_P (TREE_TYPE (def)) && TREE_CODE (def) == SSA_NAME);
 757   threshold = targetm.min_divisions_for_recip_mul (TYPE_MODE (TREE_TYPE (def)));
 758
 759   /* If DEF is a square (x * x), count the number of divisions by x.
 760      If there are more divisions by x than by (DEF * DEF), prefer to optimize
 761      the reciprocal of x instead of DEF.  This improves cases like:
 762        def = x * x
 763        t0 = a / def
 764        t1 = b / def
 765        t2 = c / x
 766      Reciprocal optimization of x results in 1 division rather than 2 or 3.  */
 767   gimple *def_stmt = SSA_NAME_DEF_STMT (def);
 768
 769   if (is_gimple_assign (def_stmt)
 770       && gimple_assign_rhs_code (def_stmt) == MULT_EXPR
 771       && TREE_CODE (gimple_assign_rhs1 (def_stmt)) == SSA_NAME
 772       && gimple_assign_rhs1 (def_stmt) == gimple_assign_rhs2 (def_stmt))
 773     {
 774       tree op0 = gimple_assign_rhs1 (def_stmt);
 775
 776       FOR_EACH_IMM_USE_FAST (use_p, use_iter, op0)
 777         {
 778           gimple *use_stmt = USE_STMT (use_p);
 779           if (is_division_by (use_stmt, op0))
 780             sqrt_recip_count++;
 781         }
 782     }
 783
 784   FOR_EACH_IMM_USE_FAST (use_p, use_iter, def)
 785     {
 786       gimple *use_stmt = USE_STMT (use_p);
 787       if (is_division_by (use_stmt, def))
 788         {
 789           register_division_in (gimple_bb (use_stmt), 2);
 790           count++;
 791         }
 792
 793       if (is_square_of (use_stmt, def))
 794         {
 795           square_def = gimple_assign_lhs (use_stmt);
 796           FOR_EACH_IMM_USE_FAST (square_use_p, square_use_iter, square_def)
 797             {
 798               gimple *square_use_stmt = USE_STMT (square_use_p);
 799               if (is_division_by (square_use_stmt, square_def))
 800                 {
 801                   /* This is executed twice for each division by a square.  */
 802                   register_division_in (gimple_bb (square_use_stmt), 1);
 803                   square_recip_count++;
 804                 }
 805             }
 806         }
 807     }
 808
 809   /* Square reciprocals were counted twice above.  */
 810   square_recip_count /= 2;
 811
 812   /* If it is more profitable to optimize 1 / x, don't optimize 1 / (x * x).  */
 813   if (sqrt_recip_count > square_recip_count)
 814     goto out;
 815
 816   /* Do the expensive part only if we can hope to optimize something.  */
 817   if (count + square_recip_count >= threshold && count >= 1)
 818     {
 819       gimple *use_stmt;
 820       for (occ = occ_head; occ; occ = occ->next)
 821         {
 822           compute_merit (occ);
 823           insert_reciprocals (def_gsi, occ, def, NULL, NULL,
 824                               square_recip_count, threshold);
 825         }
 826
 827       FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, def)
 828         {
 829           if (is_division_by (use_stmt, def))
 830             {
 831               FOR_EACH_IMM_USE_ON_STMT (use_p, use_iter)
 832                 replace_reciprocal (use_p);
 833             }
 834           else if (square_recip_count > 0 && is_square_of (use_stmt, def))
 835             {
 836               FOR_EACH_IMM_USE_ON_STMT (use_p, use_iter)
 837                 {
 838                   /* Find all uses of the square that are divisions and
 839                    * replace them by multiplications with the inverse.  */
 840                   imm_use_iterator square_iterator;
 841                   gimple *powmult_use_stmt = USE_STMT (use_p);
 842                   tree powmult_def_name = gimple_assign_lhs (powmult_use_stmt);
 843
 844                   FOR_EACH_IMM_USE_STMT (powmult_use_stmt,
 845                                          square_iterator, powmult_def_name)
 846                     FOR_EACH_IMM_USE_ON_STMT (square_use_p, square_iterator)
 847                       {
 848                         gimple *powmult_use_stmt = USE_STMT (square_use_p);
 849                         if (is_division_by (powmult_use_stmt, powmult_def_name))
 850                           replace_reciprocal_squares (square_use_p);
 851                       }
 852                 }
 853             }
 854         }
 855     }
 856
 857 out:
 858   for (occ = occ_head; occ; )
 859     occ = free_bb (occ);
 860
 861   occ_head = NULL;
 862 }
 863
 864 /* Return an internal function that implements the reciprocal of CALL,
 865    or IFN_LAST if there is no such function that the target supports.  */
 866
 867 internal_fn
 868 internal_fn_reciprocal (gcall *call)
 869 {
 870   internal_fn ifn;
 871
 872   switch (gimple_call_combined_fn (call))
 873     {
 874     CASE_CFN_SQRT:
 875     CASE_CFN_SQRT_FN:
 876       ifn = IFN_RSQRT;
 877       break;
 878
 879     default:
 880       return IFN_LAST;
 881     }
 882
 883   tree_pair types = direct_internal_fn_types (ifn, call);
 884   if (!direct_internal_fn_supported_p (ifn, types, OPTIMIZE_FOR_SPEED))
 885     return IFN_LAST;
 886
 887   return ifn;
 888 }
 889
 890 /* Go through all the floating-point SSA_NAMEs, and call
 891    execute_cse_reciprocals_1 on each of them.  */
 892 namespace {
 893
 894 const pass_data pass_data_cse_reciprocals =
 895 {
 896   GIMPLE_PASS, /* type */
 897   "recip", /* name */
 898   OPTGROUP_NONE, /* optinfo_flags */
 899   TV_TREE_RECIP, /* tv_id */
 900   PROP_ssa, /* properties_required */
 901   0, /* properties_provided */
 902   0, /* properties_destroyed */
 903   0, /* todo_flags_start */
 904   TODO_update_ssa, /* todo_flags_finish */
 905 };
 906
 907 class pass_cse_reciprocals : public gimple_opt_pass
 908 {
 909 public:
 910   pass_cse_reciprocals (gcc::context *ctxt)
 911     : gimple_opt_pass (pass_data_cse_reciprocals, ctxt)
 912   {}
 913
 914   /* opt_pass methods: */
 915   virtual bool gate (function *) { return optimize && flag_reciprocal_math; }
 916   virtual unsigned int execute (function *);
 917
 918 }; // class pass_cse_reciprocals
 919
 920 unsigned int
 921 pass_cse_reciprocals::execute (function *fun)
 922 {
 923   basic_block bb;
 924   tree arg;
 925
 926   occ_pool = new object_allocator<occurrence> ("dominators for recip");
 927
 928   memset (&reciprocal_stats, 0, sizeof (reciprocal_stats));
 929   calculate_dominance_info (CDI_DOMINATORS);
 930   calculate_dominance_info (CDI_POST_DOMINATORS);
 931
 932   if (flag_checking)
 933     FOR_EACH_BB_FN (bb, fun)
 934       gcc_assert (!bb->aux);
 935
 936   for (arg = DECL_ARGUMENTS (fun->decl); arg; arg = DECL_CHAIN (arg))
 937     if (FLOAT_TYPE_P (TREE_TYPE (arg))
 938         && is_gimple_reg (arg))
 939       {
 940         tree name = ssa_default_def (fun, arg);
 941         if (name)
 942           execute_cse_reciprocals_1 (NULL, name);
 943       }
 944
 945   FOR_EACH_BB_FN (bb, fun)
 946     {
 947       tree def;
 948
 949       for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
 950            gsi_next (&gsi))
 951         {
 952           gphi *phi = gsi.phi ();
 953           def = PHI_RESULT (phi);
 954           if (! virtual_operand_p (def)
 955               && FLOAT_TYPE_P (TREE_TYPE (def)))
 956             execute_cse_reciprocals_1 (NULL, def);
 957         }
 958
 959       for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
 960            gsi_next (&gsi))
 961         {
 962           gimple *stmt = gsi_stmt (gsi);
 963
 964           if (gimple_has_lhs (stmt)
 965               && (def = SINGLE_SSA_TREE_OPERAND (stmt, SSA_OP_DEF)) != NULL
 966               && FLOAT_TYPE_P (TREE_TYPE (def))
 967               && TREE_CODE (def) == SSA_NAME)
 968             {
 969               execute_cse_reciprocals_1 (&gsi, def);
 970               stmt = gsi_stmt (gsi);
 971               if (flag_unsafe_math_optimizations
 972                   && is_gimple_assign (stmt)
 973                   && gimple_assign_lhs (stmt) == def
 974                   && !stmt_can_throw_internal (cfun, stmt)
 975                   && gimple_assign_rhs_code (stmt) == RDIV_EXPR)
 976                 optimize_recip_sqrt (&gsi, def);
 977             }
 978         }
 979
 980       if (optimize_bb_for_size_p (bb))
 981         continue;
 982
 983       /* Scan for a/func(b) and convert it to reciprocal a*rfunc(b).  */
 984       for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
 985            gsi_next (&gsi))
 986         {
 987           gimple *stmt = gsi_stmt (gsi);
 988
 989           if (is_gimple_assign (stmt)
 990               && gimple_assign_rhs_code (stmt) == RDIV_EXPR)
 991             {
 992               tree arg1 = gimple_assign_rhs2 (stmt);
 993               gimple *stmt1;
 994
 995               if (TREE_CODE (arg1) != SSA_NAME)
 996                 continue;
 997
 998               stmt1 = SSA_NAME_DEF_STMT (arg1);
 999
1000               if (is_gimple_call (stmt1)
1001                   && gimple_call_lhs (stmt1))
1002                 {
1003                   bool fail;
1004                   imm_use_iterator ui;
1005                   use_operand_p use_p;
1006                   tree fndecl = NULL_TREE;
1007
1008                   gcall *call = as_a <gcall *> (stmt1);
1009                   internal_fn ifn = internal_fn_reciprocal (call);
1010                   if (ifn == IFN_LAST)
1011                     {
1012                       fndecl = gimple_call_fndecl (call);
1013                       if (!fndecl
1014                           || !fndecl_built_in_p (fndecl, BUILT_IN_MD))
1015                         continue;
1016                       fndecl = targetm.builtin_reciprocal (fndecl);
1017                       if (!fndecl)
1018                         continue;
1019                     }
1020
1021                   /* Check that all uses of the SSA name are divisions,
1022                      otherwise replacing the defining statement will do
1023                      the wrong thing.  */
1024                   fail = false;
1025                   FOR_EACH_IMM_USE_FAST (use_p, ui, arg1)
1026                     {
1027                       gimple *stmt2 = USE_STMT (use_p);
1028                       if (is_gimple_debug (stmt2))
1029                         continue;
1030                       if (!is_gimple_assign (stmt2)
1031                           || gimple_assign_rhs_code (stmt2) != RDIV_EXPR
1032                           || gimple_assign_rhs1 (stmt2) == arg1
1033                           || gimple_assign_rhs2 (stmt2) != arg1)
1034                         {
1035                           fail = true;
1036                           break;
1037                         }
1038                     }
1039                   if (fail)
1040                     continue;
1041
1042                   gimple_replace_ssa_lhs (call, arg1);
1043                   if (gimple_call_internal_p (call) != (ifn != IFN_LAST))
1044                     {
1045                       auto_vec<tree, 4> args;
1046                       for (unsigned int i = 0;
1047                            i < gimple_call_num_args (call); i++)
1048                         args.safe_push (gimple_call_arg (call, i));
1049                       gcall *stmt2;
1050                       if (ifn == IFN_LAST)
1051                         stmt2 = gimple_build_call_vec (fndecl, args);
1052                       else
1053                         stmt2 = gimple_build_call_internal_vec (ifn, args);
1054                       gimple_call_set_lhs (stmt2, arg1);
1055                       gimple_move_vops (stmt2, call);
1056                       gimple_call_set_nothrow (stmt2,
1057                                                gimple_call_nothrow_p (call));
1058                       gimple_stmt_iterator gsi2 = gsi_for_stmt (call);
1059                       gsi_replace (&gsi2, stmt2, true);
1060                     }
1061                   else
1062                     {
1063                       if (ifn == IFN_LAST)
1064                         gimple_call_set_fndecl (call, fndecl);
1065                       else
1066                         gimple_call_set_internal_fn (call, ifn);
1067                       update_stmt (call);
1068                     }
1069                   reciprocal_stats.rfuncs_inserted++;
1070
1071                   FOR_EACH_IMM_USE_STMT (stmt, ui, arg1)
1072                     {
1073                       gimple_stmt_iterator gsi = gsi_for_stmt (stmt);
1074                       gimple_assign_set_rhs_code (stmt, MULT_EXPR);
1075                       fold_stmt_inplace (&gsi);
1076                       update_stmt (stmt);
1077                     }
1078                 }
1079             }
1080         }
1081     }
1082
1083   statistics_counter_event (fun, "reciprocal divs inserted",
1084                             reciprocal_stats.rdivs_inserted);
1085   statistics_counter_event (fun, "reciprocal functions inserted",
1086                             reciprocal_stats.rfuncs_inserted);
1087
1088   free_dominance_info (CDI_DOMINATORS);
1089   free_dominance_info (CDI_POST_DOMINATORS);
1090   delete occ_pool;
1091   return 0;
1092 }
1093
1094 } // anon namespace
1095
1096 gimple_opt_pass *
1097 make_pass_cse_reciprocals (gcc::context *ctxt)
1098 {
1099   return new pass_cse_reciprocals (ctxt);
1100 }
1101
1102 /* Records an occurrence at statement USE_STMT in the vector of trees
1103    STMTS if it is dominated by *TOP_BB or dominates it or this basic block
1104    is not yet initialized.  Returns true if the occurrence was pushed on
1105    the vector.  Adjusts *TOP_BB to be the basic block dominating all
1106    statements in the vector.  */
1107
1108 static bool
1109 maybe_record_sincos (vec<gimple *> *stmts,
1110                      basic_block *top_bb, gimple *use_stmt)
1111 {
1112   basic_block use_bb = gimple_bb (use_stmt);
1113   if (*top_bb
1114       && (*top_bb == use_bb
1115           || dominated_by_p (CDI_DOMINATORS, use_bb, *top_bb)))
1116     stmts->safe_push (use_stmt);
1117   else if (!*top_bb
1118            || dominated_by_p (CDI_DOMINATORS, *top_bb, use_bb))
1119     {
1120       stmts->safe_push (use_stmt);
1121       *top_bb = use_bb;
1122     }
1123   else
1124     return false;
1125
1126   return true;
1127 }
1128
1129 /* Look for sin, cos and cexpi calls with the same argument NAME and
1130    create a single call to cexpi CSEing the result in this case.
1131    We first walk over all immediate uses of the argument collecting
1132    statements that we can CSE in a vector and in a second pass replace
1133    the statement rhs with a REALPART or IMAGPART expression on the
1134    result of the cexpi call we insert before the use statement that
1135    dominates all other candidates.  */
1136
1137 static bool
1138 execute_cse_sincos_1 (tree name)
1139 {
1140   gimple_stmt_iterator gsi;
1141   imm_use_iterator use_iter;
1142   tree fndecl, res, type;
1143   gimple *def_stmt, *use_stmt, *stmt;
1144   int seen_cos = 0, seen_sin = 0, seen_cexpi = 0;
1145   auto_vec<gimple *> stmts;
1146   basic_block top_bb = NULL;
1147   int i;
1148   bool cfg_changed = false;
1149
1150   type = TREE_TYPE (name);
1151   FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, name)
1152     {
1153       if (gimple_code (use_stmt) != GIMPLE_CALL
1154           || !gimple_call_lhs (use_stmt))
1155         continue;
1156
1157       switch (gimple_call_combined_fn (use_stmt))
1158         {
1159         CASE_CFN_COS:
1160           seen_cos |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
1161           break;
1162
1163         CASE_CFN_SIN:
1164           seen_sin |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
1165           break;
1166
1167         CASE_CFN_CEXPI:
1168           seen_cexpi |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
1169           break;
1170
1171         default:;
1172         }
1173     }
1174
1175   if (seen_cos + seen_sin + seen_cexpi <= 1)
1176     return false;
1177
1178   /* Simply insert cexpi at the beginning of top_bb but not earlier than
1179      the name def statement.  */
1180   fndecl = mathfn_built_in (type, BUILT_IN_CEXPI);
1181   if (!fndecl)
1182     return false;
1183   stmt = gimple_build_call (fndecl, 1, name);
1184   res = make_temp_ssa_name (TREE_TYPE (TREE_TYPE (fndecl)), stmt, "sincostmp");
1185   gimple_call_set_lhs (stmt, res);
1186
1187   def_stmt = SSA_NAME_DEF_STMT (name);
1188   if (!SSA_NAME_IS_DEFAULT_DEF (name)
1189       && gimple_code (def_stmt) != GIMPLE_PHI
1190       && gimple_bb (def_stmt) == top_bb)
1191     {
1192       gsi = gsi_for_stmt (def_stmt);
1193       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
1194     }
1195   else
1196     {
1197       gsi = gsi_after_labels (top_bb);
1198       gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
1199     }
1200   sincos_stats.inserted++;
1201
1202   /* And adjust the recorded old call sites.  */
1203   for (i = 0; stmts.iterate (i, &use_stmt); ++i)
1204     {
1205       tree rhs = NULL;
1206
1207       switch (gimple_call_combined_fn (use_stmt))
1208         {
1209         CASE_CFN_COS:
1210           rhs = fold_build1 (REALPART_EXPR, type, res);
1211           break;
1212
1213         CASE_CFN_SIN:
1214           rhs = fold_build1 (IMAGPART_EXPR, type, res);
1215           break;
1216
1217         CASE_CFN_CEXPI:
1218           rhs = res;
1219           break;
1220
1221         default:;
1222           gcc_unreachable ();
1223         }
1224
1225         /* Replace call with a copy.  */
1226         stmt = gimple_build_assign (gimple_call_lhs (use_stmt), rhs);
1227
1228         gsi = gsi_for_stmt (use_stmt);
1229         gsi_replace (&gsi, stmt, true);
1230         if (gimple_purge_dead_eh_edges (gimple_bb (stmt)))
1231           cfg_changed = true;
1232     }
1233
1234   return cfg_changed;
1235 }
1236
1237 /* To evaluate powi(x,n), the floating point value x raised to the
1238    constant integer exponent n, we use a hybrid algorithm that
1239    combines the "window method" with look-up tables.  For an
1240    introduction to exponentiation algorithms and "addition chains",
1241    see section 4.6.3, "Evaluation of Powers" of Donald E. Knuth,
1242    "Seminumerical Algorithms", Vol. 2, "The Art of Computer Programming",
1243    3rd Edition, 1998, and Daniel M. Gordon, "A Survey of Fast Exponentiation
1244    Methods", Journal of Algorithms, Vol. 27, pp. 129-146, 1998.  */
1245
1246 /* Provide a default value for POWI_MAX_MULTS, the maximum number of
1247    multiplications to inline before calling the system library's pow
1248    function.  powi(x,n) requires at worst 2*bits(n)-2 multiplications,
1249    so this default never requires calling pow, powf or powl.  */
1250
1251 #ifndef POWI_MAX_MULTS
1252 #define POWI_MAX_MULTS  (2*HOST_BITS_PER_WIDE_INT-2)
1253 #endif
1254
1255 /* The size of the "optimal power tree" lookup table.  All
1256    exponents less than this value are simply looked up in the
1257    powi_table below.  This threshold is also used to size the
1258    cache of pseudo registers that hold intermediate results.  */
1259 #define POWI_TABLE_SIZE 256
1260
1261 /* The size, in bits of the window, used in the "window method"
1262    exponentiation algorithm.  This is equivalent to a radix of
1263    (1<<POWI_WINDOW_SIZE) in the corresponding "m-ary method".  */
1264 #define POWI_WINDOW_SIZE 3
1265
1266 /* The following table is an efficient representation of an
1267    "optimal power tree".  For each value, i, the corresponding
1268    value, j, in the table states than an optimal evaluation
1269    sequence for calculating pow(x,i) can be found by evaluating
1270    pow(x,j)*pow(x,i-j).  An optimal power tree for the first
1271    100 integers is given in Knuth's "Seminumerical algorithms".  */
1272
1273 static const unsigned char powi_table[POWI_TABLE_SIZE] =
1274   {
1275       0,   1,   1,   2,   2,   3,   3,   4,  /*   0 -   7 */
1276       4,   6,   5,   6,   6,  10,   7,   9,  /*   8 -  15 */
1277       8,  16,   9,  16,  10,  12,  11,  13,  /*  16 -  23 */
1278      12,  17,  13,  18,  14,  24,  15,  26,  /*  24 -  31 */
1279      16,  17,  17,  19,  18,  33,  19,  26,  /*  32 -  39 */
1280      20,  25,  21,  40,  22,  27,  23,  44,  /*  40 -  47 */
1281      24,  32,  25,  34,  26,  29,  27,  44,  /*  48 -  55 */
1282      28,  31,  29,  34,  30,  60,  31,  36,  /*  56 -  63 */
1283      32,  64,  33,  34,  34,  46,  35,  37,  /*  64 -  71 */
1284      36,  65,  37,  50,  38,  48,  39,  69,  /*  72 -  79 */
1285      40,  49,  41,  43,  42,  51,  43,  58,  /*  80 -  87 */
1286      44,  64,  45,  47,  46,  59,  47,  76,  /*  88 -  95 */
1287      48,  65,  49,  66,  50,  67,  51,  66,  /*  96 - 103 */
1288      52,  70,  53,  74,  54, 104,  55,  74,  /* 104 - 111 */
1289      56,  64,  57,  69,  58,  78,  59,  68,  /* 112 - 119 */
1290      60,  61,  61,  80,  62,  75,  63,  68,  /* 120 - 127 */
1291      64,  65,  65, 128,  66, 129,  67,  90,  /* 128 - 135 */
1292      68,  73,  69, 131,  70,  94,  71,  88,  /* 136 - 143 */
1293      72, 128,  73,  98,  74, 132,  75, 121,  /* 144 - 151 */
1294      76, 102,  77, 124,  78, 132,  79, 106,  /* 152 - 159 */
1295      80,  97,  81, 160,  82,  99,  83, 134,  /* 160 - 167 */
1296      84,  86,  85,  95,  86, 160,  87, 100,  /* 168 - 175 */
1297      88, 113,  89,  98,  90, 107,  91, 122,  /* 176 - 183 */
1298      92, 111,  93, 102,  94, 126,  95, 150,  /* 184 - 191 */
1299      96, 128,  97, 130,  98, 133,  99, 195,  /* 192 - 199 */
1300     100, 128, 101, 123, 102, 164, 103, 138,  /* 200 - 207 */
1301     104, 145, 105, 146, 106, 109, 107, 149,  /* 208 - 215 */
1302     108, 200, 109, 146, 110, 170, 111, 157,  /* 216 - 223 */
1303     112, 128, 113, 130, 114, 182, 115, 132,  /* 224 - 231 */
1304     116, 200, 117, 132, 118, 158, 119, 206,  /* 232 - 239 */
1305     120, 240, 121, 162, 122, 147, 123, 152,  /* 240 - 247 */
1306     124, 166, 125, 214, 126, 138, 127, 153,  /* 248 - 255 */
1307   };
1308
1309
1310 /* Return the number of multiplications required to calculate
1311    powi(x,n) where n is less than POWI_TABLE_SIZE.  This is a
1312    subroutine of powi_cost.  CACHE is an array indicating
1313    which exponents have already been calculated.  */
1314
1315 static int
1316 powi_lookup_cost (unsigned HOST_WIDE_INT n, bool *cache)
1317 {
1318   /* If we've already calculated this exponent, then this evaluation
1319      doesn't require any additional multiplications.  */
1320   if (cache[n])
1321     return 0;
1322
1323   cache[n] = true;
1324   return powi_lookup_cost (n - powi_table[n], cache)
1325          + powi_lookup_cost (powi_table[n], cache) + 1;
1326 }
1327
1328 /* Return the number of multiplications required to calculate
1329    powi(x,n) for an arbitrary x, given the exponent N.  This
1330    function needs to be kept in sync with powi_as_mults below.  */
1331
1332 static int
1333 powi_cost (HOST_WIDE_INT n)
1334 {
1335   bool cache[POWI_TABLE_SIZE];
1336   unsigned HOST_WIDE_INT digit;
1337   unsigned HOST_WIDE_INT val;
1338   int result;
1339
1340   if (n == 0)
1341     return 0;
1342
1343   /* Ignore the reciprocal when calculating the cost.  */
1344   val = (n < 0) ? -n : n;
1345
1346   /* Initialize the exponent cache.  */
1347   memset (cache, 0, POWI_TABLE_SIZE * sizeof (bool));
1348   cache[1] = true;
1349
1350   result = 0;
1351
1352   while (val >= POWI_TABLE_SIZE)
1353     {
1354       if (val & 1)
1355         {
1356           digit = val & ((1 << POWI_WINDOW_SIZE) - 1);
1357           result += powi_lookup_cost (digit, cache)
1358                     + POWI_WINDOW_SIZE + 1;
1359           val >>= POWI_WINDOW_SIZE;
1360         }
1361       else
1362         {
1363           val >>= 1;
1364           result++;
1365         }
1366     }
1367
1368   return result + powi_lookup_cost (val, cache);
1369 }
1370
1371 /* Recursive subroutine of powi_as_mults.  This function takes the
1372    array, CACHE, of already calculated exponents and an exponent N and
1373    returns a tree that corresponds to CACHE[1]**N, with type TYPE.  */
1374
1375 static tree
1376 powi_as_mults_1 (gimple_stmt_iterator *gsi, location_t loc, tree type,
1377                  HOST_WIDE_INT n, tree *cache)
1378 {
1379   tree op0, op1, ssa_target;
1380   unsigned HOST_WIDE_INT digit;
1381   gassign *mult_stmt;
1382
1383   if (n < POWI_TABLE_SIZE && cache[n])
1384     return cache[n];
1385
1386   ssa_target = make_temp_ssa_name (type, NULL, "powmult");
1387
1388   if (n < POWI_TABLE_SIZE)
1389     {
1390       cache[n] = ssa_target;
1391       op0 = powi_as_mults_1 (gsi, loc, type, n - powi_table[n], cache);
1392       op1 = powi_as_mults_1 (gsi, loc, type, powi_table[n], cache);
1393     }
1394   else if (n & 1)
1395     {
1396       digit = n & ((1 << POWI_WINDOW_SIZE) - 1);
1397       op0 = powi_as_mults_1 (gsi, loc, type, n - digit, cache);
1398       op1 = powi_as_mults_1 (gsi, loc, type, digit, cache);
1399     }
1400   else
1401     {
1402       op0 = powi_as_mults_1 (gsi, loc, type, n >> 1, cache);
1403       op1 = op0;
1404     }
1405
1406   mult_stmt = gimple_build_assign (ssa_target, MULT_EXPR, op0, op1);
1407   gimple_set_location (mult_stmt, loc);
1408   gsi_insert_before (gsi, mult_stmt, GSI_SAME_STMT);
1409
1410   return ssa_target;
1411 }
1412
1413 /* Convert ARG0**N to a tree of multiplications of ARG0 with itself.
1414    This function needs to be kept in sync with powi_cost above.  */
1415
1416 static tree
1417 powi_as_mults (gimple_stmt_iterator *gsi, location_t loc,
1418                tree arg0, HOST_WIDE_INT n)
1419 {
1420   tree cache[POWI_TABLE_SIZE], result, type = TREE_TYPE (arg0);
1421   gassign *div_stmt;
1422   tree target;
1423
1424   if (n == 0)
1425     return build_real (type, dconst1);
1426
1427   memset (cache, 0,  sizeof (cache));
1428   cache[1] = arg0;
1429
1430   result = powi_as_mults_1 (gsi, loc, type, (n < 0) ? -n : n, cache);
1431   if (n >= 0)
1432     return result;
1433
1434   /* If the original exponent was negative, reciprocate the result.  */
1435   target = make_temp_ssa_name (type, NULL, "powmult");
1436   div_stmt = gimple_build_assign (target, RDIV_EXPR,
1437                                   build_real (type, dconst1), result);
1438   gimple_set_location (div_stmt, loc);
1439   gsi_insert_before (gsi, div_stmt, GSI_SAME_STMT);
1440
1441   return target;
1442 }
1443
1444 /* ARG0 and N are the two arguments to a powi builtin in GSI with
1445    location info LOC.  If the arguments are appropriate, create an
1446    equivalent sequence of statements prior to GSI using an optimal
1447    number of multiplications, and return an expession holding the
1448    result.  */
1449
1450 static tree
1451 gimple_expand_builtin_powi (gimple_stmt_iterator *gsi, location_t loc,
1452                             tree arg0, HOST_WIDE_INT n)
1453 {
1454   /* Avoid largest negative number.  */
1455   if (n != -n
1456       && ((n >= -1 && n <= 2)
1457           || (optimize_function_for_speed_p (cfun)
1458               && powi_cost (n) <= POWI_MAX_MULTS)))
1459     return powi_as_mults (gsi, loc, arg0, n);
1460
1461   return NULL_TREE;
1462 }
1463
1464 /* Build a gimple call statement that calls FN with argument ARG.
1465    Set the lhs of the call statement to a fresh SSA name.  Insert the
1466    statement prior to GSI's current position, and return the fresh
1467    SSA name.  */
1468
1469 static tree
1470 build_and_insert_call (gimple_stmt_iterator *gsi, location_t loc,
1471                        tree fn, tree arg)
1472 {
1473   gcall *call_stmt;
1474   tree ssa_target;
1475
1476   call_stmt = gimple_build_call (fn, 1, arg);
1477   ssa_target = make_temp_ssa_name (TREE_TYPE (arg), NULL, "powroot");
1478   gimple_set_lhs (call_stmt, ssa_target);
1479   gimple_set_location (call_stmt, loc);
1480   gsi_insert_before (gsi, call_stmt, GSI_SAME_STMT);
1481
1482   return ssa_target;
1483 }
1484
1485 /* Build a gimple binary operation with the given CODE and arguments
1486    ARG0, ARG1, assigning the result to a new SSA name for variable
1487    TARGET.  Insert the statement prior to GSI's current position, and
1488    return the fresh SSA name.*/
1489
1490 static tree
1491 build_and_insert_binop (gimple_stmt_iterator *gsi, location_t loc,
1492                         const char *name, enum tree_code code,
1493                         tree arg0, tree arg1)
1494 {
1495   tree result = make_temp_ssa_name (TREE_TYPE (arg0), NULL, name);
1496   gassign *stmt = gimple_build_assign (result, code, arg0, arg1);
1497   gimple_set_location (stmt, loc);
1498   gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1499   return result;
1500 }
1501
1502 /* Build a gimple reference operation with the given CODE and argument
1503    ARG, assigning the result to a new SSA name of TYPE with NAME.
1504    Insert the statement prior to GSI's current position, and return
1505    the fresh SSA name.  */
1506
1507 static inline tree
1508 build_and_insert_ref (gimple_stmt_iterator *gsi, location_t loc, tree type,
1509                       const char *name, enum tree_code code, tree arg0)
1510 {
1511   tree result = make_temp_ssa_name (type, NULL, name);
1512   gimple *stmt = gimple_build_assign (result, build1 (code, type, arg0));
1513   gimple_set_location (stmt, loc);
1514   gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1515   return result;
1516 }
1517
1518 /* Build a gimple assignment to cast VAL to TYPE.  Insert the statement
1519    prior to GSI's current position, and return the fresh SSA name.  */
1520
1521 static tree
1522 build_and_insert_cast (gimple_stmt_iterator *gsi, location_t loc,
1523                        tree type, tree val)
1524 {
1525   tree result = make_ssa_name (type);
1526   gassign *stmt = gimple_build_assign (result, NOP_EXPR, val);
1527   gimple_set_location (stmt, loc);
1528   gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1529   return result;
1530 }
1531
1532 struct pow_synth_sqrt_info
1533 {
1534   bool *factors;
1535   unsigned int deepest;
1536   unsigned int num_mults;
1537 };
1538
1539 /* Return true iff the real value C can be represented as a
1540    sum of powers of 0.5 up to N.  That is:
1541    C == SUM<i from 1..N> (a[i]*(0.5**i)) where a[i] is either 0 or 1.
1542    Record in INFO the various parameters of the synthesis algorithm such
1543    as the factors a[i], the maximum 0.5 power and the number of
1544    multiplications that will be required.  */
1545
1546 bool
1547 representable_as_half_series_p (REAL_VALUE_TYPE c, unsigned n,
1548                                  struct pow_synth_sqrt_info *info)
1549 {
1550   REAL_VALUE_TYPE factor = dconsthalf;
1551   REAL_VALUE_TYPE remainder = c;
1552
1553   info->deepest = 0;
1554   info->num_mults = 0;
1555   memset (info->factors, 0, n * sizeof (bool));
1556
1557   for (unsigned i = 0; i < n; i++)
1558     {
1559       REAL_VALUE_TYPE res;
1560
1561       /* If something inexact happened bail out now.  */
1562       if (real_arithmetic (&res, MINUS_EXPR, &remainder, &factor))
1563         return false;
1564
1565       /* We have hit zero.  The number is representable as a sum
1566          of powers of 0.5.  */
1567       if (real_equal (&res, &dconst0))
1568         {
1569           info->factors[i] = true;
1570           info->deepest = i + 1;
1571           return true;
1572         }
1573       else if (!REAL_VALUE_NEGATIVE (res))
1574         {
1575           remainder = res;
1576           info->factors[i] = true;
1577           info->num_mults++;
1578         }
1579       else
1580         info->factors[i] = false;
1581
1582       real_arithmetic (&factor, MULT_EXPR, &factor, &dconsthalf);
1583     }
1584   return false;
1585 }
1586
1587 /* Return the tree corresponding to FN being applied
1588    to ARG N times at GSI and LOC.
1589    Look up previous results from CACHE if need be.
1590    cache[0] should contain just plain ARG i.e. FN applied to ARG 0 times.  */
1591
1592 static tree
1593 get_fn_chain (tree arg, unsigned int n, gimple_stmt_iterator *gsi,
1594               tree fn, location_t loc, tree *cache)
1595 {
1596   tree res = cache[n];
1597   if (!res)
1598     {
1599       tree prev = get_fn_chain (arg, n - 1, gsi, fn, loc, cache);
1600       res = build_and_insert_call (gsi, loc, fn, prev);
1601       cache[n] = res;
1602     }
1603
1604   return res;
1605 }
1606
1607 /* Print to STREAM the repeated application of function FNAME to ARG
1608    N times.  So, for FNAME = "foo", ARG = "x", N = 2 it would print:
1609    "foo (foo (x))".  */
1610
1611 static void
1612 print_nested_fn (FILE* stream, const char *fname, const char* arg,
1613                  unsigned int n)
1614 {
1615   if (n == 0)
1616     fprintf (stream, "%s", arg);
1617   else
1618     {
1619       fprintf (stream, "%s (", fname);
1620       print_nested_fn (stream, fname, arg, n - 1);
1621       fprintf (stream, ")");
1622     }
1623 }
1624
1625 /* Print to STREAM the fractional sequence of sqrt chains
1626    applied to ARG, described by INFO.  Used for the dump file.  */
1627
1628 static void
1629 dump_fractional_sqrt_sequence (FILE *stream, const char *arg,
1630                                 struct pow_synth_sqrt_info *info)
1631 {
1632   for (unsigned int i = 0; i < info->deepest; i++)
1633     {
1634       bool is_set = info->factors[i];
1635       if (is_set)
1636         {
1637           print_nested_fn (stream, "sqrt", arg, i + 1);
1638           if (i != info->deepest - 1)
1639             fprintf (stream, " * ");
1640         }
1641     }
1642 }
1643
1644 /* Print to STREAM a representation of raising ARG to an integer
1645    power N.  Used for the dump file.  */
1646
1647 static void
1648 dump_integer_part (FILE *stream, const char* arg, HOST_WIDE_INT n)
1649 {
1650   if (n > 1)
1651     fprintf (stream, "powi (%s, " HOST_WIDE_INT_PRINT_DEC ")", arg, n);
1652   else if (n == 1)
1653     fprintf (stream, "%s", arg);
1654 }
1655
1656 /* Attempt to synthesize a POW[F] (ARG0, ARG1) call using chains of
1657    square roots.  Place at GSI and LOC.  Limit the maximum depth
1658    of the sqrt chains to MAX_DEPTH.  Return the tree holding the
1659    result of the expanded sequence or NULL_TREE if the expansion failed.
1660
1661    This routine assumes that ARG1 is a real number with a fractional part
1662    (the integer exponent case will have been handled earlier in
1663    gimple_expand_builtin_pow).
1664
1665    For ARG1 > 0.0:
1666    * For ARG1 composed of a whole part WHOLE_PART and a fractional part
1667      FRAC_PART i.e. WHOLE_PART == floor (ARG1) and
1668                     FRAC_PART == ARG1 - WHOLE_PART:
1669      Produce POWI (ARG0, WHOLE_PART) * POW (ARG0, FRAC_PART) where
1670      POW (ARG0, FRAC_PART) is expanded as a product of square root chains
1671      if it can be expressed as such, that is if FRAC_PART satisfies:
1672      FRAC_PART == <SUM from i = 1 until MAX_DEPTH> (a[i] * (0.5**i))
1673      where integer a[i] is either 0 or 1.
1674
1675      Example:
1676      POW (x, 3.625) == POWI (x, 3) * POW (x, 0.625)
1677        --> POWI (x, 3) * SQRT (x) * SQRT (SQRT (SQRT (x)))
1678
1679    For ARG1 < 0.0 there are two approaches:
1680    * (A) Expand to 1.0 / POW (ARG0, -ARG1) where POW (ARG0, -ARG1)
1681          is calculated as above.
1682
1683      Example:
1684      POW (x, -5.625) == 1.0 / POW (x, 5.625)
1685        -->  1.0 / (POWI (x, 5) * SQRT (x) * SQRT (SQRT (SQRT (x))))
1686
1687    * (B) : WHOLE_PART := - ceil (abs (ARG1))
1688            FRAC_PART  := ARG1 - WHOLE_PART
1689      and expand to POW (x, FRAC_PART) / POWI (x, WHOLE_PART).
1690      Example:
1691      POW (x, -5.875) == POW (x, 0.125) / POWI (X, 6)
1692        --> SQRT (SQRT (SQRT (x))) / (POWI (x, 6))
1693
1694    For ARG1 < 0.0 we choose between (A) and (B) depending on
1695    how many multiplications we'd have to do.
1696    So, for the example in (B): POW (x, -5.875), if we were to
1697    follow algorithm (A) we would produce:
1698    1.0 / POWI (X, 5) * SQRT (X) * SQRT (SQRT (X)) * SQRT (SQRT (SQRT (X)))
1699    which contains more multiplications than approach (B).
1700
1701    Hopefully, this approach will eliminate potentially expensive POW library
1702    calls when unsafe floating point math is enabled and allow the compiler to
1703    further optimise the multiplies, square roots and divides produced by this
1704    function.  */
1705
1706 static tree
1707 expand_pow_as_sqrts (gimple_stmt_iterator *gsi, location_t loc,
1708                      tree arg0, tree arg1, HOST_WIDE_INT max_depth)
1709 {
1710   tree type = TREE_TYPE (arg0);
1711   machine_mode mode = TYPE_MODE (type);
1712   tree sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1713   bool one_over = true;
1714
1715   if (!sqrtfn)
1716     return NULL_TREE;
1717
1718   if (TREE_CODE (arg1) != REAL_CST)
1719     return NULL_TREE;
1720
1721   REAL_VALUE_TYPE exp_init = TREE_REAL_CST (arg1);
1722
1723   gcc_assert (max_depth > 0);
1724   tree *cache = XALLOCAVEC (tree, max_depth + 1);
1725
1726   struct pow_synth_sqrt_info synth_info;
1727   synth_info.factors = XALLOCAVEC (bool, max_depth + 1);
1728   synth_info.deepest = 0;
1729   synth_info.num_mults = 0;
1730
1731   bool neg_exp = REAL_VALUE_NEGATIVE (exp_init);
1732   REAL_VALUE_TYPE exp = real_value_abs (&exp_init);
1733
1734   /* The whole and fractional parts of exp.  */
1735   REAL_VALUE_TYPE whole_part;
1736   REAL_VALUE_TYPE frac_part;
1737
1738   real_floor (&whole_part, mode, &exp);
1739   real_arithmetic (&frac_part, MINUS_EXPR, &exp, &whole_part);
1740
1741
1742   REAL_VALUE_TYPE ceil_whole = dconst0;
1743   REAL_VALUE_TYPE ceil_fract = dconst0;
1744
1745   if (neg_exp)
1746     {
1747       real_ceil (&ceil_whole, mode, &exp);
1748       real_arithmetic (&ceil_fract, MINUS_EXPR, &ceil_whole, &exp);
1749     }
1750
1751   if (!representable_as_half_series_p (frac_part, max_depth, &synth_info))
1752     return NULL_TREE;
1753
1754   /* Check whether it's more profitable to not use 1.0 / ...  */
1755   if (neg_exp)
1756     {
1757       struct pow_synth_sqrt_info alt_synth_info;
1758       alt_synth_info.factors = XALLOCAVEC (bool, max_depth + 1);
1759       alt_synth_info.deepest = 0;
1760       alt_synth_info.num_mults = 0;
1761
1762       if (representable_as_half_series_p (ceil_fract, max_depth,
1763                                            &alt_synth_info)
1764           && alt_synth_info.deepest <= synth_info.deepest
1765           && alt_synth_info.num_mults < synth_info.num_mults)
1766         {
1767           whole_part = ceil_whole;
1768           frac_part = ceil_fract;
1769           synth_info.deepest = alt_synth_info.deepest;
1770           synth_info.num_mults = alt_synth_info.num_mults;
1771           memcpy (synth_info.factors, alt_synth_info.factors,
1772                   (max_depth + 1) * sizeof (bool));
1773           one_over = false;
1774         }
1775     }
1776
1777   HOST_WIDE_INT n = real_to_integer (&whole_part);
1778   REAL_VALUE_TYPE cint;
1779   real_from_integer (&cint, VOIDmode, n, SIGNED);
1780
1781   if (!real_identical (&whole_part, &cint))
1782     return NULL_TREE;
1783
1784   if (powi_cost (n) + synth_info.num_mults > POWI_MAX_MULTS)
1785     return NULL_TREE;
1786
1787   memset (cache, 0, (max_depth + 1) * sizeof (tree));
1788
1789   tree integer_res = n == 0 ? build_real (type, dconst1) : arg0;
1790
1791   /* Calculate the integer part of the exponent.  */
1792   if (n > 1)
1793     {
1794       integer_res = gimple_expand_builtin_powi (gsi, loc, arg0, n);
1795       if (!integer_res)
1796         return NULL_TREE;
1797     }
1798
1799   if (dump_file)
1800     {
1801       char string[64];
1802
1803       real_to_decimal (string, &exp_init, sizeof (string), 0, 1);
1804       fprintf (dump_file, "synthesizing pow (x, %s) as:\n", string);
1805
1806       if (neg_exp)
1807         {
1808           if (one_over)
1809             {
1810               fprintf (dump_file, "1.0 / (");
1811               dump_integer_part (dump_file, "x", n);
1812               if (n > 0)
1813                 fprintf (dump_file, " * ");
1814               dump_fractional_sqrt_sequence (dump_file, "x", &synth_info);
1815               fprintf (dump_file, ")");
1816             }
1817           else
1818             {
1819               dump_fractional_sqrt_sequence (dump_file, "x", &synth_info);
1820               fprintf (dump_file, " / (");
1821               dump_integer_part (dump_file, "x", n);
1822               fprintf (dump_file, ")");
1823             }
1824         }
1825       else
1826         {
1827           dump_fractional_sqrt_sequence (dump_file, "x", &synth_info);
1828           if (n > 0)
1829             fprintf (dump_file, " * ");
1830           dump_integer_part (dump_file, "x", n);
1831         }
1832
1833       fprintf (dump_file, "\ndeepest sqrt chain: %d\n", synth_info.deepest);
1834     }
1835
1836
1837   tree fract_res = NULL_TREE;
1838   cache[0] = arg0;
1839
1840   /* Calculate the fractional part of the exponent.  */
1841   for (unsigned i = 0; i < synth_info.deepest; i++)
1842     {
1843       if (synth_info.factors[i])
1844         {
1845           tree sqrt_chain = get_fn_chain (arg0, i + 1, gsi, sqrtfn, loc, cache);
1846
1847           if (!fract_res)
1848               fract_res = sqrt_chain;
1849
1850           else
1851             fract_res = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1852                                            fract_res, sqrt_chain);
1853         }
1854     }
1855
1856   tree res = NULL_TREE;
1857
1858   if (neg_exp)
1859     {
1860       if (one_over)
1861         {
1862           if (n > 0)
1863             res = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1864                                            fract_res, integer_res);
1865           else
1866             res = fract_res;
1867
1868           res = build_and_insert_binop (gsi, loc, "powrootrecip", RDIV_EXPR,
1869                                           build_real (type, dconst1), res);
1870         }
1871       else
1872         {
1873           res = build_and_insert_binop (gsi, loc, "powroot", RDIV_EXPR,
1874                                          fract_res, integer_res);
1875         }
1876     }
1877   else
1878     res = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1879                                    fract_res, integer_res);
1880   return res;
1881 }
1882
1883 /* ARG0 and ARG1 are the two arguments to a pow builtin call in GSI
1884    with location info LOC.  If possible, create an equivalent and
1885    less expensive sequence of statements prior to GSI, and return an
1886    expession holding the result.  */
1887
1888 static tree
1889 gimple_expand_builtin_pow (gimple_stmt_iterator *gsi, location_t loc,
1890                            tree arg0, tree arg1)
1891 {
1892   REAL_VALUE_TYPE c, cint, dconst1_3, dconst1_4, dconst1_6;
1893   REAL_VALUE_TYPE c2, dconst3;
1894   HOST_WIDE_INT n;
1895   tree type, sqrtfn, cbrtfn, sqrt_arg0, result, cbrt_x, powi_cbrt_x;
1896   machine_mode mode;
1897   bool speed_p = optimize_bb_for_speed_p (gsi_bb (*gsi));
1898   bool hw_sqrt_exists, c_is_int, c2_is_int;
1899
1900   dconst1_4 = dconst1;
1901   SET_REAL_EXP (&dconst1_4, REAL_EXP (&dconst1_4) - 2);
1902
1903   /* If the exponent isn't a constant, there's nothing of interest
1904      to be done.  */
1905   if (TREE_CODE (arg1) != REAL_CST)
1906     return NULL_TREE;
1907
1908   /* Don't perform the operation if flag_signaling_nans is on
1909      and the operand is a signaling NaN.  */
1910   if (HONOR_SNANS (TYPE_MODE (TREE_TYPE (arg1)))
1911       && ((TREE_CODE (arg0) == REAL_CST
1912            && REAL_VALUE_ISSIGNALING_NAN (TREE_REAL_CST (arg0)))
1913           || REAL_VALUE_ISSIGNALING_NAN (TREE_REAL_CST (arg1))))
1914     return NULL_TREE;
1915
1916   /* If the exponent is equivalent to an integer, expand to an optimal
1917      multiplication sequence when profitable.  */
1918   c = TREE_REAL_CST (arg1);
1919   n = real_to_integer (&c);
1920   real_from_integer (&cint, VOIDmode, n, SIGNED);
1921   c_is_int = real_identical (&c, &cint);
1922
1923   if (c_is_int
1924       && ((n >= -1 && n <= 2)
1925           || (flag_unsafe_math_optimizations
1926               && speed_p
1927               && powi_cost (n) <= POWI_MAX_MULTS)))
1928     return gimple_expand_builtin_powi (gsi, loc, arg0, n);
1929
1930   /* Attempt various optimizations using sqrt and cbrt.  */
1931   type = TREE_TYPE (arg0);
1932   mode = TYPE_MODE (type);
1933   sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1934
1935   /* Optimize pow(x,0.5) = sqrt(x).  This replacement is always safe
1936      unless signed zeros must be maintained.  pow(-0,0.5) = +0, while
1937      sqrt(-0) = -0.  */
1938   if (sqrtfn
1939       && real_equal (&c, &dconsthalf)
1940       && !HONOR_SIGNED_ZEROS (mode))
1941     return build_and_insert_call (gsi, loc, sqrtfn, arg0);
1942
1943   hw_sqrt_exists = optab_handler (sqrt_optab, mode) != CODE_FOR_nothing;
1944
1945   /* Optimize pow(x,1./3.) = cbrt(x).  This requires unsafe math
1946      optimizations since 1./3. is not exactly representable.  If x
1947      is negative and finite, the correct value of pow(x,1./3.) is
1948      a NaN with the "invalid" exception raised, because the value
1949      of 1./3. actually has an even denominator.  The correct value
1950      of cbrt(x) is a negative real value.  */
1951   cbrtfn = mathfn_built_in (type, BUILT_IN_CBRT);
1952   dconst1_3 = real_value_truncate (mode, dconst_third ());
1953
1954   if (flag_unsafe_math_optimizations
1955       && cbrtfn
1956       && (!HONOR_NANS (mode) || tree_expr_nonnegative_p (arg0))
1957       && real_equal (&c, &dconst1_3))
1958     return build_and_insert_call (gsi, loc, cbrtfn, arg0);
1959
1960   /* Optimize pow(x,1./6.) = cbrt(sqrt(x)).  Don't do this optimization
1961      if we don't have a hardware sqrt insn.  */
1962   dconst1_6 = dconst1_3;
1963   SET_REAL_EXP (&dconst1_6, REAL_EXP (&dconst1_6) - 1);
1964
1965   if (flag_unsafe_math_optimizations
1966       && sqrtfn
1967       && cbrtfn
1968       && (!HONOR_NANS (mode) || tree_expr_nonnegative_p (arg0))
1969       && speed_p
1970       && hw_sqrt_exists
1971       && real_equal (&c, &dconst1_6))
1972     {
1973       /* sqrt(x)  */
1974       sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1975
1976       /* cbrt(sqrt(x))  */
1977       return build_and_insert_call (gsi, loc, cbrtfn, sqrt_arg0);
1978     }
1979
1980
1981   /* Attempt to expand the POW as a product of square root chains.
1982      Expand the 0.25 case even when otpimising for size.  */
1983   if (flag_unsafe_math_optimizations
1984       && sqrtfn
1985       && hw_sqrt_exists
1986       && (speed_p || real_equal (&c, &dconst1_4))
1987       && !HONOR_SIGNED_ZEROS (mode))
1988     {
1989       unsigned int max_depth = speed_p
1990                                 ? param_max_pow_sqrt_depth
1991                                 : 2;
1992
1993       tree expand_with_sqrts
1994         = expand_pow_as_sqrts (gsi, loc, arg0, arg1, max_depth);
1995
1996       if (expand_with_sqrts)
1997         return expand_with_sqrts;
1998     }
1999
2000   real_arithmetic (&c2, MULT_EXPR, &c, &dconst2);
2001   n = real_to_integer (&c2);
2002   real_from_integer (&cint, VOIDmode, n, SIGNED);
2003   c2_is_int = real_identical (&c2, &cint);
2004
2005   /* Optimize pow(x,c), where 3c = n for some nonzero integer n, into
2006
2007      powi(x, n/3) * powi(cbrt(x), n%3),                    n > 0;
2008      1.0 / (powi(x, abs(n)/3) * powi(cbrt(x), abs(n)%3)),  n < 0.
2009
2010      Do not calculate the first factor when n/3 = 0.  As cbrt(x) is
2011      different from pow(x, 1./3.) due to rounding and behavior with
2012      negative x, we need to constrain this transformation to unsafe
2013      math and positive x or finite math.  */
2014   real_from_integer (&dconst3, VOIDmode, 3, SIGNED);
2015   real_arithmetic (&c2, MULT_EXPR, &c, &dconst3);
2016   real_round (&c2, mode, &c2);
2017   n = real_to_integer (&c2);
2018   real_from_integer (&cint, VOIDmode, n, SIGNED);
2019   real_arithmetic (&c2, RDIV_EXPR, &cint, &dconst3);
2020   real_convert (&c2, mode, &c2);
2021
2022   if (flag_unsafe_math_optimizations
2023       && cbrtfn
2024       && (!HONOR_NANS (mode) || tree_expr_nonnegative_p (arg0))
2025       && real_identical (&c2, &c)
2026       && !c2_is_int
2027       && optimize_function_for_speed_p (cfun)
2028       && powi_cost (n / 3) <= POWI_MAX_MULTS)
2029     {
2030       tree powi_x_ndiv3 = NULL_TREE;
2031
2032       /* Attempt to fold powi(arg0, abs(n/3)) into multiplies.  If not
2033          possible or profitable, give up.  Skip the degenerate case when
2034          abs(n) < 3, where the result is always 1.  */
2035       if (absu_hwi (n) >= 3)
2036         {
2037           powi_x_ndiv3 = gimple_expand_builtin_powi (gsi, loc, arg0,
2038                                                      abs_hwi (n / 3));
2039           if (!powi_x_ndiv3)
2040             return NULL_TREE;
2041         }
2042
2043       /* Calculate powi(cbrt(x), n%3).  Don't use gimple_expand_builtin_powi
2044          as that creates an unnecessary variable.  Instead, just produce
2045          either cbrt(x) or cbrt(x) * cbrt(x).  */
2046       cbrt_x = build_and_insert_call (gsi, loc, cbrtfn, arg0);
2047
2048       if (absu_hwi (n) % 3 == 1)
2049         powi_cbrt_x = cbrt_x;
2050       else
2051         powi_cbrt_x = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
2052                                               cbrt_x, cbrt_x);
2053
2054       /* Multiply the two subexpressions, unless powi(x,abs(n)/3) = 1.  */
2055       if (absu_hwi (n) < 3)
2056         result = powi_cbrt_x;
2057       else
2058         result = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
2059                                          powi_x_ndiv3, powi_cbrt_x);
2060
2061       /* If n is negative, reciprocate the result.  */
2062       if (n < 0)
2063         result = build_and_insert_binop (gsi, loc, "powroot", RDIV_EXPR,
2064                                          build_real (type, dconst1), result);
2065
2066       return result;
2067     }
2068
2069   /* No optimizations succeeded.  */
2070   return NULL_TREE;
2071 }
2072
2073 /* ARG is the argument to a cabs builtin call in GSI with location info
2074    LOC.  Create a sequence of statements prior to GSI that calculates
2075    sqrt(R*R + I*I), where R and I are the real and imaginary components
2076    of ARG, respectively.  Return an expression holding the result.  */
2077
2078 static tree
2079 gimple_expand_builtin_cabs (gimple_stmt_iterator *gsi, location_t loc, tree arg)
2080 {
2081   tree real_part, imag_part, addend1, addend2, sum, result;
2082   tree type = TREE_TYPE (TREE_TYPE (arg));
2083   tree sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
2084   machine_mode mode = TYPE_MODE (type);
2085
2086   if (!flag_unsafe_math_optimizations
2087       || !optimize_bb_for_speed_p (gimple_bb (gsi_stmt (*gsi)))
2088       || !sqrtfn
2089       || optab_handler (sqrt_optab, mode) == CODE_FOR_nothing)
2090     return NULL_TREE;
2091
2092   real_part = build_and_insert_ref (gsi, loc, type, "cabs",
2093                                     REALPART_EXPR, arg);
2094   addend1 = build_and_insert_binop (gsi, loc, "cabs", MULT_EXPR,
2095                                     real_part, real_part);
2096   imag_part = build_and_insert_ref (gsi, loc, type, "cabs",
2097                                     IMAGPART_EXPR, arg);
2098   addend2 = build_and_insert_binop (gsi, loc, "cabs", MULT_EXPR,
2099                                     imag_part, imag_part);
2100   sum = build_and_insert_binop (gsi, loc, "cabs", PLUS_EXPR, addend1, addend2);
2101   result = build_and_insert_call (gsi, loc, sqrtfn, sum);
2102
2103   return result;
2104 }
2105
2106 /* Go through all calls to sin, cos and cexpi and call execute_cse_sincos_1
2107    on the SSA_NAME argument of each of them.  Also expand powi(x,n) into
2108    an optimal number of multiplies, when n is a constant.  */
2109
2110 namespace {
2111
2112 const pass_data pass_data_cse_sincos =
2113 {
2114   GIMPLE_PASS, /* type */
2115   "sincos", /* name */
2116   OPTGROUP_NONE, /* optinfo_flags */
2117   TV_TREE_SINCOS, /* tv_id */
2118   PROP_ssa, /* properties_required */
2119   PROP_gimple_opt_math, /* properties_provided */
2120   0, /* properties_destroyed */
2121   0, /* todo_flags_start */
2122   TODO_update_ssa, /* todo_flags_finish */
2123 };
2124
2125 class pass_cse_sincos : public gimple_opt_pass
2126 {
2127 public:
2128   pass_cse_sincos (gcc::context *ctxt)
2129     : gimple_opt_pass (pass_data_cse_sincos, ctxt)
2130   {}
2131
2132   /* opt_pass methods: */
2133   virtual bool gate (function *)
2134     {
2135       /* We no longer require either sincos or cexp, since powi expansion
2136          piggybacks on this pass.  */
2137       return optimize;
2138     }
2139
2140   virtual unsigned int execute (function *);
2141
2142 }; // class pass_cse_sincos
2143
2144 unsigned int
2145 pass_cse_sincos::execute (function *fun)
2146 {
2147   basic_block bb;
2148   bool cfg_changed = false;
2149
2150   calculate_dominance_info (CDI_DOMINATORS);
2151   memset (&sincos_stats, 0, sizeof (sincos_stats));
2152
2153   FOR_EACH_BB_FN (bb, fun)
2154     {
2155       gimple_stmt_iterator gsi;
2156       bool cleanup_eh = false;
2157
2158       for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2159         {
2160           gimple *stmt = gsi_stmt (gsi);
2161
2162           /* Only the last stmt in a bb could throw, no need to call
2163              gimple_purge_dead_eh_edges if we change something in the middle
2164              of a basic block.  */
2165           cleanup_eh = false;
2166
2167           if (is_gimple_call (stmt)
2168               && gimple_call_lhs (stmt))
2169             {
2170               tree arg, arg0, arg1, result;
2171               HOST_WIDE_INT n;
2172               location_t loc;
2173
2174               switch (gimple_call_combined_fn (stmt))
2175                 {
2176                 CASE_CFN_COS:
2177                 CASE_CFN_SIN:
2178                 CASE_CFN_CEXPI:
2179                   /* Make sure we have either sincos or cexp.  */
2180                   if (!targetm.libc_has_function (function_c99_math_complex)
2181                       && !targetm.libc_has_function (function_sincos))
2182                     break;
2183
2184                   arg = gimple_call_arg (stmt, 0);
2185                   if (TREE_CODE (arg) == SSA_NAME)
2186                     cfg_changed |= execute_cse_sincos_1 (arg);
2187                   break;
2188
2189                 CASE_CFN_POW:
2190                   arg0 = gimple_call_arg (stmt, 0);
2191                   arg1 = gimple_call_arg (stmt, 1);
2192
2193                   loc = gimple_location (stmt);
2194                   result = gimple_expand_builtin_pow (&gsi, loc, arg0, arg1);
2195
2196                   if (result)
2197                     {
2198                       tree lhs = gimple_get_lhs (stmt);
2199                       gassign *new_stmt = gimple_build_assign (lhs, result);
2200                       gimple_set_location (new_stmt, loc);
2201                       unlink_stmt_vdef (stmt);
2202                       gsi_replace (&gsi, new_stmt, true);
2203                       cleanup_eh = true;
2204                       if (gimple_vdef (stmt))
2205                         release_ssa_name (gimple_vdef (stmt));
2206                     }
2207                   break;
2208
2209                 CASE_CFN_POWI:
2210                   arg0 = gimple_call_arg (stmt, 0);
2211                   arg1 = gimple_call_arg (stmt, 1);
2212                   loc = gimple_location (stmt);
2213
2214                   if (real_minus_onep (arg0))
2215                     {
2216                       tree t0, t1, cond, one, minus_one;
2217                       gassign *stmt;
2218
2219                       t0 = TREE_TYPE (arg0);
2220                       t1 = TREE_TYPE (arg1);
2221                       one = build_real (t0, dconst1);
2222                       minus_one = build_real (t0, dconstm1);
2223
2224                       cond = make_temp_ssa_name (t1, NULL, "powi_cond");
2225                       stmt = gimple_build_assign (cond, BIT_AND_EXPR,
2226                                                   arg1, build_int_cst (t1, 1));
2227                       gimple_set_location (stmt, loc);
2228                       gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
2229
2230                       result = make_temp_ssa_name (t0, NULL, "powi");
2231                       stmt = gimple_build_assign (result, COND_EXPR, cond,
2232                                                   minus_one, one);
2233                       gimple_set_location (stmt, loc);
2234                       gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
2235                     }
2236                   else
2237                     {
2238                       if (!tree_fits_shwi_p (arg1))
2239                         break;
2240
2241                       n = tree_to_shwi (arg1);
2242                       result = gimple_expand_builtin_powi (&gsi, loc, arg0, n);
2243                     }
2244
2245                   if (result)
2246                     {
2247                       tree lhs = gimple_get_lhs (stmt);
2248                       gassign *new_stmt = gimple_build_assign (lhs, result);
2249                       gimple_set_location (new_stmt, loc);
2250                       unlink_stmt_vdef (stmt);
2251                       gsi_replace (&gsi, new_stmt, true);
2252                       cleanup_eh = true;
2253                       if (gimple_vdef (stmt))
2254                         release_ssa_name (gimple_vdef (stmt));
2255                     }
2256                   break;
2257
2258                 CASE_CFN_CABS:
2259                   arg0 = gimple_call_arg (stmt, 0);
2260                   loc = gimple_location (stmt);
2261                   result = gimple_expand_builtin_cabs (&gsi, loc, arg0);
2262
2263                   if (result)
2264                     {
2265                       tree lhs = gimple_get_lhs (stmt);
2266                       gassign *new_stmt = gimple_build_assign (lhs, result);
2267                       gimple_set_location (new_stmt, loc);
2268                       unlink_stmt_vdef (stmt);
2269                       gsi_replace (&gsi, new_stmt, true);
2270                       cleanup_eh = true;
2271                       if (gimple_vdef (stmt))
2272                         release_ssa_name (gimple_vdef (stmt));
2273                     }
2274                   break;
2275
2276                 default:;
2277                 }
2278             }
2279         }
2280       if (cleanup_eh)
2281         cfg_changed |= gimple_purge_dead_eh_edges (bb);
2282     }
2283
2284   statistics_counter_event (fun, "sincos statements inserted",
2285                             sincos_stats.inserted);
2286
2287   return cfg_changed ? TODO_cleanup_cfg : 0;
2288 }
2289
2290 } // anon namespace
2291
2292 gimple_opt_pass *
2293 make_pass_cse_sincos (gcc::context *ctxt)
2294 {
2295   return new pass_cse_sincos (ctxt);
2296 }
2297
2298 /* Return true if stmt is a type conversion operation that can be stripped
2299    when used in a widening multiply operation.  */
2300 static bool
2301 widening_mult_conversion_strippable_p (tree result_type, gimple *stmt)
2302 {
2303   enum tree_code rhs_code = gimple_assign_rhs_code (stmt);
2304
2305   if (TREE_CODE (result_type) == INTEGER_TYPE)
2306     {
2307       tree op_type;
2308       tree inner_op_type;
2309
2310       if (!CONVERT_EXPR_CODE_P (rhs_code))
2311         return false;
2312
2313       op_type = TREE_TYPE (gimple_assign_lhs (stmt));
2314
2315       /* If the type of OP has the same precision as the result, then
2316          we can strip this conversion.  The multiply operation will be
2317          selected to create the correct extension as a by-product.  */
2318       if (TYPE_PRECISION (result_type) == TYPE_PRECISION (op_type))
2319         return true;
2320
2321       /* We can also strip a conversion if it preserves the signed-ness of
2322          the operation and doesn't narrow the range.  */
2323       inner_op_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
2324
2325       /* If the inner-most type is unsigned, then we can strip any
2326          intermediate widening operation.  If it's signed, then the
2327          intermediate widening operation must also be signed.  */
2328       if ((TYPE_UNSIGNED (inner_op_type)
2329            || TYPE_UNSIGNED (op_type) == TYPE_UNSIGNED (inner_op_type))
2330           && TYPE_PRECISION (op_type) > TYPE_PRECISION (inner_op_type))
2331         return true;
2332
2333       return false;
2334     }
2335
2336   return rhs_code == FIXED_CONVERT_EXPR;
2337 }
2338
2339 /* Return true if RHS is a suitable operand for a widening multiplication,
2340    assuming a target type of TYPE.
2341    There are two cases:
2342
2343      - RHS makes some value at least twice as wide.  Store that value
2344        in *NEW_RHS_OUT if so, and store its type in *TYPE_OUT.
2345
2346      - RHS is an integer constant.  Store that value in *NEW_RHS_OUT if so,
2347        but leave *TYPE_OUT untouched.  */
2348
2349 static bool
2350 is_widening_mult_rhs_p (tree type, tree rhs, tree *type_out,
2351                         tree *new_rhs_out)
2352 {
2353   gimple *stmt;
2354   tree type1, rhs1;
2355
2356   if (TREE_CODE (rhs) == SSA_NAME)
2357     {
2358       stmt = SSA_NAME_DEF_STMT (rhs);
2359       if (is_gimple_assign (stmt))
2360         {
2361           if (! widening_mult_conversion_strippable_p (type, stmt))
2362             rhs1 = rhs;
2363           else
2364             {
2365               rhs1 = gimple_assign_rhs1 (stmt);
2366
2367               if (TREE_CODE (rhs1) == INTEGER_CST)
2368                 {
2369                   *new_rhs_out = rhs1;
2370                   *type_out = NULL;
2371                   return true;
2372                 }
2373             }
2374         }
2375       else
2376         rhs1 = rhs;
2377
2378       type1 = TREE_TYPE (rhs1);
2379
2380       if (TREE_CODE (type1) != TREE_CODE (type)
2381           || TYPE_PRECISION (type1) * 2 > TYPE_PRECISION (type))
2382         return false;
2383
2384       *new_rhs_out = rhs1;
2385       *type_out = type1;
2386       return true;
2387     }
2388
2389   if (TREE_CODE (rhs) == INTEGER_CST)
2390     {
2391       *new_rhs_out = rhs;
2392       *type_out = NULL;
2393       return true;
2394     }
2395
2396   return false;
2397 }
2398
2399 /* Return true if STMT performs a widening multiplication, assuming the
2400    output type is TYPE.  If so, store the unwidened types of the operands
2401    in *TYPE1_OUT and *TYPE2_OUT respectively.  Also fill *RHS1_OUT and
2402    *RHS2_OUT such that converting those operands to types *TYPE1_OUT
2403    and *TYPE2_OUT would give the operands of the multiplication.  */
2404
2405 static bool
2406 is_widening_mult_p (gimple *stmt,
2407                     tree *type1_out, tree *rhs1_out,
2408                     tree *type2_out, tree *rhs2_out)
2409 {
2410   tree type = TREE_TYPE (gimple_assign_lhs (stmt));
2411
2412   if (TREE_CODE (type) == INTEGER_TYPE)
2413     {
2414       if (TYPE_OVERFLOW_TRAPS (type))
2415         return false;
2416     }
2417   else if (TREE_CODE (type) != FIXED_POINT_TYPE)
2418     return false;
2419
2420   if (!is_widening_mult_rhs_p (type, gimple_assign_rhs1 (stmt), type1_out,
2421                                rhs1_out))
2422     return false;
2423
2424   if (!is_widening_mult_rhs_p (type, gimple_assign_rhs2 (stmt), type2_out,
2425                                rhs2_out))
2426     return false;
2427
2428   if (*type1_out == NULL)
2429     {
2430       if (*type2_out == NULL || !int_fits_type_p (*rhs1_out, *type2_out))
2431         return false;
2432       *type1_out = *type2_out;
2433     }
2434
2435   if (*type2_out == NULL)
2436     {
2437       if (!int_fits_type_p (*rhs2_out, *type1_out))
2438         return false;
2439       *type2_out = *type1_out;
2440     }
2441
2442   /* Ensure that the larger of the two operands comes first. */
2443   if (TYPE_PRECISION (*type1_out) < TYPE_PRECISION (*type2_out))
2444     {
2445       std::swap (*type1_out, *type2_out);
2446       std::swap (*rhs1_out, *rhs2_out);
2447     }
2448
2449   return true;
2450 }
2451
2452 /* Check to see if the CALL statement is an invocation of copysign
2453    with 1. being the first argument.  */
2454 static bool
2455 is_copysign_call_with_1 (gimple *call)
2456 {
2457   gcall *c = dyn_cast <gcall *> (call);
2458   if (! c)
2459     return false;
2460
2461   enum combined_fn code = gimple_call_combined_fn (c);
2462
2463   if (code == CFN_LAST)
2464     return false;
2465
2466   if (builtin_fn_p (code))
2467     {
2468       switch (as_builtin_fn (code))
2469         {
2470         CASE_FLT_FN (BUILT_IN_COPYSIGN):
2471         CASE_FLT_FN_FLOATN_NX (BUILT_IN_COPYSIGN):
2472           return real_onep (gimple_call_arg (c, 0));
2473         default:
2474           return false;
2475         }
2476     }
2477
2478   if (internal_fn_p (code))
2479     {
2480       switch (as_internal_fn (code))
2481         {
2482         case IFN_COPYSIGN:
2483           return real_onep (gimple_call_arg (c, 0));
2484         default:
2485           return false;
2486         }
2487     }
2488
2489    return false;
2490 }
2491
2492 /* Try to expand the pattern x * copysign (1, y) into xorsign (x, y).
2493    This only happens when the xorsign optab is defined, if the
2494    pattern is not a xorsign pattern or if expansion fails FALSE is
2495    returned, otherwise TRUE is returned.  */
2496 static bool
2497 convert_expand_mult_copysign (gimple *stmt, gimple_stmt_iterator *gsi)
2498 {
2499   tree treeop0, treeop1, lhs, type;
2500   location_t loc = gimple_location (stmt);
2501   lhs = gimple_assign_lhs (stmt);
2502   treeop0 = gimple_assign_rhs1 (stmt);
2503   treeop1 = gimple_assign_rhs2 (stmt);
2504   type = TREE_TYPE (lhs);
2505   machine_mode mode = TYPE_MODE (type);
2506
2507   if (HONOR_SNANS (type))
2508     return false;
2509
2510   if (TREE_CODE (treeop0) == SSA_NAME && TREE_CODE (treeop1) == SSA_NAME)
2511     {
2512       gimple *call0 = SSA_NAME_DEF_STMT (treeop0);
2513       if (!has_single_use (treeop0) || !is_copysign_call_with_1 (call0))
2514         {
2515           call0 = SSA_NAME_DEF_STMT (treeop1);
2516           if (!has_single_use (treeop1) || !is_copysign_call_with_1 (call0))
2517             return false;
2518
2519           treeop1 = treeop0;
2520         }
2521         if (optab_handler (xorsign_optab, mode) == CODE_FOR_nothing)
2522           return false;
2523
2524         gcall *c = as_a<gcall*> (call0);
2525         treeop0 = gimple_call_arg (c, 1);
2526
2527         gcall *call_stmt
2528           = gimple_build_call_internal (IFN_XORSIGN, 2, treeop1, treeop0);
2529         gimple_set_lhs (call_stmt, lhs);
2530         gimple_set_location (call_stmt, loc);
2531         gsi_replace (gsi, call_stmt, true);
2532         return true;
2533     }
2534
2535   return false;
2536 }
2537
2538 /* Process a single gimple statement STMT, which has a MULT_EXPR as
2539    its rhs, and try to convert it into a WIDEN_MULT_EXPR.  The return
2540    value is true iff we converted the statement.  */
2541
2542 static bool
2543 convert_mult_to_widen (gimple *stmt, gimple_stmt_iterator *gsi)
2544 {
2545   tree lhs, rhs1, rhs2, type, type1, type2;
2546   enum insn_code handler;
2547   scalar_int_mode to_mode, from_mode, actual_mode;
2548   optab op;
2549   int actual_precision;
2550   location_t loc = gimple_location (stmt);
2551   bool from_unsigned1, from_unsigned2;
2552
2553   lhs = gimple_assign_lhs (stmt);
2554   type = TREE_TYPE (lhs);
2555   if (TREE_CODE (type) != INTEGER_TYPE)
2556     return false;
2557
2558   if (!is_widening_mult_p (stmt, &type1, &rhs1, &type2, &rhs2))
2559     return false;
2560
2561   to_mode = SCALAR_INT_TYPE_MODE (type);
2562   from_mode = SCALAR_INT_TYPE_MODE (type1);
2563   if (to_mode == from_mode)
2564     return false;
2565
2566   from_unsigned1 = TYPE_UNSIGNED (type1);
2567   from_unsigned2 = TYPE_UNSIGNED (type2);
2568
2569   if (from_unsigned1 && from_unsigned2)
2570     op = umul_widen_optab;
2571   else if (!from_unsigned1 && !from_unsigned2)
2572     op = smul_widen_optab;
2573   else
2574     op = usmul_widen_optab;
2575
2576   handler = find_widening_optab_handler_and_mode (op, to_mode, from_mode,
2577                                                   &actual_mode);
2578
2579   if (handler == CODE_FOR_nothing)
2580     {
2581       if (op != smul_widen_optab)
2582         {
2583           /* We can use a signed multiply with unsigned types as long as
2584              there is a wider mode to use, or it is the smaller of the two
2585              types that is unsigned.  Note that type1 >= type2, always.  */
2586           if ((TYPE_UNSIGNED (type1)
2587                && TYPE_PRECISION (type1) == GET_MODE_PRECISION (from_mode))
2588               || (TYPE_UNSIGNED (type2)
2589                   && TYPE_PRECISION (type2) == GET_MODE_PRECISION (from_mode)))
2590             {
2591               if (!GET_MODE_WIDER_MODE (from_mode).exists (&from_mode)
2592                   || GET_MODE_SIZE (to_mode) <= GET_MODE_SIZE (from_mode))
2593                 return false;
2594             }
2595
2596           op = smul_widen_optab;
2597           handler = find_widening_optab_handler_and_mode (op, to_mode,
2598                                                           from_mode,
2599                                                           &actual_mode);
2600
2601           if (handler == CODE_FOR_nothing)
2602             return false;
2603
2604           from_unsigned1 = from_unsigned2 = false;
2605         }
2606       else
2607         return false;
2608     }
2609
2610   /* Ensure that the inputs to the handler are in the correct precison
2611      for the opcode.  This will be the full mode size.  */
2612   actual_precision = GET_MODE_PRECISION (actual_mode);
2613   if (2 * actual_precision > TYPE_PRECISION (type))
2614     return false;
2615   if (actual_precision != TYPE_PRECISION (type1)
2616       || from_unsigned1 != TYPE_UNSIGNED (type1))
2617     rhs1 = build_and_insert_cast (gsi, loc,
2618                                   build_nonstandard_integer_type
2619                                     (actual_precision, from_unsigned1), rhs1);
2620   if (actual_precision != TYPE_PRECISION (type2)
2621       || from_unsigned2 != TYPE_UNSIGNED (type2))
2622     rhs2 = build_and_insert_cast (gsi, loc,
2623                                   build_nonstandard_integer_type
2624                                     (actual_precision, from_unsigned2), rhs2);
2625
2626   /* Handle constants.  */
2627   if (TREE_CODE (rhs1) == INTEGER_CST)
2628     rhs1 = fold_convert (type1, rhs1);
2629   if (TREE_CODE (rhs2) == INTEGER_CST)
2630     rhs2 = fold_convert (type2, rhs2);
2631
2632   gimple_assign_set_rhs1 (stmt, rhs1);
2633   gimple_assign_set_rhs2 (stmt, rhs2);
2634   gimple_assign_set_rhs_code (stmt, WIDEN_MULT_EXPR);
2635   update_stmt (stmt);
2636   widen_mul_stats.widen_mults_inserted++;
2637   return true;
2638 }
2639
2640 /* Process a single gimple statement STMT, which is found at the
2641    iterator GSI and has a either a PLUS_EXPR or a MINUS_EXPR as its
2642    rhs (given by CODE), and try to convert it into a
2643    WIDEN_MULT_PLUS_EXPR or a WIDEN_MULT_MINUS_EXPR.  The return value
2644    is true iff we converted the statement.  */
2645
2646 static bool
2647 convert_plusminus_to_widen (gimple_stmt_iterator *gsi, gimple *stmt,
2648                             enum tree_code code)
2649 {
2650   gimple *rhs1_stmt = NULL, *rhs2_stmt = NULL;
2651   gimple *conv1_stmt = NULL, *conv2_stmt = NULL, *conv_stmt;
2652   tree type, type1, type2, optype;
2653   tree lhs, rhs1, rhs2, mult_rhs1, mult_rhs2, add_rhs;
2654   enum tree_code rhs1_code = ERROR_MARK, rhs2_code = ERROR_MARK;
2655   optab this_optab;
2656   enum tree_code wmult_code;
2657   enum insn_code handler;
2658   scalar_mode to_mode, from_mode, actual_mode;
2659   location_t loc = gimple_location (stmt);
2660   int actual_precision;
2661   bool from_unsigned1, from_unsigned2;
2662
2663   lhs = gimple_assign_lhs (stmt);
2664   type = TREE_TYPE (lhs);
2665   if (TREE_CODE (type) != INTEGER_TYPE
2666       && TREE_CODE (type) != FIXED_POINT_TYPE)
2667     return false;
2668
2669   if (code == MINUS_EXPR)
2670     wmult_code = WIDEN_MULT_MINUS_EXPR;
2671   else
2672     wmult_code = WIDEN_MULT_PLUS_EXPR;
2673
2674   rhs1 = gimple_assign_rhs1 (stmt);
2675   rhs2 = gimple_assign_rhs2 (stmt);
2676
2677   if (TREE_CODE (rhs1) == SSA_NAME)
2678     {
2679       rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
2680       if (is_gimple_assign (rhs1_stmt))
2681         rhs1_code = gimple_assign_rhs_code (rhs1_stmt);
2682     }
2683
2684   if (TREE_CODE (rhs2) == SSA_NAME)
2685     {
2686       rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
2687       if (is_gimple_assign (rhs2_stmt))
2688         rhs2_code = gimple_assign_rhs_code (rhs2_stmt);
2689     }
2690
2691   /* Allow for one conversion statement between the multiply
2692      and addition/subtraction statement.  If there are more than
2693      one conversions then we assume they would invalidate this
2694      transformation.  If that's not the case then they should have
2695      been folded before now.  */
2696   if (CONVERT_EXPR_CODE_P (rhs1_code))
2697     {
2698       conv1_stmt = rhs1_stmt;
2699       rhs1 = gimple_assign_rhs1 (rhs1_stmt);
2700       if (TREE_CODE (rhs1) == SSA_NAME)
2701         {
2702           rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
2703           if (is_gimple_assign (rhs1_stmt))
2704             rhs1_code = gimple_assign_rhs_code (rhs1_stmt);
2705         }
2706       else
2707         return false;
2708     }
2709   if (CONVERT_EXPR_CODE_P (rhs2_code))
2710     {
2711       conv2_stmt = rhs2_stmt;
2712       rhs2 = gimple_assign_rhs1 (rhs2_stmt);
2713       if (TREE_CODE (rhs2) == SSA_NAME)
2714         {
2715           rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
2716           if (is_gimple_assign (rhs2_stmt))
2717             rhs2_code = gimple_assign_rhs_code (rhs2_stmt);
2718         }
2719       else
2720         return false;
2721     }
2722
2723   /* If code is WIDEN_MULT_EXPR then it would seem unnecessary to call
2724      is_widening_mult_p, but we still need the rhs returns.
2725
2726      It might also appear that it would be sufficient to use the existing
2727      operands of the widening multiply, but that would limit the choice of
2728      multiply-and-accumulate instructions.
2729
2730      If the widened-multiplication result has more than one uses, it is
2731      probably wiser not to do the conversion.  Also restrict this operation
2732      to single basic block to avoid moving the multiply to a different block
2733      with a higher execution frequency.  */
2734   if (code == PLUS_EXPR
2735       && (rhs1_code == MULT_EXPR || rhs1_code == WIDEN_MULT_EXPR))
2736     {
2737       if (!has_single_use (rhs1)
2738           || gimple_bb (rhs1_stmt) != gimple_bb (stmt)
2739           || !is_widening_mult_p (rhs1_stmt, &type1, &mult_rhs1,
2740                                   &type2, &mult_rhs2))
2741         return false;
2742       add_rhs = rhs2;
2743       conv_stmt = conv1_stmt;
2744     }
2745   else if (rhs2_code == MULT_EXPR || rhs2_code == WIDEN_MULT_EXPR)
2746     {
2747       if (!has_single_use (rhs2)
2748           || gimple_bb (rhs2_stmt) != gimple_bb (stmt)
2749           || !is_widening_mult_p (rhs2_stmt, &type1, &mult_rhs1,
2750                                   &type2, &mult_rhs2))
2751         return false;
2752       add_rhs = rhs1;
2753       conv_stmt = conv2_stmt;
2754     }
2755   else
2756     return false;
2757
2758   to_mode = SCALAR_TYPE_MODE (type);
2759   from_mode = SCALAR_TYPE_MODE (type1);
2760   if (to_mode == from_mode)
2761     return false;
2762
2763   from_unsigned1 = TYPE_UNSIGNED (type1);
2764   from_unsigned2 = TYPE_UNSIGNED (type2);
2765   optype = type1;
2766
2767   /* There's no such thing as a mixed sign madd yet, so use a wider mode.  */
2768   if (from_unsigned1 != from_unsigned2)
2769     {
2770       if (!INTEGRAL_TYPE_P (type))
2771         return false;
2772       /* We can use a signed multiply with unsigned types as long as
2773          there is a wider mode to use, or it is the smaller of the two
2774          types that is unsigned.  Note that type1 >= type2, always.  */
2775       if ((from_unsigned1
2776            && TYPE_PRECISION (type1) == GET_MODE_PRECISION (from_mode))
2777           || (from_unsigned2
2778               && TYPE_PRECISION (type2) == GET_MODE_PRECISION (from_mode)))
2779         {
2780           if (!GET_MODE_WIDER_MODE (from_mode).exists (&from_mode)
2781               || GET_MODE_SIZE (from_mode) >= GET_MODE_SIZE (to_mode))
2782             return false;
2783         }
2784
2785       from_unsigned1 = from_unsigned2 = false;
2786       optype = build_nonstandard_integer_type (GET_MODE_PRECISION (from_mode),
2787                                                false);
2788     }
2789
2790   /* If there was a conversion between the multiply and addition
2791      then we need to make sure it fits a multiply-and-accumulate.
2792      The should be a single mode change which does not change the
2793      value.  */
2794   if (conv_stmt)
2795     {
2796       /* We use the original, unmodified data types for this.  */
2797       tree from_type = TREE_TYPE (gimple_assign_rhs1 (conv_stmt));
2798       tree to_type = TREE_TYPE (gimple_assign_lhs (conv_stmt));
2799       int data_size = TYPE_PRECISION (type1) + TYPE_PRECISION (type2);
2800       bool is_unsigned = TYPE_UNSIGNED (type1) && TYPE_UNSIGNED (type2);
2801
2802       if (TYPE_PRECISION (from_type) > TYPE_PRECISION (to_type))
2803         {
2804           /* Conversion is a truncate.  */
2805           if (TYPE_PRECISION (to_type) < data_size)
2806             return false;
2807         }
2808       else if (TYPE_PRECISION (from_type) < TYPE_PRECISION (to_type))
2809         {
2810           /* Conversion is an extend.  Check it's the right sort.  */
2811           if (TYPE_UNSIGNED (from_type) != is_unsigned
2812               && !(is_unsigned && TYPE_PRECISION (from_type) > data_size))
2813             return false;
2814         }
2815       /* else convert is a no-op for our purposes.  */
2816     }
2817
2818   /* Verify that the machine can perform a widening multiply
2819      accumulate in this mode/signedness combination, otherwise
2820      this transformation is likely to pessimize code.  */
2821   this_optab = optab_for_tree_code (wmult_code, optype, optab_default);
2822   handler = find_widening_optab_handler_and_mode (this_optab, to_mode,
2823                                                   from_mode, &actual_mode);
2824
2825   if (handler == CODE_FOR_nothing)
2826     return false;
2827
2828   /* Ensure that the inputs to the handler are in the correct precison
2829      for the opcode.  This will be the full mode size.  */
2830   actual_precision = GET_MODE_PRECISION (actual_mode);
2831   if (actual_precision != TYPE_PRECISION (type1)
2832       || from_unsigned1 != TYPE_UNSIGNED (type1))
2833     mult_rhs1 = build_and_insert_cast (gsi, loc,
2834                                        build_nonstandard_integer_type
2835                                          (actual_precision, from_unsigned1),
2836                                        mult_rhs1);
2837   if (actual_precision != TYPE_PRECISION (type2)
2838       || from_unsigned2 != TYPE_UNSIGNED (type2))
2839     mult_rhs2 = build_and_insert_cast (gsi, loc,
2840                                        build_nonstandard_integer_type
2841                                          (actual_precision, from_unsigned2),
2842                                        mult_rhs2);
2843
2844   if (!useless_type_conversion_p (type, TREE_TYPE (add_rhs)))
2845     add_rhs = build_and_insert_cast (gsi, loc, type, add_rhs);
2846
2847   /* Handle constants.  */
2848   if (TREE_CODE (mult_rhs1) == INTEGER_CST)
2849     mult_rhs1 = fold_convert (type1, mult_rhs1);
2850   if (TREE_CODE (mult_rhs2) == INTEGER_CST)
2851     mult_rhs2 = fold_convert (type2, mult_rhs2);
2852
2853   gimple_assign_set_rhs_with_ops (gsi, wmult_code, mult_rhs1, mult_rhs2,
2854                                   add_rhs);
2855   update_stmt (gsi_stmt (*gsi));
2856   widen_mul_stats.maccs_inserted++;
2857   return true;
2858 }
2859
2860 /* Given a result MUL_RESULT which is a result of a multiplication of OP1 and
2861    OP2 and which we know is used in statements that can be, together with the
2862    multiplication, converted to FMAs, perform the transformation.  */
2863
2864 static void
2865 convert_mult_to_fma_1 (tree mul_result, tree op1, tree op2)
2866 {
2867   tree type = TREE_TYPE (mul_result);
2868   gimple *use_stmt;
2869   imm_use_iterator imm_iter;
2870   gcall *fma_stmt;
2871
2872   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, mul_result)
2873     {
2874       gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
2875       tree addop, mulop1 = op1, result = mul_result;
2876       bool negate_p = false;
2877       gimple_seq seq = NULL;
2878
2879       if (is_gimple_debug (use_stmt))
2880         continue;
2881
2882       if (is_gimple_assign (use_stmt)
2883           && gimple_assign_rhs_code (use_stmt) == NEGATE_EXPR)
2884         {
2885           result = gimple_assign_lhs (use_stmt);
2886           use_operand_p use_p;
2887           gimple *neguse_stmt;
2888           single_imm_use (gimple_assign_lhs (use_stmt), &use_p, &neguse_stmt);
2889           gsi_remove (&gsi, true);
2890           release_defs (use_stmt);
2891
2892           use_stmt = neguse_stmt;
2893           gsi = gsi_for_stmt (use_stmt);
2894           negate_p = true;
2895         }
2896
2897       tree cond, else_value, ops[3];
2898       tree_code code;
2899       if (!can_interpret_as_conditional_op_p (use_stmt, &cond, &code,
2900                                               ops, &else_value))
2901         gcc_unreachable ();
2902       addop = ops[0] == result ? ops[1] : ops[0];
2903
2904       if (code == MINUS_EXPR)
2905         {
2906           if (ops[0] == result)
2907             /* a * b - c -> a * b + (-c)  */
2908             addop = gimple_build (&seq, NEGATE_EXPR, type, addop);
2909           else
2910             /* a - b * c -> (-b) * c + a */
2911             negate_p = !negate_p;
2912         }
2913
2914       if (negate_p)
2915         mulop1 = gimple_build (&seq, NEGATE_EXPR, type, mulop1);
2916
2917       if (seq)
2918         gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
2919
2920       if (cond)
2921         fma_stmt = gimple_build_call_internal (IFN_COND_FMA, 5, cond, mulop1,
2922                                                op2, addop, else_value);
2923       else
2924         fma_stmt = gimple_build_call_internal (IFN_FMA, 3, mulop1, op2, addop);
2925       gimple_set_lhs (fma_stmt, gimple_get_lhs (use_stmt));
2926       gimple_call_set_nothrow (fma_stmt, !stmt_can_throw_internal (cfun,
2927                                                                    use_stmt));
2928       gsi_replace (&gsi, fma_stmt, true);
2929       /* Follow all SSA edges so that we generate FMS, FNMA and FNMS
2930          regardless of where the negation occurs.  */
2931       gimple *orig_stmt = gsi_stmt (gsi);
2932       if (fold_stmt (&gsi, follow_all_ssa_edges))
2933         {
2934           if (maybe_clean_or_replace_eh_stmt (orig_stmt, gsi_stmt (gsi)))
2935             gcc_unreachable ();
2936           update_stmt (gsi_stmt (gsi));
2937         }
2938
2939       if (dump_file && (dump_flags & TDF_DETAILS))
2940         {
2941           fprintf (dump_file, "Generated FMA ");
2942           print_gimple_stmt (dump_file, gsi_stmt (gsi), 0, TDF_NONE);
2943           fprintf (dump_file, "\n");
2944         }
2945
2946       /* If the FMA result is negated in a single use, fold the negation
2947          too.  */
2948       orig_stmt = gsi_stmt (gsi);
2949       use_operand_p use_p;
2950       gimple *neg_stmt;
2951       if (is_gimple_call (orig_stmt)
2952           && gimple_call_internal_p (orig_stmt)
2953           && gimple_call_lhs (orig_stmt)
2954           && TREE_CODE (gimple_call_lhs (orig_stmt)) == SSA_NAME
2955           && single_imm_use (gimple_call_lhs (orig_stmt), &use_p, &neg_stmt)
2956           && is_gimple_assign (neg_stmt)
2957           && gimple_assign_rhs_code (neg_stmt) == NEGATE_EXPR
2958           && !stmt_could_throw_p (cfun, neg_stmt))
2959         {
2960           gsi = gsi_for_stmt (neg_stmt);
2961           if (fold_stmt (&gsi, follow_all_ssa_edges))
2962             {
2963               if (maybe_clean_or_replace_eh_stmt (neg_stmt, gsi_stmt (gsi)))
2964                 gcc_unreachable ();
2965               update_stmt (gsi_stmt (gsi));
2966               if (dump_file && (dump_flags & TDF_DETAILS))
2967                 {
2968                   fprintf (dump_file, "Folded FMA negation ");
2969                   print_gimple_stmt (dump_file, gsi_stmt (gsi), 0, TDF_NONE);
2970                   fprintf (dump_file, "\n");
2971                 }
2972             }
2973         }
2974
2975       widen_mul_stats.fmas_inserted++;
2976     }
2977 }
2978
2979 /* Data necessary to perform the actual transformation from a multiplication
2980    and an addition to an FMA after decision is taken it should be done and to
2981    then delete the multiplication statement from the function IL.  */
2982
2983 struct fma_transformation_info
2984 {
2985   gimple *mul_stmt;
2986   tree mul_result;
2987   tree op1;
2988   tree op2;
2989 };
2990
2991 /* Structure containing the current state of FMA deferring, i.e. whether we are
2992    deferring, whether to continue deferring, and all data necessary to come
2993    back and perform all deferred transformations.  */
2994
2995 class fma_deferring_state
2996 {
2997 public:
2998   /* Class constructor.  Pass true as PERFORM_DEFERRING in order to actually
2999      do any deferring.  */
3000
3001   fma_deferring_state (bool perform_deferring)
3002     : m_candidates (), m_mul_result_set (), m_initial_phi (NULL),
3003       m_last_result (NULL_TREE), m_deferring_p (perform_deferring) {}
3004
3005   /* List of FMA candidates for which we the transformation has been determined
3006      possible but we at this point in BB analysis we do not consider them
3007      beneficial.  */
3008   auto_vec<fma_transformation_info, 8> m_candidates;
3009
3010   /* Set of results of multiplication that are part of an already deferred FMA
3011      candidates.  */
3012   hash_set<tree> m_mul_result_set;
3013
3014   /* The PHI that supposedly feeds back result of a FMA to another over loop
3015      boundary.  */
3016   gphi *m_initial_phi;
3017
3018   /* Result of the last produced FMA candidate or NULL if there has not been
3019      one.  */
3020   tree m_last_result;
3021
3022   /* If true, deferring might still be profitable.  If false, transform all
3023      candidates and no longer defer.  */
3024   bool m_deferring_p;
3025 };
3026
3027 /* Transform all deferred FMA candidates and mark STATE as no longer
3028    deferring.  */
3029
3030 static void
3031 cancel_fma_deferring (fma_deferring_state *state)
3032 {
3033   if (!state->m_deferring_p)
3034     return;
3035
3036   for (unsigned i = 0; i < state->m_candidates.length (); i++)
3037     {
3038       if (dump_file && (dump_flags & TDF_DETAILS))
3039         fprintf (dump_file, "Generating deferred FMA\n");
3040
3041       const fma_transformation_info &fti = state->m_candidates[i];
3042       convert_mult_to_fma_1 (fti.mul_result, fti.op1, fti.op2);
3043
3044       gimple_stmt_iterator gsi = gsi_for_stmt (fti.mul_stmt);
3045       gsi_remove (&gsi, true);
3046       release_defs (fti.mul_stmt);
3047     }
3048   state->m_deferring_p = false;
3049 }
3050
3051 /* If OP is an SSA name defined by a PHI node, return the PHI statement.
3052    Otherwise return NULL.  */
3053
3054 static gphi *
3055 result_of_phi (tree op)
3056 {
3057   if (TREE_CODE (op) != SSA_NAME)
3058     return NULL;
3059
3060   return dyn_cast <gphi *> (SSA_NAME_DEF_STMT (op));
3061 }
3062
3063 /* After processing statements of a BB and recording STATE, return true if the
3064    initial phi is fed by the last FMA candidate result ore one such result from
3065    previously processed BBs marked in LAST_RESULT_SET.  */
3066
3067 static bool
3068 last_fma_candidate_feeds_initial_phi (fma_deferring_state *state,
3069                                       hash_set<tree> *last_result_set)
3070 {
3071   ssa_op_iter iter;
3072   use_operand_p use;
3073   FOR_EACH_PHI_ARG (use, state->m_initial_phi, iter, SSA_OP_USE)
3074     {
3075       tree t = USE_FROM_PTR (use);
3076       if (t == state->m_last_result
3077           || last_result_set->contains (t))
3078         return true;
3079     }
3080
3081   return false;
3082 }
3083
3084 /* Combine the multiplication at MUL_STMT with operands MULOP1 and MULOP2
3085    with uses in additions and subtractions to form fused multiply-add
3086    operations.  Returns true if successful and MUL_STMT should be removed.
3087    If MUL_COND is nonnull, the multiplication in MUL_STMT is conditional
3088    on MUL_COND, otherwise it is unconditional.
3089
3090    If STATE indicates that we are deferring FMA transformation, that means
3091    that we do not produce FMAs for basic blocks which look like:
3092
3093     <bb 6>
3094     # accumulator_111 = PHI <0.0(5), accumulator_66(6)>
3095     _65 = _14 * _16;
3096     accumulator_66 = _65 + accumulator_111;
3097
3098   or its unrolled version, i.e. with several FMA candidates that feed result
3099   of one into the addend of another.  Instead, we add them to a list in STATE
3100   and if we later discover an FMA candidate that is not part of such a chain,
3101   we go back and perform all deferred past candidates.  */
3102
3103 static bool
3104 convert_mult_to_fma (gimple *mul_stmt, tree op1, tree op2,
3105                      fma_deferring_state *state, tree mul_cond = NULL_TREE)
3106 {
3107   tree mul_result = gimple_get_lhs (mul_stmt);
3108   tree type = TREE_TYPE (mul_result);
3109   gimple *use_stmt, *neguse_stmt;
3110   use_operand_p use_p;
3111   imm_use_iterator imm_iter;
3112
3113   if (FLOAT_TYPE_P (type)
3114       && flag_fp_contract_mode == FP_CONTRACT_OFF)
3115     return false;
3116
3117   /* We don't want to do bitfield reduction ops.  */
3118   if (INTEGRAL_TYPE_P (type)
3119       && (!type_has_mode_precision_p (type) || TYPE_OVERFLOW_TRAPS (type)))
3120     return false;
3121
3122   /* If the target doesn't support it, don't generate it.  We assume that
3123      if fma isn't available then fms, fnma or fnms are not either.  */
3124   optimization_type opt_type = bb_optimization_type (gimple_bb (mul_stmt));
3125   if (!direct_internal_fn_supported_p (IFN_FMA, type, opt_type))
3126     return false;
3127
3128   /* If the multiplication has zero uses, it is kept around probably because
3129      of -fnon-call-exceptions.  Don't optimize it away in that case,
3130      it is DCE job.  */
3131   if (has_zero_uses (mul_result))
3132     return false;
3133
3134   bool check_defer
3135     = (state->m_deferring_p
3136        && (tree_to_shwi (TYPE_SIZE (type))
3137            <= param_avoid_fma_max_bits));
3138   bool defer = check_defer;
3139   bool seen_negate_p = false;
3140   /* Make sure that the multiplication statement becomes dead after
3141      the transformation, thus that all uses are transformed to FMAs.
3142      This means we assume that an FMA operation has the same cost
3143      as an addition.  */
3144   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, mul_result)
3145     {
3146       tree result = mul_result;
3147       bool negate_p = false;
3148
3149       use_stmt = USE_STMT (use_p);
3150
3151       if (is_gimple_debug (use_stmt))
3152         continue;
3153
3154       /* For now restrict this operations to single basic blocks.  In theory
3155          we would want to support sinking the multiplication in
3156          m = a*b;
3157          if ()
3158            ma = m + c;
3159          else
3160            d = m;
3161          to form a fma in the then block and sink the multiplication to the
3162          else block.  */
3163       if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
3164         return false;
3165
3166       /* A negate on the multiplication leads to FNMA.  */
3167       if (is_gimple_assign (use_stmt)
3168           && gimple_assign_rhs_code (use_stmt) == NEGATE_EXPR)
3169         {
3170           ssa_op_iter iter;
3171           use_operand_p usep;
3172
3173           /* If (due to earlier missed optimizations) we have two
3174              negates of the same value, treat them as equivalent
3175              to a single negate with multiple uses.  */
3176           if (seen_negate_p)
3177             return false;
3178
3179           result = gimple_assign_lhs (use_stmt);
3180
3181           /* Make sure the negate statement becomes dead with this
3182              single transformation.  */
3183           if (!single_imm_use (gimple_assign_lhs (use_stmt),
3184                                &use_p, &neguse_stmt))
3185             return false;
3186
3187           /* Make sure the multiplication isn't also used on that stmt.  */
3188           FOR_EACH_PHI_OR_STMT_USE (usep, neguse_stmt, iter, SSA_OP_USE)
3189             if (USE_FROM_PTR (usep) == mul_result)
3190               return false;
3191
3192           /* Re-validate.  */
3193           use_stmt = neguse_stmt;
3194           if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
3195             return false;
3196
3197           negate_p = seen_negate_p = true;
3198         }
3199
3200       tree cond, else_value, ops[3];
3201       tree_code code;
3202       if (!can_interpret_as_conditional_op_p (use_stmt, &cond, &code, ops,
3203                                               &else_value))
3204         return false;
3205
3206       switch (code)
3207         {
3208         case MINUS_EXPR:
3209           if (ops[1] == result)
3210             negate_p = !negate_p;
3211           break;
3212         case PLUS_EXPR:
3213           break;
3214         default:
3215           /* FMA can only be formed from PLUS and MINUS.  */
3216           return false;
3217         }
3218
3219       if (mul_cond && cond != mul_cond)
3220         return false;
3221
3222       if (cond)
3223         {
3224           if (cond == result || else_value == result)
3225             return false;
3226           if (!direct_internal_fn_supported_p (IFN_COND_FMA, type, opt_type))
3227             return false;
3228         }
3229
3230       /* If the subtrahend (OPS[1]) is computed by a MULT_EXPR that
3231          we'll visit later, we might be able to get a more profitable
3232          match with fnma.
3233          OTOH, if we don't, a negate / fma pair has likely lower latency
3234          that a mult / subtract pair.  */
3235       if (code == MINUS_EXPR
3236           && !negate_p
3237           && ops[0] == result
3238           && !direct_internal_fn_supported_p (IFN_FMS, type, opt_type)
3239           && direct_internal_fn_supported_p (IFN_FNMA, type, opt_type)
3240           && TREE_CODE (ops[1]) == SSA_NAME
3241           && has_single_use (ops[1]))
3242         {
3243           gimple *stmt2 = SSA_NAME_DEF_STMT (ops[1]);
3244           if (is_gimple_assign (stmt2)
3245               && gimple_assign_rhs_code (stmt2) == MULT_EXPR)
3246             return false;
3247         }
3248
3249       /* We can't handle a * b + a * b.  */
3250       if (ops[0] == ops[1])
3251         return false;
3252       /* If deferring, make sure we are not looking at an instruction that
3253          wouldn't have existed if we were not.  */
3254       if (state->m_deferring_p
3255           && (state->m_mul_result_set.contains (ops[0])
3256               || state->m_mul_result_set.contains (ops[1])))
3257         return false;
3258
3259       if (check_defer)
3260         {
3261           tree use_lhs = gimple_get_lhs (use_stmt);
3262           if (state->m_last_result)
3263             {
3264               if (ops[1] == state->m_last_result
3265                   || ops[0] == state->m_last_result)
3266                 defer = true;
3267               else
3268                 defer = false;
3269             }
3270           else
3271             {
3272               gcc_checking_assert (!state->m_initial_phi);
3273               gphi *phi;
3274               if (ops[0] == result)
3275                 phi = result_of_phi (ops[1]);
3276               else
3277                 {
3278                   gcc_assert (ops[1] == result);
3279                   phi = result_of_phi (ops[0]);
3280                 }
3281
3282               if (phi)
3283                 {
3284                   state->m_initial_phi = phi;
3285                   defer = true;
3286                 }
3287               else
3288                 defer = false;
3289             }
3290
3291           state->m_last_result = use_lhs;
3292           check_defer = false;
3293         }
3294       else
3295         defer = false;
3296
3297       /* While it is possible to validate whether or not the exact form that
3298          we've recognized is available in the backend, the assumption is that
3299          if the deferring logic above did not trigger, the transformation is
3300          never a loss.  For instance, suppose the target only has the plain FMA
3301          pattern available.  Consider a*b-c -> fma(a,b,-c): we've exchanged
3302          MUL+SUB for FMA+NEG, which is still two operations.  Consider
3303          -(a*b)-c -> fma(-a,b,-c): we still have 3 operations, but in the FMA
3304          form the two NEGs are independent and could be run in parallel.  */
3305     }
3306
3307   if (defer)
3308     {
3309       fma_transformation_info fti;
3310       fti.mul_stmt = mul_stmt;
3311       fti.mul_result = mul_result;
3312       fti.op1 = op1;
3313       fti.op2 = op2;
3314       state->m_candidates.safe_push (fti);
3315       state->m_mul_result_set.add (mul_result);
3316
3317       if (dump_file && (dump_flags & TDF_DETAILS))
3318         {
3319           fprintf (dump_file, "Deferred generating FMA for multiplication ");
3320           print_gimple_stmt (dump_file, mul_stmt, 0, TDF_NONE);
3321           fprintf (dump_file, "\n");
3322         }
3323
3324       return false;
3325     }
3326   else
3327     {
3328       if (state->m_deferring_p)
3329         cancel_fma_deferring (state);
3330       convert_mult_to_fma_1 (mul_result, op1, op2);
3331       return true;
3332     }
3333 }
3334
3335
3336 /* Helper function of match_uaddsub_overflow.  Return 1
3337    if USE_STMT is unsigned overflow check ovf != 0 for
3338    STMT, -1 if USE_STMT is unsigned overflow check ovf == 0
3339    and 0 otherwise.  */
3340
3341 static int
3342 uaddsub_overflow_check_p (gimple *stmt, gimple *use_stmt)
3343 {
3344   enum tree_code ccode = ERROR_MARK;
3345   tree crhs1 = NULL_TREE, crhs2 = NULL_TREE;
3346   if (gimple_code (use_stmt) == GIMPLE_COND)
3347     {
3348       ccode = gimple_cond_code (use_stmt);
3349       crhs1 = gimple_cond_lhs (use_stmt);
3350       crhs2 = gimple_cond_rhs (use_stmt);
3351     }
3352   else if (is_gimple_assign (use_stmt))
3353     {
3354       if (gimple_assign_rhs_class (use_stmt) == GIMPLE_BINARY_RHS)
3355         {
3356           ccode = gimple_assign_rhs_code (use_stmt);
3357           crhs1 = gimple_assign_rhs1 (use_stmt);
3358           crhs2 = gimple_assign_rhs2 (use_stmt);
3359         }
3360       else if (gimple_assign_rhs_code (use_stmt) == COND_EXPR)
3361         {
3362           tree cond = gimple_assign_rhs1 (use_stmt);
3363           if (COMPARISON_CLASS_P (cond))
3364             {
3365               ccode = TREE_CODE (cond);
3366               crhs1 = TREE_OPERAND (cond, 0);
3367               crhs2 = TREE_OPERAND (cond, 1);
3368             }
3369           else
3370             return 0;
3371         }
3372       else
3373         return 0;
3374     }
3375   else
3376     return 0;
3377
3378   if (TREE_CODE_CLASS (ccode) != tcc_comparison)
3379     return 0;
3380
3381   enum tree_code code = gimple_assign_rhs_code (stmt);
3382   tree lhs = gimple_assign_lhs (stmt);
3383   tree rhs1 = gimple_assign_rhs1 (stmt);
3384   tree rhs2 = gimple_assign_rhs2 (stmt);
3385
3386   switch (ccode)
3387     {
3388     case GT_EXPR:
3389     case LE_EXPR:
3390       /* r = a - b; r > a or r <= a
3391          r = a + b; a > r or a <= r or b > r or b <= r.  */
3392       if ((code == MINUS_EXPR && crhs1 == lhs && crhs2 == rhs1)
3393           || (code == PLUS_EXPR && (crhs1 == rhs1 || crhs1 == rhs2)
3394               && crhs2 == lhs))
3395         return ccode == GT_EXPR ? 1 : -1;
3396       break;
3397     case LT_EXPR:
3398     case GE_EXPR:
3399       /* r = a - b; a < r or a >= r
3400          r = a + b; r < a or r >= a or r < b or r >= b.  */
3401       if ((code == MINUS_EXPR && crhs1 == rhs1 && crhs2 == lhs)
3402           || (code == PLUS_EXPR && crhs1 == lhs
3403               && (crhs2 == rhs1 || crhs2 == rhs2)))
3404         return ccode == LT_EXPR ? 1 : -1;
3405       break;
3406     default:
3407       break;
3408     }
3409   return 0;
3410 }
3411
3412 /* Recognize for unsigned x
3413    x = y - z;
3414    if (x > y)
3415    where there are other uses of x and replace it with
3416    _7 = SUB_OVERFLOW (y, z);
3417    x = REALPART_EXPR <_7>;
3418    _8 = IMAGPART_EXPR <_7>;
3419    if (_8)
3420    and similarly for addition.  */
3421
3422 static bool
3423 match_uaddsub_overflow (gimple_stmt_iterator *gsi, gimple *stmt,
3424                         enum tree_code code)
3425 {
3426   tree lhs = gimple_assign_lhs (stmt);
3427   tree type = TREE_TYPE (lhs);
3428   use_operand_p use_p;
3429   imm_use_iterator iter;
3430   bool use_seen = false;
3431   bool ovf_use_seen = false;
3432   gimple *use_stmt;
3433
3434   gcc_checking_assert (code == PLUS_EXPR || code == MINUS_EXPR);
3435   if (!INTEGRAL_TYPE_P (type)
3436       || !TYPE_UNSIGNED (type)
3437       || has_zero_uses (lhs)
3438       || has_single_use (lhs)
3439       || optab_handler (code == PLUS_EXPR ? uaddv4_optab : usubv4_optab,
3440                         TYPE_MODE (type)) == CODE_FOR_nothing)
3441     return false;
3442
3443   FOR_EACH_IMM_USE_FAST (use_p, iter, lhs)
3444     {
3445       use_stmt = USE_STMT (use_p);
3446       if (is_gimple_debug (use_stmt))
3447         continue;
3448
3449       if (uaddsub_overflow_check_p (stmt, use_stmt))
3450         ovf_use_seen = true;
3451       else
3452         use_seen = true;
3453       if (ovf_use_seen && use_seen)
3454         break;
3455     }
3456
3457   if (!ovf_use_seen || !use_seen)
3458     return false;
3459
3460   tree ctype = build_complex_type (type);
3461   tree rhs1 = gimple_assign_rhs1 (stmt);
3462   tree rhs2 = gimple_assign_rhs2 (stmt);
3463   gcall *g = gimple_build_call_internal (code == PLUS_EXPR
3464                                          ? IFN_ADD_OVERFLOW : IFN_SUB_OVERFLOW,
3465                                          2, rhs1, rhs2);
3466   tree ctmp = make_ssa_name (ctype);
3467   gimple_call_set_lhs (g, ctmp);
3468   gsi_insert_before (gsi, g, GSI_SAME_STMT);
3469   gassign *g2 = gimple_build_assign (lhs, REALPART_EXPR,
3470                                      build1 (REALPART_EXPR, type, ctmp));
3471   gsi_replace (gsi, g2, true);
3472   tree ovf = make_ssa_name (type);
3473   g2 = gimple_build_assign (ovf, IMAGPART_EXPR,
3474                             build1 (IMAGPART_EXPR, type, ctmp));
3475   gsi_insert_after (gsi, g2, GSI_NEW_STMT);
3476
3477   FOR_EACH_IMM_USE_STMT (use_stmt, iter, lhs)
3478     {
3479       if (is_gimple_debug (use_stmt))
3480         continue;
3481
3482       int ovf_use = uaddsub_overflow_check_p (stmt, use_stmt);
3483       if (ovf_use == 0)
3484         continue;
3485       if (gimple_code (use_stmt) == GIMPLE_COND)
3486         {
3487           gcond *cond_stmt = as_a <gcond *> (use_stmt);
3488           gimple_cond_set_lhs (cond_stmt, ovf);
3489           gimple_cond_set_rhs (cond_stmt, build_int_cst (type, 0));
3490           gimple_cond_set_code (cond_stmt, ovf_use == 1 ? NE_EXPR : EQ_EXPR);
3491         }
3492       else
3493         {
3494           gcc_checking_assert (is_gimple_assign (use_stmt));
3495           if (gimple_assign_rhs_class (use_stmt) == GIMPLE_BINARY_RHS)
3496             {
3497               gimple_assign_set_rhs1 (use_stmt, ovf);
3498               gimple_assign_set_rhs2 (use_stmt, build_int_cst (type, 0));
3499               gimple_assign_set_rhs_code (use_stmt,
3500                                           ovf_use == 1 ? NE_EXPR : EQ_EXPR);
3501             }
3502           else
3503             {
3504               gcc_checking_assert (gimple_assign_rhs_code (use_stmt)
3505                                    == COND_EXPR);
3506               tree cond = build2 (ovf_use == 1 ? NE_EXPR : EQ_EXPR,
3507                                   boolean_type_node, ovf,
3508                                   build_int_cst (type, 0));
3509               gimple_assign_set_rhs1 (use_stmt, cond);
3510             }
3511         }
3512       update_stmt (use_stmt);
3513     }
3514   return true;
3515 }
3516
3517 /* Return true if target has support for divmod.  */
3518
3519 static bool
3520 target_supports_divmod_p (optab divmod_optab, optab div_optab, machine_mode mode)
3521 {
3522   /* If target supports hardware divmod insn, use it for divmod.  */
3523   if (optab_handler (divmod_optab, mode) != CODE_FOR_nothing)
3524     return true;
3525
3526   /* Check if libfunc for divmod is available.  */
3527   rtx libfunc = optab_libfunc (divmod_optab, mode);
3528   if (libfunc != NULL_RTX)
3529     {
3530       /* If optab_handler exists for div_optab, perhaps in a wider mode,
3531          we don't want to use the libfunc even if it exists for given mode.  */
3532       machine_mode div_mode;
3533       FOR_EACH_MODE_FROM (div_mode, mode)
3534         if (optab_handler (div_optab, div_mode) != CODE_FOR_nothing)
3535           return false;
3536
3537       return targetm.expand_divmod_libfunc != NULL;
3538     }
3539
3540   return false;
3541 }
3542
3543 /* Check if stmt is candidate for divmod transform.  */
3544
3545 static bool
3546 divmod_candidate_p (gassign *stmt)
3547 {
3548   tree type = TREE_TYPE (gimple_assign_lhs (stmt));
3549   machine_mode mode = TYPE_MODE (type);
3550   optab divmod_optab, div_optab;
3551
3552   if (TYPE_UNSIGNED (type))
3553     {
3554       divmod_optab = udivmod_optab;
3555       div_optab = udiv_optab;
3556     }
3557   else
3558     {
3559       divmod_optab = sdivmod_optab;
3560       div_optab = sdiv_optab;
3561     }
3562
3563   tree op1 = gimple_assign_rhs1 (stmt);
3564   tree op2 = gimple_assign_rhs2 (stmt);
3565
3566   /* Disable the transform if either is a constant, since division-by-constant
3567      may have specialized expansion.  */
3568   if (CONSTANT_CLASS_P (op1) || CONSTANT_CLASS_P (op2))
3569     return false;
3570
3571   /* Exclude the case where TYPE_OVERFLOW_TRAPS (type) as that should
3572      expand using the [su]divv optabs.  */
3573   if (TYPE_OVERFLOW_TRAPS (type))
3574     return false;
3575
3576   if (!target_supports_divmod_p (divmod_optab, div_optab, mode))
3577     return false;
3578
3579   return true;
3580 }
3581
3582 /* This function looks for:
3583    t1 = a TRUNC_DIV_EXPR b;
3584    t2 = a TRUNC_MOD_EXPR b;
3585    and transforms it to the following sequence:
3586    complex_tmp = DIVMOD (a, b);
3587    t1 = REALPART_EXPR(a);
3588    t2 = IMAGPART_EXPR(b);
3589    For conditions enabling the transform see divmod_candidate_p().
3590
3591    The pass has three parts:
3592    1) Find top_stmt which is trunc_div or trunc_mod stmt and dominates all
3593       other trunc_div_expr and trunc_mod_expr stmts.
3594    2) Add top_stmt and all trunc_div and trunc_mod stmts dominated by top_stmt
3595       to stmts vector.
3596    3) Insert DIVMOD call just before top_stmt and update entries in
3597       stmts vector to use return value of DIMOVD (REALEXPR_PART for div,
3598       IMAGPART_EXPR for mod).  */
3599
3600 static bool
3601 convert_to_divmod (gassign *stmt)
3602 {
3603   if (stmt_can_throw_internal (cfun, stmt)
3604       || !divmod_candidate_p (stmt))
3605     return false;
3606
3607   tree op1 = gimple_assign_rhs1 (stmt);
3608   tree op2 = gimple_assign_rhs2 (stmt);
3609
3610   imm_use_iterator use_iter;
3611   gimple *use_stmt;
3612   auto_vec<gimple *> stmts;
3613
3614   gimple *top_stmt = stmt;
3615   basic_block top_bb = gimple_bb (stmt);
3616
3617   /* Part 1: Try to set top_stmt to "topmost" stmt that dominates
3618      at-least stmt and possibly other trunc_div/trunc_mod stmts
3619      having same operands as stmt.  */
3620
3621   FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, op1)
3622     {
3623       if (is_gimple_assign (use_stmt)
3624           && (gimple_assign_rhs_code (use_stmt) == TRUNC_DIV_EXPR
3625               || gimple_assign_rhs_code (use_stmt) == TRUNC_MOD_EXPR)
3626           && operand_equal_p (op1, gimple_assign_rhs1 (use_stmt), 0)
3627           && operand_equal_p (op2, gimple_assign_rhs2 (use_stmt), 0))
3628         {
3629           if (stmt_can_throw_internal (cfun, use_stmt))
3630             continue;
3631
3632           basic_block bb = gimple_bb (use_stmt);
3633
3634           if (bb == top_bb)
3635             {
3636               if (gimple_uid (use_stmt) < gimple_uid (top_stmt))
3637                 top_stmt = use_stmt;
3638             }
3639           else if (dominated_by_p (CDI_DOMINATORS, top_bb, bb))
3640             {
3641               top_bb = bb;
3642               top_stmt = use_stmt;
3643             }
3644         }
3645     }
3646
3647   tree top_op1 = gimple_assign_rhs1 (top_stmt);
3648   tree top_op2 = gimple_assign_rhs2 (top_stmt);
3649
3650   stmts.safe_push (top_stmt);
3651   bool div_seen = (gimple_assign_rhs_code (top_stmt) == TRUNC_DIV_EXPR);
3652
3653   /* Part 2: Add all trunc_div/trunc_mod statements domianted by top_bb
3654      to stmts vector. The 2nd loop will always add stmt to stmts vector, since
3655      gimple_bb (top_stmt) dominates gimple_bb (stmt), so the
3656      2nd loop ends up adding at-least single trunc_mod_expr stmt.  */
3657
3658   FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, top_op1)
3659     {
3660       if (is_gimple_assign (use_stmt)
3661           && (gimple_assign_rhs_code (use_stmt) == TRUNC_DIV_EXPR
3662               || gimple_assign_rhs_code (use_stmt) == TRUNC_MOD_EXPR)
3663           && operand_equal_p (top_op1, gimple_assign_rhs1 (use_stmt), 0)
3664           && operand_equal_p (top_op2, gimple_assign_rhs2 (use_stmt), 0))
3665         {
3666           if (use_stmt == top_stmt
3667               || stmt_can_throw_internal (cfun, use_stmt)
3668               || !dominated_by_p (CDI_DOMINATORS, gimple_bb (use_stmt), top_bb))
3669             continue;
3670
3671           stmts.safe_push (use_stmt);
3672           if (gimple_assign_rhs_code (use_stmt) == TRUNC_DIV_EXPR)
3673             div_seen = true;
3674         }
3675     }
3676
3677   if (!div_seen)
3678     return false;
3679
3680   /* Part 3: Create libcall to internal fn DIVMOD:
3681      divmod_tmp = DIVMOD (op1, op2).  */
3682
3683   gcall *call_stmt = gimple_build_call_internal (IFN_DIVMOD, 2, op1, op2);
3684   tree res = make_temp_ssa_name (build_complex_type (TREE_TYPE (op1)),
3685                                  call_stmt, "divmod_tmp");
3686   gimple_call_set_lhs (call_stmt, res);
3687   /* We rejected throwing statements above.  */
3688   gimple_call_set_nothrow (call_stmt, true);
3689
3690   /* Insert the call before top_stmt.  */
3691   gimple_stmt_iterator top_stmt_gsi = gsi_for_stmt (top_stmt);
3692   gsi_insert_before (&top_stmt_gsi, call_stmt, GSI_SAME_STMT);
3693
3694   widen_mul_stats.divmod_calls_inserted++;
3695
3696   /* Update all statements in stmts vector:
3697      lhs = op1 TRUNC_DIV_EXPR op2 -> lhs = REALPART_EXPR<divmod_tmp>
3698      lhs = op1 TRUNC_MOD_EXPR op2 -> lhs = IMAGPART_EXPR<divmod_tmp>.  */
3699
3700   for (unsigned i = 0; stmts.iterate (i, &use_stmt); ++i)
3701     {
3702       tree new_rhs;
3703
3704       switch (gimple_assign_rhs_code (use_stmt))
3705         {
3706           case TRUNC_DIV_EXPR:
3707             new_rhs = fold_build1 (REALPART_EXPR, TREE_TYPE (op1), res);
3708             break;
3709
3710           case TRUNC_MOD_EXPR:
3711             new_rhs = fold_build1 (IMAGPART_EXPR, TREE_TYPE (op1), res);
3712             break;
3713
3714           default:
3715             gcc_unreachable ();
3716         }
3717
3718       gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
3719       gimple_assign_set_rhs_from_tree (&gsi, new_rhs);
3720       update_stmt (use_stmt);
3721     }
3722
3723   return true;
3724 }
3725
3726 /* Find integer multiplications where the operands are extended from
3727    smaller types, and replace the MULT_EXPR with a WIDEN_MULT_EXPR
3728    where appropriate.  */
3729
3730 namespace {
3731
3732 const pass_data pass_data_optimize_widening_mul =
3733 {
3734   GIMPLE_PASS, /* type */
3735   "widening_mul", /* name */
3736   OPTGROUP_NONE, /* optinfo_flags */
3737   TV_TREE_WIDEN_MUL, /* tv_id */
3738   PROP_ssa, /* properties_required */
3739   0, /* properties_provided */
3740   0, /* properties_destroyed */
3741   0, /* todo_flags_start */
3742   TODO_update_ssa, /* todo_flags_finish */
3743 };
3744
3745 class pass_optimize_widening_mul : public gimple_opt_pass
3746 {
3747 public:
3748   pass_optimize_widening_mul (gcc::context *ctxt)
3749     : gimple_opt_pass (pass_data_optimize_widening_mul, ctxt)
3750   {}
3751
3752   /* opt_pass methods: */
3753   virtual bool gate (function *)
3754     {
3755       return flag_expensive_optimizations && optimize;
3756     }
3757
3758   virtual unsigned int execute (function *);
3759
3760 }; // class pass_optimize_widening_mul
3761
3762 /* Walker class to perform the transformation in reverse dominance order. */
3763
3764 class math_opts_dom_walker : public dom_walker
3765 {
3766 public:
3767   /* Constructor, CFG_CHANGED is a pointer to a boolean flag that will be set
3768      if walking modidifes the CFG.  */
3769
3770   math_opts_dom_walker (bool *cfg_changed_p)
3771     : dom_walker (CDI_DOMINATORS), m_last_result_set (),
3772       m_cfg_changed_p (cfg_changed_p) {}
3773
3774   /* The actual actions performed in the walk.  */
3775
3776   virtual void after_dom_children (basic_block);
3777
3778   /* Set of results of chains of multiply and add statement combinations that
3779      were not transformed into FMAs because of active deferring.  */
3780   hash_set<tree> m_last_result_set;
3781
3782   /* Pointer to a flag of the user that needs to be set if CFG has been
3783      modified.  */
3784   bool *m_cfg_changed_p;
3785 };
3786
3787 void
3788 math_opts_dom_walker::after_dom_children (basic_block bb)
3789 {
3790   gimple_stmt_iterator gsi;
3791
3792   fma_deferring_state fma_state (param_avoid_fma_max_bits > 0);
3793
3794   for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi);)
3795     {
3796       gimple *stmt = gsi_stmt (gsi);
3797       enum tree_code code;
3798
3799       if (is_gimple_assign (stmt))
3800         {
3801           code = gimple_assign_rhs_code (stmt);
3802           switch (code)
3803             {
3804             case MULT_EXPR:
3805               if (!convert_mult_to_widen (stmt, &gsi)
3806                   && !convert_expand_mult_copysign (stmt, &gsi)
3807                   && convert_mult_to_fma (stmt,
3808                                           gimple_assign_rhs1 (stmt),
3809                                           gimple_assign_rhs2 (stmt),
3810                                           &fma_state))
3811                 {
3812                   gsi_remove (&gsi, true);
3813                   release_defs (stmt);
3814                   continue;
3815                 }
3816               break;
3817
3818             case PLUS_EXPR:
3819             case MINUS_EXPR:
3820               if (!convert_plusminus_to_widen (&gsi, stmt, code))
3821                 match_uaddsub_overflow (&gsi, stmt, code);
3822               break;
3823
3824             case TRUNC_MOD_EXPR:
3825               convert_to_divmod (as_a<gassign *> (stmt));
3826               break;
3827
3828             default:;
3829             }
3830         }
3831       else if (is_gimple_call (stmt))
3832         {
3833           switch (gimple_call_combined_fn (stmt))
3834             {
3835             CASE_CFN_POW:
3836               if (gimple_call_lhs (stmt)
3837                   && TREE_CODE (gimple_call_arg (stmt, 1)) == REAL_CST
3838                   && real_equal (&TREE_REAL_CST (gimple_call_arg (stmt, 1)),
3839                                  &dconst2)
3840                   && convert_mult_to_fma (stmt,
3841                                           gimple_call_arg (stmt, 0),
3842                                           gimple_call_arg (stmt, 0),
3843                                           &fma_state))
3844                 {
3845                   unlink_stmt_vdef (stmt);
3846                   if (gsi_remove (&gsi, true)
3847                       && gimple_purge_dead_eh_edges (bb))
3848                     *m_cfg_changed_p = true;
3849                   release_defs (stmt);
3850                   continue;
3851                 }
3852               break;
3853
3854             case CFN_COND_MUL:
3855               if (convert_mult_to_fma (stmt,
3856                                        gimple_call_arg (stmt, 1),
3857                                        gimple_call_arg (stmt, 2),
3858                                        &fma_state,
3859                                        gimple_call_arg (stmt, 0)))
3860
3861                 {
3862                   gsi_remove (&gsi, true);
3863                   release_defs (stmt);
3864                   continue;
3865                 }
3866               break;
3867
3868             case CFN_LAST:
3869               cancel_fma_deferring (&fma_state);
3870               break;
3871
3872             default:
3873               break;
3874             }
3875         }
3876       gsi_next (&gsi);
3877     }
3878   if (fma_state.m_deferring_p
3879       && fma_state.m_initial_phi)
3880     {
3881       gcc_checking_assert (fma_state.m_last_result);
3882       if (!last_fma_candidate_feeds_initial_phi (&fma_state,
3883                                                  &m_last_result_set))
3884         cancel_fma_deferring (&fma_state);
3885       else
3886         m_last_result_set.add (fma_state.m_last_result);
3887     }
3888 }
3889
3890
3891 unsigned int
3892 pass_optimize_widening_mul::execute (function *fun)
3893 {
3894   bool cfg_changed = false;
3895
3896   memset (&widen_mul_stats, 0, sizeof (widen_mul_stats));
3897   calculate_dominance_info (CDI_DOMINATORS);
3898   renumber_gimple_stmt_uids (cfun);
3899
3900   math_opts_dom_walker (&cfg_changed).walk (ENTRY_BLOCK_PTR_FOR_FN (cfun));
3901
3902   statistics_counter_event (fun, "widening multiplications inserted",
3903                             widen_mul_stats.widen_mults_inserted);
3904   statistics_counter_event (fun, "widening maccs inserted",
3905                             widen_mul_stats.maccs_inserted);
3906   statistics_counter_event (fun, "fused multiply-adds inserted",
3907                             widen_mul_stats.fmas_inserted);
3908   statistics_counter_event (fun, "divmod calls inserted",
3909                             widen_mul_stats.divmod_calls_inserted);
3910
3911   return cfg_changed ? TODO_cleanup_cfg : 0;
3912 }
3913
3914 } // anon namespace
3915
3916 gimple_opt_pass *
3917 make_pass_optimize_widening_mul (gcc::context *ctxt)
3918 {
3919   return new pass_optimize_widening_mul (ctxt);
3920 }