gcc/tree-ssa-math-opts.c

   1 /* Global, SSA-based optimizations using mathematical identities.
   2    Copyright (C) 2005-2014 Free Software Foundation, Inc.
   3
   4 This file is part of GCC.
   5
   6 GCC is free software; you can redistribute it and/or modify it
   7 under the terms of the GNU General Public License as published by the
   8 Free Software Foundation; either version 3, or (at your option) any
   9 later version.
  10
  11 GCC is distributed in the hope that it will be useful, but WITHOUT
  12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14 for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with GCC; see the file COPYING3.  If not see
  18 <http://www.gnu.org/licenses/>.  */
  19
  20 /* Currently, the only mini-pass in this file tries to CSE reciprocal
  21    operations.  These are common in sequences such as this one:
  22
  23         modulus = sqrt(x*x + y*y + z*z);
  24         x = x / modulus;
  25         y = y / modulus;
  26         z = z / modulus;
  27
  28    that can be optimized to
  29
  30         modulus = sqrt(x*x + y*y + z*z);
  31         rmodulus = 1.0 / modulus;
  32         x = x * rmodulus;
  33         y = y * rmodulus;
  34         z = z * rmodulus;
  35
  36    We do this for loop invariant divisors, and with this pass whenever
  37    we notice that a division has the same divisor multiple times.
  38
  39    Of course, like in PRE, we don't insert a division if a dominator
  40    already has one.  However, this cannot be done as an extension of
  41    PRE for several reasons.
  42
  43    First of all, with some experiments it was found out that the
  44    transformation is not always useful if there are only two divisions
  45    hy the same divisor.  This is probably because modern processors
  46    can pipeline the divisions; on older, in-order processors it should
  47    still be effective to optimize two divisions by the same number.
  48    We make this a param, and it shall be called N in the remainder of
  49    this comment.
  50
  51    Second, if trapping math is active, we have less freedom on where
  52    to insert divisions: we can only do so in basic blocks that already
  53    contain one.  (If divisions don't trap, instead, we can insert
  54    divisions elsewhere, which will be in blocks that are common dominators
  55    of those that have the division).
  56
  57    We really don't want to compute the reciprocal unless a division will
  58    be found.  To do this, we won't insert the division in a basic block
  59    that has less than N divisions *post-dominating* it.
  60
  61    The algorithm constructs a subset of the dominator tree, holding the
  62    blocks containing the divisions and the common dominators to them,
  63    and walk it twice.  The first walk is in post-order, and it annotates
  64    each block with the number of divisions that post-dominate it: this
  65    gives information on where divisions can be inserted profitably.
  66    The second walk is in pre-order, and it inserts divisions as explained
  67    above, and replaces divisions by multiplications.
  68
  69    In the best case, the cost of the pass is O(n_statements).  In the
  70    worst-case, the cost is due to creating the dominator tree subset,
  71    with a cost of O(n_basic_blocks ^ 2); however this can only happen
  72    for n_statements / n_basic_blocks statements.  So, the amortized cost
  73    of creating the dominator tree subset is O(n_basic_blocks) and the
  74    worst-case cost of the pass is O(n_statements * n_basic_blocks).
  75
  76    More practically, the cost will be small because there are few
  77    divisions, and they tend to be in the same basic block, so insert_bb
  78    is called very few times.
  79
  80    If we did this using domwalk.c, an efficient implementation would have
  81    to work on all the variables in a single pass, because we could not
  82    work on just a subset of the dominator tree, as we do now, and the
  83    cost would also be something like O(n_statements * n_basic_blocks).
  84    The data structures would be more complex in order to work on all the
  85    variables in a single pass.  */
  86
  87 #include "config.h"
  88 #include "system.h"
  89 #include "coretypes.h"
  90 #include "tm.h"
  91 #include "flags.h"
  92 #include "tree.h"
  93 #include "basic-block.h"
  94 #include "tree-ssa-alias.h"
  95 #include "internal-fn.h"
  96 #include "gimple-fold.h"
  97 #include "gimple-expr.h"
  98 #include "is-a.h"
  99 #include "gimple.h"
 100 #include "gimple-iterator.h"
 101 #include "gimplify.h"
 102 #include "gimplify-me.h"
 103 #include "stor-layout.h"
 104 #include "gimple-ssa.h"
 105 #include "tree-cfg.h"
 106 #include "tree-phinodes.h"
 107 #include "ssa-iterators.h"
 108 #include "stringpool.h"
 109 #include "tree-ssanames.h"
 110 #include "expr.h"
 111 #include "tree-dfa.h"
 112 #include "tree-ssa.h"
 113 #include "tree-pass.h"
 114 #include "alloc-pool.h"
 115 #include "target.h"
 116 #include "gimple-pretty-print.h"
 117
 118 /* FIXME: RTL headers have to be included here for optabs.  */
 119 #include "rtl.h"                /* Because optabs.h wants enum rtx_code.  */
 120 #include "expr.h"               /* Because optabs.h wants sepops.  */
 121 #include "optabs.h"
 122
 123 /* This structure represents one basic block that either computes a
 124    division, or is a common dominator for basic block that compute a
 125    division.  */
 126 struct occurrence {
 127   /* The basic block represented by this structure.  */
 128   basic_block bb;
 129
 130   /* If non-NULL, the SSA_NAME holding the definition for a reciprocal
 131      inserted in BB.  */
 132   tree recip_def;
 133
 134   /* If non-NULL, the GIMPLE_ASSIGN for a reciprocal computation that
 135      was inserted in BB.  */
 136   gimple recip_def_stmt;
 137
 138   /* Pointer to a list of "struct occurrence"s for blocks dominated
 139      by BB.  */
 140   struct occurrence *children;
 141
 142   /* Pointer to the next "struct occurrence"s in the list of blocks
 143      sharing a common dominator.  */
 144   struct occurrence *next;
 145
 146   /* The number of divisions that are in BB before compute_merit.  The
 147      number of divisions that are in BB or post-dominate it after
 148      compute_merit.  */
 149   int num_divisions;
 150
 151   /* True if the basic block has a division, false if it is a common
 152      dominator for basic blocks that do.  If it is false and trapping
 153      math is active, BB is not a candidate for inserting a reciprocal.  */
 154   bool bb_has_division;
 155 };
 156
 157 static struct
 158 {
 159   /* Number of 1.0/X ops inserted.  */
 160   int rdivs_inserted;
 161
 162   /* Number of 1.0/FUNC ops inserted.  */
 163   int rfuncs_inserted;
 164 } reciprocal_stats;
 165
 166 static struct
 167 {
 168   /* Number of cexpi calls inserted.  */
 169   int inserted;
 170 } sincos_stats;
 171
 172 static struct
 173 {
 174   /* Number of hand-written 16-bit nop / bswaps found.  */
 175   int found_16bit;
 176
 177   /* Number of hand-written 32-bit nop / bswaps found.  */
 178   int found_32bit;
 179
 180   /* Number of hand-written 64-bit nop / bswaps found.  */
 181   int found_64bit;
 182 } nop_stats, bswap_stats;
 183
 184 static struct
 185 {
 186   /* Number of widening multiplication ops inserted.  */
 187   int widen_mults_inserted;
 188
 189   /* Number of integer multiply-and-accumulate ops inserted.  */
 190   int maccs_inserted;
 191
 192   /* Number of fp fused multiply-add ops inserted.  */
 193   int fmas_inserted;
 194 } widen_mul_stats;
 195
 196 /* The instance of "struct occurrence" representing the highest
 197    interesting block in the dominator tree.  */
 198 static struct occurrence *occ_head;
 199
 200 /* Allocation pool for getting instances of "struct occurrence".  */
 201 static alloc_pool occ_pool;
 202
 203
 204
 205 /* Allocate and return a new struct occurrence for basic block BB, and
 206    whose children list is headed by CHILDREN.  */
 207 static struct occurrence *
 208 occ_new (basic_block bb, struct occurrence *children)
 209 {
 210   struct occurrence *occ;
 211
 212   bb->aux = occ = (struct occurrence *) pool_alloc (occ_pool);
 213   memset (occ, 0, sizeof (struct occurrence));
 214
 215   occ->bb = bb;
 216   occ->children = children;
 217   return occ;
 218 }
 219
 220
 221 /* Insert NEW_OCC into our subset of the dominator tree.  P_HEAD points to a
 222    list of "struct occurrence"s, one per basic block, having IDOM as
 223    their common dominator.
 224
 225    We try to insert NEW_OCC as deep as possible in the tree, and we also
 226    insert any other block that is a common dominator for BB and one
 227    block already in the tree.  */
 228
 229 static void
 230 insert_bb (struct occurrence *new_occ, basic_block idom,
 231            struct occurrence **p_head)
 232 {
 233   struct occurrence *occ, **p_occ;
 234
 235   for (p_occ = p_head; (occ = *p_occ) != NULL; )
 236     {
 237       basic_block bb = new_occ->bb, occ_bb = occ->bb;
 238       basic_block dom = nearest_common_dominator (CDI_DOMINATORS, occ_bb, bb);
 239       if (dom == bb)
 240         {
 241           /* BB dominates OCC_BB.  OCC becomes NEW_OCC's child: remove OCC
 242              from its list.  */
 243           *p_occ = occ->next;
 244           occ->next = new_occ->children;
 245           new_occ->children = occ;
 246
 247           /* Try the next block (it may as well be dominated by BB).  */
 248         }
 249
 250       else if (dom == occ_bb)
 251         {
 252           /* OCC_BB dominates BB.  Tail recurse to look deeper.  */
 253           insert_bb (new_occ, dom, &occ->children);
 254           return;
 255         }
 256
 257       else if (dom != idom)
 258         {
 259           gcc_assert (!dom->aux);
 260
 261           /* There is a dominator between IDOM and BB, add it and make
 262              two children out of NEW_OCC and OCC.  First, remove OCC from
 263              its list.  */
 264           *p_occ = occ->next;
 265           new_occ->next = occ;
 266           occ->next = NULL;
 267
 268           /* None of the previous blocks has DOM as a dominator: if we tail
 269              recursed, we would reexamine them uselessly. Just switch BB with
 270              DOM, and go on looking for blocks dominated by DOM.  */
 271           new_occ = occ_new (dom, new_occ);
 272         }
 273
 274       else
 275         {
 276           /* Nothing special, go on with the next element.  */
 277           p_occ = &occ->next;
 278         }
 279     }
 280
 281   /* No place was found as a child of IDOM.  Make BB a sibling of IDOM.  */
 282   new_occ->next = *p_head;
 283   *p_head = new_occ;
 284 }
 285
 286 /* Register that we found a division in BB.  */
 287
 288 static inline void
 289 register_division_in (basic_block bb)
 290 {
 291   struct occurrence *occ;
 292
 293   occ = (struct occurrence *) bb->aux;
 294   if (!occ)
 295     {
 296       occ = occ_new (bb, NULL);
 297       insert_bb (occ, ENTRY_BLOCK_PTR_FOR_FN (cfun), &occ_head);
 298     }
 299
 300   occ->bb_has_division = true;
 301   occ->num_divisions++;
 302 }
 303
 304
 305 /* Compute the number of divisions that postdominate each block in OCC and
 306    its children.  */
 307
 308 static void
 309 compute_merit (struct occurrence *occ)
 310 {
 311   struct occurrence *occ_child;
 312   basic_block dom = occ->bb;
 313
 314   for (occ_child = occ->children; occ_child; occ_child = occ_child->next)
 315     {
 316       basic_block bb;
 317       if (occ_child->children)
 318         compute_merit (occ_child);
 319
 320       if (flag_exceptions)
 321         bb = single_noncomplex_succ (dom);
 322       else
 323         bb = dom;
 324
 325       if (dominated_by_p (CDI_POST_DOMINATORS, bb, occ_child->bb))
 326         occ->num_divisions += occ_child->num_divisions;
 327     }
 328 }
 329
 330
 331 /* Return whether USE_STMT is a floating-point division by DEF.  */
 332 static inline bool
 333 is_division_by (gimple use_stmt, tree def)
 334 {
 335   return is_gimple_assign (use_stmt)
 336          && gimple_assign_rhs_code (use_stmt) == RDIV_EXPR
 337          && gimple_assign_rhs2 (use_stmt) == def
 338          /* Do not recognize x / x as valid division, as we are getting
 339             confused later by replacing all immediate uses x in such
 340             a stmt.  */
 341          && gimple_assign_rhs1 (use_stmt) != def;
 342 }
 343
 344 /* Walk the subset of the dominator tree rooted at OCC, setting the
 345    RECIP_DEF field to a definition of 1.0 / DEF that can be used in
 346    the given basic block.  The field may be left NULL, of course,
 347    if it is not possible or profitable to do the optimization.
 348
 349    DEF_BSI is an iterator pointing at the statement defining DEF.
 350    If RECIP_DEF is set, a dominator already has a computation that can
 351    be used.  */
 352
 353 static void
 354 insert_reciprocals (gimple_stmt_iterator *def_gsi, struct occurrence *occ,
 355                     tree def, tree recip_def, int threshold)
 356 {
 357   tree type;
 358   gimple new_stmt;
 359   gimple_stmt_iterator gsi;
 360   struct occurrence *occ_child;
 361
 362   if (!recip_def
 363       && (occ->bb_has_division || !flag_trapping_math)
 364       && occ->num_divisions >= threshold)
 365     {
 366       /* Make a variable with the replacement and substitute it.  */
 367       type = TREE_TYPE (def);
 368       recip_def = create_tmp_reg (type, "reciptmp");
 369       new_stmt = gimple_build_assign_with_ops (RDIV_EXPR, recip_def,
 370                                                build_one_cst (type), def);
 371
 372       if (occ->bb_has_division)
 373         {
 374           /* Case 1: insert before an existing division.  */
 375           gsi = gsi_after_labels (occ->bb);
 376           while (!gsi_end_p (gsi) && !is_division_by (gsi_stmt (gsi), def))
 377             gsi_next (&gsi);
 378
 379           gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
 380         }
 381       else if (def_gsi && occ->bb == def_gsi->bb)
 382         {
 383           /* Case 2: insert right after the definition.  Note that this will
 384              never happen if the definition statement can throw, because in
 385              that case the sole successor of the statement's basic block will
 386              dominate all the uses as well.  */
 387           gsi_insert_after (def_gsi, new_stmt, GSI_NEW_STMT);
 388         }
 389       else
 390         {
 391           /* Case 3: insert in a basic block not containing defs/uses.  */
 392           gsi = gsi_after_labels (occ->bb);
 393           gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
 394         }
 395
 396       reciprocal_stats.rdivs_inserted++;
 397
 398       occ->recip_def_stmt = new_stmt;
 399     }
 400
 401   occ->recip_def = recip_def;
 402   for (occ_child = occ->children; occ_child; occ_child = occ_child->next)
 403     insert_reciprocals (def_gsi, occ_child, def, recip_def, threshold);
 404 }
 405
 406
 407 /* Replace the division at USE_P with a multiplication by the reciprocal, if
 408    possible.  */
 409
 410 static inline void
 411 replace_reciprocal (use_operand_p use_p)
 412 {
 413   gimple use_stmt = USE_STMT (use_p);
 414   basic_block bb = gimple_bb (use_stmt);
 415   struct occurrence *occ = (struct occurrence *) bb->aux;
 416
 417   if (optimize_bb_for_speed_p (bb)
 418       && occ->recip_def && use_stmt != occ->recip_def_stmt)
 419     {
 420       gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
 421       gimple_assign_set_rhs_code (use_stmt, MULT_EXPR);
 422       SET_USE (use_p, occ->recip_def);
 423       fold_stmt_inplace (&gsi);
 424       update_stmt (use_stmt);
 425     }
 426 }
 427
 428
 429 /* Free OCC and return one more "struct occurrence" to be freed.  */
 430
 431 static struct occurrence *
 432 free_bb (struct occurrence *occ)
 433 {
 434   struct occurrence *child, *next;
 435
 436   /* First get the two pointers hanging off OCC.  */
 437   next = occ->next;
 438   child = occ->children;
 439   occ->bb->aux = NULL;
 440   pool_free (occ_pool, occ);
 441
 442   /* Now ensure that we don't recurse unless it is necessary.  */
 443   if (!child)
 444     return next;
 445   else
 446     {
 447       while (next)
 448         next = free_bb (next);
 449
 450       return child;
 451     }
 452 }
 453
 454
 455 /* Look for floating-point divisions among DEF's uses, and try to
 456    replace them by multiplications with the reciprocal.  Add
 457    as many statements computing the reciprocal as needed.
 458
 459    DEF must be a GIMPLE register of a floating-point type.  */
 460
 461 static void
 462 execute_cse_reciprocals_1 (gimple_stmt_iterator *def_gsi, tree def)
 463 {
 464   use_operand_p use_p;
 465   imm_use_iterator use_iter;
 466   struct occurrence *occ;
 467   int count = 0, threshold;
 468
 469   gcc_assert (FLOAT_TYPE_P (TREE_TYPE (def)) && is_gimple_reg (def));
 470
 471   FOR_EACH_IMM_USE_FAST (use_p, use_iter, def)
 472     {
 473       gimple use_stmt = USE_STMT (use_p);
 474       if (is_division_by (use_stmt, def))
 475         {
 476           register_division_in (gimple_bb (use_stmt));
 477           count++;
 478         }
 479     }
 480
 481   /* Do the expensive part only if we can hope to optimize something.  */
 482   threshold = targetm.min_divisions_for_recip_mul (TYPE_MODE (TREE_TYPE (def)));
 483   if (count >= threshold)
 484     {
 485       gimple use_stmt;
 486       for (occ = occ_head; occ; occ = occ->next)
 487         {
 488           compute_merit (occ);
 489           insert_reciprocals (def_gsi, occ, def, NULL, threshold);
 490         }
 491
 492       FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, def)
 493         {
 494           if (is_division_by (use_stmt, def))
 495             {
 496               FOR_EACH_IMM_USE_ON_STMT (use_p, use_iter)
 497                 replace_reciprocal (use_p);
 498             }
 499         }
 500     }
 501
 502   for (occ = occ_head; occ; )
 503     occ = free_bb (occ);
 504
 505   occ_head = NULL;
 506 }
 507
 508 /* Go through all the floating-point SSA_NAMEs, and call
 509    execute_cse_reciprocals_1 on each of them.  */
 510 namespace {
 511
 512 const pass_data pass_data_cse_reciprocals =
 513 {
 514   GIMPLE_PASS, /* type */
 515   "recip", /* name */
 516   OPTGROUP_NONE, /* optinfo_flags */
 517   true, /* has_execute */
 518   TV_NONE, /* tv_id */
 519   PROP_ssa, /* properties_required */
 520   0, /* properties_provided */
 521   0, /* properties_destroyed */
 522   0, /* todo_flags_start */
 523   TODO_update_ssa, /* todo_flags_finish */
 524 };
 525
 526 class pass_cse_reciprocals : public gimple_opt_pass
 527 {
 528 public:
 529   pass_cse_reciprocals (gcc::context *ctxt)
 530     : gimple_opt_pass (pass_data_cse_reciprocals, ctxt)
 531   {}
 532
 533   /* opt_pass methods: */
 534   virtual bool gate (function *) { return optimize && flag_reciprocal_math; }
 535   virtual unsigned int execute (function *);
 536
 537 }; // class pass_cse_reciprocals
 538
 539 unsigned int
 540 pass_cse_reciprocals::execute (function *fun)
 541 {
 542   basic_block bb;
 543   tree arg;
 544
 545   occ_pool = create_alloc_pool ("dominators for recip",
 546                                 sizeof (struct occurrence),
 547                                 n_basic_blocks_for_fn (fun) / 3 + 1);
 548
 549   memset (&reciprocal_stats, 0, sizeof (reciprocal_stats));
 550   calculate_dominance_info (CDI_DOMINATORS);
 551   calculate_dominance_info (CDI_POST_DOMINATORS);
 552
 553 #ifdef ENABLE_CHECKING
 554   FOR_EACH_BB_FN (bb, fun)
 555     gcc_assert (!bb->aux);
 556 #endif
 557
 558   for (arg = DECL_ARGUMENTS (fun->decl); arg; arg = DECL_CHAIN (arg))
 559     if (FLOAT_TYPE_P (TREE_TYPE (arg))
 560         && is_gimple_reg (arg))
 561       {
 562         tree name = ssa_default_def (fun, arg);
 563         if (name)
 564           execute_cse_reciprocals_1 (NULL, name);
 565       }
 566
 567   FOR_EACH_BB_FN (bb, fun)
 568     {
 569       gimple_stmt_iterator gsi;
 570       gimple phi;
 571       tree def;
 572
 573       for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 574         {
 575           phi = gsi_stmt (gsi);
 576           def = PHI_RESULT (phi);
 577           if (! virtual_operand_p (def)
 578               && FLOAT_TYPE_P (TREE_TYPE (def)))
 579             execute_cse_reciprocals_1 (NULL, def);
 580         }
 581
 582       for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 583         {
 584           gimple stmt = gsi_stmt (gsi);
 585
 586           if (gimple_has_lhs (stmt)
 587               && (def = SINGLE_SSA_TREE_OPERAND (stmt, SSA_OP_DEF)) != NULL
 588               && FLOAT_TYPE_P (TREE_TYPE (def))
 589               && TREE_CODE (def) == SSA_NAME)
 590             execute_cse_reciprocals_1 (&gsi, def);
 591         }
 592
 593       if (optimize_bb_for_size_p (bb))
 594         continue;
 595
 596       /* Scan for a/func(b) and convert it to reciprocal a*rfunc(b).  */
 597       for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 598         {
 599           gimple stmt = gsi_stmt (gsi);
 600           tree fndecl;
 601
 602           if (is_gimple_assign (stmt)
 603               && gimple_assign_rhs_code (stmt) == RDIV_EXPR)
 604             {
 605               tree arg1 = gimple_assign_rhs2 (stmt);
 606               gimple stmt1;
 607
 608               if (TREE_CODE (arg1) != SSA_NAME)
 609                 continue;
 610
 611               stmt1 = SSA_NAME_DEF_STMT (arg1);
 612
 613               if (is_gimple_call (stmt1)
 614                   && gimple_call_lhs (stmt1)
 615                   && (fndecl = gimple_call_fndecl (stmt1))
 616                   && (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL
 617                       || DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD))
 618                 {
 619                   enum built_in_function code;
 620                   bool md_code, fail;
 621                   imm_use_iterator ui;
 622                   use_operand_p use_p;
 623
 624                   code = DECL_FUNCTION_CODE (fndecl);
 625                   md_code = DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD;
 626
 627                   fndecl = targetm.builtin_reciprocal (code, md_code, false);
 628                   if (!fndecl)
 629                     continue;
 630
 631                   /* Check that all uses of the SSA name are divisions,
 632                      otherwise replacing the defining statement will do
 633                      the wrong thing.  */
 634                   fail = false;
 635                   FOR_EACH_IMM_USE_FAST (use_p, ui, arg1)
 636                     {
 637                       gimple stmt2 = USE_STMT (use_p);
 638                       if (is_gimple_debug (stmt2))
 639                         continue;
 640                       if (!is_gimple_assign (stmt2)
 641                           || gimple_assign_rhs_code (stmt2) != RDIV_EXPR
 642                           || gimple_assign_rhs1 (stmt2) == arg1
 643                           || gimple_assign_rhs2 (stmt2) != arg1)
 644                         {
 645                           fail = true;
 646                           break;
 647                         }
 648                     }
 649                   if (fail)
 650                     continue;
 651
 652                   gimple_replace_ssa_lhs (stmt1, arg1);
 653                   gimple_call_set_fndecl (stmt1, fndecl);
 654                   update_stmt (stmt1);
 655                   reciprocal_stats.rfuncs_inserted++;
 656
 657                   FOR_EACH_IMM_USE_STMT (stmt, ui, arg1)
 658                     {
 659                       gimple_stmt_iterator gsi = gsi_for_stmt (stmt);
 660                       gimple_assign_set_rhs_code (stmt, MULT_EXPR);
 661                       fold_stmt_inplace (&gsi);
 662                       update_stmt (stmt);
 663                     }
 664                 }
 665             }
 666         }
 667     }
 668
 669   statistics_counter_event (fun, "reciprocal divs inserted",
 670                             reciprocal_stats.rdivs_inserted);
 671   statistics_counter_event (fun, "reciprocal functions inserted",
 672                             reciprocal_stats.rfuncs_inserted);
 673
 674   free_dominance_info (CDI_DOMINATORS);
 675   free_dominance_info (CDI_POST_DOMINATORS);
 676   free_alloc_pool (occ_pool);
 677   return 0;
 678 }
 679
 680 } // anon namespace
 681
 682 gimple_opt_pass *
 683 make_pass_cse_reciprocals (gcc::context *ctxt)
 684 {
 685   return new pass_cse_reciprocals (ctxt);
 686 }
 687
 688 /* Records an occurrence at statement USE_STMT in the vector of trees
 689    STMTS if it is dominated by *TOP_BB or dominates it or this basic block
 690    is not yet initialized.  Returns true if the occurrence was pushed on
 691    the vector.  Adjusts *TOP_BB to be the basic block dominating all
 692    statements in the vector.  */
 693
 694 static bool
 695 maybe_record_sincos (vec<gimple> *stmts,
 696                      basic_block *top_bb, gimple use_stmt)
 697 {
 698   basic_block use_bb = gimple_bb (use_stmt);
 699   if (*top_bb
 700       && (*top_bb == use_bb
 701           || dominated_by_p (CDI_DOMINATORS, use_bb, *top_bb)))
 702     stmts->safe_push (use_stmt);
 703   else if (!*top_bb
 704            || dominated_by_p (CDI_DOMINATORS, *top_bb, use_bb))
 705     {
 706       stmts->safe_push (use_stmt);
 707       *top_bb = use_bb;
 708     }
 709   else
 710     return false;
 711
 712   return true;
 713 }
 714
 715 /* Look for sin, cos and cexpi calls with the same argument NAME and
 716    create a single call to cexpi CSEing the result in this case.
 717    We first walk over all immediate uses of the argument collecting
 718    statements that we can CSE in a vector and in a second pass replace
 719    the statement rhs with a REALPART or IMAGPART expression on the
 720    result of the cexpi call we insert before the use statement that
 721    dominates all other candidates.  */
 722
 723 static bool
 724 execute_cse_sincos_1 (tree name)
 725 {
 726   gimple_stmt_iterator gsi;
 727   imm_use_iterator use_iter;
 728   tree fndecl, res, type;
 729   gimple def_stmt, use_stmt, stmt;
 730   int seen_cos = 0, seen_sin = 0, seen_cexpi = 0;
 731   vec<gimple> stmts = vNULL;
 732   basic_block top_bb = NULL;
 733   int i;
 734   bool cfg_changed = false;
 735
 736   type = TREE_TYPE (name);
 737   FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, name)
 738     {
 739       if (gimple_code (use_stmt) != GIMPLE_CALL
 740           || !gimple_call_lhs (use_stmt)
 741           || !(fndecl = gimple_call_fndecl (use_stmt))
 742           || DECL_BUILT_IN_CLASS (fndecl) != BUILT_IN_NORMAL)
 743         continue;
 744
 745       switch (DECL_FUNCTION_CODE (fndecl))
 746         {
 747         CASE_FLT_FN (BUILT_IN_COS):
 748           seen_cos |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
 749           break;
 750
 751         CASE_FLT_FN (BUILT_IN_SIN):
 752           seen_sin |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
 753           break;
 754
 755         CASE_FLT_FN (BUILT_IN_CEXPI):
 756           seen_cexpi |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
 757           break;
 758
 759         default:;
 760         }
 761     }
 762
 763   if (seen_cos + seen_sin + seen_cexpi <= 1)
 764     {
 765       stmts.release ();
 766       return false;
 767     }
 768
 769   /* Simply insert cexpi at the beginning of top_bb but not earlier than
 770      the name def statement.  */
 771   fndecl = mathfn_built_in (type, BUILT_IN_CEXPI);
 772   if (!fndecl)
 773     return false;
 774   stmt = gimple_build_call (fndecl, 1, name);
 775   res = make_temp_ssa_name (TREE_TYPE (TREE_TYPE (fndecl)), stmt, "sincostmp");
 776   gimple_call_set_lhs (stmt, res);
 777
 778   def_stmt = SSA_NAME_DEF_STMT (name);
 779   if (!SSA_NAME_IS_DEFAULT_DEF (name)
 780       && gimple_code (def_stmt) != GIMPLE_PHI
 781       && gimple_bb (def_stmt) == top_bb)
 782     {
 783       gsi = gsi_for_stmt (def_stmt);
 784       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
 785     }
 786   else
 787     {
 788       gsi = gsi_after_labels (top_bb);
 789       gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
 790     }
 791   sincos_stats.inserted++;
 792
 793   /* And adjust the recorded old call sites.  */
 794   for (i = 0; stmts.iterate (i, &use_stmt); ++i)
 795     {
 796       tree rhs = NULL;
 797       fndecl = gimple_call_fndecl (use_stmt);
 798
 799       switch (DECL_FUNCTION_CODE (fndecl))
 800         {
 801         CASE_FLT_FN (BUILT_IN_COS):
 802           rhs = fold_build1 (REALPART_EXPR, type, res);
 803           break;
 804
 805         CASE_FLT_FN (BUILT_IN_SIN):
 806           rhs = fold_build1 (IMAGPART_EXPR, type, res);
 807           break;
 808
 809         CASE_FLT_FN (BUILT_IN_CEXPI):
 810           rhs = res;
 811           break;
 812
 813         default:;
 814           gcc_unreachable ();
 815         }
 816
 817         /* Replace call with a copy.  */
 818         stmt = gimple_build_assign (gimple_call_lhs (use_stmt), rhs);
 819
 820         gsi = gsi_for_stmt (use_stmt);
 821         gsi_replace (&gsi, stmt, true);
 822         if (gimple_purge_dead_eh_edges (gimple_bb (stmt)))
 823           cfg_changed = true;
 824     }
 825
 826   stmts.release ();
 827
 828   return cfg_changed;
 829 }
 830
 831 /* To evaluate powi(x,n), the floating point value x raised to the
 832    constant integer exponent n, we use a hybrid algorithm that
 833    combines the "window method" with look-up tables.  For an
 834    introduction to exponentiation algorithms and "addition chains",
 835    see section 4.6.3, "Evaluation of Powers" of Donald E. Knuth,
 836    "Seminumerical Algorithms", Vol. 2, "The Art of Computer Programming",
 837    3rd Edition, 1998, and Daniel M. Gordon, "A Survey of Fast Exponentiation
 838    Methods", Journal of Algorithms, Vol. 27, pp. 129-146, 1998.  */
 839
 840 /* Provide a default value for POWI_MAX_MULTS, the maximum number of
 841    multiplications to inline before calling the system library's pow
 842    function.  powi(x,n) requires at worst 2*bits(n)-2 multiplications,
 843    so this default never requires calling pow, powf or powl.  */
 844
 845 #ifndef POWI_MAX_MULTS
 846 #define POWI_MAX_MULTS  (2*HOST_BITS_PER_WIDE_INT-2)
 847 #endif
 848
 849 /* The size of the "optimal power tree" lookup table.  All
 850    exponents less than this value are simply looked up in the
 851    powi_table below.  This threshold is also used to size the
 852    cache of pseudo registers that hold intermediate results.  */
 853 #define POWI_TABLE_SIZE 256
 854
 855 /* The size, in bits of the window, used in the "window method"
 856    exponentiation algorithm.  This is equivalent to a radix of
 857    (1<<POWI_WINDOW_SIZE) in the corresponding "m-ary method".  */
 858 #define POWI_WINDOW_SIZE 3
 859
 860 /* The following table is an efficient representation of an
 861    "optimal power tree".  For each value, i, the corresponding
 862    value, j, in the table states than an optimal evaluation
 863    sequence for calculating pow(x,i) can be found by evaluating
 864    pow(x,j)*pow(x,i-j).  An optimal power tree for the first
 865    100 integers is given in Knuth's "Seminumerical algorithms".  */
 866
 867 static const unsigned char powi_table[POWI_TABLE_SIZE] =
 868   {
 869       0,   1,   1,   2,   2,   3,   3,   4,  /*   0 -   7 */
 870       4,   6,   5,   6,   6,  10,   7,   9,  /*   8 -  15 */
 871       8,  16,   9,  16,  10,  12,  11,  13,  /*  16 -  23 */
 872      12,  17,  13,  18,  14,  24,  15,  26,  /*  24 -  31 */
 873      16,  17,  17,  19,  18,  33,  19,  26,  /*  32 -  39 */
 874      20,  25,  21,  40,  22,  27,  23,  44,  /*  40 -  47 */
 875      24,  32,  25,  34,  26,  29,  27,  44,  /*  48 -  55 */
 876      28,  31,  29,  34,  30,  60,  31,  36,  /*  56 -  63 */
 877      32,  64,  33,  34,  34,  46,  35,  37,  /*  64 -  71 */
 878      36,  65,  37,  50,  38,  48,  39,  69,  /*  72 -  79 */
 879      40,  49,  41,  43,  42,  51,  43,  58,  /*  80 -  87 */
 880      44,  64,  45,  47,  46,  59,  47,  76,  /*  88 -  95 */
 881      48,  65,  49,  66,  50,  67,  51,  66,  /*  96 - 103 */
 882      52,  70,  53,  74,  54, 104,  55,  74,  /* 104 - 111 */
 883      56,  64,  57,  69,  58,  78,  59,  68,  /* 112 - 119 */
 884      60,  61,  61,  80,  62,  75,  63,  68,  /* 120 - 127 */
 885      64,  65,  65, 128,  66, 129,  67,  90,  /* 128 - 135 */
 886      68,  73,  69, 131,  70,  94,  71,  88,  /* 136 - 143 */
 887      72, 128,  73,  98,  74, 132,  75, 121,  /* 144 - 151 */
 888      76, 102,  77, 124,  78, 132,  79, 106,  /* 152 - 159 */
 889      80,  97,  81, 160,  82,  99,  83, 134,  /* 160 - 167 */
 890      84,  86,  85,  95,  86, 160,  87, 100,  /* 168 - 175 */
 891      88, 113,  89,  98,  90, 107,  91, 122,  /* 176 - 183 */
 892      92, 111,  93, 102,  94, 126,  95, 150,  /* 184 - 191 */
 893      96, 128,  97, 130,  98, 133,  99, 195,  /* 192 - 199 */
 894     100, 128, 101, 123, 102, 164, 103, 138,  /* 200 - 207 */
 895     104, 145, 105, 146, 106, 109, 107, 149,  /* 208 - 215 */
 896     108, 200, 109, 146, 110, 170, 111, 157,  /* 216 - 223 */
 897     112, 128, 113, 130, 114, 182, 115, 132,  /* 224 - 231 */
 898     116, 200, 117, 132, 118, 158, 119, 206,  /* 232 - 239 */
 899     120, 240, 121, 162, 122, 147, 123, 152,  /* 240 - 247 */
 900     124, 166, 125, 214, 126, 138, 127, 153,  /* 248 - 255 */
 901   };
 902
 903
 904 /* Return the number of multiplications required to calculate
 905    powi(x,n) where n is less than POWI_TABLE_SIZE.  This is a
 906    subroutine of powi_cost.  CACHE is an array indicating
 907    which exponents have already been calculated.  */
 908
 909 static int
 910 powi_lookup_cost (unsigned HOST_WIDE_INT n, bool *cache)
 911 {
 912   /* If we've already calculated this exponent, then this evaluation
 913      doesn't require any additional multiplications.  */
 914   if (cache[n])
 915     return 0;
 916
 917   cache[n] = true;
 918   return powi_lookup_cost (n - powi_table[n], cache)
 919          + powi_lookup_cost (powi_table[n], cache) + 1;
 920 }
 921
 922 /* Return the number of multiplications required to calculate
 923    powi(x,n) for an arbitrary x, given the exponent N.  This
 924    function needs to be kept in sync with powi_as_mults below.  */
 925
 926 static int
 927 powi_cost (HOST_WIDE_INT n)
 928 {
 929   bool cache[POWI_TABLE_SIZE];
 930   unsigned HOST_WIDE_INT digit;
 931   unsigned HOST_WIDE_INT val;
 932   int result;
 933
 934   if (n == 0)
 935     return 0;
 936
 937   /* Ignore the reciprocal when calculating the cost.  */
 938   val = (n < 0) ? -n : n;
 939
 940   /* Initialize the exponent cache.  */
 941   memset (cache, 0, POWI_TABLE_SIZE * sizeof (bool));
 942   cache[1] = true;
 943
 944   result = 0;
 945
 946   while (val >= POWI_TABLE_SIZE)
 947     {
 948       if (val & 1)
 949         {
 950           digit = val & ((1 << POWI_WINDOW_SIZE) - 1);
 951           result += powi_lookup_cost (digit, cache)
 952                     + POWI_WINDOW_SIZE + 1;
 953           val >>= POWI_WINDOW_SIZE;
 954         }
 955       else
 956         {
 957           val >>= 1;
 958           result++;
 959         }
 960     }
 961
 962   return result + powi_lookup_cost (val, cache);
 963 }
 964
 965 /* Recursive subroutine of powi_as_mults.  This function takes the
 966    array, CACHE, of already calculated exponents and an exponent N and
 967    returns a tree that corresponds to CACHE[1]**N, with type TYPE.  */
 968
 969 static tree
 970 powi_as_mults_1 (gimple_stmt_iterator *gsi, location_t loc, tree type,
 971                  HOST_WIDE_INT n, tree *cache)
 972 {
 973   tree op0, op1, ssa_target;
 974   unsigned HOST_WIDE_INT digit;
 975   gimple mult_stmt;
 976
 977   if (n < POWI_TABLE_SIZE && cache[n])
 978     return cache[n];
 979
 980   ssa_target = make_temp_ssa_name (type, NULL, "powmult");
 981
 982   if (n < POWI_TABLE_SIZE)
 983     {
 984       cache[n] = ssa_target;
 985       op0 = powi_as_mults_1 (gsi, loc, type, n - powi_table[n], cache);
 986       op1 = powi_as_mults_1 (gsi, loc, type, powi_table[n], cache);
 987     }
 988   else if (n & 1)
 989     {
 990       digit = n & ((1 << POWI_WINDOW_SIZE) - 1);
 991       op0 = powi_as_mults_1 (gsi, loc, type, n - digit, cache);
 992       op1 = powi_as_mults_1 (gsi, loc, type, digit, cache);
 993     }
 994   else
 995     {
 996       op0 = powi_as_mults_1 (gsi, loc, type, n >> 1, cache);
 997       op1 = op0;
 998     }
 999
1000   mult_stmt = gimple_build_assign_with_ops (MULT_EXPR, ssa_target, op0, op1);
1001   gimple_set_location (mult_stmt, loc);
1002   gsi_insert_before (gsi, mult_stmt, GSI_SAME_STMT);
1003
1004   return ssa_target;
1005 }
1006
1007 /* Convert ARG0**N to a tree of multiplications of ARG0 with itself.
1008    This function needs to be kept in sync with powi_cost above.  */
1009
1010 static tree
1011 powi_as_mults (gimple_stmt_iterator *gsi, location_t loc,
1012                tree arg0, HOST_WIDE_INT n)
1013 {
1014   tree cache[POWI_TABLE_SIZE], result, type = TREE_TYPE (arg0);
1015   gimple div_stmt;
1016   tree target;
1017
1018   if (n == 0)
1019     return build_real (type, dconst1);
1020
1021   memset (cache, 0,  sizeof (cache));
1022   cache[1] = arg0;
1023
1024   result = powi_as_mults_1 (gsi, loc, type, (n < 0) ? -n : n, cache);
1025   if (n >= 0)
1026     return result;
1027
1028   /* If the original exponent was negative, reciprocate the result.  */
1029   target = make_temp_ssa_name (type, NULL, "powmult");
1030   div_stmt = gimple_build_assign_with_ops (RDIV_EXPR, target,
1031                                            build_real (type, dconst1),
1032                                            result);
1033   gimple_set_location (div_stmt, loc);
1034   gsi_insert_before (gsi, div_stmt, GSI_SAME_STMT);
1035
1036   return target;
1037 }
1038
1039 /* ARG0 and N are the two arguments to a powi builtin in GSI with
1040    location info LOC.  If the arguments are appropriate, create an
1041    equivalent sequence of statements prior to GSI using an optimal
1042    number of multiplications, and return an expession holding the
1043    result.  */
1044
1045 static tree
1046 gimple_expand_builtin_powi (gimple_stmt_iterator *gsi, location_t loc,
1047                             tree arg0, HOST_WIDE_INT n)
1048 {
1049   /* Avoid largest negative number.  */
1050   if (n != -n
1051       && ((n >= -1 && n <= 2)
1052           || (optimize_function_for_speed_p (cfun)
1053               && powi_cost (n) <= POWI_MAX_MULTS)))
1054     return powi_as_mults (gsi, loc, arg0, n);
1055
1056   return NULL_TREE;
1057 }
1058
1059 /* Build a gimple call statement that calls FN with argument ARG.
1060    Set the lhs of the call statement to a fresh SSA name.  Insert the
1061    statement prior to GSI's current position, and return the fresh
1062    SSA name.  */
1063
1064 static tree
1065 build_and_insert_call (gimple_stmt_iterator *gsi, location_t loc,
1066                        tree fn, tree arg)
1067 {
1068   gimple call_stmt;
1069   tree ssa_target;
1070
1071   call_stmt = gimple_build_call (fn, 1, arg);
1072   ssa_target = make_temp_ssa_name (TREE_TYPE (arg), NULL, "powroot");
1073   gimple_set_lhs (call_stmt, ssa_target);
1074   gimple_set_location (call_stmt, loc);
1075   gsi_insert_before (gsi, call_stmt, GSI_SAME_STMT);
1076
1077   return ssa_target;
1078 }
1079
1080 /* Build a gimple binary operation with the given CODE and arguments
1081    ARG0, ARG1, assigning the result to a new SSA name for variable
1082    TARGET.  Insert the statement prior to GSI's current position, and
1083    return the fresh SSA name.*/
1084
1085 static tree
1086 build_and_insert_binop (gimple_stmt_iterator *gsi, location_t loc,
1087                         const char *name, enum tree_code code,
1088                         tree arg0, tree arg1)
1089 {
1090   tree result = make_temp_ssa_name (TREE_TYPE (arg0), NULL, name);
1091   gimple stmt = gimple_build_assign_with_ops (code, result, arg0, arg1);
1092   gimple_set_location (stmt, loc);
1093   gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1094   return result;
1095 }
1096
1097 /* Build a gimple reference operation with the given CODE and argument
1098    ARG, assigning the result to a new SSA name of TYPE with NAME.
1099    Insert the statement prior to GSI's current position, and return
1100    the fresh SSA name.  */
1101
1102 static inline tree
1103 build_and_insert_ref (gimple_stmt_iterator *gsi, location_t loc, tree type,
1104                       const char *name, enum tree_code code, tree arg0)
1105 {
1106   tree result = make_temp_ssa_name (type, NULL, name);
1107   gimple stmt = gimple_build_assign (result, build1 (code, type, arg0));
1108   gimple_set_location (stmt, loc);
1109   gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1110   return result;
1111 }
1112
1113 /* Build a gimple assignment to cast VAL to TYPE.  Insert the statement
1114    prior to GSI's current position, and return the fresh SSA name.  */
1115
1116 static tree
1117 build_and_insert_cast (gimple_stmt_iterator *gsi, location_t loc,
1118                        tree type, tree val)
1119 {
1120   tree result = make_ssa_name (type, NULL);
1121   gimple stmt = gimple_build_assign_with_ops (NOP_EXPR, result, val, NULL_TREE);
1122   gimple_set_location (stmt, loc);
1123   gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1124   return result;
1125 }
1126
1127 /* ARG0 and ARG1 are the two arguments to a pow builtin call in GSI
1128    with location info LOC.  If possible, create an equivalent and
1129    less expensive sequence of statements prior to GSI, and return an
1130    expession holding the result.  */
1131
1132 static tree
1133 gimple_expand_builtin_pow (gimple_stmt_iterator *gsi, location_t loc,
1134                            tree arg0, tree arg1)
1135 {
1136   REAL_VALUE_TYPE c, cint, dconst1_4, dconst3_4, dconst1_3, dconst1_6;
1137   REAL_VALUE_TYPE c2, dconst3;
1138   HOST_WIDE_INT n;
1139   tree type, sqrtfn, cbrtfn, sqrt_arg0, sqrt_sqrt, result, cbrt_x, powi_cbrt_x;
1140   enum machine_mode mode;
1141   bool hw_sqrt_exists, c_is_int, c2_is_int;
1142
1143   /* If the exponent isn't a constant, there's nothing of interest
1144      to be done.  */
1145   if (TREE_CODE (arg1) != REAL_CST)
1146     return NULL_TREE;
1147
1148   /* If the exponent is equivalent to an integer, expand to an optimal
1149      multiplication sequence when profitable.  */
1150   c = TREE_REAL_CST (arg1);
1151   n = real_to_integer (&c);
1152   real_from_integer (&cint, VOIDmode, n, SIGNED);
1153   c_is_int = real_identical (&c, &cint);
1154
1155   if (c_is_int
1156       && ((n >= -1 && n <= 2)
1157           || (flag_unsafe_math_optimizations
1158               && optimize_bb_for_speed_p (gsi_bb (*gsi))
1159               && powi_cost (n) <= POWI_MAX_MULTS)))
1160     return gimple_expand_builtin_powi (gsi, loc, arg0, n);
1161
1162   /* Attempt various optimizations using sqrt and cbrt.  */
1163   type = TREE_TYPE (arg0);
1164   mode = TYPE_MODE (type);
1165   sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1166
1167   /* Optimize pow(x,0.5) = sqrt(x).  This replacement is always safe
1168      unless signed zeros must be maintained.  pow(-0,0.5) = +0, while
1169      sqrt(-0) = -0.  */
1170   if (sqrtfn
1171       && REAL_VALUES_EQUAL (c, dconsthalf)
1172       && !HONOR_SIGNED_ZEROS (mode))
1173     return build_and_insert_call (gsi, loc, sqrtfn, arg0);
1174
1175   /* Optimize pow(x,0.25) = sqrt(sqrt(x)).  Assume on most machines that
1176      a builtin sqrt instruction is smaller than a call to pow with 0.25,
1177      so do this optimization even if -Os.  Don't do this optimization
1178      if we don't have a hardware sqrt insn.  */
1179   dconst1_4 = dconst1;
1180   SET_REAL_EXP (&dconst1_4, REAL_EXP (&dconst1_4) - 2);
1181   hw_sqrt_exists = optab_handler (sqrt_optab, mode) != CODE_FOR_nothing;
1182
1183   if (flag_unsafe_math_optimizations
1184       && sqrtfn
1185       && REAL_VALUES_EQUAL (c, dconst1_4)
1186       && hw_sqrt_exists)
1187     {
1188       /* sqrt(x)  */
1189       sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1190
1191       /* sqrt(sqrt(x))  */
1192       return build_and_insert_call (gsi, loc, sqrtfn, sqrt_arg0);
1193     }
1194
1195   /* Optimize pow(x,0.75) = sqrt(x) * sqrt(sqrt(x)) unless we are
1196      optimizing for space.  Don't do this optimization if we don't have
1197      a hardware sqrt insn.  */
1198   real_from_integer (&dconst3_4, VOIDmode, 3, SIGNED);
1199   SET_REAL_EXP (&dconst3_4, REAL_EXP (&dconst3_4) - 2);
1200
1201   if (flag_unsafe_math_optimizations
1202       && sqrtfn
1203       && optimize_function_for_speed_p (cfun)
1204       && REAL_VALUES_EQUAL (c, dconst3_4)
1205       && hw_sqrt_exists)
1206     {
1207       /* sqrt(x)  */
1208       sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1209
1210       /* sqrt(sqrt(x))  */
1211       sqrt_sqrt = build_and_insert_call (gsi, loc, sqrtfn, sqrt_arg0);
1212
1213       /* sqrt(x) * sqrt(sqrt(x))  */
1214       return build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1215                                      sqrt_arg0, sqrt_sqrt);
1216     }
1217
1218   /* Optimize pow(x,1./3.) = cbrt(x).  This requires unsafe math
1219      optimizations since 1./3. is not exactly representable.  If x
1220      is negative and finite, the correct value of pow(x,1./3.) is
1221      a NaN with the "invalid" exception raised, because the value
1222      of 1./3. actually has an even denominator.  The correct value
1223      of cbrt(x) is a negative real value.  */
1224   cbrtfn = mathfn_built_in (type, BUILT_IN_CBRT);
1225   dconst1_3 = real_value_truncate (mode, dconst_third ());
1226
1227   if (flag_unsafe_math_optimizations
1228       && cbrtfn
1229       && (gimple_val_nonnegative_real_p (arg0) || !HONOR_NANS (mode))
1230       && REAL_VALUES_EQUAL (c, dconst1_3))
1231     return build_and_insert_call (gsi, loc, cbrtfn, arg0);
1232
1233   /* Optimize pow(x,1./6.) = cbrt(sqrt(x)).  Don't do this optimization
1234      if we don't have a hardware sqrt insn.  */
1235   dconst1_6 = dconst1_3;
1236   SET_REAL_EXP (&dconst1_6, REAL_EXP (&dconst1_6) - 1);
1237
1238   if (flag_unsafe_math_optimizations
1239       && sqrtfn
1240       && cbrtfn
1241       && (gimple_val_nonnegative_real_p (arg0) || !HONOR_NANS (mode))
1242       && optimize_function_for_speed_p (cfun)
1243       && hw_sqrt_exists
1244       && REAL_VALUES_EQUAL (c, dconst1_6))
1245     {
1246       /* sqrt(x)  */
1247       sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1248
1249       /* cbrt(sqrt(x))  */
1250       return build_and_insert_call (gsi, loc, cbrtfn, sqrt_arg0);
1251     }
1252
1253   /* Optimize pow(x,c), where n = 2c for some nonzero integer n
1254      and c not an integer, into
1255
1256        sqrt(x) * powi(x, n/2),                n > 0;
1257        1.0 / (sqrt(x) * powi(x, abs(n/2))),   n < 0.
1258
1259      Do not calculate the powi factor when n/2 = 0.  */
1260   real_arithmetic (&c2, MULT_EXPR, &c, &dconst2);
1261   n = real_to_integer (&c2);
1262   real_from_integer (&cint, VOIDmode, n, SIGNED);
1263   c2_is_int = real_identical (&c2, &cint);
1264
1265   if (flag_unsafe_math_optimizations
1266       && sqrtfn
1267       && c2_is_int
1268       && !c_is_int
1269       && optimize_function_for_speed_p (cfun))
1270     {
1271       tree powi_x_ndiv2 = NULL_TREE;
1272
1273       /* Attempt to fold powi(arg0, abs(n/2)) into multiplies.  If not
1274          possible or profitable, give up.  Skip the degenerate case when
1275          n is 1 or -1, where the result is always 1.  */
1276       if (absu_hwi (n) != 1)
1277         {
1278           powi_x_ndiv2 = gimple_expand_builtin_powi (gsi, loc, arg0,
1279                                                      abs_hwi (n / 2));
1280           if (!powi_x_ndiv2)
1281             return NULL_TREE;
1282         }
1283
1284       /* Calculate sqrt(x).  When n is not 1 or -1, multiply it by the
1285          result of the optimal multiply sequence just calculated.  */
1286       sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1287
1288       if (absu_hwi (n) == 1)
1289         result = sqrt_arg0;
1290       else
1291         result = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1292                                          sqrt_arg0, powi_x_ndiv2);
1293
1294       /* If n is negative, reciprocate the result.  */
1295       if (n < 0)
1296         result = build_and_insert_binop (gsi, loc, "powroot", RDIV_EXPR,
1297                                          build_real (type, dconst1), result);
1298       return result;
1299     }
1300
1301   /* Optimize pow(x,c), where 3c = n for some nonzero integer n, into
1302
1303      powi(x, n/3) * powi(cbrt(x), n%3),                    n > 0;
1304      1.0 / (powi(x, abs(n)/3) * powi(cbrt(x), abs(n)%3)),  n < 0.
1305
1306      Do not calculate the first factor when n/3 = 0.  As cbrt(x) is
1307      different from pow(x, 1./3.) due to rounding and behavior with
1308      negative x, we need to constrain this transformation to unsafe
1309      math and positive x or finite math.  */
1310   real_from_integer (&dconst3, VOIDmode, 3, SIGNED);
1311   real_arithmetic (&c2, MULT_EXPR, &c, &dconst3);
1312   real_round (&c2, mode, &c2);
1313   n = real_to_integer (&c2);
1314   real_from_integer (&cint, VOIDmode, n, SIGNED);
1315   real_arithmetic (&c2, RDIV_EXPR, &cint, &dconst3);
1316   real_convert (&c2, mode, &c2);
1317
1318   if (flag_unsafe_math_optimizations
1319       && cbrtfn
1320       && (gimple_val_nonnegative_real_p (arg0) || !HONOR_NANS (mode))
1321       && real_identical (&c2, &c)
1322       && !c2_is_int
1323       && optimize_function_for_speed_p (cfun)
1324       && powi_cost (n / 3) <= POWI_MAX_MULTS)
1325     {
1326       tree powi_x_ndiv3 = NULL_TREE;
1327
1328       /* Attempt to fold powi(arg0, abs(n/3)) into multiplies.  If not
1329          possible or profitable, give up.  Skip the degenerate case when
1330          abs(n) < 3, where the result is always 1.  */
1331       if (absu_hwi (n) >= 3)
1332         {
1333           powi_x_ndiv3 = gimple_expand_builtin_powi (gsi, loc, arg0,
1334                                                      abs_hwi (n / 3));
1335           if (!powi_x_ndiv3)
1336             return NULL_TREE;
1337         }
1338
1339       /* Calculate powi(cbrt(x), n%3).  Don't use gimple_expand_builtin_powi
1340          as that creates an unnecessary variable.  Instead, just produce
1341          either cbrt(x) or cbrt(x) * cbrt(x).  */
1342       cbrt_x = build_and_insert_call (gsi, loc, cbrtfn, arg0);
1343
1344       if (absu_hwi (n) % 3 == 1)
1345         powi_cbrt_x = cbrt_x;
1346       else
1347         powi_cbrt_x = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1348                                               cbrt_x, cbrt_x);
1349
1350       /* Multiply the two subexpressions, unless powi(x,abs(n)/3) = 1.  */
1351       if (absu_hwi (n) < 3)
1352         result = powi_cbrt_x;
1353       else
1354         result = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1355                                          powi_x_ndiv3, powi_cbrt_x);
1356
1357       /* If n is negative, reciprocate the result.  */
1358       if (n < 0)
1359         result = build_and_insert_binop (gsi, loc, "powroot", RDIV_EXPR,
1360                                          build_real (type, dconst1), result);
1361
1362       return result;
1363     }
1364
1365   /* No optimizations succeeded.  */
1366   return NULL_TREE;
1367 }
1368
1369 /* ARG is the argument to a cabs builtin call in GSI with location info
1370    LOC.  Create a sequence of statements prior to GSI that calculates
1371    sqrt(R*R + I*I), where R and I are the real and imaginary components
1372    of ARG, respectively.  Return an expression holding the result.  */
1373
1374 static tree
1375 gimple_expand_builtin_cabs (gimple_stmt_iterator *gsi, location_t loc, tree arg)
1376 {
1377   tree real_part, imag_part, addend1, addend2, sum, result;
1378   tree type = TREE_TYPE (TREE_TYPE (arg));
1379   tree sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1380   enum machine_mode mode = TYPE_MODE (type);
1381
1382   if (!flag_unsafe_math_optimizations
1383       || !optimize_bb_for_speed_p (gimple_bb (gsi_stmt (*gsi)))
1384       || !sqrtfn
1385       || optab_handler (sqrt_optab, mode) == CODE_FOR_nothing)
1386     return NULL_TREE;
1387
1388   real_part = build_and_insert_ref (gsi, loc, type, "cabs",
1389                                     REALPART_EXPR, arg);
1390   addend1 = build_and_insert_binop (gsi, loc, "cabs", MULT_EXPR,
1391                                     real_part, real_part);
1392   imag_part = build_and_insert_ref (gsi, loc, type, "cabs",
1393                                     IMAGPART_EXPR, arg);
1394   addend2 = build_and_insert_binop (gsi, loc, "cabs", MULT_EXPR,
1395                                     imag_part, imag_part);
1396   sum = build_and_insert_binop (gsi, loc, "cabs", PLUS_EXPR, addend1, addend2);
1397   result = build_and_insert_call (gsi, loc, sqrtfn, sum);
1398
1399   return result;
1400 }
1401
1402 /* Go through all calls to sin, cos and cexpi and call execute_cse_sincos_1
1403    on the SSA_NAME argument of each of them.  Also expand powi(x,n) into
1404    an optimal number of multiplies, when n is a constant.  */
1405
1406 namespace {
1407
1408 const pass_data pass_data_cse_sincos =
1409 {
1410   GIMPLE_PASS, /* type */
1411   "sincos", /* name */
1412   OPTGROUP_NONE, /* optinfo_flags */
1413   true, /* has_execute */
1414   TV_NONE, /* tv_id */
1415   PROP_ssa, /* properties_required */
1416   0, /* properties_provided */
1417   0, /* properties_destroyed */
1418   0, /* todo_flags_start */
1419   TODO_update_ssa, /* todo_flags_finish */
1420 };
1421
1422 class pass_cse_sincos : public gimple_opt_pass
1423 {
1424 public:
1425   pass_cse_sincos (gcc::context *ctxt)
1426     : gimple_opt_pass (pass_data_cse_sincos, ctxt)
1427   {}
1428
1429   /* opt_pass methods: */
1430   virtual bool gate (function *)
1431     {
1432       /* We no longer require either sincos or cexp, since powi expansion
1433          piggybacks on this pass.  */
1434       return optimize;
1435     }
1436
1437   virtual unsigned int execute (function *);
1438
1439 }; // class pass_cse_sincos
1440
1441 unsigned int
1442 pass_cse_sincos::execute (function *fun)
1443 {
1444   basic_block bb;
1445   bool cfg_changed = false;
1446
1447   calculate_dominance_info (CDI_DOMINATORS);
1448   memset (&sincos_stats, 0, sizeof (sincos_stats));
1449
1450   FOR_EACH_BB_FN (bb, fun)
1451     {
1452       gimple_stmt_iterator gsi;
1453       bool cleanup_eh = false;
1454
1455       for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1456         {
1457           gimple stmt = gsi_stmt (gsi);
1458           tree fndecl;
1459
1460           /* Only the last stmt in a bb could throw, no need to call
1461              gimple_purge_dead_eh_edges if we change something in the middle
1462              of a basic block.  */
1463           cleanup_eh = false;
1464
1465           if (is_gimple_call (stmt)
1466               && gimple_call_lhs (stmt)
1467               && (fndecl = gimple_call_fndecl (stmt))
1468               && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL)
1469             {
1470               tree arg, arg0, arg1, result;
1471               HOST_WIDE_INT n;
1472               location_t loc;
1473
1474               switch (DECL_FUNCTION_CODE (fndecl))
1475                 {
1476                 CASE_FLT_FN (BUILT_IN_COS):
1477                 CASE_FLT_FN (BUILT_IN_SIN):
1478                 CASE_FLT_FN (BUILT_IN_CEXPI):
1479                   /* Make sure we have either sincos or cexp.  */
1480                   if (!targetm.libc_has_function (function_c99_math_complex)
1481                       && !targetm.libc_has_function (function_sincos))
1482                     break;
1483
1484                   arg = gimple_call_arg (stmt, 0);
1485                   if (TREE_CODE (arg) == SSA_NAME)
1486                     cfg_changed |= execute_cse_sincos_1 (arg);
1487                   break;
1488
1489                 CASE_FLT_FN (BUILT_IN_POW):
1490                   arg0 = gimple_call_arg (stmt, 0);
1491                   arg1 = gimple_call_arg (stmt, 1);
1492
1493                   loc = gimple_location (stmt);
1494                   result = gimple_expand_builtin_pow (&gsi, loc, arg0, arg1);
1495
1496                   if (result)
1497                     {
1498                       tree lhs = gimple_get_lhs (stmt);
1499                       gimple new_stmt = gimple_build_assign (lhs, result);
1500                       gimple_set_location (new_stmt, loc);
1501                       unlink_stmt_vdef (stmt);
1502                       gsi_replace (&gsi, new_stmt, true);
1503                       cleanup_eh = true;
1504                       if (gimple_vdef (stmt))
1505                         release_ssa_name (gimple_vdef (stmt));
1506                     }
1507                   break;
1508
1509                 CASE_FLT_FN (BUILT_IN_POWI):
1510                   arg0 = gimple_call_arg (stmt, 0);
1511                   arg1 = gimple_call_arg (stmt, 1);
1512                   loc = gimple_location (stmt);
1513
1514                   if (real_minus_onep (arg0))
1515                     {
1516                       tree t0, t1, cond, one, minus_one;
1517                       gimple stmt;
1518
1519                       t0 = TREE_TYPE (arg0);
1520                       t1 = TREE_TYPE (arg1);
1521                       one = build_real (t0, dconst1);
1522                       minus_one = build_real (t0, dconstm1);
1523
1524                       cond = make_temp_ssa_name (t1, NULL, "powi_cond");
1525                       stmt = gimple_build_assign_with_ops (BIT_AND_EXPR, cond,
1526                                                            arg1,
1527                                                            build_int_cst (t1,
1528                                                                           1));
1529                       gimple_set_location (stmt, loc);
1530                       gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
1531
1532                       result = make_temp_ssa_name (t0, NULL, "powi");
1533                       stmt = gimple_build_assign_with_ops (COND_EXPR, result,
1534                                                            cond,
1535                                                            minus_one, one);
1536                       gimple_set_location (stmt, loc);
1537                       gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
1538                     }
1539                   else
1540                     {
1541                       if (!tree_fits_shwi_p (arg1))
1542                         break;
1543
1544                       n = tree_to_shwi (arg1);
1545                       result = gimple_expand_builtin_powi (&gsi, loc, arg0, n);
1546                     }
1547
1548                   if (result)
1549                     {
1550                       tree lhs = gimple_get_lhs (stmt);
1551                       gimple new_stmt = gimple_build_assign (lhs, result);
1552                       gimple_set_location (new_stmt, loc);
1553                       unlink_stmt_vdef (stmt);
1554                       gsi_replace (&gsi, new_stmt, true);
1555                       cleanup_eh = true;
1556                       if (gimple_vdef (stmt))
1557                         release_ssa_name (gimple_vdef (stmt));
1558                     }
1559                   break;
1560
1561                 CASE_FLT_FN (BUILT_IN_CABS):
1562                   arg0 = gimple_call_arg (stmt, 0);
1563                   loc = gimple_location (stmt);
1564                   result = gimple_expand_builtin_cabs (&gsi, loc, arg0);
1565
1566                   if (result)
1567                     {
1568                       tree lhs = gimple_get_lhs (stmt);
1569                       gimple new_stmt = gimple_build_assign (lhs, result);
1570                       gimple_set_location (new_stmt, loc);
1571                       unlink_stmt_vdef (stmt);
1572                       gsi_replace (&gsi, new_stmt, true);
1573                       cleanup_eh = true;
1574                       if (gimple_vdef (stmt))
1575                         release_ssa_name (gimple_vdef (stmt));
1576                     }
1577                   break;
1578
1579                 default:;
1580                 }
1581             }
1582         }
1583       if (cleanup_eh)
1584         cfg_changed |= gimple_purge_dead_eh_edges (bb);
1585     }
1586
1587   statistics_counter_event (fun, "sincos statements inserted",
1588                             sincos_stats.inserted);
1589
1590   free_dominance_info (CDI_DOMINATORS);
1591   return cfg_changed ? TODO_cleanup_cfg : 0;
1592 }
1593
1594 } // anon namespace
1595
1596 gimple_opt_pass *
1597 make_pass_cse_sincos (gcc::context *ctxt)
1598 {
1599   return new pass_cse_sincos (ctxt);
1600 }
1601
1602 /* A symbolic number is used to detect byte permutation and selection
1603    patterns.  Therefore the field N contains an artificial number
1604    consisting of byte size markers:
1605
1606    0    - byte has the value 0
1607    1..size - byte contains the content of the byte
1608    number indexed with that value minus one.
1609
1610    To detect permutations on memory sources (arrays and structures), a symbolic
1611    number is also associated a base address (the array or structure the load is
1612    made from), an offset from the base address and a range which gives the
1613    difference between the highest and lowest accessed memory location to make
1614    such a symbolic number. The range is thus different from size which reflects
1615    the size of the type of current expression. Note that for non memory source,
1616    range holds the same value as size.
1617
1618    For instance, for an array char a[], (short) a[0] | (short) a[3] would have
1619    a size of 2 but a range of 4 while (short) a[0] | ((short) a[0] << 1) would
1620    still have a size of 2 but this time a range of 1.  */
1621
1622 struct symbolic_number {
1623   unsigned HOST_WIDEST_INT n;
1624   int size;
1625   tree base_addr;
1626   tree offset;
1627   HOST_WIDE_INT bytepos;
1628   tree alias_set;
1629   tree vuse;
1630   unsigned HOST_WIDE_INT range;
1631 };
1632
1633 /* The number which the find_bswap_or_nop_1 result should match in
1634    order to have a nop.  The number is masked according to the size of
1635    the symbolic number before using it.  */
1636 #define CMPNOP (sizeof (HOST_WIDEST_INT) < 8 ? 0 : \
1637   (unsigned HOST_WIDEST_INT)0x08070605 << 32 | 0x04030201)
1638
1639 /* The number which the find_bswap_or_nop_1 result should match in
1640    order to have a byte swap.  The number is masked according to the
1641    size of the symbolic number before using it.  */
1642 #define CMPXCHG (sizeof (HOST_WIDEST_INT) < 8 ? 0 : \
1643   (unsigned HOST_WIDEST_INT)0x01020304 << 32 | 0x05060708)
1644
1645 /* Perform a SHIFT or ROTATE operation by COUNT bits on symbolic
1646    number N.  Return false if the requested operation is not permitted
1647    on a symbolic number.  */
1648
1649 static inline bool
1650 do_shift_rotate (enum tree_code code,
1651                  struct symbolic_number *n,
1652                  int count)
1653 {
1654   if (count % 8 != 0)
1655     return false;
1656
1657   /* Zero out the extra bits of N in order to avoid them being shifted
1658      into the significant bits.  */
1659   if (n->size < (int)sizeof (HOST_WIDEST_INT))
1660     n->n &= ((unsigned HOST_WIDEST_INT)1 << (n->size * BITS_PER_UNIT)) - 1;
1661
1662   switch (code)
1663     {
1664     case LSHIFT_EXPR:
1665       n->n <<= count;
1666       break;
1667     case RSHIFT_EXPR:
1668       n->n >>= count;
1669       break;
1670     case LROTATE_EXPR:
1671       n->n = (n->n << count) | (n->n >> ((n->size * BITS_PER_UNIT) - count));
1672       break;
1673     case RROTATE_EXPR:
1674       n->n = (n->n >> count) | (n->n << ((n->size * BITS_PER_UNIT) - count));
1675       break;
1676     default:
1677       return false;
1678     }
1679   /* Zero unused bits for size.  */
1680   if (n->size < (int)sizeof (HOST_WIDEST_INT))
1681     n->n &= ((unsigned HOST_WIDEST_INT)1 << (n->size * BITS_PER_UNIT)) - 1;
1682   return true;
1683 }
1684
1685 /* Perform sanity checking for the symbolic number N and the gimple
1686    statement STMT.  */
1687
1688 static inline bool
1689 verify_symbolic_number_p (struct symbolic_number *n, gimple stmt)
1690 {
1691   tree lhs_type;
1692
1693   lhs_type = gimple_expr_type (stmt);
1694
1695   if (TREE_CODE (lhs_type) != INTEGER_TYPE)
1696     return false;
1697
1698   if (TYPE_PRECISION (lhs_type) != n->size * BITS_PER_UNIT)
1699     return false;
1700
1701   return true;
1702 }
1703
1704 /* Check if STMT might be a byte swap or a nop from a memory source and returns
1705    the answer. If so, REF is that memory source and the base of the memory area
1706    accessed and the offset of the access from that base are recorded in N.  */
1707
1708 bool
1709 find_bswap_or_nop_load (gimple stmt, tree ref, struct symbolic_number *n)
1710 {
1711   /* Leaf node is an array or component ref. Memorize its base and
1712      offset from base to compare to other such leaf node.  */
1713   HOST_WIDE_INT bitsize, bitpos;
1714   enum machine_mode mode;
1715   int unsignedp, volatilep;
1716
1717   if (!gimple_assign_load_p (stmt) || gimple_has_volatile_ops (stmt))
1718     return false;
1719
1720   n->base_addr = get_inner_reference (ref, &bitsize, &bitpos, &n->offset,
1721                                       &mode, &unsignedp, &volatilep, false);
1722
1723   if (TREE_CODE (n->base_addr) == MEM_REF)
1724     {
1725       offset_int bit_offset = 0;
1726       tree off = TREE_OPERAND (n->base_addr, 1);
1727
1728       if (!integer_zerop (off))
1729         {
1730           offset_int boff, coff = mem_ref_offset (n->base_addr);
1731           boff = wi::lshift (coff, LOG2_BITS_PER_UNIT);
1732           bit_offset += boff;
1733         }
1734
1735       n->base_addr = TREE_OPERAND (n->base_addr, 0);
1736
1737       /* Avoid returning a negative bitpos as this may wreak havoc later.  */
1738       if (wi::neg_p (bit_offset))
1739         {
1740           offset_int mask = wi::mask <offset_int> (LOG2_BITS_PER_UNIT, false);
1741           offset_int tem = bit_offset.and_not (mask);
1742           /* TEM is the bitpos rounded to BITS_PER_UNIT towards -Inf.
1743              Subtract it to BIT_OFFSET and add it (scaled) to OFFSET.  */
1744           bit_offset -= tem;
1745           tem = wi::arshift (tem, LOG2_BITS_PER_UNIT);
1746           if (n->offset)
1747             n->offset = size_binop (PLUS_EXPR, n->offset,
1748                                     wide_int_to_tree (sizetype, tem));
1749           else
1750             n->offset = wide_int_to_tree (sizetype, tem);
1751         }
1752
1753       bitpos += bit_offset.to_shwi ();
1754     }
1755
1756   if (bitpos % BITS_PER_UNIT)
1757     return false;
1758   if (bitsize % BITS_PER_UNIT)
1759     return false;
1760
1761   n->bytepos = bitpos / BITS_PER_UNIT;
1762   n->alias_set = reference_alias_ptr_type (ref);
1763   n->vuse = gimple_vuse (stmt);
1764   return true;
1765 }
1766
1767 /* find_bswap_or_nop_1 invokes itself recursively with N and tries to perform
1768    the operation given by the rhs of STMT on the result.  If the operation
1769    could successfully be executed the function returns the tree expression of
1770    the source operand and NULL otherwise.  */
1771
1772 static tree
1773 find_bswap_or_nop_1 (gimple stmt, struct symbolic_number *n, int limit)
1774 {
1775   enum tree_code code;
1776   tree rhs1, rhs2 = NULL;
1777   gimple rhs1_stmt, rhs2_stmt;
1778   tree source_expr1;
1779   enum gimple_rhs_class rhs_class;
1780
1781   if (!limit || !is_gimple_assign (stmt))
1782     return NULL_TREE;
1783
1784   rhs1 = gimple_assign_rhs1 (stmt);
1785
1786   if (find_bswap_or_nop_load (stmt, rhs1, n))
1787     return rhs1;
1788
1789   if (TREE_CODE (rhs1) != SSA_NAME)
1790     return NULL_TREE;
1791
1792   code = gimple_assign_rhs_code (stmt);
1793   rhs_class = gimple_assign_rhs_class (stmt);
1794   rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
1795
1796   if (rhs_class == GIMPLE_BINARY_RHS)
1797     rhs2 = gimple_assign_rhs2 (stmt);
1798
1799   /* Handle unary rhs and binary rhs with integer constants as second
1800      operand.  */
1801
1802   if (rhs_class == GIMPLE_UNARY_RHS
1803       || (rhs_class == GIMPLE_BINARY_RHS
1804           && TREE_CODE (rhs2) == INTEGER_CST))
1805     {
1806       if (code != BIT_AND_EXPR
1807           && code != LSHIFT_EXPR
1808           && code != RSHIFT_EXPR
1809           && code != LROTATE_EXPR
1810           && code != RROTATE_EXPR
1811           && code != NOP_EXPR
1812           && code != CONVERT_EXPR)
1813         return NULL_TREE;
1814
1815       source_expr1 = find_bswap_or_nop_1 (rhs1_stmt, n, limit - 1);
1816
1817       /* If find_bswap_or_nop_1 returned NULL, STMT is a leaf node and
1818          we have to initialize the symbolic number.  */
1819       if (!source_expr1 || gimple_assign_load_p (rhs1_stmt))
1820         {
1821           /* Set up the symbolic number N by setting each byte to a
1822              value between 1 and the byte size of rhs1.  The highest
1823              order byte is set to n->size and the lowest order
1824              byte to 1.  */
1825           n->size = TYPE_PRECISION (TREE_TYPE (rhs1));
1826           if (n->size % BITS_PER_UNIT != 0)
1827             return NULL_TREE;
1828           n->size /= BITS_PER_UNIT;
1829           n->range = n->size;
1830           n->n = CMPNOP;
1831
1832           if (n->size < (int)sizeof (HOST_WIDEST_INT))
1833             n->n &= ((unsigned HOST_WIDEST_INT)1 <<
1834                      (n->size * BITS_PER_UNIT)) - 1;
1835
1836           if (!source_expr1)
1837             {
1838               n->base_addr = n->offset = n->alias_set = n->vuse = NULL_TREE;
1839               source_expr1 = rhs1;
1840             }
1841         }
1842
1843       switch (code)
1844         {
1845         case BIT_AND_EXPR:
1846           {
1847             int i;
1848             unsigned HOST_WIDEST_INT val = widest_int_cst_value (rhs2);
1849             unsigned HOST_WIDEST_INT tmp = val;
1850
1851             /* Only constants masking full bytes are allowed.  */
1852             for (i = 0; i < n->size; i++, tmp >>= BITS_PER_UNIT)
1853               if ((tmp & 0xff) != 0 && (tmp & 0xff) != 0xff)
1854                 return NULL_TREE;
1855
1856             n->n &= val;
1857           }
1858           break;
1859         case LSHIFT_EXPR:
1860         case RSHIFT_EXPR:
1861         case LROTATE_EXPR:
1862         case RROTATE_EXPR:
1863           if (!do_shift_rotate (code, n, (int)TREE_INT_CST_LOW (rhs2)))
1864             return NULL_TREE;
1865           break;
1866         CASE_CONVERT:
1867           {
1868             int type_size;
1869
1870             type_size = TYPE_PRECISION (gimple_expr_type (stmt));
1871             if (type_size % BITS_PER_UNIT != 0)
1872               return NULL_TREE;
1873
1874             if (type_size / BITS_PER_UNIT < (int)(sizeof (HOST_WIDEST_INT)))
1875               {
1876                 /* If STMT casts to a smaller type mask out the bits not
1877                    belonging to the target type.  */
1878                 n->n &= ((unsigned HOST_WIDEST_INT)1 << type_size) - 1;
1879               }
1880             n->size = type_size / BITS_PER_UNIT;
1881             if (!n->base_addr)
1882               n->range = n->size;
1883           }
1884           break;
1885         default:
1886           return NULL_TREE;
1887         };
1888       return verify_symbolic_number_p (n, stmt) ? source_expr1 : NULL;
1889     }
1890
1891   /* Handle binary rhs.  */
1892
1893   if (rhs_class == GIMPLE_BINARY_RHS)
1894     {
1895       int i;
1896       struct symbolic_number n1, n2;
1897       unsigned HOST_WIDEST_INT mask;
1898       tree source_expr2;
1899
1900       if (code != BIT_IOR_EXPR)
1901         return NULL_TREE;
1902
1903       if (TREE_CODE (rhs2) != SSA_NAME)
1904         return NULL_TREE;
1905
1906       rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
1907
1908       switch (code)
1909         {
1910         case BIT_IOR_EXPR:
1911           source_expr1 = find_bswap_or_nop_1 (rhs1_stmt, &n1, limit - 1);
1912
1913           if (!source_expr1)
1914             return NULL_TREE;
1915
1916           source_expr2 = find_bswap_or_nop_1 (rhs2_stmt, &n2, limit - 1);
1917
1918           if (n1.size != n2.size || !source_expr2)
1919             return NULL_TREE;
1920
1921           if (!n1.vuse != !n2.vuse ||
1922           (n1.vuse && !operand_equal_p (n1.vuse, n2.vuse, 0)))
1923             return NULL_TREE;
1924
1925           if (source_expr1 != source_expr2)
1926             {
1927               HOST_WIDEST_INT inc, mask;
1928               unsigned i;
1929               HOST_WIDE_INT off_sub;
1930               struct symbolic_number *n_ptr;
1931
1932               if (!n1.base_addr || !n2.base_addr
1933                   || !operand_equal_p (n1.base_addr, n2.base_addr, 0))
1934                 return NULL_TREE;
1935               if (!n1.offset != !n2.offset ||
1936                   (n1.offset && !operand_equal_p (n1.offset, n2.offset, 0)))
1937                 return NULL_TREE;
1938
1939               /* We swap n1 with n2 to have n1 < n2.  */
1940               if (n2.bytepos < n1.bytepos)
1941                 {
1942                   struct symbolic_number tmpn;
1943
1944                   tmpn = n2;
1945                   n2 = n1;
1946                   n1 = tmpn;
1947                   source_expr1 = source_expr2;
1948                 }
1949
1950               off_sub = n2.bytepos - n1.bytepos;
1951
1952               /* Check that the range of memory covered < biggest int size.  */
1953               if (off_sub + n2.range > (int) sizeof (HOST_WIDEST_INT))
1954                 return NULL_TREE;
1955               n->range = n2.range + off_sub;
1956
1957               /* Reinterpret byte marks in symbolic number holding the value of
1958                  bigger weight according to host endianness.  */
1959               inc = BYTES_BIG_ENDIAN ? off_sub + n2.range - n1.range : off_sub;
1960               mask = 0xFF;
1961               if (BYTES_BIG_ENDIAN)
1962                 n_ptr = &n1;
1963               else
1964                 n_ptr = &n2;
1965               for (i = 0; i < sizeof (HOST_WIDEST_INT); i++, inc <<= 8,
1966                    mask <<= 8)
1967                 {
1968                   if (n_ptr->n & mask)
1969                     n_ptr->n += inc;
1970                 }
1971             }
1972           else
1973             n->range = n1.range;
1974
1975           if (!n1.alias_set
1976               || alias_ptr_types_compatible_p (n1.alias_set, n2.alias_set))
1977             n->alias_set = n1.alias_set;
1978           else
1979             n->alias_set = ptr_type_node;
1980           n->vuse = n1.vuse;
1981           n->base_addr = n1.base_addr;
1982           n->offset = n1.offset;
1983           n->bytepos = n1.bytepos;
1984           n->size = n1.size;
1985           for (i = 0, mask = 0xff; i < n->size; i++, mask <<= BITS_PER_UNIT)
1986             {
1987               unsigned HOST_WIDEST_INT masked1, masked2;
1988
1989               masked1 = n1.n & mask;
1990               masked2 = n2.n & mask;
1991               if (masked1 && masked2 && masked1 != masked2)
1992                 return NULL_TREE;
1993             }
1994           n->n = n1.n | n2.n;
1995
1996           if (!verify_symbolic_number_p (n, stmt))
1997             return NULL_TREE;
1998
1999           break;
2000         default:
2001           return NULL_TREE;
2002         }
2003       return source_expr1;
2004     }
2005   return NULL_TREE;
2006 }
2007
2008 /* Check if STMT completes a bswap implementation or a read in a given
2009    endianness consisting of ORs, SHIFTs and ANDs and sets *BSWAP
2010    accordingly.  It also sets N to represent the kind of operations
2011    performed: size of the resulting expression and whether it works on
2012    a memory source, and if so alias-set and vuse.  At last, the
2013    function returns the source tree expression.  */
2014
2015 static tree
2016 find_bswap_or_nop (gimple stmt, struct symbolic_number *n, bool *bswap)
2017 {
2018 /* The number which the find_bswap_or_nop_1 result should match in order
2019    to have a full byte swap.  The number is shifted to the right
2020    according to the size of the symbolic number before using it.  */
2021   unsigned HOST_WIDEST_INT cmpxchg = CMPXCHG;
2022   unsigned HOST_WIDEST_INT cmpnop = CMPNOP;
2023
2024   tree source_expr;
2025   int limit;
2026
2027   /* The last parameter determines the depth search limit.  It usually
2028      correlates directly to the number n of bytes to be touched.  We
2029      increase that number by log2(n) + 1 here in order to also
2030      cover signed -> unsigned conversions of the src operand as can be seen
2031      in libgcc, and for initial shift/and operation of the src operand.  */
2032   limit = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (gimple_expr_type (stmt)));
2033   limit += 1 + (int) ceil_log2 ((unsigned HOST_WIDE_INT) limit);
2034   source_expr =  find_bswap_or_nop_1 (stmt, n, limit);
2035
2036   if (!source_expr)
2037     return NULL_TREE;
2038
2039   /* Find real size of result (highest non zero byte).  */
2040   if (n->base_addr)
2041     {
2042       int rsize;
2043       unsigned HOST_WIDEST_INT tmpn;
2044
2045       for (tmpn = n->n, rsize = 0; tmpn; tmpn >>= BITS_PER_UNIT, rsize++);
2046       n->range = rsize;
2047     }
2048
2049   /* Zero out the extra bits of N and CMP*.  */
2050   if (n->range < (int)sizeof (HOST_WIDEST_INT))
2051     {
2052       unsigned HOST_WIDEST_INT mask;
2053
2054       mask = ((unsigned HOST_WIDEST_INT)1 << (n->range * BITS_PER_UNIT)) - 1;
2055       cmpxchg >>= (sizeof (HOST_WIDEST_INT) - n->range) * BITS_PER_UNIT;
2056       cmpnop &= mask;
2057     }
2058
2059   /* A complete byte swap should make the symbolic number to start with
2060      the largest digit in the highest order byte. Unchanged symbolic
2061      number indicates a read with same endianness as host architecture.  */
2062   if (n->n == cmpnop)
2063     *bswap = false;
2064   else if (n->n == cmpxchg)
2065     *bswap = true;
2066   else
2067     return NULL_TREE;
2068
2069   /* Useless bit manipulation performed by code.  */
2070   if (!n->base_addr && n->n == cmpnop)
2071     return NULL_TREE;
2072
2073   n->range *= BITS_PER_UNIT;
2074   return source_expr;
2075 }
2076
2077 namespace {
2078
2079 const pass_data pass_data_optimize_bswap =
2080 {
2081   GIMPLE_PASS, /* type */
2082   "bswap", /* name */
2083   OPTGROUP_NONE, /* optinfo_flags */
2084   true, /* has_execute */
2085   TV_NONE, /* tv_id */
2086   PROP_ssa, /* properties_required */
2087   0, /* properties_provided */
2088   0, /* properties_destroyed */
2089   0, /* todo_flags_start */
2090   0, /* todo_flags_finish */
2091 };
2092
2093 class pass_optimize_bswap : public gimple_opt_pass
2094 {
2095 public:
2096   pass_optimize_bswap (gcc::context *ctxt)
2097     : gimple_opt_pass (pass_data_optimize_bswap, ctxt)
2098   {}
2099
2100   /* opt_pass methods: */
2101   virtual bool gate (function *)
2102     {
2103       return flag_expensive_optimizations && optimize;
2104     }
2105
2106   virtual unsigned int execute (function *);
2107
2108 }; // class pass_optimize_bswap
2109
2110 /* Perform the bswap optimization: replace the statement STMT at GSI
2111    with load type, VUSE and set-alias as described by N if a memory
2112    source is involved (N->base_addr is non null), followed by the
2113    builtin bswap invocation in FNDECL if BSWAP is true.  SRC gives
2114    the source on which STMT is operating and N->range gives the
2115    size of the expression involved for maintaining some statistics.  */
2116
2117 static bool
2118 bswap_replace (gimple stmt, gimple_stmt_iterator *gsi, tree src, tree fndecl,
2119                tree bswap_type, tree load_type, struct symbolic_number *n,
2120                bool bswap)
2121 {
2122   tree tmp, tgt;
2123   gimple call;
2124
2125   tgt = gimple_assign_lhs (stmt);
2126
2127   /* Need to load the value from memory first.  */
2128   if (n->base_addr)
2129     {
2130       tree addr_expr, addr_tmp, val_expr, val_tmp;
2131       tree load_offset_ptr, aligned_load_type;
2132       gimple addr_stmt, load_stmt;
2133       unsigned align;
2134
2135       align = get_object_alignment (src);
2136       if (bswap && SLOW_UNALIGNED_ACCESS (TYPE_MODE (load_type), align))
2137         return false;
2138
2139       /*  Compute address to load from and cast according to the size
2140           of the load.  */
2141       addr_expr = build_fold_addr_expr (unshare_expr (src));
2142       if (is_gimple_min_invariant (addr_expr))
2143         addr_tmp = addr_expr;
2144       else
2145         {
2146           addr_tmp = make_temp_ssa_name (TREE_TYPE (addr_expr), NULL,
2147                                          "load_src");
2148           addr_stmt = gimple_build_assign (addr_tmp, addr_expr);
2149           gsi_insert_before (gsi, addr_stmt, GSI_SAME_STMT);
2150         }
2151
2152       /* Perform the load.  */
2153       aligned_load_type = load_type;
2154       if (align < TYPE_ALIGN (load_type))
2155         aligned_load_type = build_aligned_type (load_type, align);
2156       load_offset_ptr = build_int_cst (n->alias_set, 0);
2157       val_expr = fold_build2 (MEM_REF, aligned_load_type, addr_tmp,
2158                               load_offset_ptr);
2159
2160       if (!bswap)
2161         {
2162           if (n->range == 16)
2163             nop_stats.found_16bit++;
2164           else if (n->range == 32)
2165             nop_stats.found_32bit++;
2166           else
2167             {
2168               gcc_assert (n->range == 64);
2169               nop_stats.found_64bit++;
2170             }
2171
2172           /* Convert the result of load if necessary.  */
2173           if (!useless_type_conversion_p (TREE_TYPE (tgt), load_type))
2174             {
2175               val_tmp = make_temp_ssa_name (aligned_load_type, NULL,
2176                                             "load_dst");
2177               load_stmt = gimple_build_assign (val_tmp, val_expr);
2178               gimple_set_vuse (load_stmt, n->vuse);
2179               gsi_insert_before (gsi, load_stmt, GSI_SAME_STMT);
2180               gimple_assign_set_rhs_with_ops_1 (gsi, NOP_EXPR, val_tmp,
2181                                                 NULL_TREE, NULL_TREE);
2182             }
2183           else
2184             gimple_assign_set_rhs_with_ops_1 (gsi, MEM_REF, val_expr,
2185                                               NULL_TREE, NULL_TREE);
2186           update_stmt (gsi_stmt (*gsi));
2187
2188           if (dump_file)
2189             {
2190               fprintf (dump_file,
2191                        "%d bit load in host endianness found at: ",
2192                        (int)n->range);
2193               print_gimple_stmt (dump_file, stmt, 0, 0);
2194             }
2195           return true;
2196         }
2197       else
2198         {
2199           val_tmp = make_temp_ssa_name (aligned_load_type, NULL, "load_dst");
2200           load_stmt = gimple_build_assign (val_tmp, val_expr);
2201           gimple_set_vuse (load_stmt, n->vuse);
2202           gsi_insert_before (gsi, load_stmt, GSI_SAME_STMT);
2203         }
2204       src = val_tmp;
2205     }
2206
2207   if (n->range == 16)
2208     bswap_stats.found_16bit++;
2209   else if (n->range == 32)
2210     bswap_stats.found_32bit++;
2211   else
2212     {
2213       gcc_assert (n->range == 64);
2214       bswap_stats.found_64bit++;
2215     }
2216
2217   tmp = src;
2218
2219   /* Convert the src expression if necessary.  */
2220   if (!useless_type_conversion_p (TREE_TYPE (tmp), bswap_type))
2221     {
2222       gimple convert_stmt;
2223       tmp = make_temp_ssa_name (bswap_type, NULL, "bswapsrc");
2224       convert_stmt = gimple_build_assign_with_ops (NOP_EXPR, tmp, src, NULL);
2225       gsi_insert_before (gsi, convert_stmt, GSI_SAME_STMT);
2226     }
2227
2228   call = gimple_build_call (fndecl, 1, tmp);
2229
2230   tmp = tgt;
2231
2232   /* Convert the result if necessary.  */
2233   if (!useless_type_conversion_p (TREE_TYPE (tgt), bswap_type))
2234     {
2235       gimple convert_stmt;
2236       tmp = make_temp_ssa_name (bswap_type, NULL, "bswapdst");
2237       convert_stmt = gimple_build_assign_with_ops (NOP_EXPR, tgt, tmp, NULL);
2238       gsi_insert_after (gsi, convert_stmt, GSI_SAME_STMT);
2239     }
2240
2241   gimple_call_set_lhs (call, tmp);
2242
2243   if (dump_file)
2244     {
2245       fprintf (dump_file, "%d bit bswap implementation found at: ",
2246                (int)n->range);
2247       print_gimple_stmt (dump_file, stmt, 0, 0);
2248     }
2249
2250   gsi_insert_after (gsi, call, GSI_SAME_STMT);
2251   gsi_remove (gsi, true);
2252   return true;
2253 }
2254
2255 /* Find manual byte swap implementations as well as load in a given
2256    endianness. Byte swaps are turned into a bswap builtin invokation
2257    while endian loads are converted to bswap builtin invokation or
2258    simple load according to the host endianness.  */
2259
2260 unsigned int
2261 pass_optimize_bswap::execute (function *fun)
2262 {
2263   basic_block bb;
2264   bool bswap16_p, bswap32_p, bswap64_p;
2265   bool changed = false;
2266   tree bswap16_type = NULL_TREE, bswap32_type = NULL_TREE, bswap64_type = NULL_TREE;
2267
2268   if (BITS_PER_UNIT != 8)
2269     return 0;
2270
2271   if (sizeof (HOST_WIDEST_INT) < 8)
2272     return 0;
2273
2274   bswap16_p = (builtin_decl_explicit_p (BUILT_IN_BSWAP16)
2275                && optab_handler (bswap_optab, HImode) != CODE_FOR_nothing);
2276   bswap32_p = (builtin_decl_explicit_p (BUILT_IN_BSWAP32)
2277                && optab_handler (bswap_optab, SImode) != CODE_FOR_nothing);
2278   bswap64_p = (builtin_decl_explicit_p (BUILT_IN_BSWAP64)
2279                && (optab_handler (bswap_optab, DImode) != CODE_FOR_nothing
2280                    || (bswap32_p && word_mode == SImode)));
2281
2282   /* Determine the argument type of the builtins.  The code later on
2283      assumes that the return and argument type are the same.  */
2284   if (bswap16_p)
2285     {
2286       tree fndecl = builtin_decl_explicit (BUILT_IN_BSWAP16);
2287       bswap16_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
2288     }
2289
2290   if (bswap32_p)
2291     {
2292       tree fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32);
2293       bswap32_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
2294     }
2295
2296   if (bswap64_p)
2297     {
2298       tree fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64);
2299       bswap64_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
2300     }
2301
2302   memset (&nop_stats, 0, sizeof (nop_stats));
2303   memset (&bswap_stats, 0, sizeof (bswap_stats));
2304
2305   FOR_EACH_BB_FN (bb, fun)
2306     {
2307       gimple_stmt_iterator gsi;
2308
2309       /* We do a reverse scan for bswap patterns to make sure we get the
2310          widest match. As bswap pattern matching doesn't handle
2311          previously inserted smaller bswap replacements as sub-
2312          patterns, the wider variant wouldn't be detected.  */
2313       for (gsi = gsi_last_bb (bb); !gsi_end_p (gsi); gsi_prev (&gsi))
2314         {
2315           gimple stmt = gsi_stmt (gsi);
2316           tree fndecl = NULL_TREE, bswap_type = NULL_TREE;
2317           tree src, load_type;
2318           struct symbolic_number n;
2319           bool bswap;
2320
2321           if (!is_gimple_assign (stmt)
2322               || gimple_assign_rhs_code (stmt) != BIT_IOR_EXPR)
2323             continue;
2324
2325           src = find_bswap_or_nop (stmt, &n, &bswap);
2326
2327           if (!src)
2328             continue;
2329
2330           switch (n.range)
2331             {
2332             case 16:
2333               load_type = uint16_type_node;
2334               if (bswap16_p)
2335                 {
2336                   fndecl = builtin_decl_explicit (BUILT_IN_BSWAP16);
2337                   bswap_type = bswap16_type;
2338                 }
2339               break;
2340             case 32:
2341               load_type = uint32_type_node;
2342               if (bswap32_p)
2343                 {
2344                   fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32);
2345                   bswap_type = bswap32_type;
2346                 }
2347               break;
2348             case 64:
2349               load_type = uint64_type_node;
2350               if (bswap64_p)
2351                 {
2352                   fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64);
2353                   bswap_type = bswap64_type;
2354                 }
2355               break;
2356             default:
2357               continue;
2358             }
2359
2360           if (bswap && !fndecl)
2361             continue;
2362
2363           if (bswap_replace (stmt, &gsi, src, fndecl, bswap_type, load_type,
2364                              &n, bswap))
2365             changed = true;
2366         }
2367     }
2368
2369   statistics_counter_event (fun, "16-bit nop implementations found",
2370                             nop_stats.found_16bit);
2371   statistics_counter_event (fun, "32-bit nop implementations found",
2372                             nop_stats.found_32bit);
2373   statistics_counter_event (fun, "64-bit nop implementations found",
2374                             nop_stats.found_64bit);
2375   statistics_counter_event (fun, "16-bit bswap implementations found",
2376                             bswap_stats.found_16bit);
2377   statistics_counter_event (fun, "32-bit bswap implementations found",
2378                             bswap_stats.found_32bit);
2379   statistics_counter_event (fun, "64-bit bswap implementations found",
2380                             bswap_stats.found_64bit);
2381
2382   return (changed ? TODO_update_ssa : 0);
2383 }
2384
2385 } // anon namespace
2386
2387 gimple_opt_pass *
2388 make_pass_optimize_bswap (gcc::context *ctxt)
2389 {
2390   return new pass_optimize_bswap (ctxt);
2391 }
2392
2393 /* Return true if stmt is a type conversion operation that can be stripped
2394    when used in a widening multiply operation.  */
2395 static bool
2396 widening_mult_conversion_strippable_p (tree result_type, gimple stmt)
2397 {
2398   enum tree_code rhs_code = gimple_assign_rhs_code (stmt);
2399
2400   if (TREE_CODE (result_type) == INTEGER_TYPE)
2401     {
2402       tree op_type;
2403       tree inner_op_type;
2404
2405       if (!CONVERT_EXPR_CODE_P (rhs_code))
2406         return false;
2407
2408       op_type = TREE_TYPE (gimple_assign_lhs (stmt));
2409
2410       /* If the type of OP has the same precision as the result, then
2411          we can strip this conversion.  The multiply operation will be
2412          selected to create the correct extension as a by-product.  */
2413       if (TYPE_PRECISION (result_type) == TYPE_PRECISION (op_type))
2414         return true;
2415
2416       /* We can also strip a conversion if it preserves the signed-ness of
2417          the operation and doesn't narrow the range.  */
2418       inner_op_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
2419
2420       /* If the inner-most type is unsigned, then we can strip any
2421          intermediate widening operation.  If it's signed, then the
2422          intermediate widening operation must also be signed.  */
2423       if ((TYPE_UNSIGNED (inner_op_type)
2424            || TYPE_UNSIGNED (op_type) == TYPE_UNSIGNED (inner_op_type))
2425           && TYPE_PRECISION (op_type) > TYPE_PRECISION (inner_op_type))
2426         return true;
2427
2428       return false;
2429     }
2430
2431   return rhs_code == FIXED_CONVERT_EXPR;
2432 }
2433
2434 /* Return true if RHS is a suitable operand for a widening multiplication,
2435    assuming a target type of TYPE.
2436    There are two cases:
2437
2438      - RHS makes some value at least twice as wide.  Store that value
2439        in *NEW_RHS_OUT if so, and store its type in *TYPE_OUT.
2440
2441      - RHS is an integer constant.  Store that value in *NEW_RHS_OUT if so,
2442        but leave *TYPE_OUT untouched.  */
2443
2444 static bool
2445 is_widening_mult_rhs_p (tree type, tree rhs, tree *type_out,
2446                         tree *new_rhs_out)
2447 {
2448   gimple stmt;
2449   tree type1, rhs1;
2450
2451   if (TREE_CODE (rhs) == SSA_NAME)
2452     {
2453       stmt = SSA_NAME_DEF_STMT (rhs);
2454       if (is_gimple_assign (stmt))
2455         {
2456           if (! widening_mult_conversion_strippable_p (type, stmt))
2457             rhs1 = rhs;
2458           else
2459             {
2460               rhs1 = gimple_assign_rhs1 (stmt);
2461
2462               if (TREE_CODE (rhs1) == INTEGER_CST)
2463                 {
2464                   *new_rhs_out = rhs1;
2465                   *type_out = NULL;
2466                   return true;
2467                 }
2468             }
2469         }
2470       else
2471         rhs1 = rhs;
2472
2473       type1 = TREE_TYPE (rhs1);
2474
2475       if (TREE_CODE (type1) != TREE_CODE (type)
2476           || TYPE_PRECISION (type1) * 2 > TYPE_PRECISION (type))
2477         return false;
2478
2479       *new_rhs_out = rhs1;
2480       *type_out = type1;
2481       return true;
2482     }
2483
2484   if (TREE_CODE (rhs) == INTEGER_CST)
2485     {
2486       *new_rhs_out = rhs;
2487       *type_out = NULL;
2488       return true;
2489     }
2490
2491   return false;
2492 }
2493
2494 /* Return true if STMT performs a widening multiplication, assuming the
2495    output type is TYPE.  If so, store the unwidened types of the operands
2496    in *TYPE1_OUT and *TYPE2_OUT respectively.  Also fill *RHS1_OUT and
2497    *RHS2_OUT such that converting those operands to types *TYPE1_OUT
2498    and *TYPE2_OUT would give the operands of the multiplication.  */
2499
2500 static bool
2501 is_widening_mult_p (gimple stmt,
2502                     tree *type1_out, tree *rhs1_out,
2503                     tree *type2_out, tree *rhs2_out)
2504 {
2505   tree type = TREE_TYPE (gimple_assign_lhs (stmt));
2506
2507   if (TREE_CODE (type) != INTEGER_TYPE
2508       && TREE_CODE (type) != FIXED_POINT_TYPE)
2509     return false;
2510
2511   if (!is_widening_mult_rhs_p (type, gimple_assign_rhs1 (stmt), type1_out,
2512                                rhs1_out))
2513     return false;
2514
2515   if (!is_widening_mult_rhs_p (type, gimple_assign_rhs2 (stmt), type2_out,
2516                                rhs2_out))
2517     return false;
2518
2519   if (*type1_out == NULL)
2520     {
2521       if (*type2_out == NULL || !int_fits_type_p (*rhs1_out, *type2_out))
2522         return false;
2523       *type1_out = *type2_out;
2524     }
2525
2526   if (*type2_out == NULL)
2527     {
2528       if (!int_fits_type_p (*rhs2_out, *type1_out))
2529         return false;
2530       *type2_out = *type1_out;
2531     }
2532
2533   /* Ensure that the larger of the two operands comes first. */
2534   if (TYPE_PRECISION (*type1_out) < TYPE_PRECISION (*type2_out))
2535     {
2536       tree tmp;
2537       tmp = *type1_out;
2538       *type1_out = *type2_out;
2539       *type2_out = tmp;
2540       tmp = *rhs1_out;
2541       *rhs1_out = *rhs2_out;
2542       *rhs2_out = tmp;
2543     }
2544
2545   return true;
2546 }
2547
2548 /* Process a single gimple statement STMT, which has a MULT_EXPR as
2549    its rhs, and try to convert it into a WIDEN_MULT_EXPR.  The return
2550    value is true iff we converted the statement.  */
2551
2552 static bool
2553 convert_mult_to_widen (gimple stmt, gimple_stmt_iterator *gsi)
2554 {
2555   tree lhs, rhs1, rhs2, type, type1, type2;
2556   enum insn_code handler;
2557   enum machine_mode to_mode, from_mode, actual_mode;
2558   optab op;
2559   int actual_precision;
2560   location_t loc = gimple_location (stmt);
2561   bool from_unsigned1, from_unsigned2;
2562
2563   lhs = gimple_assign_lhs (stmt);
2564   type = TREE_TYPE (lhs);
2565   if (TREE_CODE (type) != INTEGER_TYPE)
2566     return false;
2567
2568   if (!is_widening_mult_p (stmt, &type1, &rhs1, &type2, &rhs2))
2569     return false;
2570
2571   to_mode = TYPE_MODE (type);
2572   from_mode = TYPE_MODE (type1);
2573   from_unsigned1 = TYPE_UNSIGNED (type1);
2574   from_unsigned2 = TYPE_UNSIGNED (type2);
2575
2576   if (from_unsigned1 && from_unsigned2)
2577     op = umul_widen_optab;
2578   else if (!from_unsigned1 && !from_unsigned2)
2579     op = smul_widen_optab;
2580   else
2581     op = usmul_widen_optab;
2582
2583   handler = find_widening_optab_handler_and_mode (op, to_mode, from_mode,
2584                                                   0, &actual_mode);
2585
2586   if (handler == CODE_FOR_nothing)
2587     {
2588       if (op != smul_widen_optab)
2589         {
2590           /* We can use a signed multiply with unsigned types as long as
2591              there is a wider mode to use, or it is the smaller of the two
2592              types that is unsigned.  Note that type1 >= type2, always.  */
2593           if ((TYPE_UNSIGNED (type1)
2594                && TYPE_PRECISION (type1) == GET_MODE_PRECISION (from_mode))
2595               || (TYPE_UNSIGNED (type2)
2596                   && TYPE_PRECISION (type2) == GET_MODE_PRECISION (from_mode)))
2597             {
2598               from_mode = GET_MODE_WIDER_MODE (from_mode);
2599               if (GET_MODE_SIZE (to_mode) <= GET_MODE_SIZE (from_mode))
2600                 return false;
2601             }
2602
2603           op = smul_widen_optab;
2604           handler = find_widening_optab_handler_and_mode (op, to_mode,
2605                                                           from_mode, 0,
2606                                                           &actual_mode);
2607
2608           if (handler == CODE_FOR_nothing)
2609             return false;
2610
2611           from_unsigned1 = from_unsigned2 = false;
2612         }
2613       else
2614         return false;
2615     }
2616
2617   /* Ensure that the inputs to the handler are in the correct precison
2618      for the opcode.  This will be the full mode size.  */
2619   actual_precision = GET_MODE_PRECISION (actual_mode);
2620   if (2 * actual_precision > TYPE_PRECISION (type))
2621     return false;
2622   if (actual_precision != TYPE_PRECISION (type1)
2623       || from_unsigned1 != TYPE_UNSIGNED (type1))
2624     rhs1 = build_and_insert_cast (gsi, loc,
2625                                   build_nonstandard_integer_type
2626                                     (actual_precision, from_unsigned1), rhs1);
2627   if (actual_precision != TYPE_PRECISION (type2)
2628       || from_unsigned2 != TYPE_UNSIGNED (type2))
2629     rhs2 = build_and_insert_cast (gsi, loc,
2630                                   build_nonstandard_integer_type
2631                                     (actual_precision, from_unsigned2), rhs2);
2632
2633   /* Handle constants.  */
2634   if (TREE_CODE (rhs1) == INTEGER_CST)
2635     rhs1 = fold_convert (type1, rhs1);
2636   if (TREE_CODE (rhs2) == INTEGER_CST)
2637     rhs2 = fold_convert (type2, rhs2);
2638
2639   gimple_assign_set_rhs1 (stmt, rhs1);
2640   gimple_assign_set_rhs2 (stmt, rhs2);
2641   gimple_assign_set_rhs_code (stmt, WIDEN_MULT_EXPR);
2642   update_stmt (stmt);
2643   widen_mul_stats.widen_mults_inserted++;
2644   return true;
2645 }
2646
2647 /* Process a single gimple statement STMT, which is found at the
2648    iterator GSI and has a either a PLUS_EXPR or a MINUS_EXPR as its
2649    rhs (given by CODE), and try to convert it into a
2650    WIDEN_MULT_PLUS_EXPR or a WIDEN_MULT_MINUS_EXPR.  The return value
2651    is true iff we converted the statement.  */
2652
2653 static bool
2654 convert_plusminus_to_widen (gimple_stmt_iterator *gsi, gimple stmt,
2655                             enum tree_code code)
2656 {
2657   gimple rhs1_stmt = NULL, rhs2_stmt = NULL;
2658   gimple conv1_stmt = NULL, conv2_stmt = NULL, conv_stmt;
2659   tree type, type1, type2, optype;
2660   tree lhs, rhs1, rhs2, mult_rhs1, mult_rhs2, add_rhs;
2661   enum tree_code rhs1_code = ERROR_MARK, rhs2_code = ERROR_MARK;
2662   optab this_optab;
2663   enum tree_code wmult_code;
2664   enum insn_code handler;
2665   enum machine_mode to_mode, from_mode, actual_mode;
2666   location_t loc = gimple_location (stmt);
2667   int actual_precision;
2668   bool from_unsigned1, from_unsigned2;
2669
2670   lhs = gimple_assign_lhs (stmt);
2671   type = TREE_TYPE (lhs);
2672   if (TREE_CODE (type) != INTEGER_TYPE
2673       && TREE_CODE (type) != FIXED_POINT_TYPE)
2674     return false;
2675
2676   if (code == MINUS_EXPR)
2677     wmult_code = WIDEN_MULT_MINUS_EXPR;
2678   else
2679     wmult_code = WIDEN_MULT_PLUS_EXPR;
2680
2681   rhs1 = gimple_assign_rhs1 (stmt);
2682   rhs2 = gimple_assign_rhs2 (stmt);
2683
2684   if (TREE_CODE (rhs1) == SSA_NAME)
2685     {
2686       rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
2687       if (is_gimple_assign (rhs1_stmt))
2688         rhs1_code = gimple_assign_rhs_code (rhs1_stmt);
2689     }
2690
2691   if (TREE_CODE (rhs2) == SSA_NAME)
2692     {
2693       rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
2694       if (is_gimple_assign (rhs2_stmt))
2695         rhs2_code = gimple_assign_rhs_code (rhs2_stmt);
2696     }
2697
2698   /* Allow for one conversion statement between the multiply
2699      and addition/subtraction statement.  If there are more than
2700      one conversions then we assume they would invalidate this
2701      transformation.  If that's not the case then they should have
2702      been folded before now.  */
2703   if (CONVERT_EXPR_CODE_P (rhs1_code))
2704     {
2705       conv1_stmt = rhs1_stmt;
2706       rhs1 = gimple_assign_rhs1 (rhs1_stmt);
2707       if (TREE_CODE (rhs1) == SSA_NAME)
2708         {
2709           rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
2710           if (is_gimple_assign (rhs1_stmt))
2711             rhs1_code = gimple_assign_rhs_code (rhs1_stmt);
2712         }
2713       else
2714         return false;
2715     }
2716   if (CONVERT_EXPR_CODE_P (rhs2_code))
2717     {
2718       conv2_stmt = rhs2_stmt;
2719       rhs2 = gimple_assign_rhs1 (rhs2_stmt);
2720       if (TREE_CODE (rhs2) == SSA_NAME)
2721         {
2722           rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
2723           if (is_gimple_assign (rhs2_stmt))
2724             rhs2_code = gimple_assign_rhs_code (rhs2_stmt);
2725         }
2726       else
2727         return false;
2728     }
2729
2730   /* If code is WIDEN_MULT_EXPR then it would seem unnecessary to call
2731      is_widening_mult_p, but we still need the rhs returns.
2732
2733      It might also appear that it would be sufficient to use the existing
2734      operands of the widening multiply, but that would limit the choice of
2735      multiply-and-accumulate instructions.
2736
2737      If the widened-multiplication result has more than one uses, it is
2738      probably wiser not to do the conversion.  */
2739   if (code == PLUS_EXPR
2740       && (rhs1_code == MULT_EXPR || rhs1_code == WIDEN_MULT_EXPR))
2741     {
2742       if (!has_single_use (rhs1)
2743           || !is_widening_mult_p (rhs1_stmt, &type1, &mult_rhs1,
2744                                   &type2, &mult_rhs2))
2745         return false;
2746       add_rhs = rhs2;
2747       conv_stmt = conv1_stmt;
2748     }
2749   else if (rhs2_code == MULT_EXPR || rhs2_code == WIDEN_MULT_EXPR)
2750     {
2751       if (!has_single_use (rhs2)
2752           || !is_widening_mult_p (rhs2_stmt, &type1, &mult_rhs1,
2753                                   &type2, &mult_rhs2))
2754         return false;
2755       add_rhs = rhs1;
2756       conv_stmt = conv2_stmt;
2757     }
2758   else
2759     return false;
2760
2761   to_mode = TYPE_MODE (type);
2762   from_mode = TYPE_MODE (type1);
2763   from_unsigned1 = TYPE_UNSIGNED (type1);
2764   from_unsigned2 = TYPE_UNSIGNED (type2);
2765   optype = type1;
2766
2767   /* There's no such thing as a mixed sign madd yet, so use a wider mode.  */
2768   if (from_unsigned1 != from_unsigned2)
2769     {
2770       if (!INTEGRAL_TYPE_P (type))
2771         return false;
2772       /* We can use a signed multiply with unsigned types as long as
2773          there is a wider mode to use, or it is the smaller of the two
2774          types that is unsigned.  Note that type1 >= type2, always.  */
2775       if ((from_unsigned1
2776            && TYPE_PRECISION (type1) == GET_MODE_PRECISION (from_mode))
2777           || (from_unsigned2
2778               && TYPE_PRECISION (type2) == GET_MODE_PRECISION (from_mode)))
2779         {
2780           from_mode = GET_MODE_WIDER_MODE (from_mode);
2781           if (GET_MODE_SIZE (from_mode) >= GET_MODE_SIZE (to_mode))
2782             return false;
2783         }
2784
2785       from_unsigned1 = from_unsigned2 = false;
2786       optype = build_nonstandard_integer_type (GET_MODE_PRECISION (from_mode),
2787                                                false);
2788     }
2789
2790   /* If there was a conversion between the multiply and addition
2791      then we need to make sure it fits a multiply-and-accumulate.
2792      The should be a single mode change which does not change the
2793      value.  */
2794   if (conv_stmt)
2795     {
2796       /* We use the original, unmodified data types for this.  */
2797       tree from_type = TREE_TYPE (gimple_assign_rhs1 (conv_stmt));
2798       tree to_type = TREE_TYPE (gimple_assign_lhs (conv_stmt));
2799       int data_size = TYPE_PRECISION (type1) + TYPE_PRECISION (type2);
2800       bool is_unsigned = TYPE_UNSIGNED (type1) && TYPE_UNSIGNED (type2);
2801
2802       if (TYPE_PRECISION (from_type) > TYPE_PRECISION (to_type))
2803         {
2804           /* Conversion is a truncate.  */
2805           if (TYPE_PRECISION (to_type) < data_size)
2806             return false;
2807         }
2808       else if (TYPE_PRECISION (from_type) < TYPE_PRECISION (to_type))
2809         {
2810           /* Conversion is an extend.  Check it's the right sort.  */
2811           if (TYPE_UNSIGNED (from_type) != is_unsigned
2812               && !(is_unsigned && TYPE_PRECISION (from_type) > data_size))
2813             return false;
2814         }
2815       /* else convert is a no-op for our purposes.  */
2816     }
2817
2818   /* Verify that the machine can perform a widening multiply
2819      accumulate in this mode/signedness combination, otherwise
2820      this transformation is likely to pessimize code.  */
2821   this_optab = optab_for_tree_code (wmult_code, optype, optab_default);
2822   handler = find_widening_optab_handler_and_mode (this_optab, to_mode,
2823                                                   from_mode, 0, &actual_mode);
2824
2825   if (handler == CODE_FOR_nothing)
2826     return false;
2827
2828   /* Ensure that the inputs to the handler are in the correct precison
2829      for the opcode.  This will be the full mode size.  */
2830   actual_precision = GET_MODE_PRECISION (actual_mode);
2831   if (actual_precision != TYPE_PRECISION (type1)
2832       || from_unsigned1 != TYPE_UNSIGNED (type1))
2833     mult_rhs1 = build_and_insert_cast (gsi, loc,
2834                                        build_nonstandard_integer_type
2835                                          (actual_precision, from_unsigned1),
2836                                        mult_rhs1);
2837   if (actual_precision != TYPE_PRECISION (type2)
2838       || from_unsigned2 != TYPE_UNSIGNED (type2))
2839     mult_rhs2 = build_and_insert_cast (gsi, loc,
2840                                        build_nonstandard_integer_type
2841                                          (actual_precision, from_unsigned2),
2842                                        mult_rhs2);
2843
2844   if (!useless_type_conversion_p (type, TREE_TYPE (add_rhs)))
2845     add_rhs = build_and_insert_cast (gsi, loc, type, add_rhs);
2846
2847   /* Handle constants.  */
2848   if (TREE_CODE (mult_rhs1) == INTEGER_CST)
2849     mult_rhs1 = fold_convert (type1, mult_rhs1);
2850   if (TREE_CODE (mult_rhs2) == INTEGER_CST)
2851     mult_rhs2 = fold_convert (type2, mult_rhs2);
2852
2853   gimple_assign_set_rhs_with_ops_1 (gsi, wmult_code, mult_rhs1, mult_rhs2,
2854                                     add_rhs);
2855   update_stmt (gsi_stmt (*gsi));
2856   widen_mul_stats.maccs_inserted++;
2857   return true;
2858 }
2859
2860 /* Combine the multiplication at MUL_STMT with operands MULOP1 and MULOP2
2861    with uses in additions and subtractions to form fused multiply-add
2862    operations.  Returns true if successful and MUL_STMT should be removed.  */
2863
2864 static bool
2865 convert_mult_to_fma (gimple mul_stmt, tree op1, tree op2)
2866 {
2867   tree mul_result = gimple_get_lhs (mul_stmt);
2868   tree type = TREE_TYPE (mul_result);
2869   gimple use_stmt, neguse_stmt, fma_stmt;
2870   use_operand_p use_p;
2871   imm_use_iterator imm_iter;
2872
2873   if (FLOAT_TYPE_P (type)
2874       && flag_fp_contract_mode == FP_CONTRACT_OFF)
2875     return false;
2876
2877   /* We don't want to do bitfield reduction ops.  */
2878   if (INTEGRAL_TYPE_P (type)
2879       && (TYPE_PRECISION (type)
2880           != GET_MODE_PRECISION (TYPE_MODE (type))))
2881     return false;
2882
2883   /* If the target doesn't support it, don't generate it.  We assume that
2884      if fma isn't available then fms, fnma or fnms are not either.  */
2885   if (optab_handler (fma_optab, TYPE_MODE (type)) == CODE_FOR_nothing)
2886     return false;
2887
2888   /* If the multiplication has zero uses, it is kept around probably because
2889      of -fnon-call-exceptions.  Don't optimize it away in that case,
2890      it is DCE job.  */
2891   if (has_zero_uses (mul_result))
2892     return false;
2893
2894   /* Make sure that the multiplication statement becomes dead after
2895      the transformation, thus that all uses are transformed to FMAs.
2896      This means we assume that an FMA operation has the same cost
2897      as an addition.  */
2898   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, mul_result)
2899     {
2900       enum tree_code use_code;
2901       tree result = mul_result;
2902       bool negate_p = false;
2903
2904       use_stmt = USE_STMT (use_p);
2905
2906       if (is_gimple_debug (use_stmt))
2907         continue;
2908
2909       /* For now restrict this operations to single basic blocks.  In theory
2910          we would want to support sinking the multiplication in
2911          m = a*b;
2912          if ()
2913            ma = m + c;
2914          else
2915            d = m;
2916          to form a fma in the then block and sink the multiplication to the
2917          else block.  */
2918       if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
2919         return false;
2920
2921       if (!is_gimple_assign (use_stmt))
2922         return false;
2923
2924       use_code = gimple_assign_rhs_code (use_stmt);
2925
2926       /* A negate on the multiplication leads to FNMA.  */
2927       if (use_code == NEGATE_EXPR)
2928         {
2929           ssa_op_iter iter;
2930           use_operand_p usep;
2931
2932           result = gimple_assign_lhs (use_stmt);
2933
2934           /* Make sure the negate statement becomes dead with this
2935              single transformation.  */
2936           if (!single_imm_use (gimple_assign_lhs (use_stmt),
2937                                &use_p, &neguse_stmt))
2938             return false;
2939
2940           /* Make sure the multiplication isn't also used on that stmt.  */
2941           FOR_EACH_PHI_OR_STMT_USE (usep, neguse_stmt, iter, SSA_OP_USE)
2942             if (USE_FROM_PTR (usep) == mul_result)
2943               return false;
2944
2945           /* Re-validate.  */
2946           use_stmt = neguse_stmt;
2947           if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
2948             return false;
2949           if (!is_gimple_assign (use_stmt))
2950             return false;
2951
2952           use_code = gimple_assign_rhs_code (use_stmt);
2953           negate_p = true;
2954         }
2955
2956       switch (use_code)
2957         {
2958         case MINUS_EXPR:
2959           if (gimple_assign_rhs2 (use_stmt) == result)
2960             negate_p = !negate_p;
2961           break;
2962         case PLUS_EXPR:
2963           break;
2964         default:
2965           /* FMA can only be formed from PLUS and MINUS.  */
2966           return false;
2967         }
2968
2969       /* If the subtrahend (gimple_assign_rhs2 (use_stmt)) is computed
2970          by a MULT_EXPR that we'll visit later, we might be able to
2971          get a more profitable match with fnma.
2972          OTOH, if we don't, a negate / fma pair has likely lower latency
2973          that a mult / subtract pair.  */
2974       if (use_code == MINUS_EXPR && !negate_p
2975           && gimple_assign_rhs1 (use_stmt) == result
2976           && optab_handler (fms_optab, TYPE_MODE (type)) == CODE_FOR_nothing
2977           && optab_handler (fnma_optab, TYPE_MODE (type)) != CODE_FOR_nothing)
2978         {
2979           tree rhs2 = gimple_assign_rhs2 (use_stmt);
2980
2981           if (TREE_CODE (rhs2) == SSA_NAME)
2982             {
2983               gimple stmt2 = SSA_NAME_DEF_STMT (rhs2);
2984               if (has_single_use (rhs2)
2985                   && is_gimple_assign (stmt2)
2986                   && gimple_assign_rhs_code (stmt2) == MULT_EXPR)
2987               return false;
2988             }
2989         }
2990
2991       /* We can't handle a * b + a * b.  */
2992       if (gimple_assign_rhs1 (use_stmt) == gimple_assign_rhs2 (use_stmt))
2993         return false;
2994
2995       /* While it is possible to validate whether or not the exact form
2996          that we've recognized is available in the backend, the assumption
2997          is that the transformation is never a loss.  For instance, suppose
2998          the target only has the plain FMA pattern available.  Consider
2999          a*b-c -> fma(a,b,-c): we've exchanged MUL+SUB for FMA+NEG, which
3000          is still two operations.  Consider -(a*b)-c -> fma(-a,b,-c): we
3001          still have 3 operations, but in the FMA form the two NEGs are
3002          independent and could be run in parallel.  */
3003     }
3004
3005   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, mul_result)
3006     {
3007       gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
3008       enum tree_code use_code;
3009       tree addop, mulop1 = op1, result = mul_result;
3010       bool negate_p = false;
3011
3012       if (is_gimple_debug (use_stmt))
3013         continue;
3014
3015       use_code = gimple_assign_rhs_code (use_stmt);
3016       if (use_code == NEGATE_EXPR)
3017         {
3018           result = gimple_assign_lhs (use_stmt);
3019           single_imm_use (gimple_assign_lhs (use_stmt), &use_p, &neguse_stmt);
3020           gsi_remove (&gsi, true);
3021           release_defs (use_stmt);
3022
3023           use_stmt = neguse_stmt;
3024           gsi = gsi_for_stmt (use_stmt);
3025           use_code = gimple_assign_rhs_code (use_stmt);
3026           negate_p = true;
3027         }
3028
3029       if (gimple_assign_rhs1 (use_stmt) == result)
3030         {
3031           addop = gimple_assign_rhs2 (use_stmt);
3032           /* a * b - c -> a * b + (-c)  */
3033           if (gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
3034             addop = force_gimple_operand_gsi (&gsi,
3035                                               build1 (NEGATE_EXPR,
3036                                                       type, addop),
3037                                               true, NULL_TREE, true,
3038                                               GSI_SAME_STMT);
3039         }
3040       else
3041         {
3042           addop = gimple_assign_rhs1 (use_stmt);
3043           /* a - b * c -> (-b) * c + a */
3044           if (gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
3045             negate_p = !negate_p;
3046         }
3047
3048       if (negate_p)
3049         mulop1 = force_gimple_operand_gsi (&gsi,
3050                                            build1 (NEGATE_EXPR,
3051                                                    type, mulop1),
3052                                            true, NULL_TREE, true,
3053                                            GSI_SAME_STMT);
3054
3055       fma_stmt = gimple_build_assign_with_ops (FMA_EXPR,
3056                                                gimple_assign_lhs (use_stmt),
3057                                                mulop1, op2,
3058                                                addop);
3059       gsi_replace (&gsi, fma_stmt, true);
3060       widen_mul_stats.fmas_inserted++;
3061     }
3062
3063   return true;
3064 }
3065
3066 /* Find integer multiplications where the operands are extended from
3067    smaller types, and replace the MULT_EXPR with a WIDEN_MULT_EXPR
3068    where appropriate.  */
3069
3070 namespace {
3071
3072 const pass_data pass_data_optimize_widening_mul =
3073 {
3074   GIMPLE_PASS, /* type */
3075   "widening_mul", /* name */
3076   OPTGROUP_NONE, /* optinfo_flags */
3077   true, /* has_execute */
3078   TV_NONE, /* tv_id */
3079   PROP_ssa, /* properties_required */
3080   0, /* properties_provided */
3081   0, /* properties_destroyed */
3082   0, /* todo_flags_start */
3083   TODO_update_ssa, /* todo_flags_finish */
3084 };
3085
3086 class pass_optimize_widening_mul : public gimple_opt_pass
3087 {
3088 public:
3089   pass_optimize_widening_mul (gcc::context *ctxt)
3090     : gimple_opt_pass (pass_data_optimize_widening_mul, ctxt)
3091   {}
3092
3093   /* opt_pass methods: */
3094   virtual bool gate (function *)
3095     {
3096       return flag_expensive_optimizations && optimize;
3097     }
3098
3099   virtual unsigned int execute (function *);
3100
3101 }; // class pass_optimize_widening_mul
3102
3103 unsigned int
3104 pass_optimize_widening_mul::execute (function *fun)
3105 {
3106   basic_block bb;
3107   bool cfg_changed = false;
3108
3109   memset (&widen_mul_stats, 0, sizeof (widen_mul_stats));
3110
3111   FOR_EACH_BB_FN (bb, fun)
3112     {
3113       gimple_stmt_iterator gsi;
3114
3115       for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi);)
3116         {
3117           gimple stmt = gsi_stmt (gsi);
3118           enum tree_code code;
3119
3120           if (is_gimple_assign (stmt))
3121             {
3122               code = gimple_assign_rhs_code (stmt);
3123               switch (code)
3124                 {
3125                 case MULT_EXPR:
3126                   if (!convert_mult_to_widen (stmt, &gsi)
3127                       && convert_mult_to_fma (stmt,
3128                                               gimple_assign_rhs1 (stmt),
3129                                               gimple_assign_rhs2 (stmt)))
3130                     {
3131                       gsi_remove (&gsi, true);
3132                       release_defs (stmt);
3133                       continue;
3134                     }
3135                   break;
3136
3137                 case PLUS_EXPR:
3138                 case MINUS_EXPR:
3139                   convert_plusminus_to_widen (&gsi, stmt, code);
3140                   break;
3141
3142                 default:;
3143                 }
3144             }
3145           else if (is_gimple_call (stmt)
3146                    && gimple_call_lhs (stmt))
3147             {
3148               tree fndecl = gimple_call_fndecl (stmt);
3149               if (fndecl
3150                   && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL)
3151                 {
3152                   switch (DECL_FUNCTION_CODE (fndecl))
3153                     {
3154                       case BUILT_IN_POWF:
3155                       case BUILT_IN_POW:
3156                       case BUILT_IN_POWL:
3157                         if (TREE_CODE (gimple_call_arg (stmt, 1)) == REAL_CST
3158                             && REAL_VALUES_EQUAL
3159                                  (TREE_REAL_CST (gimple_call_arg (stmt, 1)),
3160                                   dconst2)
3161                             && convert_mult_to_fma (stmt,
3162                                                     gimple_call_arg (stmt, 0),
3163                                                     gimple_call_arg (stmt, 0)))
3164                           {
3165                             unlink_stmt_vdef (stmt);
3166                             if (gsi_remove (&gsi, true)
3167                                 && gimple_purge_dead_eh_edges (bb))
3168                               cfg_changed = true;
3169                             release_defs (stmt);
3170                             continue;
3171                           }
3172                           break;
3173
3174                       default:;
3175                     }
3176                 }
3177             }
3178           gsi_next (&gsi);
3179         }
3180     }
3181
3182   statistics_counter_event (fun, "widening multiplications inserted",
3183                             widen_mul_stats.widen_mults_inserted);
3184   statistics_counter_event (fun, "widening maccs inserted",
3185                             widen_mul_stats.maccs_inserted);
3186   statistics_counter_event (fun, "fused multiply-adds inserted",
3187                             widen_mul_stats.fmas_inserted);
3188
3189   return cfg_changed ? TODO_cleanup_cfg : 0;
3190 }
3191
3192 } // anon namespace
3193
3194 gimple_opt_pass *
3195 make_pass_optimize_widening_mul (gcc::context *ctxt)
3196 {
3197   return new pass_optimize_widening_mul (ctxt);
3198 }