gcc/tree-ssa-math-opts.c

   1 /* Global, SSA-based optimizations using mathematical identities.
   2    Copyright (C) 2005, 2006, 2007, 2008, 2009, 2010
   3    Free Software Foundation, Inc.
   4
   5 This file is part of GCC.
   6
   7 GCC is free software; you can redistribute it and/or modify it
   8 under the terms of the GNU General Public License as published by the
   9 Free Software Foundation; either version 3, or (at your option) any
  10 later version.
  11
  12 GCC is distributed in the hope that it will be useful, but WITHOUT
  13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  14 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  15 for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with GCC; see the file COPYING3.  If not see
  19 <http://www.gnu.org/licenses/>.  */
  20
  21 /* Currently, the only mini-pass in this file tries to CSE reciprocal
  22    operations.  These are common in sequences such as this one:
  23
  24         modulus = sqrt(x*x + y*y + z*z);
  25         x = x / modulus;
  26         y = y / modulus;
  27         z = z / modulus;
  28
  29    that can be optimized to
  30
  31         modulus = sqrt(x*x + y*y + z*z);
  32         rmodulus = 1.0 / modulus;
  33         x = x * rmodulus;
  34         y = y * rmodulus;
  35         z = z * rmodulus;
  36
  37    We do this for loop invariant divisors, and with this pass whenever
  38    we notice that a division has the same divisor multiple times.
  39
  40    Of course, like in PRE, we don't insert a division if a dominator
  41    already has one.  However, this cannot be done as an extension of
  42    PRE for several reasons.
  43
  44    First of all, with some experiments it was found out that the
  45    transformation is not always useful if there are only two divisions
  46    hy the same divisor.  This is probably because modern processors
  47    can pipeline the divisions; on older, in-order processors it should
  48    still be effective to optimize two divisions by the same number.
  49    We make this a param, and it shall be called N in the remainder of
  50    this comment.
  51
  52    Second, if trapping math is active, we have less freedom on where
  53    to insert divisions: we can only do so in basic blocks that already
  54    contain one.  (If divisions don't trap, instead, we can insert
  55    divisions elsewhere, which will be in blocks that are common dominators
  56    of those that have the division).
  57
  58    We really don't want to compute the reciprocal unless a division will
  59    be found.  To do this, we won't insert the division in a basic block
  60    that has less than N divisions *post-dominating* it.
  61
  62    The algorithm constructs a subset of the dominator tree, holding the
  63    blocks containing the divisions and the common dominators to them,
  64    and walk it twice.  The first walk is in post-order, and it annotates
  65    each block with the number of divisions that post-dominate it: this
  66    gives information on where divisions can be inserted profitably.
  67    The second walk is in pre-order, and it inserts divisions as explained
  68    above, and replaces divisions by multiplications.
  69
  70    In the best case, the cost of the pass is O(n_statements).  In the
  71    worst-case, the cost is due to creating the dominator tree subset,
  72    with a cost of O(n_basic_blocks ^ 2); however this can only happen
  73    for n_statements / n_basic_blocks statements.  So, the amortized cost
  74    of creating the dominator tree subset is O(n_basic_blocks) and the
  75    worst-case cost of the pass is O(n_statements * n_basic_blocks).
  76
  77    More practically, the cost will be small because there are few
  78    divisions, and they tend to be in the same basic block, so insert_bb
  79    is called very few times.
  80
  81    If we did this using domwalk.c, an efficient implementation would have
  82    to work on all the variables in a single pass, because we could not
  83    work on just a subset of the dominator tree, as we do now, and the
  84    cost would also be something like O(n_statements * n_basic_blocks).
  85    The data structures would be more complex in order to work on all the
  86    variables in a single pass.  */
  87
  88 #include "config.h"
  89 #include "system.h"
  90 #include "coretypes.h"
  91 #include "tm.h"
  92 #include "flags.h"
  93 #include "tree.h"
  94 #include "tree-flow.h"
  95 #include "timevar.h"
  96 #include "tree-pass.h"
  97 #include "alloc-pool.h"
  98 #include "basic-block.h"
  99 #include "target.h"
 100 #include "gimple-pretty-print.h"
 101
 102 /* FIXME: RTL headers have to be included here for optabs.  */
 103 #include "rtl.h"                /* Because optabs.h wants enum rtx_code.  */
 104 #include "expr.h"               /* Because optabs.h wants sepops.  */
 105 #include "optabs.h"
 106
 107 /* This structure represents one basic block that either computes a
 108    division, or is a common dominator for basic block that compute a
 109    division.  */
 110 struct occurrence {
 111   /* The basic block represented by this structure.  */
 112   basic_block bb;
 113
 114   /* If non-NULL, the SSA_NAME holding the definition for a reciprocal
 115      inserted in BB.  */
 116   tree recip_def;
 117
 118   /* If non-NULL, the GIMPLE_ASSIGN for a reciprocal computation that
 119      was inserted in BB.  */
 120   gimple recip_def_stmt;
 121
 122   /* Pointer to a list of "struct occurrence"s for blocks dominated
 123      by BB.  */
 124   struct occurrence *children;
 125
 126   /* Pointer to the next "struct occurrence"s in the list of blocks
 127      sharing a common dominator.  */
 128   struct occurrence *next;
 129
 130   /* The number of divisions that are in BB before compute_merit.  The
 131      number of divisions that are in BB or post-dominate it after
 132      compute_merit.  */
 133   int num_divisions;
 134
 135   /* True if the basic block has a division, false if it is a common
 136      dominator for basic blocks that do.  If it is false and trapping
 137      math is active, BB is not a candidate for inserting a reciprocal.  */
 138   bool bb_has_division;
 139 };
 140
 141 static struct
 142 {
 143   /* Number of 1.0/X ops inserted.  */
 144   int rdivs_inserted;
 145
 146   /* Number of 1.0/FUNC ops inserted.  */
 147   int rfuncs_inserted;
 148 } reciprocal_stats;
 149
 150 static struct
 151 {
 152   /* Number of cexpi calls inserted.  */
 153   int inserted;
 154 } sincos_stats;
 155
 156 static struct
 157 {
 158   /* Number of hand-written 32-bit bswaps found.  */
 159   int found_32bit;
 160
 161   /* Number of hand-written 64-bit bswaps found.  */
 162   int found_64bit;
 163 } bswap_stats;
 164
 165 static struct
 166 {
 167   /* Number of widening multiplication ops inserted.  */
 168   int widen_mults_inserted;
 169
 170   /* Number of integer multiply-and-accumulate ops inserted.  */
 171   int maccs_inserted;
 172
 173   /* Number of fp fused multiply-add ops inserted.  */
 174   int fmas_inserted;
 175 } widen_mul_stats;
 176
 177 /* The instance of "struct occurrence" representing the highest
 178    interesting block in the dominator tree.  */
 179 static struct occurrence *occ_head;
 180
 181 /* Allocation pool for getting instances of "struct occurrence".  */
 182 static alloc_pool occ_pool;
 183
 184
 185
 186 /* Allocate and return a new struct occurrence for basic block BB, and
 187    whose children list is headed by CHILDREN.  */
 188 static struct occurrence *
 189 occ_new (basic_block bb, struct occurrence *children)
 190 {
 191   struct occurrence *occ;
 192
 193   bb->aux = occ = (struct occurrence *) pool_alloc (occ_pool);
 194   memset (occ, 0, sizeof (struct occurrence));
 195
 196   occ->bb = bb;
 197   occ->children = children;
 198   return occ;
 199 }
 200
 201
 202 /* Insert NEW_OCC into our subset of the dominator tree.  P_HEAD points to a
 203    list of "struct occurrence"s, one per basic block, having IDOM as
 204    their common dominator.
 205
 206    We try to insert NEW_OCC as deep as possible in the tree, and we also
 207    insert any other block that is a common dominator for BB and one
 208    block already in the tree.  */
 209
 210 static void
 211 insert_bb (struct occurrence *new_occ, basic_block idom,
 212            struct occurrence **p_head)
 213 {
 214   struct occurrence *occ, **p_occ;
 215
 216   for (p_occ = p_head; (occ = *p_occ) != NULL; )
 217     {
 218       basic_block bb = new_occ->bb, occ_bb = occ->bb;
 219       basic_block dom = nearest_common_dominator (CDI_DOMINATORS, occ_bb, bb);
 220       if (dom == bb)
 221         {
 222           /* BB dominates OCC_BB.  OCC becomes NEW_OCC's child: remove OCC
 223              from its list.  */
 224           *p_occ = occ->next;
 225           occ->next = new_occ->children;
 226           new_occ->children = occ;
 227
 228           /* Try the next block (it may as well be dominated by BB).  */
 229         }
 230
 231       else if (dom == occ_bb)
 232         {
 233           /* OCC_BB dominates BB.  Tail recurse to look deeper.  */
 234           insert_bb (new_occ, dom, &occ->children);
 235           return;
 236         }
 237
 238       else if (dom != idom)
 239         {
 240           gcc_assert (!dom->aux);
 241
 242           /* There is a dominator between IDOM and BB, add it and make
 243              two children out of NEW_OCC and OCC.  First, remove OCC from
 244              its list.  */
 245           *p_occ = occ->next;
 246           new_occ->next = occ;
 247           occ->next = NULL;
 248
 249           /* None of the previous blocks has DOM as a dominator: if we tail
 250              recursed, we would reexamine them uselessly. Just switch BB with
 251              DOM, and go on looking for blocks dominated by DOM.  */
 252           new_occ = occ_new (dom, new_occ);
 253         }
 254
 255       else
 256         {
 257           /* Nothing special, go on with the next element.  */
 258           p_occ = &occ->next;
 259         }
 260     }
 261
 262   /* No place was found as a child of IDOM.  Make BB a sibling of IDOM.  */
 263   new_occ->next = *p_head;
 264   *p_head = new_occ;
 265 }
 266
 267 /* Register that we found a division in BB.  */
 268
 269 static inline void
 270 register_division_in (basic_block bb)
 271 {
 272   struct occurrence *occ;
 273
 274   occ = (struct occurrence *) bb->aux;
 275   if (!occ)
 276     {
 277       occ = occ_new (bb, NULL);
 278       insert_bb (occ, ENTRY_BLOCK_PTR, &occ_head);
 279     }
 280
 281   occ->bb_has_division = true;
 282   occ->num_divisions++;
 283 }
 284
 285
 286 /* Compute the number of divisions that postdominate each block in OCC and
 287    its children.  */
 288
 289 static void
 290 compute_merit (struct occurrence *occ)
 291 {
 292   struct occurrence *occ_child;
 293   basic_block dom = occ->bb;
 294
 295   for (occ_child = occ->children; occ_child; occ_child = occ_child->next)
 296     {
 297       basic_block bb;
 298       if (occ_child->children)
 299         compute_merit (occ_child);
 300
 301       if (flag_exceptions)
 302         bb = single_noncomplex_succ (dom);
 303       else
 304         bb = dom;
 305
 306       if (dominated_by_p (CDI_POST_DOMINATORS, bb, occ_child->bb))
 307         occ->num_divisions += occ_child->num_divisions;
 308     }
 309 }
 310
 311
 312 /* Return whether USE_STMT is a floating-point division by DEF.  */
 313 static inline bool
 314 is_division_by (gimple use_stmt, tree def)
 315 {
 316   return is_gimple_assign (use_stmt)
 317          && gimple_assign_rhs_code (use_stmt) == RDIV_EXPR
 318          && gimple_assign_rhs2 (use_stmt) == def
 319          /* Do not recognize x / x as valid division, as we are getting
 320             confused later by replacing all immediate uses x in such
 321             a stmt.  */
 322          && gimple_assign_rhs1 (use_stmt) != def;
 323 }
 324
 325 /* Walk the subset of the dominator tree rooted at OCC, setting the
 326    RECIP_DEF field to a definition of 1.0 / DEF that can be used in
 327    the given basic block.  The field may be left NULL, of course,
 328    if it is not possible or profitable to do the optimization.
 329
 330    DEF_BSI is an iterator pointing at the statement defining DEF.
 331    If RECIP_DEF is set, a dominator already has a computation that can
 332    be used.  */
 333
 334 static void
 335 insert_reciprocals (gimple_stmt_iterator *def_gsi, struct occurrence *occ,
 336                     tree def, tree recip_def, int threshold)
 337 {
 338   tree type;
 339   gimple new_stmt;
 340   gimple_stmt_iterator gsi;
 341   struct occurrence *occ_child;
 342
 343   if (!recip_def
 344       && (occ->bb_has_division || !flag_trapping_math)
 345       && occ->num_divisions >= threshold)
 346     {
 347       /* Make a variable with the replacement and substitute it.  */
 348       type = TREE_TYPE (def);
 349       recip_def = make_rename_temp (type, "reciptmp");
 350       new_stmt = gimple_build_assign_with_ops (RDIV_EXPR, recip_def,
 351                                                build_one_cst (type), def);
 352
 353       if (occ->bb_has_division)
 354         {
 355           /* Case 1: insert before an existing division.  */
 356           gsi = gsi_after_labels (occ->bb);
 357           while (!gsi_end_p (gsi) && !is_division_by (gsi_stmt (gsi), def))
 358             gsi_next (&gsi);
 359
 360           gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
 361         }
 362       else if (def_gsi && occ->bb == def_gsi->bb)
 363         {
 364           /* Case 2: insert right after the definition.  Note that this will
 365              never happen if the definition statement can throw, because in
 366              that case the sole successor of the statement's basic block will
 367              dominate all the uses as well.  */
 368           gsi_insert_after (def_gsi, new_stmt, GSI_NEW_STMT);
 369         }
 370       else
 371         {
 372           /* Case 3: insert in a basic block not containing defs/uses.  */
 373           gsi = gsi_after_labels (occ->bb);
 374           gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
 375         }
 376
 377       reciprocal_stats.rdivs_inserted++;
 378
 379       occ->recip_def_stmt = new_stmt;
 380     }
 381
 382   occ->recip_def = recip_def;
 383   for (occ_child = occ->children; occ_child; occ_child = occ_child->next)
 384     insert_reciprocals (def_gsi, occ_child, def, recip_def, threshold);
 385 }
 386
 387
 388 /* Replace the division at USE_P with a multiplication by the reciprocal, if
 389    possible.  */
 390
 391 static inline void
 392 replace_reciprocal (use_operand_p use_p)
 393 {
 394   gimple use_stmt = USE_STMT (use_p);
 395   basic_block bb = gimple_bb (use_stmt);
 396   struct occurrence *occ = (struct occurrence *) bb->aux;
 397
 398   if (optimize_bb_for_speed_p (bb)
 399       && occ->recip_def && use_stmt != occ->recip_def_stmt)
 400     {
 401       gimple_assign_set_rhs_code (use_stmt, MULT_EXPR);
 402       SET_USE (use_p, occ->recip_def);
 403       fold_stmt_inplace (use_stmt);
 404       update_stmt (use_stmt);
 405     }
 406 }
 407
 408
 409 /* Free OCC and return one more "struct occurrence" to be freed.  */
 410
 411 static struct occurrence *
 412 free_bb (struct occurrence *occ)
 413 {
 414   struct occurrence *child, *next;
 415
 416   /* First get the two pointers hanging off OCC.  */
 417   next = occ->next;
 418   child = occ->children;
 419   occ->bb->aux = NULL;
 420   pool_free (occ_pool, occ);
 421
 422   /* Now ensure that we don't recurse unless it is necessary.  */
 423   if (!child)
 424     return next;
 425   else
 426     {
 427       while (next)
 428         next = free_bb (next);
 429
 430       return child;
 431     }
 432 }
 433
 434
 435 /* Look for floating-point divisions among DEF's uses, and try to
 436    replace them by multiplications with the reciprocal.  Add
 437    as many statements computing the reciprocal as needed.
 438
 439    DEF must be a GIMPLE register of a floating-point type.  */
 440
 441 static void
 442 execute_cse_reciprocals_1 (gimple_stmt_iterator *def_gsi, tree def)
 443 {
 444   use_operand_p use_p;
 445   imm_use_iterator use_iter;
 446   struct occurrence *occ;
 447   int count = 0, threshold;
 448
 449   gcc_assert (FLOAT_TYPE_P (TREE_TYPE (def)) && is_gimple_reg (def));
 450
 451   FOR_EACH_IMM_USE_FAST (use_p, use_iter, def)
 452     {
 453       gimple use_stmt = USE_STMT (use_p);
 454       if (is_division_by (use_stmt, def))
 455         {
 456           register_division_in (gimple_bb (use_stmt));
 457           count++;
 458         }
 459     }
 460
 461   /* Do the expensive part only if we can hope to optimize something.  */
 462   threshold = targetm.min_divisions_for_recip_mul (TYPE_MODE (TREE_TYPE (def)));
 463   if (count >= threshold)
 464     {
 465       gimple use_stmt;
 466       for (occ = occ_head; occ; occ = occ->next)
 467         {
 468           compute_merit (occ);
 469           insert_reciprocals (def_gsi, occ, def, NULL, threshold);
 470         }
 471
 472       FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, def)
 473         {
 474           if (is_division_by (use_stmt, def))
 475             {
 476               FOR_EACH_IMM_USE_ON_STMT (use_p, use_iter)
 477                 replace_reciprocal (use_p);
 478             }
 479         }
 480     }
 481
 482   for (occ = occ_head; occ; )
 483     occ = free_bb (occ);
 484
 485   occ_head = NULL;
 486 }
 487
 488 static bool
 489 gate_cse_reciprocals (void)
 490 {
 491   return optimize && flag_reciprocal_math;
 492 }
 493
 494 /* Go through all the floating-point SSA_NAMEs, and call
 495    execute_cse_reciprocals_1 on each of them.  */
 496 static unsigned int
 497 execute_cse_reciprocals (void)
 498 {
 499   basic_block bb;
 500   tree arg;
 501
 502   occ_pool = create_alloc_pool ("dominators for recip",
 503                                 sizeof (struct occurrence),
 504                                 n_basic_blocks / 3 + 1);
 505
 506   memset (&reciprocal_stats, 0, sizeof (reciprocal_stats));
 507   calculate_dominance_info (CDI_DOMINATORS);
 508   calculate_dominance_info (CDI_POST_DOMINATORS);
 509
 510 #ifdef ENABLE_CHECKING
 511   FOR_EACH_BB (bb)
 512     gcc_assert (!bb->aux);
 513 #endif
 514
 515   for (arg = DECL_ARGUMENTS (cfun->decl); arg; arg = DECL_CHAIN (arg))
 516     if (gimple_default_def (cfun, arg)
 517         && FLOAT_TYPE_P (TREE_TYPE (arg))
 518         && is_gimple_reg (arg))
 519       execute_cse_reciprocals_1 (NULL, gimple_default_def (cfun, arg));
 520
 521   FOR_EACH_BB (bb)
 522     {
 523       gimple_stmt_iterator gsi;
 524       gimple phi;
 525       tree def;
 526
 527       for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 528         {
 529           phi = gsi_stmt (gsi);
 530           def = PHI_RESULT (phi);
 531           if (FLOAT_TYPE_P (TREE_TYPE (def))
 532               && is_gimple_reg (def))
 533             execute_cse_reciprocals_1 (NULL, def);
 534         }
 535
 536       for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 537         {
 538           gimple stmt = gsi_stmt (gsi);
 539
 540           if (gimple_has_lhs (stmt)
 541               && (def = SINGLE_SSA_TREE_OPERAND (stmt, SSA_OP_DEF)) != NULL
 542               && FLOAT_TYPE_P (TREE_TYPE (def))
 543               && TREE_CODE (def) == SSA_NAME)
 544             execute_cse_reciprocals_1 (&gsi, def);
 545         }
 546
 547       if (optimize_bb_for_size_p (bb))
 548         continue;
 549
 550       /* Scan for a/func(b) and convert it to reciprocal a*rfunc(b).  */
 551       for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 552         {
 553           gimple stmt = gsi_stmt (gsi);
 554           tree fndecl;
 555
 556           if (is_gimple_assign (stmt)
 557               && gimple_assign_rhs_code (stmt) == RDIV_EXPR)
 558             {
 559               tree arg1 = gimple_assign_rhs2 (stmt);
 560               gimple stmt1;
 561
 562               if (TREE_CODE (arg1) != SSA_NAME)
 563                 continue;
 564
 565               stmt1 = SSA_NAME_DEF_STMT (arg1);
 566
 567               if (is_gimple_call (stmt1)
 568                   && gimple_call_lhs (stmt1)
 569                   && (fndecl = gimple_call_fndecl (stmt1))
 570                   && (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL
 571                       || DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD))
 572                 {
 573                   enum built_in_function code;
 574                   bool md_code, fail;
 575                   imm_use_iterator ui;
 576                   use_operand_p use_p;
 577
 578                   code = DECL_FUNCTION_CODE (fndecl);
 579                   md_code = DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD;
 580
 581                   fndecl = targetm.builtin_reciprocal (code, md_code, false);
 582                   if (!fndecl)
 583                     continue;
 584
 585                   /* Check that all uses of the SSA name are divisions,
 586                      otherwise replacing the defining statement will do
 587                      the wrong thing.  */
 588                   fail = false;
 589                   FOR_EACH_IMM_USE_FAST (use_p, ui, arg1)
 590                     {
 591                       gimple stmt2 = USE_STMT (use_p);
 592                       if (is_gimple_debug (stmt2))
 593                         continue;
 594                       if (!is_gimple_assign (stmt2)
 595                           || gimple_assign_rhs_code (stmt2) != RDIV_EXPR
 596                           || gimple_assign_rhs1 (stmt2) == arg1
 597                           || gimple_assign_rhs2 (stmt2) != arg1)
 598                         {
 599                           fail = true;
 600                           break;
 601                         }
 602                     }
 603                   if (fail)
 604                     continue;
 605
 606                   gimple_replace_lhs (stmt1, arg1);
 607                   gimple_call_set_fndecl (stmt1, fndecl);
 608                   update_stmt (stmt1);
 609                   reciprocal_stats.rfuncs_inserted++;
 610
 611                   FOR_EACH_IMM_USE_STMT (stmt, ui, arg1)
 612                     {
 613                       gimple_assign_set_rhs_code (stmt, MULT_EXPR);
 614                       fold_stmt_inplace (stmt);
 615                       update_stmt (stmt);
 616                     }
 617                 }
 618             }
 619         }
 620     }
 621
 622   statistics_counter_event (cfun, "reciprocal divs inserted",
 623                             reciprocal_stats.rdivs_inserted);
 624   statistics_counter_event (cfun, "reciprocal functions inserted",
 625                             reciprocal_stats.rfuncs_inserted);
 626
 627   free_dominance_info (CDI_DOMINATORS);
 628   free_dominance_info (CDI_POST_DOMINATORS);
 629   free_alloc_pool (occ_pool);
 630   return 0;
 631 }
 632
 633 struct gimple_opt_pass pass_cse_reciprocals =
 634 {
 635  {
 636   GIMPLE_PASS,
 637   "recip",                              /* name */
 638   gate_cse_reciprocals,                 /* gate */
 639   execute_cse_reciprocals,              /* execute */
 640   NULL,                                 /* sub */
 641   NULL,                                 /* next */
 642   0,                                    /* static_pass_number */
 643   TV_NONE,                              /* tv_id */
 644   PROP_ssa,                             /* properties_required */
 645   0,                                    /* properties_provided */
 646   0,                                    /* properties_destroyed */
 647   0,                                    /* todo_flags_start */
 648   TODO_dump_func | TODO_update_ssa | TODO_verify_ssa
 649     | TODO_verify_stmts                /* todo_flags_finish */
 650  }
 651 };
 652
 653 /* Records an occurrence at statement USE_STMT in the vector of trees
 654    STMTS if it is dominated by *TOP_BB or dominates it or this basic block
 655    is not yet initialized.  Returns true if the occurrence was pushed on
 656    the vector.  Adjusts *TOP_BB to be the basic block dominating all
 657    statements in the vector.  */
 658
 659 static bool
 660 maybe_record_sincos (VEC(gimple, heap) **stmts,
 661                      basic_block *top_bb, gimple use_stmt)
 662 {
 663   basic_block use_bb = gimple_bb (use_stmt);
 664   if (*top_bb
 665       && (*top_bb == use_bb
 666           || dominated_by_p (CDI_DOMINATORS, use_bb, *top_bb)))
 667     VEC_safe_push (gimple, heap, *stmts, use_stmt);
 668   else if (!*top_bb
 669            || dominated_by_p (CDI_DOMINATORS, *top_bb, use_bb))
 670     {
 671       VEC_safe_push (gimple, heap, *stmts, use_stmt);
 672       *top_bb = use_bb;
 673     }
 674   else
 675     return false;
 676
 677   return true;
 678 }
 679
 680 /* Look for sin, cos and cexpi calls with the same argument NAME and
 681    create a single call to cexpi CSEing the result in this case.
 682    We first walk over all immediate uses of the argument collecting
 683    statements that we can CSE in a vector and in a second pass replace
 684    the statement rhs with a REALPART or IMAGPART expression on the
 685    result of the cexpi call we insert before the use statement that
 686    dominates all other candidates.  */
 687
 688 static bool
 689 execute_cse_sincos_1 (tree name)
 690 {
 691   gimple_stmt_iterator gsi;
 692   imm_use_iterator use_iter;
 693   tree fndecl, res, type;
 694   gimple def_stmt, use_stmt, stmt;
 695   int seen_cos = 0, seen_sin = 0, seen_cexpi = 0;
 696   VEC(gimple, heap) *stmts = NULL;
 697   basic_block top_bb = NULL;
 698   int i;
 699   bool cfg_changed = false;
 700
 701   type = TREE_TYPE (name);
 702   FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, name)
 703     {
 704       if (gimple_code (use_stmt) != GIMPLE_CALL
 705           || !gimple_call_lhs (use_stmt)
 706           || !(fndecl = gimple_call_fndecl (use_stmt))
 707           || DECL_BUILT_IN_CLASS (fndecl) != BUILT_IN_NORMAL)
 708         continue;
 709
 710       switch (DECL_FUNCTION_CODE (fndecl))
 711         {
 712         CASE_FLT_FN (BUILT_IN_COS):
 713           seen_cos |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
 714           break;
 715
 716         CASE_FLT_FN (BUILT_IN_SIN):
 717           seen_sin |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
 718           break;
 719
 720         CASE_FLT_FN (BUILT_IN_CEXPI):
 721           seen_cexpi |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
 722           break;
 723
 724         default:;
 725         }
 726     }
 727
 728   if (seen_cos + seen_sin + seen_cexpi <= 1)
 729     {
 730       VEC_free(gimple, heap, stmts);
 731       return false;
 732     }
 733
 734   /* Simply insert cexpi at the beginning of top_bb but not earlier than
 735      the name def statement.  */
 736   fndecl = mathfn_built_in (type, BUILT_IN_CEXPI);
 737   if (!fndecl)
 738     return false;
 739   res = create_tmp_reg (TREE_TYPE (TREE_TYPE (fndecl)), "sincostmp");
 740   stmt = gimple_build_call (fndecl, 1, name);
 741   res = make_ssa_name (res, stmt);
 742   gimple_call_set_lhs (stmt, res);
 743
 744   def_stmt = SSA_NAME_DEF_STMT (name);
 745   if (!SSA_NAME_IS_DEFAULT_DEF (name)
 746       && gimple_code (def_stmt) != GIMPLE_PHI
 747       && gimple_bb (def_stmt) == top_bb)
 748     {
 749       gsi = gsi_for_stmt (def_stmt);
 750       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
 751     }
 752   else
 753     {
 754       gsi = gsi_after_labels (top_bb);
 755       gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
 756     }
 757   update_stmt (stmt);
 758   sincos_stats.inserted++;
 759
 760   /* And adjust the recorded old call sites.  */
 761   for (i = 0; VEC_iterate(gimple, stmts, i, use_stmt); ++i)
 762     {
 763       tree rhs = NULL;
 764       fndecl = gimple_call_fndecl (use_stmt);
 765
 766       switch (DECL_FUNCTION_CODE (fndecl))
 767         {
 768         CASE_FLT_FN (BUILT_IN_COS):
 769           rhs = fold_build1 (REALPART_EXPR, type, res);
 770           break;
 771
 772         CASE_FLT_FN (BUILT_IN_SIN):
 773           rhs = fold_build1 (IMAGPART_EXPR, type, res);
 774           break;
 775
 776         CASE_FLT_FN (BUILT_IN_CEXPI):
 777           rhs = res;
 778           break;
 779
 780         default:;
 781           gcc_unreachable ();
 782         }
 783
 784         /* Replace call with a copy.  */
 785         stmt = gimple_build_assign (gimple_call_lhs (use_stmt), rhs);
 786
 787         gsi = gsi_for_stmt (use_stmt);
 788         gsi_replace (&gsi, stmt, true);
 789         if (gimple_purge_dead_eh_edges (gimple_bb (stmt)))
 790           cfg_changed = true;
 791     }
 792
 793   VEC_free(gimple, heap, stmts);
 794
 795   return cfg_changed;
 796 }
 797
 798 /* To evaluate powi(x,n), the floating point value x raised to the
 799    constant integer exponent n, we use a hybrid algorithm that
 800    combines the "window method" with look-up tables.  For an
 801    introduction to exponentiation algorithms and "addition chains",
 802    see section 4.6.3, "Evaluation of Powers" of Donald E. Knuth,
 803    "Seminumerical Algorithms", Vol. 2, "The Art of Computer Programming",
 804    3rd Edition, 1998, and Daniel M. Gordon, "A Survey of Fast Exponentiation
 805    Methods", Journal of Algorithms, Vol. 27, pp. 129-146, 1998.  */
 806
 807 /* Provide a default value for POWI_MAX_MULTS, the maximum number of
 808    multiplications to inline before calling the system library's pow
 809    function.  powi(x,n) requires at worst 2*bits(n)-2 multiplications,
 810    so this default never requires calling pow, powf or powl.  */
 811
 812 #ifndef POWI_MAX_MULTS
 813 #define POWI_MAX_MULTS  (2*HOST_BITS_PER_WIDE_INT-2)
 814 #endif
 815
 816 /* The size of the "optimal power tree" lookup table.  All
 817    exponents less than this value are simply looked up in the
 818    powi_table below.  This threshold is also used to size the
 819    cache of pseudo registers that hold intermediate results.  */
 820 #define POWI_TABLE_SIZE 256
 821
 822 /* The size, in bits of the window, used in the "window method"
 823    exponentiation algorithm.  This is equivalent to a radix of
 824    (1<<POWI_WINDOW_SIZE) in the corresponding "m-ary method".  */
 825 #define POWI_WINDOW_SIZE 3
 826
 827 /* The following table is an efficient representation of an
 828    "optimal power tree".  For each value, i, the corresponding
 829    value, j, in the table states than an optimal evaluation
 830    sequence for calculating pow(x,i) can be found by evaluating
 831    pow(x,j)*pow(x,i-j).  An optimal power tree for the first
 832    100 integers is given in Knuth's "Seminumerical algorithms".  */
 833
 834 static const unsigned char powi_table[POWI_TABLE_SIZE] =
 835   {
 836       0,   1,   1,   2,   2,   3,   3,   4,  /*   0 -   7 */
 837       4,   6,   5,   6,   6,  10,   7,   9,  /*   8 -  15 */
 838       8,  16,   9,  16,  10,  12,  11,  13,  /*  16 -  23 */
 839      12,  17,  13,  18,  14,  24,  15,  26,  /*  24 -  31 */
 840      16,  17,  17,  19,  18,  33,  19,  26,  /*  32 -  39 */
 841      20,  25,  21,  40,  22,  27,  23,  44,  /*  40 -  47 */
 842      24,  32,  25,  34,  26,  29,  27,  44,  /*  48 -  55 */
 843      28,  31,  29,  34,  30,  60,  31,  36,  /*  56 -  63 */
 844      32,  64,  33,  34,  34,  46,  35,  37,  /*  64 -  71 */
 845      36,  65,  37,  50,  38,  48,  39,  69,  /*  72 -  79 */
 846      40,  49,  41,  43,  42,  51,  43,  58,  /*  80 -  87 */
 847      44,  64,  45,  47,  46,  59,  47,  76,  /*  88 -  95 */
 848      48,  65,  49,  66,  50,  67,  51,  66,  /*  96 - 103 */
 849      52,  70,  53,  74,  54, 104,  55,  74,  /* 104 - 111 */
 850      56,  64,  57,  69,  58,  78,  59,  68,  /* 112 - 119 */
 851      60,  61,  61,  80,  62,  75,  63,  68,  /* 120 - 127 */
 852      64,  65,  65, 128,  66, 129,  67,  90,  /* 128 - 135 */
 853      68,  73,  69, 131,  70,  94,  71,  88,  /* 136 - 143 */
 854      72, 128,  73,  98,  74, 132,  75, 121,  /* 144 - 151 */
 855      76, 102,  77, 124,  78, 132,  79, 106,  /* 152 - 159 */
 856      80,  97,  81, 160,  82,  99,  83, 134,  /* 160 - 167 */
 857      84,  86,  85,  95,  86, 160,  87, 100,  /* 168 - 175 */
 858      88, 113,  89,  98,  90, 107,  91, 122,  /* 176 - 183 */
 859      92, 111,  93, 102,  94, 126,  95, 150,  /* 184 - 191 */
 860      96, 128,  97, 130,  98, 133,  99, 195,  /* 192 - 199 */
 861     100, 128, 101, 123, 102, 164, 103, 138,  /* 200 - 207 */
 862     104, 145, 105, 146, 106, 109, 107, 149,  /* 208 - 215 */
 863     108, 200, 109, 146, 110, 170, 111, 157,  /* 216 - 223 */
 864     112, 128, 113, 130, 114, 182, 115, 132,  /* 224 - 231 */
 865     116, 200, 117, 132, 118, 158, 119, 206,  /* 232 - 239 */
 866     120, 240, 121, 162, 122, 147, 123, 152,  /* 240 - 247 */
 867     124, 166, 125, 214, 126, 138, 127, 153,  /* 248 - 255 */
 868   };
 869
 870
 871 /* Return the number of multiplications required to calculate
 872    powi(x,n) where n is less than POWI_TABLE_SIZE.  This is a
 873    subroutine of powi_cost.  CACHE is an array indicating
 874    which exponents have already been calculated.  */
 875
 876 static int
 877 powi_lookup_cost (unsigned HOST_WIDE_INT n, bool *cache)
 878 {
 879   /* If we've already calculated this exponent, then this evaluation
 880      doesn't require any additional multiplications.  */
 881   if (cache[n])
 882     return 0;
 883
 884   cache[n] = true;
 885   return powi_lookup_cost (n - powi_table[n], cache)
 886          + powi_lookup_cost (powi_table[n], cache) + 1;
 887 }
 888
 889 /* Return the number of multiplications required to calculate
 890    powi(x,n) for an arbitrary x, given the exponent N.  This
 891    function needs to be kept in sync with powi_as_mults below.  */
 892
 893 static int
 894 powi_cost (HOST_WIDE_INT n)
 895 {
 896   bool cache[POWI_TABLE_SIZE];
 897   unsigned HOST_WIDE_INT digit;
 898   unsigned HOST_WIDE_INT val;
 899   int result;
 900
 901   if (n == 0)
 902     return 0;
 903
 904   /* Ignore the reciprocal when calculating the cost.  */
 905   val = (n < 0) ? -n : n;
 906
 907   /* Initialize the exponent cache.  */
 908   memset (cache, 0, POWI_TABLE_SIZE * sizeof (bool));
 909   cache[1] = true;
 910
 911   result = 0;
 912
 913   while (val >= POWI_TABLE_SIZE)
 914     {
 915       if (val & 1)
 916         {
 917           digit = val & ((1 << POWI_WINDOW_SIZE) - 1);
 918           result += powi_lookup_cost (digit, cache)
 919                     + POWI_WINDOW_SIZE + 1;
 920           val >>= POWI_WINDOW_SIZE;
 921         }
 922       else
 923         {
 924           val >>= 1;
 925           result++;
 926         }
 927     }
 928
 929   return result + powi_lookup_cost (val, cache);
 930 }
 931
 932 /* Recursive subroutine of powi_as_mults.  This function takes the
 933    array, CACHE, of already calculated exponents and an exponent N and
 934    returns a tree that corresponds to CACHE[1]**N, with type TYPE.  */
 935
 936 static tree
 937 powi_as_mults_1 (gimple_stmt_iterator *gsi, location_t loc, tree type,
 938                  HOST_WIDE_INT n, tree *cache, tree target)
 939 {
 940   tree op0, op1, ssa_target;
 941   unsigned HOST_WIDE_INT digit;
 942   gimple mult_stmt;
 943
 944   if (n < POWI_TABLE_SIZE && cache[n])
 945     return cache[n];
 946
 947   ssa_target = make_ssa_name (target, NULL);
 948
 949   if (n < POWI_TABLE_SIZE)
 950     {
 951       cache[n] = ssa_target;
 952       op0 = powi_as_mults_1 (gsi, loc, type, n - powi_table[n], cache, target);
 953       op1 = powi_as_mults_1 (gsi, loc, type, powi_table[n], cache, target);
 954     }
 955   else if (n & 1)
 956     {
 957       digit = n & ((1 << POWI_WINDOW_SIZE) - 1);
 958       op0 = powi_as_mults_1 (gsi, loc, type, n - digit, cache, target);
 959       op1 = powi_as_mults_1 (gsi, loc, type, digit, cache, target);
 960     }
 961   else
 962     {
 963       op0 = powi_as_mults_1 (gsi, loc, type, n >> 1, cache, target);
 964       op1 = op0;
 965     }
 966
 967   mult_stmt = gimple_build_assign_with_ops (MULT_EXPR, ssa_target, op0, op1);
 968   gsi_insert_before (gsi, mult_stmt, GSI_SAME_STMT);
 969
 970   return ssa_target;
 971 }
 972
 973 /* Convert ARG0**N to a tree of multiplications of ARG0 with itself.
 974    This function needs to be kept in sync with powi_cost above.  */
 975
 976 static tree
 977 powi_as_mults (gimple_stmt_iterator *gsi, location_t loc,
 978                tree arg0, HOST_WIDE_INT n)
 979 {
 980   tree cache[POWI_TABLE_SIZE], result, type = TREE_TYPE (arg0), target;
 981   gimple div_stmt;
 982
 983   if (n == 0)
 984     return build_real (type, dconst1);
 985
 986   memset (cache, 0,  sizeof (cache));
 987   cache[1] = arg0;
 988
 989   target = create_tmp_var (type, "powmult");
 990   add_referenced_var (target);
 991
 992   result = powi_as_mults_1 (gsi, loc, type, (n < 0) ? -n : n, cache, target);
 993
 994   if (n >= 0)
 995     return result;
 996
 997   /* If the original exponent was negative, reciprocate the result.  */
 998   target = make_ssa_name (target, NULL);
 999   div_stmt = gimple_build_assign_with_ops (RDIV_EXPR, target,
1000                                            build_real (type, dconst1),
1001                                            result);
1002   gsi_insert_before (gsi, div_stmt, GSI_SAME_STMT);
1003
1004   return target;
1005 }
1006
1007 /* ARG0 and N are the two arguments to a powi builtin in GSI with
1008    location info LOC.  If the arguments are appropriate, create an
1009    equivalent sequence of statements prior to GSI using an optimal
1010    number of multiplications, and return an expession holding the
1011    result.  */
1012
1013 static tree
1014 gimple_expand_builtin_powi (gimple_stmt_iterator *gsi, location_t loc,
1015                             tree arg0, HOST_WIDE_INT n)
1016 {
1017   /* Avoid largest negative number.  */
1018   if (n != -n
1019       && ((n >= -1 && n <= 2)
1020           || (optimize_function_for_speed_p (cfun)
1021               && powi_cost (n) <= POWI_MAX_MULTS)))
1022     return powi_as_mults (gsi, loc, arg0, n);
1023
1024   return NULL_TREE;
1025 }
1026
1027 /* Go through all calls to sin, cos and cexpi and call execute_cse_sincos_1
1028    on the SSA_NAME argument of each of them.  Also expand powi(x,n) into
1029    an optimal number of multiplies, when n is a constant.  */
1030
1031 static unsigned int
1032 execute_cse_sincos (void)
1033 {
1034   basic_block bb;
1035   bool cfg_changed = false;
1036
1037   calculate_dominance_info (CDI_DOMINATORS);
1038   memset (&sincos_stats, 0, sizeof (sincos_stats));
1039
1040   FOR_EACH_BB (bb)
1041     {
1042       gimple_stmt_iterator gsi;
1043
1044       for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1045         {
1046           gimple stmt = gsi_stmt (gsi);
1047           tree fndecl;
1048
1049           if (is_gimple_call (stmt)
1050               && gimple_call_lhs (stmt)
1051               && (fndecl = gimple_call_fndecl (stmt))
1052               && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL)
1053             {
1054               tree arg, arg0, arg1, result;
1055               HOST_WIDE_INT n;
1056               location_t loc;
1057
1058               switch (DECL_FUNCTION_CODE (fndecl))
1059                 {
1060                 CASE_FLT_FN (BUILT_IN_COS):
1061                 CASE_FLT_FN (BUILT_IN_SIN):
1062                 CASE_FLT_FN (BUILT_IN_CEXPI):
1063                   arg = gimple_call_arg (stmt, 0);
1064                   if (TREE_CODE (arg) == SSA_NAME)
1065                     cfg_changed |= execute_cse_sincos_1 (arg);
1066                   break;
1067
1068                 CASE_FLT_FN (BUILT_IN_POWI):
1069                   arg0 = gimple_call_arg (stmt, 0);
1070                   arg1 = gimple_call_arg (stmt, 1);
1071                   if (!host_integerp (arg1, 0))
1072                     break;
1073
1074                   n = TREE_INT_CST_LOW (arg1);
1075                   loc = gimple_location (stmt);
1076                   result = gimple_expand_builtin_powi (&gsi, loc, arg0, n);
1077
1078                   if (result)
1079                     {
1080                       tree lhs = gimple_get_lhs (stmt);
1081                       gimple new_stmt = gimple_build_assign (lhs, result);
1082                       gimple_set_location (new_stmt, loc);
1083                       unlink_stmt_vdef (stmt);
1084                       gsi_replace (&gsi, new_stmt, true);
1085                     }
1086                   break;
1087
1088                 default:;
1089                 }
1090             }
1091         }
1092     }
1093
1094   statistics_counter_event (cfun, "sincos statements inserted",
1095                             sincos_stats.inserted);
1096
1097   free_dominance_info (CDI_DOMINATORS);
1098   return cfg_changed ? TODO_cleanup_cfg : 0;
1099 }
1100
1101 static bool
1102 gate_cse_sincos (void)
1103 {
1104   /* We no longer require either sincos or cexp, since powi expansion
1105      piggybacks on this pass.  */
1106   return optimize;
1107 }
1108
1109 struct gimple_opt_pass pass_cse_sincos =
1110 {
1111  {
1112   GIMPLE_PASS,
1113   "sincos",                             /* name */
1114   gate_cse_sincos,                      /* gate */
1115   execute_cse_sincos,                   /* execute */
1116   NULL,                                 /* sub */
1117   NULL,                                 /* next */
1118   0,                                    /* static_pass_number */
1119   TV_NONE,                              /* tv_id */
1120   PROP_ssa,                             /* properties_required */
1121   0,                                    /* properties_provided */
1122   0,                                    /* properties_destroyed */
1123   0,                                    /* todo_flags_start */
1124   TODO_dump_func | TODO_update_ssa | TODO_verify_ssa
1125     | TODO_verify_stmts                 /* todo_flags_finish */
1126  }
1127 };
1128
1129 /* A symbolic number is used to detect byte permutation and selection
1130    patterns.  Therefore the field N contains an artificial number
1131    consisting of byte size markers:
1132
1133    0    - byte has the value 0
1134    1..size - byte contains the content of the byte
1135    number indexed with that value minus one  */
1136
1137 struct symbolic_number {
1138   unsigned HOST_WIDEST_INT n;
1139   int size;
1140 };
1141
1142 /* Perform a SHIFT or ROTATE operation by COUNT bits on symbolic
1143    number N.  Return false if the requested operation is not permitted
1144    on a symbolic number.  */
1145
1146 static inline bool
1147 do_shift_rotate (enum tree_code code,
1148                  struct symbolic_number *n,
1149                  int count)
1150 {
1151   if (count % 8 != 0)
1152     return false;
1153
1154   /* Zero out the extra bits of N in order to avoid them being shifted
1155      into the significant bits.  */
1156   if (n->size < (int)sizeof (HOST_WIDEST_INT))
1157     n->n &= ((unsigned HOST_WIDEST_INT)1 << (n->size * BITS_PER_UNIT)) - 1;
1158
1159   switch (code)
1160     {
1161     case LSHIFT_EXPR:
1162       n->n <<= count;
1163       break;
1164     case RSHIFT_EXPR:
1165       n->n >>= count;
1166       break;
1167     case LROTATE_EXPR:
1168       n->n = (n->n << count) | (n->n >> ((n->size * BITS_PER_UNIT) - count));
1169       break;
1170     case RROTATE_EXPR:
1171       n->n = (n->n >> count) | (n->n << ((n->size * BITS_PER_UNIT) - count));
1172       break;
1173     default:
1174       return false;
1175     }
1176   return true;
1177 }
1178
1179 /* Perform sanity checking for the symbolic number N and the gimple
1180    statement STMT.  */
1181
1182 static inline bool
1183 verify_symbolic_number_p (struct symbolic_number *n, gimple stmt)
1184 {
1185   tree lhs_type;
1186
1187   lhs_type = gimple_expr_type (stmt);
1188
1189   if (TREE_CODE (lhs_type) != INTEGER_TYPE)
1190     return false;
1191
1192   if (TYPE_PRECISION (lhs_type) != n->size * BITS_PER_UNIT)
1193     return false;
1194
1195   return true;
1196 }
1197
1198 /* find_bswap_1 invokes itself recursively with N and tries to perform
1199    the operation given by the rhs of STMT on the result.  If the
1200    operation could successfully be executed the function returns the
1201    tree expression of the source operand and NULL otherwise.  */
1202
1203 static tree
1204 find_bswap_1 (gimple stmt, struct symbolic_number *n, int limit)
1205 {
1206   enum tree_code code;
1207   tree rhs1, rhs2 = NULL;
1208   gimple rhs1_stmt, rhs2_stmt;
1209   tree source_expr1;
1210   enum gimple_rhs_class rhs_class;
1211
1212   if (!limit || !is_gimple_assign (stmt))
1213     return NULL_TREE;
1214
1215   rhs1 = gimple_assign_rhs1 (stmt);
1216
1217   if (TREE_CODE (rhs1) != SSA_NAME)
1218     return NULL_TREE;
1219
1220   code = gimple_assign_rhs_code (stmt);
1221   rhs_class = gimple_assign_rhs_class (stmt);
1222   rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
1223
1224   if (rhs_class == GIMPLE_BINARY_RHS)
1225     rhs2 = gimple_assign_rhs2 (stmt);
1226
1227   /* Handle unary rhs and binary rhs with integer constants as second
1228      operand.  */
1229
1230   if (rhs_class == GIMPLE_UNARY_RHS
1231       || (rhs_class == GIMPLE_BINARY_RHS
1232           && TREE_CODE (rhs2) == INTEGER_CST))
1233     {
1234       if (code != BIT_AND_EXPR
1235           && code != LSHIFT_EXPR
1236           && code != RSHIFT_EXPR
1237           && code != LROTATE_EXPR
1238           && code != RROTATE_EXPR
1239           && code != NOP_EXPR
1240           && code != CONVERT_EXPR)
1241         return NULL_TREE;
1242
1243       source_expr1 = find_bswap_1 (rhs1_stmt, n, limit - 1);
1244
1245       /* If find_bswap_1 returned NULL STMT is a leaf node and we have
1246          to initialize the symbolic number.  */
1247       if (!source_expr1)
1248         {
1249           /* Set up the symbolic number N by setting each byte to a
1250              value between 1 and the byte size of rhs1.  The highest
1251              order byte is set to n->size and the lowest order
1252              byte to 1.  */
1253           n->size = TYPE_PRECISION (TREE_TYPE (rhs1));
1254           if (n->size % BITS_PER_UNIT != 0)
1255             return NULL_TREE;
1256           n->size /= BITS_PER_UNIT;
1257           n->n = (sizeof (HOST_WIDEST_INT) < 8 ? 0 :
1258                   (unsigned HOST_WIDEST_INT)0x08070605 << 32 | 0x04030201);
1259
1260           if (n->size < (int)sizeof (HOST_WIDEST_INT))
1261             n->n &= ((unsigned HOST_WIDEST_INT)1 <<
1262                      (n->size * BITS_PER_UNIT)) - 1;
1263
1264           source_expr1 = rhs1;
1265         }
1266
1267       switch (code)
1268         {
1269         case BIT_AND_EXPR:
1270           {
1271             int i;
1272             unsigned HOST_WIDEST_INT val = widest_int_cst_value (rhs2);
1273             unsigned HOST_WIDEST_INT tmp = val;
1274
1275             /* Only constants masking full bytes are allowed.  */
1276             for (i = 0; i < n->size; i++, tmp >>= BITS_PER_UNIT)
1277               if ((tmp & 0xff) != 0 && (tmp & 0xff) != 0xff)
1278                 return NULL_TREE;
1279
1280             n->n &= val;
1281           }
1282           break;
1283         case LSHIFT_EXPR:
1284         case RSHIFT_EXPR:
1285         case LROTATE_EXPR:
1286         case RROTATE_EXPR:
1287           if (!do_shift_rotate (code, n, (int)TREE_INT_CST_LOW (rhs2)))
1288             return NULL_TREE;
1289           break;
1290         CASE_CONVERT:
1291           {
1292             int type_size;
1293
1294             type_size = TYPE_PRECISION (gimple_expr_type (stmt));
1295             if (type_size % BITS_PER_UNIT != 0)
1296               return NULL_TREE;
1297
1298             if (type_size / BITS_PER_UNIT < (int)(sizeof (HOST_WIDEST_INT)))
1299               {
1300                 /* If STMT casts to a smaller type mask out the bits not
1301                    belonging to the target type.  */
1302                 n->n &= ((unsigned HOST_WIDEST_INT)1 << type_size) - 1;
1303               }
1304             n->size = type_size / BITS_PER_UNIT;
1305           }
1306           break;
1307         default:
1308           return NULL_TREE;
1309         };
1310       return verify_symbolic_number_p (n, stmt) ? source_expr1 : NULL;
1311     }
1312
1313   /* Handle binary rhs.  */
1314
1315   if (rhs_class == GIMPLE_BINARY_RHS)
1316     {
1317       struct symbolic_number n1, n2;
1318       tree source_expr2;
1319
1320       if (code != BIT_IOR_EXPR)
1321         return NULL_TREE;
1322
1323       if (TREE_CODE (rhs2) != SSA_NAME)
1324         return NULL_TREE;
1325
1326       rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
1327
1328       switch (code)
1329         {
1330         case BIT_IOR_EXPR:
1331           source_expr1 = find_bswap_1 (rhs1_stmt, &n1, limit - 1);
1332
1333           if (!source_expr1)
1334             return NULL_TREE;
1335
1336           source_expr2 = find_bswap_1 (rhs2_stmt, &n2, limit - 1);
1337
1338           if (source_expr1 != source_expr2
1339               || n1.size != n2.size)
1340             return NULL_TREE;
1341
1342           n->size = n1.size;
1343           n->n = n1.n | n2.n;
1344
1345           if (!verify_symbolic_number_p (n, stmt))
1346             return NULL_TREE;
1347
1348           break;
1349         default:
1350           return NULL_TREE;
1351         }
1352       return source_expr1;
1353     }
1354   return NULL_TREE;
1355 }
1356
1357 /* Check if STMT completes a bswap implementation consisting of ORs,
1358    SHIFTs and ANDs.  Return the source tree expression on which the
1359    byte swap is performed and NULL if no bswap was found.  */
1360
1361 static tree
1362 find_bswap (gimple stmt)
1363 {
1364 /* The number which the find_bswap result should match in order to
1365    have a full byte swap.  The number is shifted to the left according
1366    to the size of the symbolic number before using it.  */
1367   unsigned HOST_WIDEST_INT cmp =
1368     sizeof (HOST_WIDEST_INT) < 8 ? 0 :
1369     (unsigned HOST_WIDEST_INT)0x01020304 << 32 | 0x05060708;
1370
1371   struct symbolic_number n;
1372   tree source_expr;
1373
1374   /* The last parameter determines the depth search limit.  It usually
1375      correlates directly to the number of bytes to be touched.  We
1376      increase that number by one here in order to also cover signed ->
1377      unsigned conversions of the src operand as can be seen in
1378      libgcc.  */
1379   source_expr =  find_bswap_1 (stmt, &n,
1380                                TREE_INT_CST_LOW (
1381                                  TYPE_SIZE_UNIT (gimple_expr_type (stmt))) + 1);
1382
1383   if (!source_expr)
1384     return NULL_TREE;
1385
1386   /* Zero out the extra bits of N and CMP.  */
1387   if (n.size < (int)sizeof (HOST_WIDEST_INT))
1388     {
1389       unsigned HOST_WIDEST_INT mask =
1390         ((unsigned HOST_WIDEST_INT)1 << (n.size * BITS_PER_UNIT)) - 1;
1391
1392       n.n &= mask;
1393       cmp >>= (sizeof (HOST_WIDEST_INT) - n.size) * BITS_PER_UNIT;
1394     }
1395
1396   /* A complete byte swap should make the symbolic number to start
1397      with the largest digit in the highest order byte.  */
1398   if (cmp != n.n)
1399     return NULL_TREE;
1400
1401   return source_expr;
1402 }
1403
1404 /* Find manual byte swap implementations and turn them into a bswap
1405    builtin invokation.  */
1406
1407 static unsigned int
1408 execute_optimize_bswap (void)
1409 {
1410   basic_block bb;
1411   bool bswap32_p, bswap64_p;
1412   bool changed = false;
1413   tree bswap32_type = NULL_TREE, bswap64_type = NULL_TREE;
1414
1415   if (BITS_PER_UNIT != 8)
1416     return 0;
1417
1418   if (sizeof (HOST_WIDEST_INT) < 8)
1419     return 0;
1420
1421   bswap32_p = (built_in_decls[BUILT_IN_BSWAP32]
1422                && optab_handler (bswap_optab, SImode) != CODE_FOR_nothing);
1423   bswap64_p = (built_in_decls[BUILT_IN_BSWAP64]
1424                && (optab_handler (bswap_optab, DImode) != CODE_FOR_nothing
1425                    || (bswap32_p && word_mode == SImode)));
1426
1427   if (!bswap32_p && !bswap64_p)
1428     return 0;
1429
1430   /* Determine the argument type of the builtins.  The code later on
1431      assumes that the return and argument type are the same.  */
1432   if (bswap32_p)
1433     {
1434       tree fndecl = built_in_decls[BUILT_IN_BSWAP32];
1435       bswap32_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
1436     }
1437
1438   if (bswap64_p)
1439     {
1440       tree fndecl = built_in_decls[BUILT_IN_BSWAP64];
1441       bswap64_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
1442     }
1443
1444   memset (&bswap_stats, 0, sizeof (bswap_stats));
1445
1446   FOR_EACH_BB (bb)
1447     {
1448       gimple_stmt_iterator gsi;
1449
1450       for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1451         {
1452           gimple stmt = gsi_stmt (gsi);
1453           tree bswap_src, bswap_type;
1454           tree bswap_tmp;
1455           tree fndecl = NULL_TREE;
1456           int type_size;
1457           gimple call;
1458
1459           if (!is_gimple_assign (stmt)
1460               || gimple_assign_rhs_code (stmt) != BIT_IOR_EXPR)
1461             continue;
1462
1463           type_size = TYPE_PRECISION (gimple_expr_type (stmt));
1464
1465           switch (type_size)
1466             {
1467             case 32:
1468               if (bswap32_p)
1469                 {
1470                   fndecl = built_in_decls[BUILT_IN_BSWAP32];
1471                   bswap_type = bswap32_type;
1472                 }
1473               break;
1474             case 64:
1475               if (bswap64_p)
1476                 {
1477                   fndecl = built_in_decls[BUILT_IN_BSWAP64];
1478                   bswap_type = bswap64_type;
1479                 }
1480               break;
1481             default:
1482               continue;
1483             }
1484
1485           if (!fndecl)
1486             continue;
1487
1488           bswap_src = find_bswap (stmt);
1489
1490           if (!bswap_src)
1491             continue;
1492
1493           changed = true;
1494           if (type_size == 32)
1495             bswap_stats.found_32bit++;
1496           else
1497             bswap_stats.found_64bit++;
1498
1499           bswap_tmp = bswap_src;
1500
1501           /* Convert the src expression if necessary.  */
1502           if (!useless_type_conversion_p (TREE_TYPE (bswap_tmp), bswap_type))
1503             {
1504               gimple convert_stmt;
1505
1506               bswap_tmp = create_tmp_var (bswap_type, "bswapsrc");
1507               add_referenced_var (bswap_tmp);
1508               bswap_tmp = make_ssa_name (bswap_tmp, NULL);
1509
1510               convert_stmt = gimple_build_assign_with_ops (
1511                                CONVERT_EXPR, bswap_tmp, bswap_src, NULL);
1512               gsi_insert_before (&gsi, convert_stmt, GSI_SAME_STMT);
1513             }
1514
1515           call = gimple_build_call (fndecl, 1, bswap_tmp);
1516
1517           bswap_tmp = gimple_assign_lhs (stmt);
1518
1519           /* Convert the result if necessary.  */
1520           if (!useless_type_conversion_p (TREE_TYPE (bswap_tmp), bswap_type))
1521             {
1522               gimple convert_stmt;
1523
1524               bswap_tmp = create_tmp_var (bswap_type, "bswapdst");
1525               add_referenced_var (bswap_tmp);
1526               bswap_tmp = make_ssa_name (bswap_tmp, NULL);
1527               convert_stmt = gimple_build_assign_with_ops (
1528                                CONVERT_EXPR, gimple_assign_lhs (stmt), bswap_tmp, NULL);
1529               gsi_insert_after (&gsi, convert_stmt, GSI_SAME_STMT);
1530             }
1531
1532           gimple_call_set_lhs (call, bswap_tmp);
1533
1534           if (dump_file)
1535             {
1536               fprintf (dump_file, "%d bit bswap implementation found at: ",
1537                        (int)type_size);
1538               print_gimple_stmt (dump_file, stmt, 0, 0);
1539             }
1540
1541           gsi_insert_after (&gsi, call, GSI_SAME_STMT);
1542           gsi_remove (&gsi, true);
1543         }
1544     }
1545
1546   statistics_counter_event (cfun, "32-bit bswap implementations found",
1547                             bswap_stats.found_32bit);
1548   statistics_counter_event (cfun, "64-bit bswap implementations found",
1549                             bswap_stats.found_64bit);
1550
1551   return (changed ? TODO_dump_func | TODO_update_ssa | TODO_verify_ssa
1552           | TODO_verify_stmts : 0);
1553 }
1554
1555 static bool
1556 gate_optimize_bswap (void)
1557 {
1558   return flag_expensive_optimizations && optimize;
1559 }
1560
1561 struct gimple_opt_pass pass_optimize_bswap =
1562 {
1563  {
1564   GIMPLE_PASS,
1565   "bswap",                              /* name */
1566   gate_optimize_bswap,                  /* gate */
1567   execute_optimize_bswap,               /* execute */
1568   NULL,                                 /* sub */
1569   NULL,                                 /* next */
1570   0,                                    /* static_pass_number */
1571   TV_NONE,                              /* tv_id */
1572   PROP_ssa,                             /* properties_required */
1573   0,                                    /* properties_provided */
1574   0,                                    /* properties_destroyed */
1575   0,                                    /* todo_flags_start */
1576   0                                     /* todo_flags_finish */
1577  }
1578 };
1579
1580 /* Return true if RHS is a suitable operand for a widening multiplication.
1581    There are two cases:
1582
1583      - RHS makes some value twice as wide.  Store that value in *NEW_RHS_OUT
1584        if so, and store its type in *TYPE_OUT.
1585
1586      - RHS is an integer constant.  Store that value in *NEW_RHS_OUT if so,
1587        but leave *TYPE_OUT untouched.  */
1588
1589 static bool
1590 is_widening_mult_rhs_p (tree rhs, tree *type_out, tree *new_rhs_out)
1591 {
1592   gimple stmt;
1593   tree type, type1, rhs1;
1594   enum tree_code rhs_code;
1595
1596   if (TREE_CODE (rhs) == SSA_NAME)
1597     {
1598       type = TREE_TYPE (rhs);
1599       stmt = SSA_NAME_DEF_STMT (rhs);
1600       if (!is_gimple_assign (stmt))
1601         return false;
1602
1603       rhs_code = gimple_assign_rhs_code (stmt);
1604       if (TREE_CODE (type) == INTEGER_TYPE
1605           ? !CONVERT_EXPR_CODE_P (rhs_code)
1606           : rhs_code != FIXED_CONVERT_EXPR)
1607         return false;
1608
1609       rhs1 = gimple_assign_rhs1 (stmt);
1610       type1 = TREE_TYPE (rhs1);
1611       if (TREE_CODE (type1) != TREE_CODE (type)
1612           || TYPE_PRECISION (type1) * 2 != TYPE_PRECISION (type))
1613         return false;
1614
1615       *new_rhs_out = rhs1;
1616       *type_out = type1;
1617       return true;
1618     }
1619
1620   if (TREE_CODE (rhs) == INTEGER_CST)
1621     {
1622       *new_rhs_out = rhs;
1623       *type_out = NULL;
1624       return true;
1625     }
1626
1627   return false;
1628 }
1629
1630 /* Return true if STMT performs a widening multiplication.  If so,
1631    store the unwidened types of the operands in *TYPE1_OUT and *TYPE2_OUT
1632    respectively.  Also fill *RHS1_OUT and *RHS2_OUT such that converting
1633    those operands to types *TYPE1_OUT and *TYPE2_OUT would give the
1634    operands of the multiplication.  */
1635
1636 static bool
1637 is_widening_mult_p (gimple stmt,
1638                     tree *type1_out, tree *rhs1_out,
1639                     tree *type2_out, tree *rhs2_out)
1640 {
1641   tree type;
1642
1643   type = TREE_TYPE (gimple_assign_lhs (stmt));
1644   if (TREE_CODE (type) != INTEGER_TYPE
1645       && TREE_CODE (type) != FIXED_POINT_TYPE)
1646     return false;
1647
1648   if (!is_widening_mult_rhs_p (gimple_assign_rhs1 (stmt), type1_out, rhs1_out))
1649     return false;
1650
1651   if (!is_widening_mult_rhs_p (gimple_assign_rhs2 (stmt), type2_out, rhs2_out))
1652     return false;
1653
1654   if (*type1_out == NULL)
1655     {
1656       if (*type2_out == NULL || !int_fits_type_p (*rhs1_out, *type2_out))
1657         return false;
1658       *type1_out = *type2_out;
1659     }
1660
1661   if (*type2_out == NULL)
1662     {
1663       if (!int_fits_type_p (*rhs2_out, *type1_out))
1664         return false;
1665       *type2_out = *type1_out;
1666     }
1667
1668   return true;
1669 }
1670
1671 /* Process a single gimple statement STMT, which has a MULT_EXPR as
1672    its rhs, and try to convert it into a WIDEN_MULT_EXPR.  The return
1673    value is true iff we converted the statement.  */
1674
1675 static bool
1676 convert_mult_to_widen (gimple stmt)
1677 {
1678   tree lhs, rhs1, rhs2, type, type1, type2;
1679   enum insn_code handler;
1680
1681   lhs = gimple_assign_lhs (stmt);
1682   type = TREE_TYPE (lhs);
1683   if (TREE_CODE (type) != INTEGER_TYPE)
1684     return false;
1685
1686   if (!is_widening_mult_p (stmt, &type1, &rhs1, &type2, &rhs2))
1687     return false;
1688
1689   if (TYPE_UNSIGNED (type1) && TYPE_UNSIGNED (type2))
1690     handler = optab_handler (umul_widen_optab, TYPE_MODE (type));
1691   else if (!TYPE_UNSIGNED (type1) && !TYPE_UNSIGNED (type2))
1692     handler = optab_handler (smul_widen_optab, TYPE_MODE (type));
1693   else
1694     handler = optab_handler (usmul_widen_optab, TYPE_MODE (type));
1695
1696   if (handler == CODE_FOR_nothing)
1697     return false;
1698
1699   gimple_assign_set_rhs1 (stmt, fold_convert (type1, rhs1));
1700   gimple_assign_set_rhs2 (stmt, fold_convert (type2, rhs2));
1701   gimple_assign_set_rhs_code (stmt, WIDEN_MULT_EXPR);
1702   update_stmt (stmt);
1703   widen_mul_stats.widen_mults_inserted++;
1704   return true;
1705 }
1706
1707 /* Process a single gimple statement STMT, which is found at the
1708    iterator GSI and has a either a PLUS_EXPR or a MINUS_EXPR as its
1709    rhs (given by CODE), and try to convert it into a
1710    WIDEN_MULT_PLUS_EXPR or a WIDEN_MULT_MINUS_EXPR.  The return value
1711    is true iff we converted the statement.  */
1712
1713 static bool
1714 convert_plusminus_to_widen (gimple_stmt_iterator *gsi, gimple stmt,
1715                             enum tree_code code)
1716 {
1717   gimple rhs1_stmt = NULL, rhs2_stmt = NULL;
1718   tree type, type1, type2;
1719   tree lhs, rhs1, rhs2, mult_rhs1, mult_rhs2, add_rhs;
1720   enum tree_code rhs1_code = ERROR_MARK, rhs2_code = ERROR_MARK;
1721   optab this_optab;
1722   enum tree_code wmult_code;
1723
1724   lhs = gimple_assign_lhs (stmt);
1725   type = TREE_TYPE (lhs);
1726   if (TREE_CODE (type) != INTEGER_TYPE
1727       && TREE_CODE (type) != FIXED_POINT_TYPE)
1728     return false;
1729
1730   if (code == MINUS_EXPR)
1731     wmult_code = WIDEN_MULT_MINUS_EXPR;
1732   else
1733     wmult_code = WIDEN_MULT_PLUS_EXPR;
1734
1735   rhs1 = gimple_assign_rhs1 (stmt);
1736   rhs2 = gimple_assign_rhs2 (stmt);
1737
1738   if (TREE_CODE (rhs1) == SSA_NAME)
1739     {
1740       rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
1741       if (is_gimple_assign (rhs1_stmt))
1742         rhs1_code = gimple_assign_rhs_code (rhs1_stmt);
1743     }
1744   else
1745     return false;
1746
1747   if (TREE_CODE (rhs2) == SSA_NAME)
1748     {
1749       rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
1750       if (is_gimple_assign (rhs2_stmt))
1751         rhs2_code = gimple_assign_rhs_code (rhs2_stmt);
1752     }
1753   else
1754     return false;
1755
1756   if (code == PLUS_EXPR && rhs1_code == MULT_EXPR)
1757     {
1758       if (!is_widening_mult_p (rhs1_stmt, &type1, &mult_rhs1,
1759                                &type2, &mult_rhs2))
1760         return false;
1761       add_rhs = rhs2;
1762     }
1763   else if (rhs2_code == MULT_EXPR)
1764     {
1765       if (!is_widening_mult_p (rhs2_stmt, &type1, &mult_rhs1,
1766                                &type2, &mult_rhs2))
1767         return false;
1768       add_rhs = rhs1;
1769     }
1770   else if (code == PLUS_EXPR && rhs1_code == WIDEN_MULT_EXPR)
1771     {
1772       mult_rhs1 = gimple_assign_rhs1 (rhs1_stmt);
1773       mult_rhs2 = gimple_assign_rhs2 (rhs1_stmt);
1774       type1 = TREE_TYPE (mult_rhs1);
1775       type2 = TREE_TYPE (mult_rhs2);
1776       add_rhs = rhs2;
1777     }
1778   else if (rhs2_code == WIDEN_MULT_EXPR)
1779     {
1780       mult_rhs1 = gimple_assign_rhs1 (rhs2_stmt);
1781       mult_rhs2 = gimple_assign_rhs2 (rhs2_stmt);
1782       type1 = TREE_TYPE (mult_rhs1);
1783       type2 = TREE_TYPE (mult_rhs2);
1784       add_rhs = rhs1;
1785     }
1786   else
1787     return false;
1788
1789   if (TYPE_UNSIGNED (type1) != TYPE_UNSIGNED (type2))
1790     return false;
1791
1792   /* Verify that the machine can perform a widening multiply
1793      accumulate in this mode/signedness combination, otherwise
1794      this transformation is likely to pessimize code.  */
1795   this_optab = optab_for_tree_code (wmult_code, type1, optab_default);
1796   if (optab_handler (this_optab, TYPE_MODE (type)) == CODE_FOR_nothing)
1797     return false;
1798
1799   /* ??? May need some type verification here?  */
1800
1801   gimple_assign_set_rhs_with_ops_1 (gsi, wmult_code,
1802                                     fold_convert (type1, mult_rhs1),
1803                                     fold_convert (type2, mult_rhs2),
1804                                     add_rhs);
1805   update_stmt (gsi_stmt (*gsi));
1806   widen_mul_stats.maccs_inserted++;
1807   return true;
1808 }
1809
1810 /* Combine the multiplication at MUL_STMT with operands MULOP1 and MULOP2
1811    with uses in additions and subtractions to form fused multiply-add
1812    operations.  Returns true if successful and MUL_STMT should be removed.  */
1813
1814 static bool
1815 convert_mult_to_fma (gimple mul_stmt, tree op1, tree op2)
1816 {
1817   tree mul_result = gimple_get_lhs (mul_stmt);
1818   tree type = TREE_TYPE (mul_result);
1819   gimple use_stmt, neguse_stmt, fma_stmt;
1820   use_operand_p use_p;
1821   imm_use_iterator imm_iter;
1822
1823   if (FLOAT_TYPE_P (type)
1824       && flag_fp_contract_mode == FP_CONTRACT_OFF)
1825     return false;
1826
1827   /* We don't want to do bitfield reduction ops.  */
1828   if (INTEGRAL_TYPE_P (type)
1829       && (TYPE_PRECISION (type)
1830           != GET_MODE_PRECISION (TYPE_MODE (type))))
1831     return false;
1832
1833   /* If the target doesn't support it, don't generate it.  We assume that
1834      if fma isn't available then fms, fnma or fnms are not either.  */
1835   if (optab_handler (fma_optab, TYPE_MODE (type)) == CODE_FOR_nothing)
1836     return false;
1837
1838   /* Make sure that the multiplication statement becomes dead after
1839      the transformation, thus that all uses are transformed to FMAs.
1840      This means we assume that an FMA operation has the same cost
1841      as an addition.  */
1842   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, mul_result)
1843     {
1844       enum tree_code use_code;
1845       tree result = mul_result;
1846       bool negate_p = false;
1847
1848       use_stmt = USE_STMT (use_p);
1849
1850       if (is_gimple_debug (use_stmt))
1851         continue;
1852
1853       /* For now restrict this operations to single basic blocks.  In theory
1854          we would want to support sinking the multiplication in
1855          m = a*b;
1856          if ()
1857            ma = m + c;
1858          else
1859            d = m;
1860          to form a fma in the then block and sink the multiplication to the
1861          else block.  */
1862       if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
1863         return false;
1864
1865       if (!is_gimple_assign (use_stmt))
1866         return false;
1867
1868       use_code = gimple_assign_rhs_code (use_stmt);
1869
1870       /* A negate on the multiplication leads to FNMA.  */
1871       if (use_code == NEGATE_EXPR)
1872         {
1873           ssa_op_iter iter;
1874           tree use;
1875
1876           result = gimple_assign_lhs (use_stmt);
1877
1878           /* Make sure the negate statement becomes dead with this
1879              single transformation.  */
1880           if (!single_imm_use (gimple_assign_lhs (use_stmt),
1881                                &use_p, &neguse_stmt))
1882             return false;
1883
1884           /* Make sure the multiplication isn't also used on that stmt.  */
1885           FOR_EACH_SSA_TREE_OPERAND (use, neguse_stmt, iter, SSA_OP_USE)
1886             if (use == mul_result)
1887               return false;
1888
1889           /* Re-validate.  */
1890           use_stmt = neguse_stmt;
1891           if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
1892             return false;
1893           if (!is_gimple_assign (use_stmt))
1894             return false;
1895
1896           use_code = gimple_assign_rhs_code (use_stmt);
1897           negate_p = true;
1898         }
1899
1900       switch (use_code)
1901         {
1902         case MINUS_EXPR:
1903           if (gimple_assign_rhs2 (use_stmt) == result)
1904             negate_p = !negate_p;
1905           break;
1906         case PLUS_EXPR:
1907           break;
1908         default:
1909           /* FMA can only be formed from PLUS and MINUS.  */
1910           return false;
1911         }
1912
1913       /* We can't handle a * b + a * b.  */
1914       if (gimple_assign_rhs1 (use_stmt) == gimple_assign_rhs2 (use_stmt))
1915         return false;
1916
1917       /* While it is possible to validate whether or not the exact form
1918          that we've recognized is available in the backend, the assumption
1919          is that the transformation is never a loss.  For instance, suppose
1920          the target only has the plain FMA pattern available.  Consider
1921          a*b-c -> fma(a,b,-c): we've exchanged MUL+SUB for FMA+NEG, which
1922          is still two operations.  Consider -(a*b)-c -> fma(-a,b,-c): we
1923          still have 3 operations, but in the FMA form the two NEGs are
1924          independant and could be run in parallel.  */
1925     }
1926
1927   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, mul_result)
1928     {
1929       gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
1930       enum tree_code use_code;
1931       tree addop, mulop1 = op1, result = mul_result;
1932       bool negate_p = false;
1933
1934       if (is_gimple_debug (use_stmt))
1935         continue;
1936
1937       use_code = gimple_assign_rhs_code (use_stmt);
1938       if (use_code == NEGATE_EXPR)
1939         {
1940           result = gimple_assign_lhs (use_stmt);
1941           single_imm_use (gimple_assign_lhs (use_stmt), &use_p, &neguse_stmt);
1942           gsi_remove (&gsi, true);
1943           release_defs (use_stmt);
1944
1945           use_stmt = neguse_stmt;
1946           gsi = gsi_for_stmt (use_stmt);
1947           use_code = gimple_assign_rhs_code (use_stmt);
1948           negate_p = true;
1949         }
1950
1951       if (gimple_assign_rhs1 (use_stmt) == result)
1952         {
1953           addop = gimple_assign_rhs2 (use_stmt);
1954           /* a * b - c -> a * b + (-c)  */
1955           if (gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
1956             addop = force_gimple_operand_gsi (&gsi,
1957                                               build1 (NEGATE_EXPR,
1958                                                       type, addop),
1959                                               true, NULL_TREE, true,
1960                                               GSI_SAME_STMT);
1961         }
1962       else
1963         {
1964           addop = gimple_assign_rhs1 (use_stmt);
1965           /* a - b * c -> (-b) * c + a */
1966           if (gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
1967             negate_p = !negate_p;
1968         }
1969
1970       if (negate_p)
1971         mulop1 = force_gimple_operand_gsi (&gsi,
1972                                            build1 (NEGATE_EXPR,
1973                                                    type, mulop1),
1974                                            true, NULL_TREE, true,
1975                                            GSI_SAME_STMT);
1976
1977       fma_stmt = gimple_build_assign_with_ops3 (FMA_EXPR,
1978                                                 gimple_assign_lhs (use_stmt),
1979                                                 mulop1, op2,
1980                                                 addop);
1981       gsi_replace (&gsi, fma_stmt, true);
1982       widen_mul_stats.fmas_inserted++;
1983     }
1984
1985   return true;
1986 }
1987
1988 /* Find integer multiplications where the operands are extended from
1989    smaller types, and replace the MULT_EXPR with a WIDEN_MULT_EXPR
1990    where appropriate.  */
1991
1992 static unsigned int
1993 execute_optimize_widening_mul (void)
1994 {
1995   basic_block bb;
1996   bool cfg_changed = false;
1997
1998   memset (&widen_mul_stats, 0, sizeof (widen_mul_stats));
1999
2000   FOR_EACH_BB (bb)
2001     {
2002       gimple_stmt_iterator gsi;
2003
2004       for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi);)
2005         {
2006           gimple stmt = gsi_stmt (gsi);
2007           enum tree_code code;
2008
2009           if (is_gimple_assign (stmt))
2010             {
2011               code = gimple_assign_rhs_code (stmt);
2012               switch (code)
2013                 {
2014                 case MULT_EXPR:
2015                   if (!convert_mult_to_widen (stmt)
2016                       && convert_mult_to_fma (stmt,
2017                                               gimple_assign_rhs1 (stmt),
2018                                               gimple_assign_rhs2 (stmt)))
2019                     {
2020                       gsi_remove (&gsi, true);
2021                       release_defs (stmt);
2022                       continue;
2023                     }
2024                   break;
2025
2026                 case PLUS_EXPR:
2027                 case MINUS_EXPR:
2028                   convert_plusminus_to_widen (&gsi, stmt, code);
2029                   break;
2030
2031                 default:;
2032                 }
2033             }
2034           else if (is_gimple_call (stmt)
2035                    && gimple_call_lhs (stmt))
2036             {
2037               tree fndecl = gimple_call_fndecl (stmt);
2038               if (fndecl
2039                   && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL)
2040                 {
2041                   switch (DECL_FUNCTION_CODE (fndecl))
2042                     {
2043                       case BUILT_IN_POWF:
2044                       case BUILT_IN_POW:
2045                       case BUILT_IN_POWL:
2046                         if (TREE_CODE (gimple_call_arg (stmt, 1)) == REAL_CST
2047                             && REAL_VALUES_EQUAL
2048                                  (TREE_REAL_CST (gimple_call_arg (stmt, 1)),
2049                                   dconst2)
2050                             && convert_mult_to_fma (stmt,
2051                                                     gimple_call_arg (stmt, 0),
2052                                                     gimple_call_arg (stmt, 0)))
2053                           {
2054                             unlink_stmt_vdef (stmt);
2055                             gsi_remove (&gsi, true);
2056                             release_defs (stmt);
2057                             if (gimple_purge_dead_eh_edges (bb))
2058                               cfg_changed = true;
2059                             continue;
2060                           }
2061                           break;
2062
2063                       default:;
2064                     }
2065                 }
2066             }
2067           gsi_next (&gsi);
2068         }
2069     }
2070
2071   statistics_counter_event (cfun, "widening multiplications inserted",
2072                             widen_mul_stats.widen_mults_inserted);
2073   statistics_counter_event (cfun, "widening maccs inserted",
2074                             widen_mul_stats.maccs_inserted);
2075   statistics_counter_event (cfun, "fused multiply-adds inserted",
2076                             widen_mul_stats.fmas_inserted);
2077
2078   return cfg_changed ? TODO_cleanup_cfg : 0;
2079 }
2080
2081 static bool
2082 gate_optimize_widening_mul (void)
2083 {
2084   return flag_expensive_optimizations && optimize;
2085 }
2086
2087 struct gimple_opt_pass pass_optimize_widening_mul =
2088 {
2089  {
2090   GIMPLE_PASS,
2091   "widening_mul",                       /* name */
2092   gate_optimize_widening_mul,           /* gate */
2093   execute_optimize_widening_mul,        /* execute */
2094   NULL,                                 /* sub */
2095   NULL,                                 /* next */
2096   0,                                    /* static_pass_number */
2097   TV_NONE,                              /* tv_id */
2098   PROP_ssa,                             /* properties_required */
2099   0,                                    /* properties_provided */
2100   0,                                    /* properties_destroyed */
2101   0,                                    /* todo_flags_start */
2102   TODO_verify_ssa
2103   | TODO_verify_stmts
2104   | TODO_dump_func
2105   | TODO_update_ssa                     /* todo_flags_finish */
2106  }
2107 };