gcc/tree-ssa-math-opts.c

   1 /* Global, SSA-based optimizations using mathematical identities.
   2    Copyright (C) 2005, 2006, 2007, 2008, 2009, 2010, 2011
   3    Free Software Foundation, Inc.
   4
   5 This file is part of GCC.
   6
   7 GCC is free software; you can redistribute it and/or modify it
   8 under the terms of the GNU General Public License as published by the
   9 Free Software Foundation; either version 3, or (at your option) any
  10 later version.
  11
  12 GCC is distributed in the hope that it will be useful, but WITHOUT
  13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  14 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  15 for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with GCC; see the file COPYING3.  If not see
  19 <http://www.gnu.org/licenses/>.  */
  20
  21 /* Currently, the only mini-pass in this file tries to CSE reciprocal
  22    operations.  These are common in sequences such as this one:
  23
  24         modulus = sqrt(x*x + y*y + z*z);
  25         x = x / modulus;
  26         y = y / modulus;
  27         z = z / modulus;
  28
  29    that can be optimized to
  30
  31         modulus = sqrt(x*x + y*y + z*z);
  32         rmodulus = 1.0 / modulus;
  33         x = x * rmodulus;
  34         y = y * rmodulus;
  35         z = z * rmodulus;
  36
  37    We do this for loop invariant divisors, and with this pass whenever
  38    we notice that a division has the same divisor multiple times.
  39
  40    Of course, like in PRE, we don't insert a division if a dominator
  41    already has one.  However, this cannot be done as an extension of
  42    PRE for several reasons.
  43
  44    First of all, with some experiments it was found out that the
  45    transformation is not always useful if there are only two divisions
  46    hy the same divisor.  This is probably because modern processors
  47    can pipeline the divisions; on older, in-order processors it should
  48    still be effective to optimize two divisions by the same number.
  49    We make this a param, and it shall be called N in the remainder of
  50    this comment.
  51
  52    Second, if trapping math is active, we have less freedom on where
  53    to insert divisions: we can only do so in basic blocks that already
  54    contain one.  (If divisions don't trap, instead, we can insert
  55    divisions elsewhere, which will be in blocks that are common dominators
  56    of those that have the division).
  57
  58    We really don't want to compute the reciprocal unless a division will
  59    be found.  To do this, we won't insert the division in a basic block
  60    that has less than N divisions *post-dominating* it.
  61
  62    The algorithm constructs a subset of the dominator tree, holding the
  63    blocks containing the divisions and the common dominators to them,
  64    and walk it twice.  The first walk is in post-order, and it annotates
  65    each block with the number of divisions that post-dominate it: this
  66    gives information on where divisions can be inserted profitably.
  67    The second walk is in pre-order, and it inserts divisions as explained
  68    above, and replaces divisions by multiplications.
  69
  70    In the best case, the cost of the pass is O(n_statements).  In the
  71    worst-case, the cost is due to creating the dominator tree subset,
  72    with a cost of O(n_basic_blocks ^ 2); however this can only happen
  73    for n_statements / n_basic_blocks statements.  So, the amortized cost
  74    of creating the dominator tree subset is O(n_basic_blocks) and the
  75    worst-case cost of the pass is O(n_statements * n_basic_blocks).
  76
  77    More practically, the cost will be small because there are few
  78    divisions, and they tend to be in the same basic block, so insert_bb
  79    is called very few times.
  80
  81    If we did this using domwalk.c, an efficient implementation would have
  82    to work on all the variables in a single pass, because we could not
  83    work on just a subset of the dominator tree, as we do now, and the
  84    cost would also be something like O(n_statements * n_basic_blocks).
  85    The data structures would be more complex in order to work on all the
  86    variables in a single pass.  */
  87
  88 #include "config.h"
  89 #include "system.h"
  90 #include "coretypes.h"
  91 #include "tm.h"
  92 #include "flags.h"
  93 #include "tree.h"
  94 #include "tree-flow.h"
  95 #include "tree-pass.h"
  96 #include "alloc-pool.h"
  97 #include "basic-block.h"
  98 #include "target.h"
  99 #include "gimple-pretty-print.h"
 100
 101 /* FIXME: RTL headers have to be included here for optabs.  */
 102 #include "rtl.h"                /* Because optabs.h wants enum rtx_code.  */
 103 #include "expr.h"               /* Because optabs.h wants sepops.  */
 104 #include "optabs.h"
 105
 106 /* This structure represents one basic block that either computes a
 107    division, or is a common dominator for basic block that compute a
 108    division.  */
 109 struct occurrence {
 110   /* The basic block represented by this structure.  */
 111   basic_block bb;
 112
 113   /* If non-NULL, the SSA_NAME holding the definition for a reciprocal
 114      inserted in BB.  */
 115   tree recip_def;
 116
 117   /* If non-NULL, the GIMPLE_ASSIGN for a reciprocal computation that
 118      was inserted in BB.  */
 119   gimple recip_def_stmt;
 120
 121   /* Pointer to a list of "struct occurrence"s for blocks dominated
 122      by BB.  */
 123   struct occurrence *children;
 124
 125   /* Pointer to the next "struct occurrence"s in the list of blocks
 126      sharing a common dominator.  */
 127   struct occurrence *next;
 128
 129   /* The number of divisions that are in BB before compute_merit.  The
 130      number of divisions that are in BB or post-dominate it after
 131      compute_merit.  */
 132   int num_divisions;
 133
 134   /* True if the basic block has a division, false if it is a common
 135      dominator for basic blocks that do.  If it is false and trapping
 136      math is active, BB is not a candidate for inserting a reciprocal.  */
 137   bool bb_has_division;
 138 };
 139
 140 static struct
 141 {
 142   /* Number of 1.0/X ops inserted.  */
 143   int rdivs_inserted;
 144
 145   /* Number of 1.0/FUNC ops inserted.  */
 146   int rfuncs_inserted;
 147 } reciprocal_stats;
 148
 149 static struct
 150 {
 151   /* Number of cexpi calls inserted.  */
 152   int inserted;
 153 } sincos_stats;
 154
 155 static struct
 156 {
 157   /* Number of hand-written 32-bit bswaps found.  */
 158   int found_32bit;
 159
 160   /* Number of hand-written 64-bit bswaps found.  */
 161   int found_64bit;
 162 } bswap_stats;
 163
 164 static struct
 165 {
 166   /* Number of widening multiplication ops inserted.  */
 167   int widen_mults_inserted;
 168
 169   /* Number of integer multiply-and-accumulate ops inserted.  */
 170   int maccs_inserted;
 171
 172   /* Number of fp fused multiply-add ops inserted.  */
 173   int fmas_inserted;
 174 } widen_mul_stats;
 175
 176 /* The instance of "struct occurrence" representing the highest
 177    interesting block in the dominator tree.  */
 178 static struct occurrence *occ_head;
 179
 180 /* Allocation pool for getting instances of "struct occurrence".  */
 181 static alloc_pool occ_pool;
 182
 183
 184
 185 /* Allocate and return a new struct occurrence for basic block BB, and
 186    whose children list is headed by CHILDREN.  */
 187 static struct occurrence *
 188 occ_new (basic_block bb, struct occurrence *children)
 189 {
 190   struct occurrence *occ;
 191
 192   bb->aux = occ = (struct occurrence *) pool_alloc (occ_pool);
 193   memset (occ, 0, sizeof (struct occurrence));
 194
 195   occ->bb = bb;
 196   occ->children = children;
 197   return occ;
 198 }
 199
 200
 201 /* Insert NEW_OCC into our subset of the dominator tree.  P_HEAD points to a
 202    list of "struct occurrence"s, one per basic block, having IDOM as
 203    their common dominator.
 204
 205    We try to insert NEW_OCC as deep as possible in the tree, and we also
 206    insert any other block that is a common dominator for BB and one
 207    block already in the tree.  */
 208
 209 static void
 210 insert_bb (struct occurrence *new_occ, basic_block idom,
 211            struct occurrence **p_head)
 212 {
 213   struct occurrence *occ, **p_occ;
 214
 215   for (p_occ = p_head; (occ = *p_occ) != NULL; )
 216     {
 217       basic_block bb = new_occ->bb, occ_bb = occ->bb;
 218       basic_block dom = nearest_common_dominator (CDI_DOMINATORS, occ_bb, bb);
 219       if (dom == bb)
 220         {
 221           /* BB dominates OCC_BB.  OCC becomes NEW_OCC's child: remove OCC
 222              from its list.  */
 223           *p_occ = occ->next;
 224           occ->next = new_occ->children;
 225           new_occ->children = occ;
 226
 227           /* Try the next block (it may as well be dominated by BB).  */
 228         }
 229
 230       else if (dom == occ_bb)
 231         {
 232           /* OCC_BB dominates BB.  Tail recurse to look deeper.  */
 233           insert_bb (new_occ, dom, &occ->children);
 234           return;
 235         }
 236
 237       else if (dom != idom)
 238         {
 239           gcc_assert (!dom->aux);
 240
 241           /* There is a dominator between IDOM and BB, add it and make
 242              two children out of NEW_OCC and OCC.  First, remove OCC from
 243              its list.  */
 244           *p_occ = occ->next;
 245           new_occ->next = occ;
 246           occ->next = NULL;
 247
 248           /* None of the previous blocks has DOM as a dominator: if we tail
 249              recursed, we would reexamine them uselessly. Just switch BB with
 250              DOM, and go on looking for blocks dominated by DOM.  */
 251           new_occ = occ_new (dom, new_occ);
 252         }
 253
 254       else
 255         {
 256           /* Nothing special, go on with the next element.  */
 257           p_occ = &occ->next;
 258         }
 259     }
 260
 261   /* No place was found as a child of IDOM.  Make BB a sibling of IDOM.  */
 262   new_occ->next = *p_head;
 263   *p_head = new_occ;
 264 }
 265
 266 /* Register that we found a division in BB.  */
 267
 268 static inline void
 269 register_division_in (basic_block bb)
 270 {
 271   struct occurrence *occ;
 272
 273   occ = (struct occurrence *) bb->aux;
 274   if (!occ)
 275     {
 276       occ = occ_new (bb, NULL);
 277       insert_bb (occ, ENTRY_BLOCK_PTR, &occ_head);
 278     }
 279
 280   occ->bb_has_division = true;
 281   occ->num_divisions++;
 282 }
 283
 284
 285 /* Compute the number of divisions that postdominate each block in OCC and
 286    its children.  */
 287
 288 static void
 289 compute_merit (struct occurrence *occ)
 290 {
 291   struct occurrence *occ_child;
 292   basic_block dom = occ->bb;
 293
 294   for (occ_child = occ->children; occ_child; occ_child = occ_child->next)
 295     {
 296       basic_block bb;
 297       if (occ_child->children)
 298         compute_merit (occ_child);
 299
 300       if (flag_exceptions)
 301         bb = single_noncomplex_succ (dom);
 302       else
 303         bb = dom;
 304
 305       if (dominated_by_p (CDI_POST_DOMINATORS, bb, occ_child->bb))
 306         occ->num_divisions += occ_child->num_divisions;
 307     }
 308 }
 309
 310
 311 /* Return whether USE_STMT is a floating-point division by DEF.  */
 312 static inline bool
 313 is_division_by (gimple use_stmt, tree def)
 314 {
 315   return is_gimple_assign (use_stmt)
 316          && gimple_assign_rhs_code (use_stmt) == RDIV_EXPR
 317          && gimple_assign_rhs2 (use_stmt) == def
 318          /* Do not recognize x / x as valid division, as we are getting
 319             confused later by replacing all immediate uses x in such
 320             a stmt.  */
 321          && gimple_assign_rhs1 (use_stmt) != def;
 322 }
 323
 324 /* Walk the subset of the dominator tree rooted at OCC, setting the
 325    RECIP_DEF field to a definition of 1.0 / DEF that can be used in
 326    the given basic block.  The field may be left NULL, of course,
 327    if it is not possible or profitable to do the optimization.
 328
 329    DEF_BSI is an iterator pointing at the statement defining DEF.
 330    If RECIP_DEF is set, a dominator already has a computation that can
 331    be used.  */
 332
 333 static void
 334 insert_reciprocals (gimple_stmt_iterator *def_gsi, struct occurrence *occ,
 335                     tree def, tree recip_def, int threshold)
 336 {
 337   tree type;
 338   gimple new_stmt;
 339   gimple_stmt_iterator gsi;
 340   struct occurrence *occ_child;
 341
 342   if (!recip_def
 343       && (occ->bb_has_division || !flag_trapping_math)
 344       && occ->num_divisions >= threshold)
 345     {
 346       /* Make a variable with the replacement and substitute it.  */
 347       type = TREE_TYPE (def);
 348       recip_def = create_tmp_reg (type, "reciptmp");
 349       new_stmt = gimple_build_assign_with_ops (RDIV_EXPR, recip_def,
 350                                                build_one_cst (type), def);
 351
 352       if (occ->bb_has_division)
 353         {
 354           /* Case 1: insert before an existing division.  */
 355           gsi = gsi_after_labels (occ->bb);
 356           while (!gsi_end_p (gsi) && !is_division_by (gsi_stmt (gsi), def))
 357             gsi_next (&gsi);
 358
 359           gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
 360         }
 361       else if (def_gsi && occ->bb == def_gsi->bb)
 362         {
 363           /* Case 2: insert right after the definition.  Note that this will
 364              never happen if the definition statement can throw, because in
 365              that case the sole successor of the statement's basic block will
 366              dominate all the uses as well.  */
 367           gsi_insert_after (def_gsi, new_stmt, GSI_NEW_STMT);
 368         }
 369       else
 370         {
 371           /* Case 3: insert in a basic block not containing defs/uses.  */
 372           gsi = gsi_after_labels (occ->bb);
 373           gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
 374         }
 375
 376       reciprocal_stats.rdivs_inserted++;
 377
 378       occ->recip_def_stmt = new_stmt;
 379     }
 380
 381   occ->recip_def = recip_def;
 382   for (occ_child = occ->children; occ_child; occ_child = occ_child->next)
 383     insert_reciprocals (def_gsi, occ_child, def, recip_def, threshold);
 384 }
 385
 386
 387 /* Replace the division at USE_P with a multiplication by the reciprocal, if
 388    possible.  */
 389
 390 static inline void
 391 replace_reciprocal (use_operand_p use_p)
 392 {
 393   gimple use_stmt = USE_STMT (use_p);
 394   basic_block bb = gimple_bb (use_stmt);
 395   struct occurrence *occ = (struct occurrence *) bb->aux;
 396
 397   if (optimize_bb_for_speed_p (bb)
 398       && occ->recip_def && use_stmt != occ->recip_def_stmt)
 399     {
 400       gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
 401       gimple_assign_set_rhs_code (use_stmt, MULT_EXPR);
 402       SET_USE (use_p, occ->recip_def);
 403       fold_stmt_inplace (&gsi);
 404       update_stmt (use_stmt);
 405     }
 406 }
 407
 408
 409 /* Free OCC and return one more "struct occurrence" to be freed.  */
 410
 411 static struct occurrence *
 412 free_bb (struct occurrence *occ)
 413 {
 414   struct occurrence *child, *next;
 415
 416   /* First get the two pointers hanging off OCC.  */
 417   next = occ->next;
 418   child = occ->children;
 419   occ->bb->aux = NULL;
 420   pool_free (occ_pool, occ);
 421
 422   /* Now ensure that we don't recurse unless it is necessary.  */
 423   if (!child)
 424     return next;
 425   else
 426     {
 427       while (next)
 428         next = free_bb (next);
 429
 430       return child;
 431     }
 432 }
 433
 434
 435 /* Look for floating-point divisions among DEF's uses, and try to
 436    replace them by multiplications with the reciprocal.  Add
 437    as many statements computing the reciprocal as needed.
 438
 439    DEF must be a GIMPLE register of a floating-point type.  */
 440
 441 static void
 442 execute_cse_reciprocals_1 (gimple_stmt_iterator *def_gsi, tree def)
 443 {
 444   use_operand_p use_p;
 445   imm_use_iterator use_iter;
 446   struct occurrence *occ;
 447   int count = 0, threshold;
 448
 449   gcc_assert (FLOAT_TYPE_P (TREE_TYPE (def)) && is_gimple_reg (def));
 450
 451   FOR_EACH_IMM_USE_FAST (use_p, use_iter, def)
 452     {
 453       gimple use_stmt = USE_STMT (use_p);
 454       if (is_division_by (use_stmt, def))
 455         {
 456           register_division_in (gimple_bb (use_stmt));
 457           count++;
 458         }
 459     }
 460
 461   /* Do the expensive part only if we can hope to optimize something.  */
 462   threshold = targetm.min_divisions_for_recip_mul (TYPE_MODE (TREE_TYPE (def)));
 463   if (count >= threshold)
 464     {
 465       gimple use_stmt;
 466       for (occ = occ_head; occ; occ = occ->next)
 467         {
 468           compute_merit (occ);
 469           insert_reciprocals (def_gsi, occ, def, NULL, threshold);
 470         }
 471
 472       FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, def)
 473         {
 474           if (is_division_by (use_stmt, def))
 475             {
 476               FOR_EACH_IMM_USE_ON_STMT (use_p, use_iter)
 477                 replace_reciprocal (use_p);
 478             }
 479         }
 480     }
 481
 482   for (occ = occ_head; occ; )
 483     occ = free_bb (occ);
 484
 485   occ_head = NULL;
 486 }
 487
 488 static bool
 489 gate_cse_reciprocals (void)
 490 {
 491   return optimize && flag_reciprocal_math;
 492 }
 493
 494 /* Go through all the floating-point SSA_NAMEs, and call
 495    execute_cse_reciprocals_1 on each of them.  */
 496 static unsigned int
 497 execute_cse_reciprocals (void)
 498 {
 499   basic_block bb;
 500   tree arg;
 501
 502   occ_pool = create_alloc_pool ("dominators for recip",
 503                                 sizeof (struct occurrence),
 504                                 n_basic_blocks / 3 + 1);
 505
 506   memset (&reciprocal_stats, 0, sizeof (reciprocal_stats));
 507   calculate_dominance_info (CDI_DOMINATORS);
 508   calculate_dominance_info (CDI_POST_DOMINATORS);
 509
 510 #ifdef ENABLE_CHECKING
 511   FOR_EACH_BB (bb)
 512     gcc_assert (!bb->aux);
 513 #endif
 514
 515   for (arg = DECL_ARGUMENTS (cfun->decl); arg; arg = DECL_CHAIN (arg))
 516     if (FLOAT_TYPE_P (TREE_TYPE (arg))
 517         && is_gimple_reg (arg))
 518       {
 519         tree name = ssa_default_def (cfun, arg);
 520         if (name)
 521           execute_cse_reciprocals_1 (NULL, name);
 522       }
 523
 524   FOR_EACH_BB (bb)
 525     {
 526       gimple_stmt_iterator gsi;
 527       gimple phi;
 528       tree def;
 529
 530       for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 531         {
 532           phi = gsi_stmt (gsi);
 533           def = PHI_RESULT (phi);
 534           if (! virtual_operand_p (def)
 535               && FLOAT_TYPE_P (TREE_TYPE (def)))
 536             execute_cse_reciprocals_1 (NULL, def);
 537         }
 538
 539       for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 540         {
 541           gimple stmt = gsi_stmt (gsi);
 542
 543           if (gimple_has_lhs (stmt)
 544               && (def = SINGLE_SSA_TREE_OPERAND (stmt, SSA_OP_DEF)) != NULL
 545               && FLOAT_TYPE_P (TREE_TYPE (def))
 546               && TREE_CODE (def) == SSA_NAME)
 547             execute_cse_reciprocals_1 (&gsi, def);
 548         }
 549
 550       if (optimize_bb_for_size_p (bb))
 551         continue;
 552
 553       /* Scan for a/func(b) and convert it to reciprocal a*rfunc(b).  */
 554       for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 555         {
 556           gimple stmt = gsi_stmt (gsi);
 557           tree fndecl;
 558
 559           if (is_gimple_assign (stmt)
 560               && gimple_assign_rhs_code (stmt) == RDIV_EXPR)
 561             {
 562               tree arg1 = gimple_assign_rhs2 (stmt);
 563               gimple stmt1;
 564
 565               if (TREE_CODE (arg1) != SSA_NAME)
 566                 continue;
 567
 568               stmt1 = SSA_NAME_DEF_STMT (arg1);
 569
 570               if (is_gimple_call (stmt1)
 571                   && gimple_call_lhs (stmt1)
 572                   && (fndecl = gimple_call_fndecl (stmt1))
 573                   && (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL
 574                       || DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD))
 575                 {
 576                   enum built_in_function code;
 577                   bool md_code, fail;
 578                   imm_use_iterator ui;
 579                   use_operand_p use_p;
 580
 581                   code = DECL_FUNCTION_CODE (fndecl);
 582                   md_code = DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD;
 583
 584                   fndecl = targetm.builtin_reciprocal (code, md_code, false);
 585                   if (!fndecl)
 586                     continue;
 587
 588                   /* Check that all uses of the SSA name are divisions,
 589                      otherwise replacing the defining statement will do
 590                      the wrong thing.  */
 591                   fail = false;
 592                   FOR_EACH_IMM_USE_FAST (use_p, ui, arg1)
 593                     {
 594                       gimple stmt2 = USE_STMT (use_p);
 595                       if (is_gimple_debug (stmt2))
 596                         continue;
 597                       if (!is_gimple_assign (stmt2)
 598                           || gimple_assign_rhs_code (stmt2) != RDIV_EXPR
 599                           || gimple_assign_rhs1 (stmt2) == arg1
 600                           || gimple_assign_rhs2 (stmt2) != arg1)
 601                         {
 602                           fail = true;
 603                           break;
 604                         }
 605                     }
 606                   if (fail)
 607                     continue;
 608
 609                   gimple_replace_lhs (stmt1, arg1);
 610                   gimple_call_set_fndecl (stmt1, fndecl);
 611                   update_stmt (stmt1);
 612                   reciprocal_stats.rfuncs_inserted++;
 613
 614                   FOR_EACH_IMM_USE_STMT (stmt, ui, arg1)
 615                     {
 616                       gimple_stmt_iterator gsi = gsi_for_stmt (stmt);
 617                       gimple_assign_set_rhs_code (stmt, MULT_EXPR);
 618                       fold_stmt_inplace (&gsi);
 619                       update_stmt (stmt);
 620                     }
 621                 }
 622             }
 623         }
 624     }
 625
 626   statistics_counter_event (cfun, "reciprocal divs inserted",
 627                             reciprocal_stats.rdivs_inserted);
 628   statistics_counter_event (cfun, "reciprocal functions inserted",
 629                             reciprocal_stats.rfuncs_inserted);
 630
 631   free_dominance_info (CDI_DOMINATORS);
 632   free_dominance_info (CDI_POST_DOMINATORS);
 633   free_alloc_pool (occ_pool);
 634   return 0;
 635 }
 636
 637 struct gimple_opt_pass pass_cse_reciprocals =
 638 {
 639  {
 640   GIMPLE_PASS,
 641   "recip",                              /* name */
 642   gate_cse_reciprocals,                 /* gate */
 643   execute_cse_reciprocals,              /* execute */
 644   NULL,                                 /* sub */
 645   NULL,                                 /* next */
 646   0,                                    /* static_pass_number */
 647   TV_NONE,                              /* tv_id */
 648   PROP_ssa,                             /* properties_required */
 649   0,                                    /* properties_provided */
 650   0,                                    /* properties_destroyed */
 651   0,                                    /* todo_flags_start */
 652   TODO_update_ssa | TODO_verify_ssa
 653     | TODO_verify_stmts                /* todo_flags_finish */
 654  }
 655 };
 656
 657 /* Records an occurrence at statement USE_STMT in the vector of trees
 658    STMTS if it is dominated by *TOP_BB or dominates it or this basic block
 659    is not yet initialized.  Returns true if the occurrence was pushed on
 660    the vector.  Adjusts *TOP_BB to be the basic block dominating all
 661    statements in the vector.  */
 662
 663 static bool
 664 maybe_record_sincos (VEC(gimple, heap) **stmts,
 665                      basic_block *top_bb, gimple use_stmt)
 666 {
 667   basic_block use_bb = gimple_bb (use_stmt);
 668   if (*top_bb
 669       && (*top_bb == use_bb
 670           || dominated_by_p (CDI_DOMINATORS, use_bb, *top_bb)))
 671     VEC_safe_push (gimple, heap, *stmts, use_stmt);
 672   else if (!*top_bb
 673            || dominated_by_p (CDI_DOMINATORS, *top_bb, use_bb))
 674     {
 675       VEC_safe_push (gimple, heap, *stmts, use_stmt);
 676       *top_bb = use_bb;
 677     }
 678   else
 679     return false;
 680
 681   return true;
 682 }
 683
 684 /* Look for sin, cos and cexpi calls with the same argument NAME and
 685    create a single call to cexpi CSEing the result in this case.
 686    We first walk over all immediate uses of the argument collecting
 687    statements that we can CSE in a vector and in a second pass replace
 688    the statement rhs with a REALPART or IMAGPART expression on the
 689    result of the cexpi call we insert before the use statement that
 690    dominates all other candidates.  */
 691
 692 static bool
 693 execute_cse_sincos_1 (tree name)
 694 {
 695   gimple_stmt_iterator gsi;
 696   imm_use_iterator use_iter;
 697   tree fndecl, res, type;
 698   gimple def_stmt, use_stmt, stmt;
 699   int seen_cos = 0, seen_sin = 0, seen_cexpi = 0;
 700   VEC(gimple, heap) *stmts = NULL;
 701   basic_block top_bb = NULL;
 702   int i;
 703   bool cfg_changed = false;
 704
 705   type = TREE_TYPE (name);
 706   FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, name)
 707     {
 708       if (gimple_code (use_stmt) != GIMPLE_CALL
 709           || !gimple_call_lhs (use_stmt)
 710           || !(fndecl = gimple_call_fndecl (use_stmt))
 711           || DECL_BUILT_IN_CLASS (fndecl) != BUILT_IN_NORMAL)
 712         continue;
 713
 714       switch (DECL_FUNCTION_CODE (fndecl))
 715         {
 716         CASE_FLT_FN (BUILT_IN_COS):
 717           seen_cos |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
 718           break;
 719
 720         CASE_FLT_FN (BUILT_IN_SIN):
 721           seen_sin |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
 722           break;
 723
 724         CASE_FLT_FN (BUILT_IN_CEXPI):
 725           seen_cexpi |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
 726           break;
 727
 728         default:;
 729         }
 730     }
 731
 732   if (seen_cos + seen_sin + seen_cexpi <= 1)
 733     {
 734       VEC_free(gimple, heap, stmts);
 735       return false;
 736     }
 737
 738   /* Simply insert cexpi at the beginning of top_bb but not earlier than
 739      the name def statement.  */
 740   fndecl = mathfn_built_in (type, BUILT_IN_CEXPI);
 741   if (!fndecl)
 742     return false;
 743   stmt = gimple_build_call (fndecl, 1, name);
 744   res = make_temp_ssa_name (TREE_TYPE (TREE_TYPE (fndecl)), stmt, "sincostmp");
 745   gimple_call_set_lhs (stmt, res);
 746
 747   def_stmt = SSA_NAME_DEF_STMT (name);
 748   if (!SSA_NAME_IS_DEFAULT_DEF (name)
 749       && gimple_code (def_stmt) != GIMPLE_PHI
 750       && gimple_bb (def_stmt) == top_bb)
 751     {
 752       gsi = gsi_for_stmt (def_stmt);
 753       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
 754     }
 755   else
 756     {
 757       gsi = gsi_after_labels (top_bb);
 758       gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
 759     }
 760   sincos_stats.inserted++;
 761
 762   /* And adjust the recorded old call sites.  */
 763   for (i = 0; VEC_iterate(gimple, stmts, i, use_stmt); ++i)
 764     {
 765       tree rhs = NULL;
 766       fndecl = gimple_call_fndecl (use_stmt);
 767
 768       switch (DECL_FUNCTION_CODE (fndecl))
 769         {
 770         CASE_FLT_FN (BUILT_IN_COS):
 771           rhs = fold_build1 (REALPART_EXPR, type, res);
 772           break;
 773
 774         CASE_FLT_FN (BUILT_IN_SIN):
 775           rhs = fold_build1 (IMAGPART_EXPR, type, res);
 776           break;
 777
 778         CASE_FLT_FN (BUILT_IN_CEXPI):
 779           rhs = res;
 780           break;
 781
 782         default:;
 783           gcc_unreachable ();
 784         }
 785
 786         /* Replace call with a copy.  */
 787         stmt = gimple_build_assign (gimple_call_lhs (use_stmt), rhs);
 788
 789         gsi = gsi_for_stmt (use_stmt);
 790         gsi_replace (&gsi, stmt, true);
 791         if (gimple_purge_dead_eh_edges (gimple_bb (stmt)))
 792           cfg_changed = true;
 793     }
 794
 795   VEC_free(gimple, heap, stmts);
 796
 797   return cfg_changed;
 798 }
 799
 800 /* To evaluate powi(x,n), the floating point value x raised to the
 801    constant integer exponent n, we use a hybrid algorithm that
 802    combines the "window method" with look-up tables.  For an
 803    introduction to exponentiation algorithms and "addition chains",
 804    see section 4.6.3, "Evaluation of Powers" of Donald E. Knuth,
 805    "Seminumerical Algorithms", Vol. 2, "The Art of Computer Programming",
 806    3rd Edition, 1998, and Daniel M. Gordon, "A Survey of Fast Exponentiation
 807    Methods", Journal of Algorithms, Vol. 27, pp. 129-146, 1998.  */
 808
 809 /* Provide a default value for POWI_MAX_MULTS, the maximum number of
 810    multiplications to inline before calling the system library's pow
 811    function.  powi(x,n) requires at worst 2*bits(n)-2 multiplications,
 812    so this default never requires calling pow, powf or powl.  */
 813
 814 #ifndef POWI_MAX_MULTS
 815 #define POWI_MAX_MULTS  (2*HOST_BITS_PER_WIDE_INT-2)
 816 #endif
 817
 818 /* The size of the "optimal power tree" lookup table.  All
 819    exponents less than this value are simply looked up in the
 820    powi_table below.  This threshold is also used to size the
 821    cache of pseudo registers that hold intermediate results.  */
 822 #define POWI_TABLE_SIZE 256
 823
 824 /* The size, in bits of the window, used in the "window method"
 825    exponentiation algorithm.  This is equivalent to a radix of
 826    (1<<POWI_WINDOW_SIZE) in the corresponding "m-ary method".  */
 827 #define POWI_WINDOW_SIZE 3
 828
 829 /* The following table is an efficient representation of an
 830    "optimal power tree".  For each value, i, the corresponding
 831    value, j, in the table states than an optimal evaluation
 832    sequence for calculating pow(x,i) can be found by evaluating
 833    pow(x,j)*pow(x,i-j).  An optimal power tree for the first
 834    100 integers is given in Knuth's "Seminumerical algorithms".  */
 835
 836 static const unsigned char powi_table[POWI_TABLE_SIZE] =
 837   {
 838       0,   1,   1,   2,   2,   3,   3,   4,  /*   0 -   7 */
 839       4,   6,   5,   6,   6,  10,   7,   9,  /*   8 -  15 */
 840       8,  16,   9,  16,  10,  12,  11,  13,  /*  16 -  23 */
 841      12,  17,  13,  18,  14,  24,  15,  26,  /*  24 -  31 */
 842      16,  17,  17,  19,  18,  33,  19,  26,  /*  32 -  39 */
 843      20,  25,  21,  40,  22,  27,  23,  44,  /*  40 -  47 */
 844      24,  32,  25,  34,  26,  29,  27,  44,  /*  48 -  55 */
 845      28,  31,  29,  34,  30,  60,  31,  36,  /*  56 -  63 */
 846      32,  64,  33,  34,  34,  46,  35,  37,  /*  64 -  71 */
 847      36,  65,  37,  50,  38,  48,  39,  69,  /*  72 -  79 */
 848      40,  49,  41,  43,  42,  51,  43,  58,  /*  80 -  87 */
 849      44,  64,  45,  47,  46,  59,  47,  76,  /*  88 -  95 */
 850      48,  65,  49,  66,  50,  67,  51,  66,  /*  96 - 103 */
 851      52,  70,  53,  74,  54, 104,  55,  74,  /* 104 - 111 */
 852      56,  64,  57,  69,  58,  78,  59,  68,  /* 112 - 119 */
 853      60,  61,  61,  80,  62,  75,  63,  68,  /* 120 - 127 */
 854      64,  65,  65, 128,  66, 129,  67,  90,  /* 128 - 135 */
 855      68,  73,  69, 131,  70,  94,  71,  88,  /* 136 - 143 */
 856      72, 128,  73,  98,  74, 132,  75, 121,  /* 144 - 151 */
 857      76, 102,  77, 124,  78, 132,  79, 106,  /* 152 - 159 */
 858      80,  97,  81, 160,  82,  99,  83, 134,  /* 160 - 167 */
 859      84,  86,  85,  95,  86, 160,  87, 100,  /* 168 - 175 */
 860      88, 113,  89,  98,  90, 107,  91, 122,  /* 176 - 183 */
 861      92, 111,  93, 102,  94, 126,  95, 150,  /* 184 - 191 */
 862      96, 128,  97, 130,  98, 133,  99, 195,  /* 192 - 199 */
 863     100, 128, 101, 123, 102, 164, 103, 138,  /* 200 - 207 */
 864     104, 145, 105, 146, 106, 109, 107, 149,  /* 208 - 215 */
 865     108, 200, 109, 146, 110, 170, 111, 157,  /* 216 - 223 */
 866     112, 128, 113, 130, 114, 182, 115, 132,  /* 224 - 231 */
 867     116, 200, 117, 132, 118, 158, 119, 206,  /* 232 - 239 */
 868     120, 240, 121, 162, 122, 147, 123, 152,  /* 240 - 247 */
 869     124, 166, 125, 214, 126, 138, 127, 153,  /* 248 - 255 */
 870   };
 871
 872
 873 /* Return the number of multiplications required to calculate
 874    powi(x,n) where n is less than POWI_TABLE_SIZE.  This is a
 875    subroutine of powi_cost.  CACHE is an array indicating
 876    which exponents have already been calculated.  */
 877
 878 static int
 879 powi_lookup_cost (unsigned HOST_WIDE_INT n, bool *cache)
 880 {
 881   /* If we've already calculated this exponent, then this evaluation
 882      doesn't require any additional multiplications.  */
 883   if (cache[n])
 884     return 0;
 885
 886   cache[n] = true;
 887   return powi_lookup_cost (n - powi_table[n], cache)
 888          + powi_lookup_cost (powi_table[n], cache) + 1;
 889 }
 890
 891 /* Return the number of multiplications required to calculate
 892    powi(x,n) for an arbitrary x, given the exponent N.  This
 893    function needs to be kept in sync with powi_as_mults below.  */
 894
 895 static int
 896 powi_cost (HOST_WIDE_INT n)
 897 {
 898   bool cache[POWI_TABLE_SIZE];
 899   unsigned HOST_WIDE_INT digit;
 900   unsigned HOST_WIDE_INT val;
 901   int result;
 902
 903   if (n == 0)
 904     return 0;
 905
 906   /* Ignore the reciprocal when calculating the cost.  */
 907   val = (n < 0) ? -n : n;
 908
 909   /* Initialize the exponent cache.  */
 910   memset (cache, 0, POWI_TABLE_SIZE * sizeof (bool));
 911   cache[1] = true;
 912
 913   result = 0;
 914
 915   while (val >= POWI_TABLE_SIZE)
 916     {
 917       if (val & 1)
 918         {
 919           digit = val & ((1 << POWI_WINDOW_SIZE) - 1);
 920           result += powi_lookup_cost (digit, cache)
 921                     + POWI_WINDOW_SIZE + 1;
 922           val >>= POWI_WINDOW_SIZE;
 923         }
 924       else
 925         {
 926           val >>= 1;
 927           result++;
 928         }
 929     }
 930
 931   return result + powi_lookup_cost (val, cache);
 932 }
 933
 934 /* Recursive subroutine of powi_as_mults.  This function takes the
 935    array, CACHE, of already calculated exponents and an exponent N and
 936    returns a tree that corresponds to CACHE[1]**N, with type TYPE.  */
 937
 938 static tree
 939 powi_as_mults_1 (gimple_stmt_iterator *gsi, location_t loc, tree type,
 940                  HOST_WIDE_INT n, tree *cache)
 941 {
 942   tree op0, op1, ssa_target;
 943   unsigned HOST_WIDE_INT digit;
 944   gimple mult_stmt;
 945
 946   if (n < POWI_TABLE_SIZE && cache[n])
 947     return cache[n];
 948
 949   ssa_target = make_temp_ssa_name (type, NULL, "powmult");
 950
 951   if (n < POWI_TABLE_SIZE)
 952     {
 953       cache[n] = ssa_target;
 954       op0 = powi_as_mults_1 (gsi, loc, type, n - powi_table[n], cache);
 955       op1 = powi_as_mults_1 (gsi, loc, type, powi_table[n], cache);
 956     }
 957   else if (n & 1)
 958     {
 959       digit = n & ((1 << POWI_WINDOW_SIZE) - 1);
 960       op0 = powi_as_mults_1 (gsi, loc, type, n - digit, cache);
 961       op1 = powi_as_mults_1 (gsi, loc, type, digit, cache);
 962     }
 963   else
 964     {
 965       op0 = powi_as_mults_1 (gsi, loc, type, n >> 1, cache);
 966       op1 = op0;
 967     }
 968
 969   mult_stmt = gimple_build_assign_with_ops (MULT_EXPR, ssa_target, op0, op1);
 970   gimple_set_location (mult_stmt, loc);
 971   gsi_insert_before (gsi, mult_stmt, GSI_SAME_STMT);
 972
 973   return ssa_target;
 974 }
 975
 976 /* Convert ARG0**N to a tree of multiplications of ARG0 with itself.
 977    This function needs to be kept in sync with powi_cost above.  */
 978
 979 static tree
 980 powi_as_mults (gimple_stmt_iterator *gsi, location_t loc,
 981                tree arg0, HOST_WIDE_INT n)
 982 {
 983   tree cache[POWI_TABLE_SIZE], result, type = TREE_TYPE (arg0);
 984   gimple div_stmt;
 985   tree target;
 986
 987   if (n == 0)
 988     return build_real (type, dconst1);
 989
 990   memset (cache, 0,  sizeof (cache));
 991   cache[1] = arg0;
 992
 993   result = powi_as_mults_1 (gsi, loc, type, (n < 0) ? -n : n, cache);
 994   if (n >= 0)
 995     return result;
 996
 997   /* If the original exponent was negative, reciprocate the result.  */
 998   target = make_temp_ssa_name (type, NULL, "powmult");
 999   div_stmt = gimple_build_assign_with_ops (RDIV_EXPR, target,
1000                                            build_real (type, dconst1),
1001                                            result);
1002   gimple_set_location (div_stmt, loc);
1003   gsi_insert_before (gsi, div_stmt, GSI_SAME_STMT);
1004
1005   return target;
1006 }
1007
1008 /* ARG0 and N are the two arguments to a powi builtin in GSI with
1009    location info LOC.  If the arguments are appropriate, create an
1010    equivalent sequence of statements prior to GSI using an optimal
1011    number of multiplications, and return an expession holding the
1012    result.  */
1013
1014 static tree
1015 gimple_expand_builtin_powi (gimple_stmt_iterator *gsi, location_t loc,
1016                             tree arg0, HOST_WIDE_INT n)
1017 {
1018   /* Avoid largest negative number.  */
1019   if (n != -n
1020       && ((n >= -1 && n <= 2)
1021           || (optimize_function_for_speed_p (cfun)
1022               && powi_cost (n) <= POWI_MAX_MULTS)))
1023     return powi_as_mults (gsi, loc, arg0, n);
1024
1025   return NULL_TREE;
1026 }
1027
1028 /* Build a gimple call statement that calls FN with argument ARG.
1029    Set the lhs of the call statement to a fresh SSA name.  Insert the
1030    statement prior to GSI's current position, and return the fresh
1031    SSA name.  */
1032
1033 static tree
1034 build_and_insert_call (gimple_stmt_iterator *gsi, location_t loc,
1035                        tree fn, tree arg)
1036 {
1037   gimple call_stmt;
1038   tree ssa_target;
1039
1040   call_stmt = gimple_build_call (fn, 1, arg);
1041   ssa_target = make_temp_ssa_name (TREE_TYPE (arg), NULL, "powroot");
1042   gimple_set_lhs (call_stmt, ssa_target);
1043   gimple_set_location (call_stmt, loc);
1044   gsi_insert_before (gsi, call_stmt, GSI_SAME_STMT);
1045
1046   return ssa_target;
1047 }
1048
1049 /* Build a gimple binary operation with the given CODE and arguments
1050    ARG0, ARG1, assigning the result to a new SSA name for variable
1051    TARGET.  Insert the statement prior to GSI's current position, and
1052    return the fresh SSA name.*/
1053
1054 static tree
1055 build_and_insert_binop (gimple_stmt_iterator *gsi, location_t loc,
1056                         const char *name, enum tree_code code,
1057                         tree arg0, tree arg1)
1058 {
1059   tree result = make_temp_ssa_name (TREE_TYPE (arg0), NULL, name);
1060   gimple stmt = gimple_build_assign_with_ops (code, result, arg0, arg1);
1061   gimple_set_location (stmt, loc);
1062   gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1063   return result;
1064 }
1065
1066 /* Build a gimple reference operation with the given CODE and argument
1067    ARG, assigning the result to a new SSA name of TYPE with NAME.
1068    Insert the statement prior to GSI's current position, and return
1069    the fresh SSA name.  */
1070
1071 static inline tree
1072 build_and_insert_ref (gimple_stmt_iterator *gsi, location_t loc, tree type,
1073                       const char *name, enum tree_code code, tree arg0)
1074 {
1075   tree result = make_temp_ssa_name (type, NULL, name);
1076   gimple stmt = gimple_build_assign (result, build1 (code, type, arg0));
1077   gimple_set_location (stmt, loc);
1078   gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1079   return result;
1080 }
1081
1082 /* Build a gimple assignment to cast VAL to TYPE.  Insert the statement
1083    prior to GSI's current position, and return the fresh SSA name.  */
1084
1085 static tree
1086 build_and_insert_cast (gimple_stmt_iterator *gsi, location_t loc,
1087                        tree type, tree val)
1088 {
1089   tree result = make_ssa_name (type, NULL);
1090   gimple stmt = gimple_build_assign_with_ops (NOP_EXPR, result, val, NULL_TREE);
1091   gimple_set_location (stmt, loc);
1092   gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1093   return result;
1094 }
1095
1096 /* ARG0 and ARG1 are the two arguments to a pow builtin call in GSI
1097    with location info LOC.  If possible, create an equivalent and
1098    less expensive sequence of statements prior to GSI, and return an
1099    expession holding the result.  */
1100
1101 static tree
1102 gimple_expand_builtin_pow (gimple_stmt_iterator *gsi, location_t loc,
1103                            tree arg0, tree arg1)
1104 {
1105   REAL_VALUE_TYPE c, cint, dconst1_4, dconst3_4, dconst1_3, dconst1_6;
1106   REAL_VALUE_TYPE c2, dconst3;
1107   HOST_WIDE_INT n;
1108   tree type, sqrtfn, cbrtfn, sqrt_arg0, sqrt_sqrt, result, cbrt_x, powi_cbrt_x;
1109   enum machine_mode mode;
1110   bool hw_sqrt_exists;
1111
1112   /* If the exponent isn't a constant, there's nothing of interest
1113      to be done.  */
1114   if (TREE_CODE (arg1) != REAL_CST)
1115     return NULL_TREE;
1116
1117   /* If the exponent is equivalent to an integer, expand to an optimal
1118      multiplication sequence when profitable.  */
1119   c = TREE_REAL_CST (arg1);
1120   n = real_to_integer (&c);
1121   real_from_integer (&cint, VOIDmode, n, n < 0 ? -1 : 0, 0);
1122
1123   if (real_identical (&c, &cint)
1124       && ((n >= -1 && n <= 2)
1125           || (flag_unsafe_math_optimizations
1126               && optimize_insn_for_speed_p ()
1127               && powi_cost (n) <= POWI_MAX_MULTS)))
1128     return gimple_expand_builtin_powi (gsi, loc, arg0, n);
1129
1130   /* Attempt various optimizations using sqrt and cbrt.  */
1131   type = TREE_TYPE (arg0);
1132   mode = TYPE_MODE (type);
1133   sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1134
1135   /* Optimize pow(x,0.5) = sqrt(x).  This replacement is always safe
1136      unless signed zeros must be maintained.  pow(-0,0.5) = +0, while
1137      sqrt(-0) = -0.  */
1138   if (sqrtfn
1139       && REAL_VALUES_EQUAL (c, dconsthalf)
1140       && !HONOR_SIGNED_ZEROS (mode))
1141     return build_and_insert_call (gsi, loc, sqrtfn, arg0);
1142
1143   /* Optimize pow(x,0.25) = sqrt(sqrt(x)).  Assume on most machines that
1144      a builtin sqrt instruction is smaller than a call to pow with 0.25,
1145      so do this optimization even if -Os.  Don't do this optimization
1146      if we don't have a hardware sqrt insn.  */
1147   dconst1_4 = dconst1;
1148   SET_REAL_EXP (&dconst1_4, REAL_EXP (&dconst1_4) - 2);
1149   hw_sqrt_exists = optab_handler (sqrt_optab, mode) != CODE_FOR_nothing;
1150
1151   if (flag_unsafe_math_optimizations
1152       && sqrtfn
1153       && REAL_VALUES_EQUAL (c, dconst1_4)
1154       && hw_sqrt_exists)
1155     {
1156       /* sqrt(x)  */
1157       sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1158
1159       /* sqrt(sqrt(x))  */
1160       return build_and_insert_call (gsi, loc, sqrtfn, sqrt_arg0);
1161     }
1162
1163   /* Optimize pow(x,0.75) = sqrt(x) * sqrt(sqrt(x)) unless we are
1164      optimizing for space.  Don't do this optimization if we don't have
1165      a hardware sqrt insn.  */
1166   real_from_integer (&dconst3_4, VOIDmode, 3, 0, 0);
1167   SET_REAL_EXP (&dconst3_4, REAL_EXP (&dconst3_4) - 2);
1168
1169   if (flag_unsafe_math_optimizations
1170       && sqrtfn
1171       && optimize_function_for_speed_p (cfun)
1172       && REAL_VALUES_EQUAL (c, dconst3_4)
1173       && hw_sqrt_exists)
1174     {
1175       /* sqrt(x)  */
1176       sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1177
1178       /* sqrt(sqrt(x))  */
1179       sqrt_sqrt = build_and_insert_call (gsi, loc, sqrtfn, sqrt_arg0);
1180
1181       /* sqrt(x) * sqrt(sqrt(x))  */
1182       return build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1183                                      sqrt_arg0, sqrt_sqrt);
1184     }
1185
1186   /* Optimize pow(x,1./3.) = cbrt(x).  This requires unsafe math
1187      optimizations since 1./3. is not exactly representable.  If x
1188      is negative and finite, the correct value of pow(x,1./3.) is
1189      a NaN with the "invalid" exception raised, because the value
1190      of 1./3. actually has an even denominator.  The correct value
1191      of cbrt(x) is a negative real value.  */
1192   cbrtfn = mathfn_built_in (type, BUILT_IN_CBRT);
1193   dconst1_3 = real_value_truncate (mode, dconst_third ());
1194
1195   if (flag_unsafe_math_optimizations
1196       && cbrtfn
1197       && (gimple_val_nonnegative_real_p (arg0) || !HONOR_NANS (mode))
1198       && REAL_VALUES_EQUAL (c, dconst1_3))
1199     return build_and_insert_call (gsi, loc, cbrtfn, arg0);
1200
1201   /* Optimize pow(x,1./6.) = cbrt(sqrt(x)).  Don't do this optimization
1202      if we don't have a hardware sqrt insn.  */
1203   dconst1_6 = dconst1_3;
1204   SET_REAL_EXP (&dconst1_6, REAL_EXP (&dconst1_6) - 1);
1205
1206   if (flag_unsafe_math_optimizations
1207       && sqrtfn
1208       && cbrtfn
1209       && (gimple_val_nonnegative_real_p (arg0) || !HONOR_NANS (mode))
1210       && optimize_function_for_speed_p (cfun)
1211       && hw_sqrt_exists
1212       && REAL_VALUES_EQUAL (c, dconst1_6))
1213     {
1214       /* sqrt(x)  */
1215       sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1216
1217       /* cbrt(sqrt(x))  */
1218       return build_and_insert_call (gsi, loc, cbrtfn, sqrt_arg0);
1219     }
1220
1221   /* Optimize pow(x,c), where n = 2c for some nonzero integer n, into
1222
1223        sqrt(x) * powi(x, n/2),                n > 0;
1224        1.0 / (sqrt(x) * powi(x, abs(n/2))),   n < 0.
1225
1226      Do not calculate the powi factor when n/2 = 0.  */
1227   real_arithmetic (&c2, MULT_EXPR, &c, &dconst2);
1228   n = real_to_integer (&c2);
1229   real_from_integer (&cint, VOIDmode, n, n < 0 ? -1 : 0, 0);
1230
1231   if (flag_unsafe_math_optimizations
1232       && sqrtfn
1233       && real_identical (&c2, &cint))
1234     {
1235       tree powi_x_ndiv2 = NULL_TREE;
1236
1237       /* Attempt to fold powi(arg0, abs(n/2)) into multiplies.  If not
1238          possible or profitable, give up.  Skip the degenerate case when
1239          n is 1 or -1, where the result is always 1.  */
1240       if (absu_hwi (n) != 1)
1241         {
1242           powi_x_ndiv2 = gimple_expand_builtin_powi (gsi, loc, arg0,
1243                                                      abs_hwi (n / 2));
1244           if (!powi_x_ndiv2)
1245             return NULL_TREE;
1246         }
1247
1248       /* Calculate sqrt(x).  When n is not 1 or -1, multiply it by the
1249          result of the optimal multiply sequence just calculated.  */
1250       sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1251
1252       if (absu_hwi (n) == 1)
1253         result = sqrt_arg0;
1254       else
1255         result = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1256                                          sqrt_arg0, powi_x_ndiv2);
1257
1258       /* If n is negative, reciprocate the result.  */
1259       if (n < 0)
1260         result = build_and_insert_binop (gsi, loc, "powroot", RDIV_EXPR,
1261                                          build_real (type, dconst1), result);
1262       return result;
1263     }
1264
1265   /* Optimize pow(x,c), where 3c = n for some nonzero integer n, into
1266
1267      powi(x, n/3) * powi(cbrt(x), n%3),                    n > 0;
1268      1.0 / (powi(x, abs(n)/3) * powi(cbrt(x), abs(n)%3)),  n < 0.
1269
1270      Do not calculate the first factor when n/3 = 0.  As cbrt(x) is
1271      different from pow(x, 1./3.) due to rounding and behavior with
1272      negative x, we need to constrain this transformation to unsafe
1273      math and positive x or finite math.  */
1274   real_from_integer (&dconst3, VOIDmode, 3, 0, 0);
1275   real_arithmetic (&c2, MULT_EXPR, &c, &dconst3);
1276   real_round (&c2, mode, &c2);
1277   n = real_to_integer (&c2);
1278   real_from_integer (&cint, VOIDmode, n, n < 0 ? -1 : 0, 0);
1279   real_arithmetic (&c2, RDIV_EXPR, &cint, &dconst3);
1280   real_convert (&c2, mode, &c2);
1281
1282   if (flag_unsafe_math_optimizations
1283       && cbrtfn
1284       && (gimple_val_nonnegative_real_p (arg0) || !HONOR_NANS (mode))
1285       && real_identical (&c2, &c)
1286       && optimize_function_for_speed_p (cfun)
1287       && powi_cost (n / 3) <= POWI_MAX_MULTS)
1288     {
1289       tree powi_x_ndiv3 = NULL_TREE;
1290
1291       /* Attempt to fold powi(arg0, abs(n/3)) into multiplies.  If not
1292          possible or profitable, give up.  Skip the degenerate case when
1293          abs(n) < 3, where the result is always 1.  */
1294       if (absu_hwi (n) >= 3)
1295         {
1296           powi_x_ndiv3 = gimple_expand_builtin_powi (gsi, loc, arg0,
1297                                                      abs_hwi (n / 3));
1298           if (!powi_x_ndiv3)
1299             return NULL_TREE;
1300         }
1301
1302       /* Calculate powi(cbrt(x), n%3).  Don't use gimple_expand_builtin_powi
1303          as that creates an unnecessary variable.  Instead, just produce
1304          either cbrt(x) or cbrt(x) * cbrt(x).  */
1305       cbrt_x = build_and_insert_call (gsi, loc, cbrtfn, arg0);
1306
1307       if (absu_hwi (n) % 3 == 1)
1308         powi_cbrt_x = cbrt_x;
1309       else
1310         powi_cbrt_x = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1311                                               cbrt_x, cbrt_x);
1312
1313       /* Multiply the two subexpressions, unless powi(x,abs(n)/3) = 1.  */
1314       if (absu_hwi (n) < 3)
1315         result = powi_cbrt_x;
1316       else
1317         result = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1318                                          powi_x_ndiv3, powi_cbrt_x);
1319
1320       /* If n is negative, reciprocate the result.  */
1321       if (n < 0)
1322         result = build_and_insert_binop (gsi, loc, "powroot", RDIV_EXPR,
1323                                          build_real (type, dconst1), result);
1324
1325       return result;
1326     }
1327
1328   /* No optimizations succeeded.  */
1329   return NULL_TREE;
1330 }
1331
1332 /* ARG is the argument to a cabs builtin call in GSI with location info
1333    LOC.  Create a sequence of statements prior to GSI that calculates
1334    sqrt(R*R + I*I), where R and I are the real and imaginary components
1335    of ARG, respectively.  Return an expression holding the result.  */
1336
1337 static tree
1338 gimple_expand_builtin_cabs (gimple_stmt_iterator *gsi, location_t loc, tree arg)
1339 {
1340   tree real_part, imag_part, addend1, addend2, sum, result;
1341   tree type = TREE_TYPE (TREE_TYPE (arg));
1342   tree sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1343   enum machine_mode mode = TYPE_MODE (type);
1344
1345   if (!flag_unsafe_math_optimizations
1346       || !optimize_bb_for_speed_p (gimple_bb (gsi_stmt (*gsi)))
1347       || !sqrtfn
1348       || optab_handler (sqrt_optab, mode) == CODE_FOR_nothing)
1349     return NULL_TREE;
1350
1351   real_part = build_and_insert_ref (gsi, loc, type, "cabs",
1352                                     REALPART_EXPR, arg);
1353   addend1 = build_and_insert_binop (gsi, loc, "cabs", MULT_EXPR,
1354                                     real_part, real_part);
1355   imag_part = build_and_insert_ref (gsi, loc, type, "cabs",
1356                                     IMAGPART_EXPR, arg);
1357   addend2 = build_and_insert_binop (gsi, loc, "cabs", MULT_EXPR,
1358                                     imag_part, imag_part);
1359   sum = build_and_insert_binop (gsi, loc, "cabs", PLUS_EXPR, addend1, addend2);
1360   result = build_and_insert_call (gsi, loc, sqrtfn, sum);
1361
1362   return result;
1363 }
1364
1365 /* Go through all calls to sin, cos and cexpi and call execute_cse_sincos_1
1366    on the SSA_NAME argument of each of them.  Also expand powi(x,n) into
1367    an optimal number of multiplies, when n is a constant.  */
1368
1369 static unsigned int
1370 execute_cse_sincos (void)
1371 {
1372   basic_block bb;
1373   bool cfg_changed = false;
1374
1375   calculate_dominance_info (CDI_DOMINATORS);
1376   memset (&sincos_stats, 0, sizeof (sincos_stats));
1377
1378   FOR_EACH_BB (bb)
1379     {
1380       gimple_stmt_iterator gsi;
1381
1382       for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1383         {
1384           gimple stmt = gsi_stmt (gsi);
1385           tree fndecl;
1386
1387           if (is_gimple_call (stmt)
1388               && gimple_call_lhs (stmt)
1389               && (fndecl = gimple_call_fndecl (stmt))
1390               && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL)
1391             {
1392               tree arg, arg0, arg1, result;
1393               HOST_WIDE_INT n;
1394               location_t loc;
1395
1396               switch (DECL_FUNCTION_CODE (fndecl))
1397                 {
1398                 CASE_FLT_FN (BUILT_IN_COS):
1399                 CASE_FLT_FN (BUILT_IN_SIN):
1400                 CASE_FLT_FN (BUILT_IN_CEXPI):
1401                   /* Make sure we have either sincos or cexp.  */
1402                   if (!TARGET_HAS_SINCOS && !TARGET_C99_FUNCTIONS)
1403                     break;
1404
1405                   arg = gimple_call_arg (stmt, 0);
1406                   if (TREE_CODE (arg) == SSA_NAME)
1407                     cfg_changed |= execute_cse_sincos_1 (arg);
1408                   break;
1409
1410                 CASE_FLT_FN (BUILT_IN_POW):
1411                   arg0 = gimple_call_arg (stmt, 0);
1412                   arg1 = gimple_call_arg (stmt, 1);
1413
1414                   loc = gimple_location (stmt);
1415                   result = gimple_expand_builtin_pow (&gsi, loc, arg0, arg1);
1416
1417                   if (result)
1418                     {
1419                       tree lhs = gimple_get_lhs (stmt);
1420                       gimple new_stmt = gimple_build_assign (lhs, result);
1421                       gimple_set_location (new_stmt, loc);
1422                       unlink_stmt_vdef (stmt);
1423                       gsi_replace (&gsi, new_stmt, true);
1424                       if (gimple_vdef (stmt))
1425                         release_ssa_name (gimple_vdef (stmt));
1426                     }
1427                   break;
1428
1429                 CASE_FLT_FN (BUILT_IN_POWI):
1430                   arg0 = gimple_call_arg (stmt, 0);
1431                   arg1 = gimple_call_arg (stmt, 1);
1432                   if (!host_integerp (arg1, 0))
1433                     break;
1434
1435                   n = TREE_INT_CST_LOW (arg1);
1436                   loc = gimple_location (stmt);
1437                   result = gimple_expand_builtin_powi (&gsi, loc, arg0, n);
1438
1439                   if (result)
1440                     {
1441                       tree lhs = gimple_get_lhs (stmt);
1442                       gimple new_stmt = gimple_build_assign (lhs, result);
1443                       gimple_set_location (new_stmt, loc);
1444                       unlink_stmt_vdef (stmt);
1445                       gsi_replace (&gsi, new_stmt, true);
1446                       if (gimple_vdef (stmt))
1447                         release_ssa_name (gimple_vdef (stmt));
1448                     }
1449                   break;
1450
1451                 CASE_FLT_FN (BUILT_IN_CABS):
1452                   arg0 = gimple_call_arg (stmt, 0);
1453                   loc = gimple_location (stmt);
1454                   result = gimple_expand_builtin_cabs (&gsi, loc, arg0);
1455
1456                   if (result)
1457                     {
1458                       tree lhs = gimple_get_lhs (stmt);
1459                       gimple new_stmt = gimple_build_assign (lhs, result);
1460                       gimple_set_location (new_stmt, loc);
1461                       unlink_stmt_vdef (stmt);
1462                       gsi_replace (&gsi, new_stmt, true);
1463                       if (gimple_vdef (stmt))
1464                         release_ssa_name (gimple_vdef (stmt));
1465                     }
1466                   break;
1467
1468                 default:;
1469                 }
1470             }
1471         }
1472     }
1473
1474   statistics_counter_event (cfun, "sincos statements inserted",
1475                             sincos_stats.inserted);
1476
1477   free_dominance_info (CDI_DOMINATORS);
1478   return cfg_changed ? TODO_cleanup_cfg : 0;
1479 }
1480
1481 static bool
1482 gate_cse_sincos (void)
1483 {
1484   /* We no longer require either sincos or cexp, since powi expansion
1485      piggybacks on this pass.  */
1486   return optimize;
1487 }
1488
1489 struct gimple_opt_pass pass_cse_sincos =
1490 {
1491  {
1492   GIMPLE_PASS,
1493   "sincos",                             /* name */
1494   gate_cse_sincos,                      /* gate */
1495   execute_cse_sincos,                   /* execute */
1496   NULL,                                 /* sub */
1497   NULL,                                 /* next */
1498   0,                                    /* static_pass_number */
1499   TV_NONE,                              /* tv_id */
1500   PROP_ssa,                             /* properties_required */
1501   0,                                    /* properties_provided */
1502   0,                                    /* properties_destroyed */
1503   0,                                    /* todo_flags_start */
1504   TODO_update_ssa | TODO_verify_ssa
1505     | TODO_verify_stmts                 /* todo_flags_finish */
1506  }
1507 };
1508
1509 /* A symbolic number is used to detect byte permutation and selection
1510    patterns.  Therefore the field N contains an artificial number
1511    consisting of byte size markers:
1512
1513    0    - byte has the value 0
1514    1..size - byte contains the content of the byte
1515    number indexed with that value minus one  */
1516
1517 struct symbolic_number {
1518   unsigned HOST_WIDEST_INT n;
1519   int size;
1520 };
1521
1522 /* Perform a SHIFT or ROTATE operation by COUNT bits on symbolic
1523    number N.  Return false if the requested operation is not permitted
1524    on a symbolic number.  */
1525
1526 static inline bool
1527 do_shift_rotate (enum tree_code code,
1528                  struct symbolic_number *n,
1529                  int count)
1530 {
1531   if (count % 8 != 0)
1532     return false;
1533
1534   /* Zero out the extra bits of N in order to avoid them being shifted
1535      into the significant bits.  */
1536   if (n->size < (int)sizeof (HOST_WIDEST_INT))
1537     n->n &= ((unsigned HOST_WIDEST_INT)1 << (n->size * BITS_PER_UNIT)) - 1;
1538
1539   switch (code)
1540     {
1541     case LSHIFT_EXPR:
1542       n->n <<= count;
1543       break;
1544     case RSHIFT_EXPR:
1545       n->n >>= count;
1546       break;
1547     case LROTATE_EXPR:
1548       n->n = (n->n << count) | (n->n >> ((n->size * BITS_PER_UNIT) - count));
1549       break;
1550     case RROTATE_EXPR:
1551       n->n = (n->n >> count) | (n->n << ((n->size * BITS_PER_UNIT) - count));
1552       break;
1553     default:
1554       return false;
1555     }
1556   /* Zero unused bits for size.  */
1557   if (n->size < (int)sizeof (HOST_WIDEST_INT))
1558     n->n &= ((unsigned HOST_WIDEST_INT)1 << (n->size * BITS_PER_UNIT)) - 1;
1559   return true;
1560 }
1561
1562 /* Perform sanity checking for the symbolic number N and the gimple
1563    statement STMT.  */
1564
1565 static inline bool
1566 verify_symbolic_number_p (struct symbolic_number *n, gimple stmt)
1567 {
1568   tree lhs_type;
1569
1570   lhs_type = gimple_expr_type (stmt);
1571
1572   if (TREE_CODE (lhs_type) != INTEGER_TYPE)
1573     return false;
1574
1575   if (TYPE_PRECISION (lhs_type) != n->size * BITS_PER_UNIT)
1576     return false;
1577
1578   return true;
1579 }
1580
1581 /* find_bswap_1 invokes itself recursively with N and tries to perform
1582    the operation given by the rhs of STMT on the result.  If the
1583    operation could successfully be executed the function returns the
1584    tree expression of the source operand and NULL otherwise.  */
1585
1586 static tree
1587 find_bswap_1 (gimple stmt, struct symbolic_number *n, int limit)
1588 {
1589   enum tree_code code;
1590   tree rhs1, rhs2 = NULL;
1591   gimple rhs1_stmt, rhs2_stmt;
1592   tree source_expr1;
1593   enum gimple_rhs_class rhs_class;
1594
1595   if (!limit || !is_gimple_assign (stmt))
1596     return NULL_TREE;
1597
1598   rhs1 = gimple_assign_rhs1 (stmt);
1599
1600   if (TREE_CODE (rhs1) != SSA_NAME)
1601     return NULL_TREE;
1602
1603   code = gimple_assign_rhs_code (stmt);
1604   rhs_class = gimple_assign_rhs_class (stmt);
1605   rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
1606
1607   if (rhs_class == GIMPLE_BINARY_RHS)
1608     rhs2 = gimple_assign_rhs2 (stmt);
1609
1610   /* Handle unary rhs and binary rhs with integer constants as second
1611      operand.  */
1612
1613   if (rhs_class == GIMPLE_UNARY_RHS
1614       || (rhs_class == GIMPLE_BINARY_RHS
1615           && TREE_CODE (rhs2) == INTEGER_CST))
1616     {
1617       if (code != BIT_AND_EXPR
1618           && code != LSHIFT_EXPR
1619           && code != RSHIFT_EXPR
1620           && code != LROTATE_EXPR
1621           && code != RROTATE_EXPR
1622           && code != NOP_EXPR
1623           && code != CONVERT_EXPR)
1624         return NULL_TREE;
1625
1626       source_expr1 = find_bswap_1 (rhs1_stmt, n, limit - 1);
1627
1628       /* If find_bswap_1 returned NULL STMT is a leaf node and we have
1629          to initialize the symbolic number.  */
1630       if (!source_expr1)
1631         {
1632           /* Set up the symbolic number N by setting each byte to a
1633              value between 1 and the byte size of rhs1.  The highest
1634              order byte is set to n->size and the lowest order
1635              byte to 1.  */
1636           n->size = TYPE_PRECISION (TREE_TYPE (rhs1));
1637           if (n->size % BITS_PER_UNIT != 0)
1638             return NULL_TREE;
1639           n->size /= BITS_PER_UNIT;
1640           n->n = (sizeof (HOST_WIDEST_INT) < 8 ? 0 :
1641                   (unsigned HOST_WIDEST_INT)0x08070605 << 32 | 0x04030201);
1642
1643           if (n->size < (int)sizeof (HOST_WIDEST_INT))
1644             n->n &= ((unsigned HOST_WIDEST_INT)1 <<
1645                      (n->size * BITS_PER_UNIT)) - 1;
1646
1647           source_expr1 = rhs1;
1648         }
1649
1650       switch (code)
1651         {
1652         case BIT_AND_EXPR:
1653           {
1654             int i;
1655             unsigned HOST_WIDEST_INT val = widest_int_cst_value (rhs2);
1656             unsigned HOST_WIDEST_INT tmp = val;
1657
1658             /* Only constants masking full bytes are allowed.  */
1659             for (i = 0; i < n->size; i++, tmp >>= BITS_PER_UNIT)
1660               if ((tmp & 0xff) != 0 && (tmp & 0xff) != 0xff)
1661                 return NULL_TREE;
1662
1663             n->n &= val;
1664           }
1665           break;
1666         case LSHIFT_EXPR:
1667         case RSHIFT_EXPR:
1668         case LROTATE_EXPR:
1669         case RROTATE_EXPR:
1670           if (!do_shift_rotate (code, n, (int)TREE_INT_CST_LOW (rhs2)))
1671             return NULL_TREE;
1672           break;
1673         CASE_CONVERT:
1674           {
1675             int type_size;
1676
1677             type_size = TYPE_PRECISION (gimple_expr_type (stmt));
1678             if (type_size % BITS_PER_UNIT != 0)
1679               return NULL_TREE;
1680
1681             if (type_size / BITS_PER_UNIT < (int)(sizeof (HOST_WIDEST_INT)))
1682               {
1683                 /* If STMT casts to a smaller type mask out the bits not
1684                    belonging to the target type.  */
1685                 n->n &= ((unsigned HOST_WIDEST_INT)1 << type_size) - 1;
1686               }
1687             n->size = type_size / BITS_PER_UNIT;
1688           }
1689           break;
1690         default:
1691           return NULL_TREE;
1692         };
1693       return verify_symbolic_number_p (n, stmt) ? source_expr1 : NULL;
1694     }
1695
1696   /* Handle binary rhs.  */
1697
1698   if (rhs_class == GIMPLE_BINARY_RHS)
1699     {
1700       struct symbolic_number n1, n2;
1701       tree source_expr2;
1702
1703       if (code != BIT_IOR_EXPR)
1704         return NULL_TREE;
1705
1706       if (TREE_CODE (rhs2) != SSA_NAME)
1707         return NULL_TREE;
1708
1709       rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
1710
1711       switch (code)
1712         {
1713         case BIT_IOR_EXPR:
1714           source_expr1 = find_bswap_1 (rhs1_stmt, &n1, limit - 1);
1715
1716           if (!source_expr1)
1717             return NULL_TREE;
1718
1719           source_expr2 = find_bswap_1 (rhs2_stmt, &n2, limit - 1);
1720
1721           if (source_expr1 != source_expr2
1722               || n1.size != n2.size)
1723             return NULL_TREE;
1724
1725           n->size = n1.size;
1726           n->n = n1.n | n2.n;
1727
1728           if (!verify_symbolic_number_p (n, stmt))
1729             return NULL_TREE;
1730
1731           break;
1732         default:
1733           return NULL_TREE;
1734         }
1735       return source_expr1;
1736     }
1737   return NULL_TREE;
1738 }
1739
1740 /* Check if STMT completes a bswap implementation consisting of ORs,
1741    SHIFTs and ANDs.  Return the source tree expression on which the
1742    byte swap is performed and NULL if no bswap was found.  */
1743
1744 static tree
1745 find_bswap (gimple stmt)
1746 {
1747 /* The number which the find_bswap result should match in order to
1748    have a full byte swap.  The number is shifted to the left according
1749    to the size of the symbolic number before using it.  */
1750   unsigned HOST_WIDEST_INT cmp =
1751     sizeof (HOST_WIDEST_INT) < 8 ? 0 :
1752     (unsigned HOST_WIDEST_INT)0x01020304 << 32 | 0x05060708;
1753
1754   struct symbolic_number n;
1755   tree source_expr;
1756   int limit;
1757
1758   /* The last parameter determines the depth search limit.  It usually
1759      correlates directly to the number of bytes to be touched.  We
1760      increase that number by three  here in order to also
1761      cover signed -> unsigned converions of the src operand as can be seen
1762      in libgcc, and for initial shift/and operation of the src operand.  */
1763   limit = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (gimple_expr_type (stmt)));
1764   limit += 1 + (int) ceil_log2 ((unsigned HOST_WIDE_INT) limit);
1765   source_expr =  find_bswap_1 (stmt, &n, limit);
1766
1767   if (!source_expr)
1768     return NULL_TREE;
1769
1770   /* Zero out the extra bits of N and CMP.  */
1771   if (n.size < (int)sizeof (HOST_WIDEST_INT))
1772     {
1773       unsigned HOST_WIDEST_INT mask =
1774         ((unsigned HOST_WIDEST_INT)1 << (n.size * BITS_PER_UNIT)) - 1;
1775
1776       n.n &= mask;
1777       cmp >>= (sizeof (HOST_WIDEST_INT) - n.size) * BITS_PER_UNIT;
1778     }
1779
1780   /* A complete byte swap should make the symbolic number to start
1781      with the largest digit in the highest order byte.  */
1782   if (cmp != n.n)
1783     return NULL_TREE;
1784
1785   return source_expr;
1786 }
1787
1788 /* Find manual byte swap implementations and turn them into a bswap
1789    builtin invokation.  */
1790
1791 static unsigned int
1792 execute_optimize_bswap (void)
1793 {
1794   basic_block bb;
1795   bool bswap32_p, bswap64_p;
1796   bool changed = false;
1797   tree bswap32_type = NULL_TREE, bswap64_type = NULL_TREE;
1798
1799   if (BITS_PER_UNIT != 8)
1800     return 0;
1801
1802   if (sizeof (HOST_WIDEST_INT) < 8)
1803     return 0;
1804
1805   bswap32_p = (builtin_decl_explicit_p (BUILT_IN_BSWAP32)
1806                && optab_handler (bswap_optab, SImode) != CODE_FOR_nothing);
1807   bswap64_p = (builtin_decl_explicit_p (BUILT_IN_BSWAP64)
1808                && (optab_handler (bswap_optab, DImode) != CODE_FOR_nothing
1809                    || (bswap32_p && word_mode == SImode)));
1810
1811   if (!bswap32_p && !bswap64_p)
1812     return 0;
1813
1814   /* Determine the argument type of the builtins.  The code later on
1815      assumes that the return and argument type are the same.  */
1816   if (bswap32_p)
1817     {
1818       tree fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32);
1819       bswap32_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
1820     }
1821
1822   if (bswap64_p)
1823     {
1824       tree fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64);
1825       bswap64_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
1826     }
1827
1828   memset (&bswap_stats, 0, sizeof (bswap_stats));
1829
1830   FOR_EACH_BB (bb)
1831     {
1832       gimple_stmt_iterator gsi;
1833
1834       /* We do a reverse scan for bswap patterns to make sure we get the
1835          widest match. As bswap pattern matching doesn't handle
1836          previously inserted smaller bswap replacements as sub-
1837          patterns, the wider variant wouldn't be detected.  */
1838       for (gsi = gsi_last_bb (bb); !gsi_end_p (gsi); gsi_prev (&gsi))
1839         {
1840           gimple stmt = gsi_stmt (gsi);
1841           tree bswap_src, bswap_type;
1842           tree bswap_tmp;
1843           tree fndecl = NULL_TREE;
1844           int type_size;
1845           gimple call;
1846
1847           if (!is_gimple_assign (stmt)
1848               || gimple_assign_rhs_code (stmt) != BIT_IOR_EXPR)
1849             continue;
1850
1851           type_size = TYPE_PRECISION (gimple_expr_type (stmt));
1852
1853           switch (type_size)
1854             {
1855             case 32:
1856               if (bswap32_p)
1857                 {
1858                   fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32);
1859                   bswap_type = bswap32_type;
1860                 }
1861               break;
1862             case 64:
1863               if (bswap64_p)
1864                 {
1865                   fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64);
1866                   bswap_type = bswap64_type;
1867                 }
1868               break;
1869             default:
1870               continue;
1871             }
1872
1873           if (!fndecl)
1874             continue;
1875
1876           bswap_src = find_bswap (stmt);
1877
1878           if (!bswap_src)
1879             continue;
1880
1881           changed = true;
1882           if (type_size == 32)
1883             bswap_stats.found_32bit++;
1884           else
1885             bswap_stats.found_64bit++;
1886
1887           bswap_tmp = bswap_src;
1888
1889           /* Convert the src expression if necessary.  */
1890           if (!useless_type_conversion_p (TREE_TYPE (bswap_tmp), bswap_type))
1891             {
1892               gimple convert_stmt;
1893               bswap_tmp = make_temp_ssa_name (bswap_type, NULL, "bswapsrc");
1894               convert_stmt = gimple_build_assign_with_ops
1895                                 (NOP_EXPR, bswap_tmp, bswap_src, NULL);
1896               gsi_insert_before (&gsi, convert_stmt, GSI_SAME_STMT);
1897             }
1898
1899           call = gimple_build_call (fndecl, 1, bswap_tmp);
1900
1901           bswap_tmp = gimple_assign_lhs (stmt);
1902
1903           /* Convert the result if necessary.  */
1904           if (!useless_type_conversion_p (TREE_TYPE (bswap_tmp), bswap_type))
1905             {
1906               gimple convert_stmt;
1907               bswap_tmp = make_temp_ssa_name (bswap_type, NULL, "bswapdst");
1908               convert_stmt = gimple_build_assign_with_ops
1909                         (NOP_EXPR, gimple_assign_lhs (stmt), bswap_tmp, NULL);
1910               gsi_insert_after (&gsi, convert_stmt, GSI_SAME_STMT);
1911             }
1912
1913           gimple_call_set_lhs (call, bswap_tmp);
1914
1915           if (dump_file)
1916             {
1917               fprintf (dump_file, "%d bit bswap implementation found at: ",
1918                        (int)type_size);
1919               print_gimple_stmt (dump_file, stmt, 0, 0);
1920             }
1921
1922           gsi_insert_after (&gsi, call, GSI_SAME_STMT);
1923           gsi_remove (&gsi, true);
1924         }
1925     }
1926
1927   statistics_counter_event (cfun, "32-bit bswap implementations found",
1928                             bswap_stats.found_32bit);
1929   statistics_counter_event (cfun, "64-bit bswap implementations found",
1930                             bswap_stats.found_64bit);
1931
1932   return (changed ? TODO_update_ssa | TODO_verify_ssa
1933           | TODO_verify_stmts : 0);
1934 }
1935
1936 static bool
1937 gate_optimize_bswap (void)
1938 {
1939   return flag_expensive_optimizations && optimize;
1940 }
1941
1942 struct gimple_opt_pass pass_optimize_bswap =
1943 {
1944  {
1945   GIMPLE_PASS,
1946   "bswap",                              /* name */
1947   gate_optimize_bswap,                  /* gate */
1948   execute_optimize_bswap,               /* execute */
1949   NULL,                                 /* sub */
1950   NULL,                                 /* next */
1951   0,                                    /* static_pass_number */
1952   TV_NONE,                              /* tv_id */
1953   PROP_ssa,                             /* properties_required */
1954   0,                                    /* properties_provided */
1955   0,                                    /* properties_destroyed */
1956   0,                                    /* todo_flags_start */
1957   0                                     /* todo_flags_finish */
1958  }
1959 };
1960
1961 /* Return true if stmt is a type conversion operation that can be stripped
1962    when used in a widening multiply operation.  */
1963 static bool
1964 widening_mult_conversion_strippable_p (tree result_type, gimple stmt)
1965 {
1966   enum tree_code rhs_code = gimple_assign_rhs_code (stmt);
1967
1968   if (TREE_CODE (result_type) == INTEGER_TYPE)
1969     {
1970       tree op_type;
1971       tree inner_op_type;
1972
1973       if (!CONVERT_EXPR_CODE_P (rhs_code))
1974         return false;
1975
1976       op_type = TREE_TYPE (gimple_assign_lhs (stmt));
1977
1978       /* If the type of OP has the same precision as the result, then
1979          we can strip this conversion.  The multiply operation will be
1980          selected to create the correct extension as a by-product.  */
1981       if (TYPE_PRECISION (result_type) == TYPE_PRECISION (op_type))
1982         return true;
1983
1984       /* We can also strip a conversion if it preserves the signed-ness of
1985          the operation and doesn't narrow the range.  */
1986       inner_op_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
1987
1988       if (TYPE_UNSIGNED (op_type) == TYPE_UNSIGNED (inner_op_type)
1989           && TYPE_PRECISION (op_type) > TYPE_PRECISION (inner_op_type))
1990         return true;
1991
1992       return false;
1993     }
1994
1995   return rhs_code == FIXED_CONVERT_EXPR;
1996 }
1997
1998 /* Return true if RHS is a suitable operand for a widening multiplication,
1999    assuming a target type of TYPE.
2000    There are two cases:
2001
2002      - RHS makes some value at least twice as wide.  Store that value
2003        in *NEW_RHS_OUT if so, and store its type in *TYPE_OUT.
2004
2005      - RHS is an integer constant.  Store that value in *NEW_RHS_OUT if so,
2006        but leave *TYPE_OUT untouched.  */
2007
2008 static bool
2009 is_widening_mult_rhs_p (tree type, tree rhs, tree *type_out,
2010                         tree *new_rhs_out)
2011 {
2012   gimple stmt;
2013   tree type1, rhs1;
2014   enum tree_code rhs_code;
2015
2016   if (TREE_CODE (rhs) == SSA_NAME)
2017     {
2018       stmt = SSA_NAME_DEF_STMT (rhs);
2019       if (is_gimple_assign (stmt))
2020         {
2021           rhs_code = gimple_assign_rhs_code (stmt);
2022           if (! widening_mult_conversion_strippable_p (type, stmt))
2023             rhs1 = rhs;
2024           else
2025             {
2026               rhs1 = gimple_assign_rhs1 (stmt);
2027
2028               if (TREE_CODE (rhs1) == INTEGER_CST)
2029                 {
2030                   *new_rhs_out = rhs1;
2031                   *type_out = NULL;
2032                   return true;
2033                 }
2034             }
2035         }
2036       else
2037         rhs1 = rhs;
2038
2039       type1 = TREE_TYPE (rhs1);
2040
2041       if (TREE_CODE (type1) != TREE_CODE (type)
2042           || TYPE_PRECISION (type1) * 2 > TYPE_PRECISION (type))
2043         return false;
2044
2045       *new_rhs_out = rhs1;
2046       *type_out = type1;
2047       return true;
2048     }
2049
2050   if (TREE_CODE (rhs) == INTEGER_CST)
2051     {
2052       *new_rhs_out = rhs;
2053       *type_out = NULL;
2054       return true;
2055     }
2056
2057   return false;
2058 }
2059
2060 /* Return true if STMT performs a widening multiplication, assuming the
2061    output type is TYPE.  If so, store the unwidened types of the operands
2062    in *TYPE1_OUT and *TYPE2_OUT respectively.  Also fill *RHS1_OUT and
2063    *RHS2_OUT such that converting those operands to types *TYPE1_OUT
2064    and *TYPE2_OUT would give the operands of the multiplication.  */
2065
2066 static bool
2067 is_widening_mult_p (gimple stmt,
2068                     tree *type1_out, tree *rhs1_out,
2069                     tree *type2_out, tree *rhs2_out)
2070 {
2071   tree type = TREE_TYPE (gimple_assign_lhs (stmt));
2072
2073   if (TREE_CODE (type) != INTEGER_TYPE
2074       && TREE_CODE (type) != FIXED_POINT_TYPE)
2075     return false;
2076
2077   if (!is_widening_mult_rhs_p (type, gimple_assign_rhs1 (stmt), type1_out,
2078                                rhs1_out))
2079     return false;
2080
2081   if (!is_widening_mult_rhs_p (type, gimple_assign_rhs2 (stmt), type2_out,
2082                                rhs2_out))
2083     return false;
2084
2085   if (*type1_out == NULL)
2086     {
2087       if (*type2_out == NULL || !int_fits_type_p (*rhs1_out, *type2_out))
2088         return false;
2089       *type1_out = *type2_out;
2090     }
2091
2092   if (*type2_out == NULL)
2093     {
2094       if (!int_fits_type_p (*rhs2_out, *type1_out))
2095         return false;
2096       *type2_out = *type1_out;
2097     }
2098
2099   /* Ensure that the larger of the two operands comes first. */
2100   if (TYPE_PRECISION (*type1_out) < TYPE_PRECISION (*type2_out))
2101     {
2102       tree tmp;
2103       tmp = *type1_out;
2104       *type1_out = *type2_out;
2105       *type2_out = tmp;
2106       tmp = *rhs1_out;
2107       *rhs1_out = *rhs2_out;
2108       *rhs2_out = tmp;
2109     }
2110
2111   return true;
2112 }
2113
2114 /* Process a single gimple statement STMT, which has a MULT_EXPR as
2115    its rhs, and try to convert it into a WIDEN_MULT_EXPR.  The return
2116    value is true iff we converted the statement.  */
2117
2118 static bool
2119 convert_mult_to_widen (gimple stmt, gimple_stmt_iterator *gsi)
2120 {
2121   tree lhs, rhs1, rhs2, type, type1, type2;
2122   enum insn_code handler;
2123   enum machine_mode to_mode, from_mode, actual_mode;
2124   optab op;
2125   int actual_precision;
2126   location_t loc = gimple_location (stmt);
2127   bool from_unsigned1, from_unsigned2;
2128
2129   lhs = gimple_assign_lhs (stmt);
2130   type = TREE_TYPE (lhs);
2131   if (TREE_CODE (type) != INTEGER_TYPE)
2132     return false;
2133
2134   if (!is_widening_mult_p (stmt, &type1, &rhs1, &type2, &rhs2))
2135     return false;
2136
2137   to_mode = TYPE_MODE (type);
2138   from_mode = TYPE_MODE (type1);
2139   from_unsigned1 = TYPE_UNSIGNED (type1);
2140   from_unsigned2 = TYPE_UNSIGNED (type2);
2141
2142   if (from_unsigned1 && from_unsigned2)
2143     op = umul_widen_optab;
2144   else if (!from_unsigned1 && !from_unsigned2)
2145     op = smul_widen_optab;
2146   else
2147     op = usmul_widen_optab;
2148
2149   handler = find_widening_optab_handler_and_mode (op, to_mode, from_mode,
2150                                                   0, &actual_mode);
2151
2152   if (handler == CODE_FOR_nothing)
2153     {
2154       if (op != smul_widen_optab)
2155         {
2156           /* We can use a signed multiply with unsigned types as long as
2157              there is a wider mode to use, or it is the smaller of the two
2158              types that is unsigned.  Note that type1 >= type2, always.  */
2159           if ((TYPE_UNSIGNED (type1)
2160                && TYPE_PRECISION (type1) == GET_MODE_PRECISION (from_mode))
2161               || (TYPE_UNSIGNED (type2)
2162                   && TYPE_PRECISION (type2) == GET_MODE_PRECISION (from_mode)))
2163             {
2164               from_mode = GET_MODE_WIDER_MODE (from_mode);
2165               if (GET_MODE_SIZE (to_mode) <= GET_MODE_SIZE (from_mode))
2166                 return false;
2167             }
2168
2169           op = smul_widen_optab;
2170           handler = find_widening_optab_handler_and_mode (op, to_mode,
2171                                                           from_mode, 0,
2172                                                           &actual_mode);
2173
2174           if (handler == CODE_FOR_nothing)
2175             return false;
2176
2177           from_unsigned1 = from_unsigned2 = false;
2178         }
2179       else
2180         return false;
2181     }
2182
2183   /* Ensure that the inputs to the handler are in the correct precison
2184      for the opcode.  This will be the full mode size.  */
2185   actual_precision = GET_MODE_PRECISION (actual_mode);
2186   if (2 * actual_precision > TYPE_PRECISION (type))
2187     return false;
2188   if (actual_precision != TYPE_PRECISION (type1)
2189       || from_unsigned1 != TYPE_UNSIGNED (type1))
2190     rhs1 = build_and_insert_cast (gsi, loc,
2191                                   build_nonstandard_integer_type
2192                                     (actual_precision, from_unsigned1), rhs1);
2193   if (actual_precision != TYPE_PRECISION (type2)
2194       || from_unsigned2 != TYPE_UNSIGNED (type2))
2195     rhs2 = build_and_insert_cast (gsi, loc,
2196                                   build_nonstandard_integer_type
2197                                     (actual_precision, from_unsigned2), rhs2);
2198
2199   /* Handle constants.  */
2200   if (TREE_CODE (rhs1) == INTEGER_CST)
2201     rhs1 = fold_convert (type1, rhs1);
2202   if (TREE_CODE (rhs2) == INTEGER_CST)
2203     rhs2 = fold_convert (type2, rhs2);
2204
2205   gimple_assign_set_rhs1 (stmt, rhs1);
2206   gimple_assign_set_rhs2 (stmt, rhs2);
2207   gimple_assign_set_rhs_code (stmt, WIDEN_MULT_EXPR);
2208   update_stmt (stmt);
2209   widen_mul_stats.widen_mults_inserted++;
2210   return true;
2211 }
2212
2213 /* Process a single gimple statement STMT, which is found at the
2214    iterator GSI and has a either a PLUS_EXPR or a MINUS_EXPR as its
2215    rhs (given by CODE), and try to convert it into a
2216    WIDEN_MULT_PLUS_EXPR or a WIDEN_MULT_MINUS_EXPR.  The return value
2217    is true iff we converted the statement.  */
2218
2219 static bool
2220 convert_plusminus_to_widen (gimple_stmt_iterator *gsi, gimple stmt,
2221                             enum tree_code code)
2222 {
2223   gimple rhs1_stmt = NULL, rhs2_stmt = NULL;
2224   gimple conv1_stmt = NULL, conv2_stmt = NULL, conv_stmt;
2225   tree type, type1, type2, optype;
2226   tree lhs, rhs1, rhs2, mult_rhs1, mult_rhs2, add_rhs;
2227   enum tree_code rhs1_code = ERROR_MARK, rhs2_code = ERROR_MARK;
2228   optab this_optab;
2229   enum tree_code wmult_code;
2230   enum insn_code handler;
2231   enum machine_mode to_mode, from_mode, actual_mode;
2232   location_t loc = gimple_location (stmt);
2233   int actual_precision;
2234   bool from_unsigned1, from_unsigned2;
2235
2236   lhs = gimple_assign_lhs (stmt);
2237   type = TREE_TYPE (lhs);
2238   if (TREE_CODE (type) != INTEGER_TYPE
2239       && TREE_CODE (type) != FIXED_POINT_TYPE)
2240     return false;
2241
2242   if (code == MINUS_EXPR)
2243     wmult_code = WIDEN_MULT_MINUS_EXPR;
2244   else
2245     wmult_code = WIDEN_MULT_PLUS_EXPR;
2246
2247   rhs1 = gimple_assign_rhs1 (stmt);
2248   rhs2 = gimple_assign_rhs2 (stmt);
2249
2250   if (TREE_CODE (rhs1) == SSA_NAME)
2251     {
2252       rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
2253       if (is_gimple_assign (rhs1_stmt))
2254         rhs1_code = gimple_assign_rhs_code (rhs1_stmt);
2255     }
2256
2257   if (TREE_CODE (rhs2) == SSA_NAME)
2258     {
2259       rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
2260       if (is_gimple_assign (rhs2_stmt))
2261         rhs2_code = gimple_assign_rhs_code (rhs2_stmt);
2262     }
2263
2264   /* Allow for one conversion statement between the multiply
2265      and addition/subtraction statement.  If there are more than
2266      one conversions then we assume they would invalidate this
2267      transformation.  If that's not the case then they should have
2268      been folded before now.  */
2269   if (CONVERT_EXPR_CODE_P (rhs1_code))
2270     {
2271       conv1_stmt = rhs1_stmt;
2272       rhs1 = gimple_assign_rhs1 (rhs1_stmt);
2273       if (TREE_CODE (rhs1) == SSA_NAME)
2274         {
2275           rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
2276           if (is_gimple_assign (rhs1_stmt))
2277             rhs1_code = gimple_assign_rhs_code (rhs1_stmt);
2278         }
2279       else
2280         return false;
2281     }
2282   if (CONVERT_EXPR_CODE_P (rhs2_code))
2283     {
2284       conv2_stmt = rhs2_stmt;
2285       rhs2 = gimple_assign_rhs1 (rhs2_stmt);
2286       if (TREE_CODE (rhs2) == SSA_NAME)
2287         {
2288           rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
2289           if (is_gimple_assign (rhs2_stmt))
2290             rhs2_code = gimple_assign_rhs_code (rhs2_stmt);
2291         }
2292       else
2293         return false;
2294     }
2295
2296   /* If code is WIDEN_MULT_EXPR then it would seem unnecessary to call
2297      is_widening_mult_p, but we still need the rhs returns.
2298
2299      It might also appear that it would be sufficient to use the existing
2300      operands of the widening multiply, but that would limit the choice of
2301      multiply-and-accumulate instructions.  */
2302   if (code == PLUS_EXPR
2303       && (rhs1_code == MULT_EXPR || rhs1_code == WIDEN_MULT_EXPR))
2304     {
2305       if (!is_widening_mult_p (rhs1_stmt, &type1, &mult_rhs1,
2306                                &type2, &mult_rhs2))
2307         return false;
2308       add_rhs = rhs2;
2309       conv_stmt = conv1_stmt;
2310     }
2311   else if (rhs2_code == MULT_EXPR || rhs2_code == WIDEN_MULT_EXPR)
2312     {
2313       if (!is_widening_mult_p (rhs2_stmt, &type1, &mult_rhs1,
2314                                &type2, &mult_rhs2))
2315         return false;
2316       add_rhs = rhs1;
2317       conv_stmt = conv2_stmt;
2318     }
2319   else
2320     return false;
2321
2322   to_mode = TYPE_MODE (type);
2323   from_mode = TYPE_MODE (type1);
2324   from_unsigned1 = TYPE_UNSIGNED (type1);
2325   from_unsigned2 = TYPE_UNSIGNED (type2);
2326   optype = type1;
2327
2328   /* There's no such thing as a mixed sign madd yet, so use a wider mode.  */
2329   if (from_unsigned1 != from_unsigned2)
2330     {
2331       if (!INTEGRAL_TYPE_P (type))
2332         return false;
2333       /* We can use a signed multiply with unsigned types as long as
2334          there is a wider mode to use, or it is the smaller of the two
2335          types that is unsigned.  Note that type1 >= type2, always.  */
2336       if ((from_unsigned1
2337            && TYPE_PRECISION (type1) == GET_MODE_PRECISION (from_mode))
2338           || (from_unsigned2
2339               && TYPE_PRECISION (type2) == GET_MODE_PRECISION (from_mode)))
2340         {
2341           from_mode = GET_MODE_WIDER_MODE (from_mode);
2342           if (GET_MODE_SIZE (from_mode) >= GET_MODE_SIZE (to_mode))
2343             return false;
2344         }
2345
2346       from_unsigned1 = from_unsigned2 = false;
2347       optype = build_nonstandard_integer_type (GET_MODE_PRECISION (from_mode),
2348                                                false);
2349     }
2350
2351   /* If there was a conversion between the multiply and addition
2352      then we need to make sure it fits a multiply-and-accumulate.
2353      The should be a single mode change which does not change the
2354      value.  */
2355   if (conv_stmt)
2356     {
2357       /* We use the original, unmodified data types for this.  */
2358       tree from_type = TREE_TYPE (gimple_assign_rhs1 (conv_stmt));
2359       tree to_type = TREE_TYPE (gimple_assign_lhs (conv_stmt));
2360       int data_size = TYPE_PRECISION (type1) + TYPE_PRECISION (type2);
2361       bool is_unsigned = TYPE_UNSIGNED (type1) && TYPE_UNSIGNED (type2);
2362
2363       if (TYPE_PRECISION (from_type) > TYPE_PRECISION (to_type))
2364         {
2365           /* Conversion is a truncate.  */
2366           if (TYPE_PRECISION (to_type) < data_size)
2367             return false;
2368         }
2369       else if (TYPE_PRECISION (from_type) < TYPE_PRECISION (to_type))
2370         {
2371           /* Conversion is an extend.  Check it's the right sort.  */
2372           if (TYPE_UNSIGNED (from_type) != is_unsigned
2373               && !(is_unsigned && TYPE_PRECISION (from_type) > data_size))
2374             return false;
2375         }
2376       /* else convert is a no-op for our purposes.  */
2377     }
2378
2379   /* Verify that the machine can perform a widening multiply
2380      accumulate in this mode/signedness combination, otherwise
2381      this transformation is likely to pessimize code.  */
2382   this_optab = optab_for_tree_code (wmult_code, optype, optab_default);
2383   handler = find_widening_optab_handler_and_mode (this_optab, to_mode,
2384                                                   from_mode, 0, &actual_mode);
2385
2386   if (handler == CODE_FOR_nothing)
2387     return false;
2388
2389   /* Ensure that the inputs to the handler are in the correct precison
2390      for the opcode.  This will be the full mode size.  */
2391   actual_precision = GET_MODE_PRECISION (actual_mode);
2392   if (actual_precision != TYPE_PRECISION (type1)
2393       || from_unsigned1 != TYPE_UNSIGNED (type1))
2394     mult_rhs1 = build_and_insert_cast (gsi, loc,
2395                                        build_nonstandard_integer_type
2396                                          (actual_precision, from_unsigned1),
2397                                        mult_rhs1);
2398   if (actual_precision != TYPE_PRECISION (type2)
2399       || from_unsigned2 != TYPE_UNSIGNED (type2))
2400     mult_rhs2 = build_and_insert_cast (gsi, loc,
2401                                        build_nonstandard_integer_type
2402                                          (actual_precision, from_unsigned2),
2403                                        mult_rhs2);
2404
2405   if (!useless_type_conversion_p (type, TREE_TYPE (add_rhs)))
2406     add_rhs = build_and_insert_cast (gsi, loc, type, add_rhs);
2407
2408   /* Handle constants.  */
2409   if (TREE_CODE (mult_rhs1) == INTEGER_CST)
2410     mult_rhs1 = fold_convert (type1, mult_rhs1);
2411   if (TREE_CODE (mult_rhs2) == INTEGER_CST)
2412     mult_rhs2 = fold_convert (type2, mult_rhs2);
2413
2414   gimple_assign_set_rhs_with_ops_1 (gsi, wmult_code, mult_rhs1, mult_rhs2,
2415                                     add_rhs);
2416   update_stmt (gsi_stmt (*gsi));
2417   widen_mul_stats.maccs_inserted++;
2418   return true;
2419 }
2420
2421 /* Combine the multiplication at MUL_STMT with operands MULOP1 and MULOP2
2422    with uses in additions and subtractions to form fused multiply-add
2423    operations.  Returns true if successful and MUL_STMT should be removed.  */
2424
2425 static bool
2426 convert_mult_to_fma (gimple mul_stmt, tree op1, tree op2)
2427 {
2428   tree mul_result = gimple_get_lhs (mul_stmt);
2429   tree type = TREE_TYPE (mul_result);
2430   gimple use_stmt, neguse_stmt, fma_stmt;
2431   use_operand_p use_p;
2432   imm_use_iterator imm_iter;
2433
2434   if (FLOAT_TYPE_P (type)
2435       && flag_fp_contract_mode == FP_CONTRACT_OFF)
2436     return false;
2437
2438   /* We don't want to do bitfield reduction ops.  */
2439   if (INTEGRAL_TYPE_P (type)
2440       && (TYPE_PRECISION (type)
2441           != GET_MODE_PRECISION (TYPE_MODE (type))))
2442     return false;
2443
2444   /* If the target doesn't support it, don't generate it.  We assume that
2445      if fma isn't available then fms, fnma or fnms are not either.  */
2446   if (optab_handler (fma_optab, TYPE_MODE (type)) == CODE_FOR_nothing)
2447     return false;
2448
2449   /* If the multiplication has zero uses, it is kept around probably because
2450      of -fnon-call-exceptions.  Don't optimize it away in that case,
2451      it is DCE job.  */
2452   if (has_zero_uses (mul_result))
2453     return false;
2454
2455   /* Make sure that the multiplication statement becomes dead after
2456      the transformation, thus that all uses are transformed to FMAs.
2457      This means we assume that an FMA operation has the same cost
2458      as an addition.  */
2459   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, mul_result)
2460     {
2461       enum tree_code use_code;
2462       tree result = mul_result;
2463       bool negate_p = false;
2464
2465       use_stmt = USE_STMT (use_p);
2466
2467       if (is_gimple_debug (use_stmt))
2468         continue;
2469
2470       /* For now restrict this operations to single basic blocks.  In theory
2471          we would want to support sinking the multiplication in
2472          m = a*b;
2473          if ()
2474            ma = m + c;
2475          else
2476            d = m;
2477          to form a fma in the then block and sink the multiplication to the
2478          else block.  */
2479       if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
2480         return false;
2481
2482       if (!is_gimple_assign (use_stmt))
2483         return false;
2484
2485       use_code = gimple_assign_rhs_code (use_stmt);
2486
2487       /* A negate on the multiplication leads to FNMA.  */
2488       if (use_code == NEGATE_EXPR)
2489         {
2490           ssa_op_iter iter;
2491           use_operand_p usep;
2492
2493           result = gimple_assign_lhs (use_stmt);
2494
2495           /* Make sure the negate statement becomes dead with this
2496              single transformation.  */
2497           if (!single_imm_use (gimple_assign_lhs (use_stmt),
2498                                &use_p, &neguse_stmt))
2499             return false;
2500
2501           /* Make sure the multiplication isn't also used on that stmt.  */
2502           FOR_EACH_PHI_OR_STMT_USE (usep, neguse_stmt, iter, SSA_OP_USE)
2503             if (USE_FROM_PTR (usep) == mul_result)
2504               return false;
2505
2506           /* Re-validate.  */
2507           use_stmt = neguse_stmt;
2508           if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
2509             return false;
2510           if (!is_gimple_assign (use_stmt))
2511             return false;
2512
2513           use_code = gimple_assign_rhs_code (use_stmt);
2514           negate_p = true;
2515         }
2516
2517       switch (use_code)
2518         {
2519         case MINUS_EXPR:
2520           if (gimple_assign_rhs2 (use_stmt) == result)
2521             negate_p = !negate_p;
2522           break;
2523         case PLUS_EXPR:
2524           break;
2525         default:
2526           /* FMA can only be formed from PLUS and MINUS.  */
2527           return false;
2528         }
2529
2530       /* We can't handle a * b + a * b.  */
2531       if (gimple_assign_rhs1 (use_stmt) == gimple_assign_rhs2 (use_stmt))
2532         return false;
2533
2534       /* While it is possible to validate whether or not the exact form
2535          that we've recognized is available in the backend, the assumption
2536          is that the transformation is never a loss.  For instance, suppose
2537          the target only has the plain FMA pattern available.  Consider
2538          a*b-c -> fma(a,b,-c): we've exchanged MUL+SUB for FMA+NEG, which
2539          is still two operations.  Consider -(a*b)-c -> fma(-a,b,-c): we
2540          still have 3 operations, but in the FMA form the two NEGs are
2541          independent and could be run in parallel.  */
2542     }
2543
2544   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, mul_result)
2545     {
2546       gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
2547       enum tree_code use_code;
2548       tree addop, mulop1 = op1, result = mul_result;
2549       bool negate_p = false;
2550
2551       if (is_gimple_debug (use_stmt))
2552         continue;
2553
2554       use_code = gimple_assign_rhs_code (use_stmt);
2555       if (use_code == NEGATE_EXPR)
2556         {
2557           result = gimple_assign_lhs (use_stmt);
2558           single_imm_use (gimple_assign_lhs (use_stmt), &use_p, &neguse_stmt);
2559           gsi_remove (&gsi, true);
2560           release_defs (use_stmt);
2561
2562           use_stmt = neguse_stmt;
2563           gsi = gsi_for_stmt (use_stmt);
2564           use_code = gimple_assign_rhs_code (use_stmt);
2565           negate_p = true;
2566         }
2567
2568       if (gimple_assign_rhs1 (use_stmt) == result)
2569         {
2570           addop = gimple_assign_rhs2 (use_stmt);
2571           /* a * b - c -> a * b + (-c)  */
2572           if (gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
2573             addop = force_gimple_operand_gsi (&gsi,
2574                                               build1 (NEGATE_EXPR,
2575                                                       type, addop),
2576                                               true, NULL_TREE, true,
2577                                               GSI_SAME_STMT);
2578         }
2579       else
2580         {
2581           addop = gimple_assign_rhs1 (use_stmt);
2582           /* a - b * c -> (-b) * c + a */
2583           if (gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
2584             negate_p = !negate_p;
2585         }
2586
2587       if (negate_p)
2588         mulop1 = force_gimple_operand_gsi (&gsi,
2589                                            build1 (NEGATE_EXPR,
2590                                                    type, mulop1),
2591                                            true, NULL_TREE, true,
2592                                            GSI_SAME_STMT);
2593
2594       fma_stmt = gimple_build_assign_with_ops3 (FMA_EXPR,
2595                                                 gimple_assign_lhs (use_stmt),
2596                                                 mulop1, op2,
2597                                                 addop);
2598       gsi_replace (&gsi, fma_stmt, true);
2599       widen_mul_stats.fmas_inserted++;
2600     }
2601
2602   return true;
2603 }
2604
2605 /* Find integer multiplications where the operands are extended from
2606    smaller types, and replace the MULT_EXPR with a WIDEN_MULT_EXPR
2607    where appropriate.  */
2608
2609 static unsigned int
2610 execute_optimize_widening_mul (void)
2611 {
2612   basic_block bb;
2613   bool cfg_changed = false;
2614
2615   memset (&widen_mul_stats, 0, sizeof (widen_mul_stats));
2616
2617   FOR_EACH_BB (bb)
2618     {
2619       gimple_stmt_iterator gsi;
2620
2621       for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi);)
2622         {
2623           gimple stmt = gsi_stmt (gsi);
2624           enum tree_code code;
2625
2626           if (is_gimple_assign (stmt))
2627             {
2628               code = gimple_assign_rhs_code (stmt);
2629               switch (code)
2630                 {
2631                 case MULT_EXPR:
2632                   if (!convert_mult_to_widen (stmt, &gsi)
2633                       && convert_mult_to_fma (stmt,
2634                                               gimple_assign_rhs1 (stmt),
2635                                               gimple_assign_rhs2 (stmt)))
2636                     {
2637                       gsi_remove (&gsi, true);
2638                       release_defs (stmt);
2639                       continue;
2640                     }
2641                   break;
2642
2643                 case PLUS_EXPR:
2644                 case MINUS_EXPR:
2645                   convert_plusminus_to_widen (&gsi, stmt, code);
2646                   break;
2647
2648                 default:;
2649                 }
2650             }
2651           else if (is_gimple_call (stmt)
2652                    && gimple_call_lhs (stmt))
2653             {
2654               tree fndecl = gimple_call_fndecl (stmt);
2655               if (fndecl
2656                   && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL)
2657                 {
2658                   switch (DECL_FUNCTION_CODE (fndecl))
2659                     {
2660                       case BUILT_IN_POWF:
2661                       case BUILT_IN_POW:
2662                       case BUILT_IN_POWL:
2663                         if (TREE_CODE (gimple_call_arg (stmt, 1)) == REAL_CST
2664                             && REAL_VALUES_EQUAL
2665                                  (TREE_REAL_CST (gimple_call_arg (stmt, 1)),
2666                                   dconst2)
2667                             && convert_mult_to_fma (stmt,
2668                                                     gimple_call_arg (stmt, 0),
2669                                                     gimple_call_arg (stmt, 0)))
2670                           {
2671                             unlink_stmt_vdef (stmt);
2672                             if (gsi_remove (&gsi, true)
2673                                 && gimple_purge_dead_eh_edges (bb))
2674                               cfg_changed = true;
2675                             release_defs (stmt);
2676                             continue;
2677                           }
2678                           break;
2679
2680                       default:;
2681                     }
2682                 }
2683             }
2684           gsi_next (&gsi);
2685         }
2686     }
2687
2688   statistics_counter_event (cfun, "widening multiplications inserted",
2689                             widen_mul_stats.widen_mults_inserted);
2690   statistics_counter_event (cfun, "widening maccs inserted",
2691                             widen_mul_stats.maccs_inserted);
2692   statistics_counter_event (cfun, "fused multiply-adds inserted",
2693                             widen_mul_stats.fmas_inserted);
2694
2695   return cfg_changed ? TODO_cleanup_cfg : 0;
2696 }
2697
2698 static bool
2699 gate_optimize_widening_mul (void)
2700 {
2701   return flag_expensive_optimizations && optimize;
2702 }
2703
2704 struct gimple_opt_pass pass_optimize_widening_mul =
2705 {
2706  {
2707   GIMPLE_PASS,
2708   "widening_mul",                       /* name */
2709   gate_optimize_widening_mul,           /* gate */
2710   execute_optimize_widening_mul,        /* execute */
2711   NULL,                                 /* sub */
2712   NULL,                                 /* next */
2713   0,                                    /* static_pass_number */
2714   TV_NONE,                              /* tv_id */
2715   PROP_ssa,                             /* properties_required */
2716   0,                                    /* properties_provided */
2717   0,                                    /* properties_destroyed */
2718   0,                                    /* todo_flags_start */
2719   TODO_verify_ssa
2720   | TODO_verify_stmts
2721   | TODO_update_ssa                     /* todo_flags_finish */
2722  }
2723 };